Merge 6c0bec52f4 into 75f3bc94e6

2026-04-13 15:03:55 +02:00 · 2026-04-13 15:03:55 +02:00 · e840352f97
parent 75f3bc94e6 6c0bec52f4
commit e840352f97
59 changed files with 10652 additions and 1724 deletions
--- a/ggml/include/ggml-cpu.h
+++ b/ggml/include/ggml-cpu.h
@ -111,13 +111,14 @@ extern "C" {
    // Internal types and functions exposed for tests and benchmarks

    typedef void (*ggml_vec_dot_t)  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
-                                       const void * GGML_RESTRICT y, size_t by, int nrc);
+                                       const void * GGML_RESTRICT y, size_t by, int nrc, const void * levels);

    struct ggml_type_traits_cpu {
        ggml_from_float_t        from_float;
        ggml_vec_dot_t           vec_dot;
        enum ggml_type           vec_dot_type;
-        int64_t                  nrows; // number of rows to process simultaneously
+        int64_t                  nrows;             // number of rows to process simultaneously
+        size_t                   levels_row_stride;  // bytes to add per row to get next row's quant_levels (0 = per-tensor)
    };

    GGML_BACKEND_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type);
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@ -429,7 +429,15 @@ extern "C" {
        GGML_TYPE_MXFP4   = 39, // MXFP4 (1 block)
        GGML_TYPE_NVFP4   = 40, // NVFP4 (4 blocks, E4M3 scale)
        GGML_TYPE_Q1_0    = 41,
-        GGML_TYPE_COUNT   = 42,
+        GGML_TYPE_Q3_PT   = 42, // 3.875 bpw per-tensor Lloyd-Max, 16-elem affine sub-blocks
+        GGML_TYPE_Q3_KPT  = 43, // Q3_K with learned per-tensor levels (3.4375 bpw)
+        GGML_TYPE_Q4_DPT  = 44, // IQ4_NL with learned per-tensor int8 levels (4.125 bpw)
+        GGML_TYPE_Q2_DPT  = 45, // 2-bit with learned per-tensor int8 levels (2.5 bpw)
+        GGML_TYPE_Q2_KPT  = 46, // Q2_K with learned per-tensor float levels (2.625 bpw)
+        GGML_TYPE_IQ2_TQ  = 47, // Trellis quantized with RNG codebook (2.0625 bpw)
+        GGML_TYPE_IQ3_TQ  = 48, // 3-bit with per-tensor trained grid table (3.5625 bpw)
+        GGML_TYPE_IQ1_BN  = 49, // 8D vector quantized with per-tensor trained codebook (1.5625 bpw)
+        GGML_TYPE_COUNT   = 50,
    };

    // precision
@ -457,6 +465,7 @@ extern "C" {
        GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
        GGML_FTYPE_MOSTLY_IQ2_XS  = 16, // except 1d tensors
        GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q3_PT  = 26, // except 1d tensors
        GGML_FTYPE_MOSTLY_IQ1_S   = 18, // except 1d tensors
        GGML_FTYPE_MOSTLY_IQ4_NL  = 19, // except 1d tensors
        GGML_FTYPE_MOSTLY_IQ3_S   = 20, // except 1d tensors
@ -465,8 +474,11 @@ extern "C" {
        GGML_FTYPE_MOSTLY_IQ1_M   = 23, // except 1d tensors
        GGML_FTYPE_MOSTLY_BF16    = 24, // except 1d tensors
        GGML_FTYPE_MOSTLY_MXFP4   = 25, // except 1d tensors
-        GGML_FTYPE_MOSTLY_NVFP4   = 26, // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q1_0    = 27, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q3_KPT  = 27, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q4_DPT  = 28, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q2_KPT  = 29, // except 1d tensors
+        GGML_FTYPE_MOSTLY_NVFP4   = 30, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q1_0    = 31, // except 1d tensors
    };

    // available tensor operations:
@ -686,9 +698,8 @@ extern "C" {

        char name[GGML_MAX_NAME];

-        void * extra; // extra things e.g. for ggml-cuda.cu
-
-        char padding[8];
+        void * extra;        // extra things e.g. for ggml-cuda.cu
+        void * quant_levels; // per-tensor quantization levels (replaces char padding[8]; same size on 64-bit)
    };

    static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@ -2723,7 +2734,7 @@ extern "C" {
 #        define GGML_RESTRICT restrict
 #    endif
 #endif
-    typedef void (*ggml_to_float_t)  (const void  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+    typedef void (*ggml_to_float_t)  (const void  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
    typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t k);

    struct ggml_type_traits {
@ -2734,6 +2745,7 @@ extern "C" {
        bool                     is_quantized;
        ggml_to_float_t          to_float;
        ggml_from_float_t        from_float_ref;
+        size_t                   levels_row_stride;  // bytes to advance quant_levels per row (0 = per-tensor)
    };

    GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type);
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@ -208,6 +208,13 @@ add_library(ggml-base
            ggml-quants.h
            gguf.cpp)

+# Enable native SIMD for ggml-quants.c (needed for K-means training in quantization)
+include(CheckCCompilerFlag)
+check_c_compiler_flag("-march=native" GGML_COMPILER_SUPPORTS_MARCH_NATIVE)
+if (GGML_COMPILER_SUPPORTS_MARCH_NATIVE)
+    set_source_files_properties(ggml-quants.c PROPERTIES COMPILE_FLAGS "-march=native")
+endif()
+
 set_target_properties(ggml-base PROPERTIES
    VERSION ${GGML_VERSION}
    SOVERSION ${GGML_VERSION_MAJOR}
--- a/ggml/src/ggml-backend-meta.cpp
+++ b/ggml/src/ggml-backend-meta.cpp
@ -396,7 +396,7 @@ static ggml_backend_buffer_type_t ggml_backend_meta_device_get_host_buffer_type(
 //

 struct ggml_backend_meta_buffer_context {
-    static constexpr size_t nbtc = GGML_TENSOR_SIZE - sizeof(ggml_tensor::padding);
+    static constexpr size_t nbtc = GGML_TENSOR_SIZE - sizeof(ggml_tensor::quant_levels);

    std::map<std::pair<const ggml_tensor *, bool>, std::pair<ggml_backend_meta_split_state, char[nbtc]>> split_state_cache;
    std::map<          const ggml_tensor *,        std::vector<ggml_tensor *>>                           simple_tensors;
--- a/ggml/src/ggml-blas/ggml-blas.cpp
+++ b/ggml/src/ggml-blas/ggml-blas.cpp
@ -1,5 +1,15 @@
 #include "ggml-impl.h"
 #include "ggml-blas.h"
+
+// Helper: compute quant_levels stride for a given row.
+// For Q2_KPT (per-block levels), stride depends on tensor width.
+static inline size_t ggml_quant_levels_stride(ggml_type type, size_t constant_stride, int64_t ne0) {
+    if (type == GGML_TYPE_Q2_KPT) {
+        return (size_t)(ne0 / 256) * 4 * sizeof(float);
+    }
+    return constant_stride;
+}
+
 #include "ggml-backend-impl.h"

 #include <future>
@ -77,10 +87,11 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
                const int min_rows_per_thread = std::max((int)(min_cols_per_thread/ne00), 1);
                const int n_threads = std::max(std::min(ctx->n_threads, (int)(ne01/min_rows_per_thread)), 1);

+                const size_t lrs = ggml_quant_levels_stride(src0->type, ggml_get_type_traits(src0->type)->levels_row_stride, src0->ne[0]);
 #ifdef GGML_USE_OPENMP
                #pragma omp parallel for num_threads(n_threads)
                for (int64_t i01 = 0; i01 < ne01; i01++) {
-                    to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
+                    to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00, (const char*)src0->quant_levels + i01*lrs);
                }
 #else
                for (int i = 1; i < n_threads; i++) {
@ -89,7 +100,7 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
                    if (start < end) {
                        ctx->tasks.push_back(std::async(std::launch::async, [=]() {
                            for (int64_t i01 = start; i01 < end; i01++) {
-                                to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
+                                to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00, (const char*)src0->quant_levels + i01*lrs);
                            }
                        }));
                    }
@ -99,7 +110,7 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
                    const int64_t start = 0;
                    const int64_t end   = ne01/n_threads;
                    for (int64_t i01 = start; i01 < end; i01++) {
-                        to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
+                        to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00, (const char*)src0->quant_levels + i01*lrs);
                    }
                }
 #endif
--- a/ggml/src/ggml-common.h
+++ b/ggml/src/ggml-common.h
@ -298,6 +298,7 @@ typedef struct {
 } block_q2_K;
 static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");

+
 // 3-bit quantization
 // weight is represented as x = a * q
 // 16 blocks of 16 elements each
@ -327,6 +328,12 @@ typedef struct {
 } block_q4_K;
 static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");

+// Q3_KPT: Q3_K with learned per-tensor levels
+// Reuses block_q3_K structure but maps 3-bit indices through learned level table
+typedef block_q3_K block_q3_kpt;
+#define Q3KPT_N_LEVELS 8
+
+
 // 5-bit quantization
 // 8 blocks of 32 elements each
 // weight is represented as x = a * q + b
@ -449,6 +456,115 @@ typedef struct {
 } block_iq4_xs;
 static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");

+// 3.875 bpw - per-tensor Lloyd-Max scalar quantization
+// 256 elements = 16 sub-blocks of 16, 8-entry level table trained per tensor
+// Layout: 2 (d) + 2 (dmin) + 24 (scales: 32x6-bit) + 96 (qs: 256x3-bit) = 124 bytes
+typedef struct {
+    ggml_half d;                  //  2 bytes: global scale for 16-elem sub-block ranges
+    ggml_half dmin;               //  2 bytes: global scale for sub-block neg_mins
+    uint8_t scales[3*QK_K/32];   // 24 bytes: 32 x 6-bit (indices 0..15 = ranges, 16..31 = neg_mins)
+    uint8_t qs[3*QK_K/8];        // 96 bytes: 256 x 3-bit Lloyd-Max level index, sequential
+} block_q3_pt;
+static_assert(sizeof(block_q3_pt) == 124, "wrong q3_pt block size");
+
+#define Q3PT_N_LEVELS 8
+
+// Q4_DPT: IQ4_NL with learned per-tensor int8 levels (4.125 bpw)
+// Block format: identical to block_iq4_nl (2 + 16 = 18 bytes per 32 elements)
+typedef block_iq4_nl block_q4_dpt;
+#define Q4DPT_N_LEVELS 16
+
+// Q2_DPT: 2-bit per-tensor Lloyd-Max scalar quantization (2.5 bpw)
+// Block format: 2 bytes (FP16 scale) + 8 bytes (2-bit indices for 32 elements) = 10 bytes per block
+// 4 learned int8 levels per tensor, optimized via Lloyd-Max k-means
+typedef struct {
+    ggml_half d;               // 2 bytes: FP16 scale (delta)
+    uint8_t qs[8];             // 8 bytes: 2-bit indices (4 values per byte, 32 elements total)
+} block_q2_dpt;
+static_assert(sizeof(block_q2_dpt) == sizeof(ggml_half) + 8, "wrong q2_dpt block size/padding");
+
+#define QK2_DPT 32
+#define Q2DPT_N_LEVELS 4
+
+// Q2_KPT: Q2_K with learned per-tensor float levels (2.625 bpw)
+// Reuses block_q2_K structure but maps 2-bit indices through learned level table
+typedef block_q2_K block_q2_kpt;
+#define Q2KPT_N_LEVELS 4
+
+// IQ2_TQ: Trellis Quantized with RNG codebook (2.0625 bpw)
+//
+// Reconstruction: y[i] = d * hash(seed, block_idx, position, trellis_state, qs_idx)
+//   where hash is a deterministic function mapping to [-1, 1]
+//   and trellis_state evolves as: next = (state + idx + 1) & 7
+//
+// Block layout (66 bytes per 256 elements):
+// IQ2_TQ: 2-bit scalar quantization with per-tensor trained asymmetric grid table
+// 32 groups of 8 elements per 256-element super-block
+//   - ggml_half d (2 bytes): super-block scale
+//   - uint8_t scales[16] (16 bytes): 32 × 4-bit grid entry index per group
+//   - uint8_t qs[64] (64 bytes): 256 × 2-bit element index within grid entry
+// recon[j] = d * IQ2TQ_GRID_SCALE * grid[group_idx][elem_idx]
+typedef struct {
+    ggml_half d;                    // Super-block scale (2 bytes)
+    uint8_t scales[QK_K/16];       // 32 × 4-bit grid entry index per group (16 bytes)
+    uint8_t qs[QK_K/4];            // 256 × 2-bit element index (64 bytes)
+} block_iq2_tq;
+static_assert(sizeof(block_iq2_tq) == 82, "wrong iq2_tq block size");
+// 2 + 16 + 64 = 82 bytes per 256 weights = 2.5625 bpw
+
+#define IQ2TQ_GROUP_SIZE 8         // Elements per group
+#define IQ2TQ_N_GROUPS   (QK_K / IQ2TQ_GROUP_SIZE)  // 32 groups per super-block
+#define IQ2TQ_GRID_SCALE 0.125f    // Grid value multiplier: recon = d * GRID_SCALE * grid_int8
+
+// IQ3_TQ: 3-bit scalar quantization with per-tensor trained asymmetric grid table (3.5625 bpw)
+// 32 groups of 8 elements per 256-element super-block
+// Each grid entry has 8 int8 levels (3 bits → 8 values per element)
+// Grid table: 16 entries × 8 int8 = 128 bytes per tensor
+// Block layout:
+//   - ggml_half d (2 bytes): super-block scale
+//   - uint8_t scales[16] (16 bytes): 32 × 4-bit grid entry index per group
+//   - uint8_t qs[96] (96 bytes): 256 × 3-bit element index within grid entry
+// recon[j] = d * IQ3TQ_GRID_SCALE * grid[group_idx][elem_idx]
+typedef struct {
+    ggml_half d;                    // Super-block scale (2 bytes)
+    uint8_t scales[QK_K/16];       // 32 × 4-bit grid entry index per group (16 bytes)
+    uint8_t qs[3*QK_K/8];          // 256 × 3-bit element index (96 bytes)
+} block_iq3_tq;
+static_assert(sizeof(block_iq3_tq) == 114, "wrong iq3_tq block size");
+// 2 + 16 + 96 = 114 bytes per 256 weights = 3.5625 bpw
+
+#define IQ3TQ_GROUP_SIZE 8         // Elements per group
+#define IQ3TQ_N_GROUPS   (QK_K / IQ3TQ_GROUP_SIZE)  // 32 groups per super-block
+#define IQ3TQ_N_LEVELS   8         // 3-bit → 8 levels per grid entry
+#define IQ3TQ_GRID_SCALE 0.125f    // Grid value multiplier
+#define IQ3TQ_GRID_SIZE  128       // 16 entries × 8 int8 = 128 bytes per tensor
+
+// IQ1_BN: 8D vector quantized with per-tensor trained 4096-entry codebook (1.5625 bpw)
+// 32 groups of 8 elements per 256-element super-block
+// Each group selects one of 4096 trained 8D vectors via 12-bit codebook index
+// Codebook: 4096 entries × 8 int8 = 32768 bytes per tensor
+// Block layout:
+//   - ggml_half d (2 bytes): super-block scale
+//   - uint8_t qs[48] (48 bytes): 32 × 12-bit codebook indices packed in pairs
+// 12-bit pair packing (groups 2k, 2k+1 → 3 bytes at qs[3k]):
+//   idx_even = qs[3k] | ((qs[3k+1] & 0x0F) << 8)
+//   idx_odd  = (qs[3k+1] >> 4) | (qs[3k+2] << 4)
+// recon[g*8+k] = d * IQ1BN_GRID_SCALE * codebook[ci][k]
+typedef struct {
+    ggml_half d;                    // Super-block scale (2 bytes)
+    uint8_t qs[3*QK_K/16];         // 32 × 12-bit codebook indices packed in pairs (48 bytes)
+} block_iq1_bn;
+static_assert(sizeof(block_iq1_bn) == 50, "wrong iq1_bn block size");
+// 2 + 48 = 50 bytes per 256 weights = 1.5625 bpw
+
+#define IQ1BN_GROUP_SIZE    8
+#define IQ1BN_N_GROUPS      (QK_K / IQ1BN_GROUP_SIZE)  // 32
+#define IQ1BN_CODEBOOK_K    4096    // number of codebook entries
+#define IQ1BN_CODEBOOK_DIM  8       // vector dimension (= group size)
+#define IQ1BN_GRID_SCALE    0.125f  // Grid value multiplier
+#define IQ1BN_CODEBOOK_SIZE (IQ1BN_CODEBOOK_K * IQ1BN_CODEBOOK_DIM)  // 32768 bytes
+#define IQ1BN_AUX_SIZE      IQ1BN_CODEBOOK_SIZE                      // 32768 bytes
+
 #endif // GGML_COMMON_DECL
 #endif // GGML_COMMON_DECL

--- a/ggml/src/ggml-cpu/arch-fallback.h
+++ b/ggml/src/ggml-cpu/arch-fallback.h
@ -33,6 +33,8 @@
 #define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
 #define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
 #define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
+#define ggml_vec_dot_q3_pt_q8_K_generic ggml_vec_dot_q3_pt_q8_K
+#define ggml_vec_dot_q4_dpt_q8_0_generic ggml_vec_dot_q4_dpt_q8_0
 // repack.cpp
 #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
 #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
@ -203,6 +205,15 @@
 #define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
 #elif defined(__riscv)
 // quants.c
+#define quantize_row_q8_K_generic quantize_row_q8_K
+#define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
+#define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K
+#define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K
+#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
+#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
+#define ggml_vec_dot_q3_pt_q8_K_generic ggml_vec_dot_q3_pt_q8_K
+#define ggml_vec_dot_q4_dpt_q8_0_generic ggml_vec_dot_q4_dpt_q8_0
+#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
 #define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
 #define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0
 // repack.cpp
@ -307,6 +318,8 @@
 #define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
 #define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
 #define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
+#define ggml_vec_dot_q3_pt_q8_K_generic ggml_vec_dot_q3_pt_q8_K
+#define ggml_vec_dot_q4_dpt_q8_0_generic ggml_vec_dot_q4_dpt_q8_0
 #define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
 #define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
 #define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0
--- a/ggml/src/ggml-cpu/arch/arm/quants.c
+++ b/ggml/src/ggml-cpu/arch/arm/quants.c
@ -137,7 +137,111 @@ void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in

 //===================================== Dot products =================================

-void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    const int qk = QK1_0;  // 128
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+    GGML_UNUSED(levels);
+
+    const block_q1_0 * GGML_RESTRICT x = vx;
+    const block_q8_0 * GGML_RESTRICT y = vy;
+
+    float sumf = 0.0f;
+
+#if defined(__ARM_NEON)
+    float32x4_t sumv = vdupq_n_f32(0.0f);
+
+    for (int i = 0; i < nb; i++) {
+        const float d0 = GGML_CPU_FP16_TO_FP32(x[i].d);
+
+        // Process 4 Q8_0 blocks (each has 32 elements)
+        for (int k = 0; k < 4; k++) {
+            const block_q8_0 * GGML_RESTRICT yb = &y[i * 4 + k];
+            const float d1 = GGML_CPU_FP16_TO_FP32(yb->d);
+
+            // Get the 4 bytes of bits for this Q8_0 block (32 bits = 4 bytes)
+            // Bits are at offset k*4 bytes in x[i].qs
+            const uint8_t * bits = &x[i].qs[k * 4];
+
+            // Load 32 int8 values from y
+            const int8x16_t y0 = vld1q_s8(yb->qs);
+            const int8x16_t y1 = vld1q_s8(yb->qs + 16);
+
+            // Byte 0-1: bits for y0[0..15]
+            const uint64_t expand0 = table_b2b_0[bits[0]];
+            const uint64_t expand1 = table_b2b_0[bits[1]];
+            // Byte 2-3: bits for y1[0..15]
+            const uint64_t expand2 = table_b2b_0[bits[2]];
+            const uint64_t expand3 = table_b2b_0[bits[3]];
+
+            // Build the sign vectors by reinterpreting the table values
+            uint8x8_t e0 = vcreate_u8(expand0);
+            uint8x8_t e1 = vcreate_u8(expand1);
+            uint8x8_t e2 = vcreate_u8(expand2);
+            uint8x8_t e3 = vcreate_u8(expand3);
+
+            // Shift right by 4 to get 0 or 1
+            int8x8_t s0 = vreinterpret_s8_u8(vshr_n_u8(e0, 4));
+            int8x8_t s1 = vreinterpret_s8_u8(vshr_n_u8(e1, 4));
+            int8x8_t s2 = vreinterpret_s8_u8(vshr_n_u8(e2, 4));
+            int8x8_t s3 = vreinterpret_s8_u8(vshr_n_u8(e3, 4));
+
+            // Convert 0/1 to -1/+1: sign = 2*val - 1
+            int8x8_t one = vdup_n_s8(1);
+            s0 = vsub_s8(vadd_s8(s0, s0), one);  // 2*s0 - 1
+            s1 = vsub_s8(vadd_s8(s1, s1), one);
+            s2 = vsub_s8(vadd_s8(s2, s2), one);
+            s3 = vsub_s8(vadd_s8(s3, s3), one);
+
+            // Combine into 16-element vectors
+            int8x16_t signs0 = vcombine_s8(s0, s1);
+            int8x16_t signs1 = vcombine_s8(s2, s3);
+
+            // Multiply signs with y values and accumulate
+            // dot(signs, y) where signs are +1/-1
+            int32x4_t p0 = ggml_vdotq_s32(vdupq_n_s32(0), signs0, y0);
+            int32x4_t p1 = ggml_vdotq_s32(p0, signs1, y1);
+
+            // Scale by d1 and accumulate
+            sumv = vmlaq_n_f32(sumv, vcvtq_f32_s32(p1), d0 * d1);
+        }
+    }
+
+    sumf = vaddvq_f32(sumv);
+#else
+    // Scalar fallback
+    for (int i = 0; i < nb; i++) {
+        const float d0 = GGML_FP16_TO_FP32(x[i].d);
+
+        // Process 4 Q8_0 blocks
+        for (int k = 0; k < 4; k++) {
+            const float d1 = GGML_FP16_TO_FP32(y[i*4 + k].d);
+
+            int sumi = 0;
+            for (int j = 0; j < QK8_0; j++) {
+                const int bit_index = k * QK8_0 + j;
+                const int byte_index = bit_index / 8;
+                const int bit_offset = bit_index % 8;
+
+                const int xi = ((x[i].qs[byte_index] >> bit_offset) & 1) ? 1 : -1;
+                sumi += xi * y[i*4 + k].qs[j];
+            }
+            sumf += d0 * d1 * sumi;
+        }
+    }
+#endif
+
+    *s = sumf;
+}
+
+
+void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
    const int qk = QK1_0;  // 128
    const int nb = n / qk;

@ -240,7 +344,7 @@ void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
 }


-void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
    const int qk = QK8_0;
    const int nb = n / qk;

@ -533,7 +637,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
    *s = sumf;
 }

-void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
    const int qk = QK8_1;
    const int nb = n / qk;

@ -753,12 +857,13 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
    *s = sumf;
 }

-void ggml_vec_dot_nvfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_nvfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
    assert(nrc == 1);
    UNUSED(nrc);
    UNUSED(bx);
    UNUSED(by);
    UNUSED(bs);
+    GGML_UNUSED(levels);
    assert(n % QK_NVFP4 == 0);

    const block_nvfp4 * GGML_RESTRICT x = vx;
@ -837,7 +942,92 @@ void ggml_vec_dot_nvfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
    *s = sumf;
 }

-void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_nvfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+    GGML_UNUSED(levels);
+    assert(n % QK_NVFP4 == 0);
+
+    const block_nvfp4 * GGML_RESTRICT x = vx;
+    const block_q8_0 * GGML_RESTRICT y = vy;
+
+    // Each NVFP4 super-block (64 elements) spans 2 q8_0 blocks
+    const int nb = n / QK_NVFP4;
+
+    float sumf = 0;
+
+#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
+    const int8x16_t values = vld1q_s8(kvalues_mxfp4);
+    const uint8x16_t m4b = vdupq_n_u8(0x0f);
+    float32x4_t acc = vdupq_n_f32(0.0f);
+
+    for (int ib = 0; ib < nb; ++ib) {
+        const uint8x16_t q4bits_0 = vld1q_u8(x[ib].qs);
+        const uint8x16_t q4bits_1 = vld1q_u8(x[ib].qs + 16);
+
+        const int8x16_t q4_lo_0 = ggml_vqtbl1q_s8(values, vandq_u8  (q4bits_0, m4b));
+        const int8x16_t q4_hi_0 = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits_0, 4));
+        const int8x16_t q4_lo_1 = ggml_vqtbl1q_s8(values, vandq_u8  (q4bits_1, m4b));
+        const int8x16_t q4_hi_1 = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits_1, 4));
+
+        const int8x16_t q8_0a = vld1q_s8(y[2*ib].qs);
+        const int8x16_t q8_0b = vld1q_s8(y[2*ib].qs + 16);
+        const int8x16_t q8_lo_0 = vcombine_s8(vget_low_s8(q8_0a), vget_low_s8(q8_0b));
+        const int8x16_t q8_hi_0 = vcombine_s8(vget_high_s8(q8_0a), vget_high_s8(q8_0b));
+
+        const int8x16_t q8_1a = vld1q_s8(y[2*ib+1].qs);
+        const int8x16_t q8_1b = vld1q_s8(y[2*ib+1].qs + 16);
+        const int8x16_t q8_lo_1 = vcombine_s8(vget_low_s8(q8_1a), vget_low_s8(q8_1b));
+        const int8x16_t q8_hi_1 = vcombine_s8(vget_high_s8(q8_1a), vget_high_s8(q8_1b));
+
+        const int32x4_t p0 = vaddq_s32(
+            ggml_vdotq_s32(vdupq_n_s32(0), q4_lo_0, q8_lo_0),
+            ggml_vdotq_s32(vdupq_n_s32(0), q4_hi_0, q8_hi_0));
+        const int32x4_t p1 = vaddq_s32(
+            ggml_vdotq_s32(vdupq_n_s32(0), q4_lo_1, q8_lo_1),
+            ggml_vdotq_s32(vdupq_n_s32(0), q4_hi_1, q8_hi_1));
+
+        const int32x4_t sums = vpaddq_s32(p0, p1);
+
+        // Decode 4 UE4M3 scales to f32 and multiply with q8 scales
+        const float dy0 = GGML_CPU_FP16_TO_FP32(y[2*ib].d);
+        const float dy1 = GGML_CPU_FP16_TO_FP32(y[2*ib+1].d);
+        const float32x4_t nvsc = {
+            ggml_ue4m3_to_fp32(x[ib].d[0]),
+            ggml_ue4m3_to_fp32(x[ib].d[1]),
+            ggml_ue4m3_to_fp32(x[ib].d[2]),
+            ggml_ue4m3_to_fp32(x[ib].d[3])
+        };
+        const float32x4_t scales = vmulq_f32(nvsc, (float32x4_t){dy0, dy0, dy1, dy1});
+
+        acc = vfmaq_f32(acc, vcvtq_f32_s32(sums), scales);
+    }
+    sumf = vaddvq_f32(acc);
+#else
+    for (int ib = 0; ib < nb; ++ib) {
+        for (int si = 0; si < 4; ++si) {
+            const float d = ggml_ue4m3_to_fp32(x[ib].d[si]);
+            const int q8b = si / 2;
+            const int q8o = (si % 2) * QK_NVFP4_SUB;
+            const float dy = GGML_CPU_FP16_TO_FP32(y[2*ib + q8b].d);
+
+            int sumi_lo = 0, sumi_hi = 0;
+            for (int j = 0; j < QK_NVFP4_SUB/2; ++j) {
+                const uint8_t qv = x[ib].qs[si*(QK_NVFP4_SUB/2) + j];
+                sumi_lo += y[2*ib + q8b].qs[q8o + j +               0] * kvalues_mxfp4[qv & 0xf];
+                sumi_hi += y[2*ib + q8b].qs[q8o + j + QK_NVFP4_SUB/2] * kvalues_mxfp4[qv >>  4];
+            }
+            sumf += dy * d * (sumi_lo + sumi_hi);
+        }
+    }
+#endif
+    *s = sumf;
+}
+
+void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
    const int qk = QK8_0;
    const int nb = n / qk;

@ -949,7 +1139,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
    *s = sumf;
 }

-void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
    const int qk = QK8_1;
    const int nb = n / qk;

@ -1067,7 +1257,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
    *s = sumf;
 }

-void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
    const int qk = QK8_0;
    const int nb = n / qk;

@ -3953,6 +4143,10 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
 #endif
 }

+void ggml_vec_dot_q3_pt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    ggml_vec_dot_q3_pt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+}
+
 void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
    assert(n % QK_K == 0);
    assert(nrc == 1);
--- a/ggml/src/ggml-cpu/arch/loongarch/quants.c
+++ b/ggml/src/ggml-cpu/arch/loongarch/quants.c
@ -644,7 +644,7 @@ static inline __m128i get_scale_shuffle(int i) {
 }
 #endif

-void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
    const int qk = QK8_0;
    const int nb = n / qk;

@ -772,7 +772,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
    *s = sumf;
 }

-void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
    const int qk = QK8_1;
    const int nb = n / qk;

@ -827,11 +827,11 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
    UNUSED(y);
    UNUSED(ib);
    UNUSED(sumf);
-    ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
+    ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
 #endif
 }

-void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
    const int qk = QK8_0;
    const int nb = n / qk;

@ -880,11 +880,11 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
    UNUSED(sumf);
    UNUSED(x);
    UNUSED(y);
-    ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
+    ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
 #endif
 }

-void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
    const int qk = QK8_1;
    const int nb = n / qk;

@ -936,11 +936,11 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
    UNUSED(sumf);
    UNUSED(x);
    UNUSED(y);
-    ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
+    ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
 #endif
 }

-void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
    const int qk = QK8_0;
    const int nb = n / qk;

@ -983,7 +983,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
    UNUSED(sumf);
    UNUSED(x);
    UNUSED(y);
-    ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
+    ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
 #endif
 }

@ -1956,6 +1956,10 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
 #endif
 }

+void ggml_vec_dot_q3_pt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    ggml_vec_dot_q3_pt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+}
+
 #if defined(__loongarch_asx)
 static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
    const __m256i a = __lasx_xvmulwev_h_b(x, y);
--- a/ggml/src/ggml-cpu/arch/powerpc/quants.c
+++ b/ggml/src/ggml-cpu/arch/powerpc/quants.c
@ -141,7 +141,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i

 //===================================== Dot products =================================

-void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
    const int qk = QK8_0;
    const int nb = n / qk;

@ -207,11 +207,11 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
    UNUSED(y);
    UNUSED(ib);
    UNUSED(sumf);
-    ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
+    ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
 #endif
 }

-void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
    const int qk = QK8_1;
    const int nb = n / qk;

@ -274,7 +274,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
    UNUSED(y);
    UNUSED(ib);
    UNUSED(sumf);
-    ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
+    ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
 #endif
 }

@ -340,11 +340,11 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
    UNUSED(y);
    UNUSED(ib);
    UNUSED(sumf);
-    ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
+    ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
 #endif
 }

-void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
    const int qk = QK8_0;
    const int nb = n / qk;

@ -412,11 +412,11 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
    UNUSED(sumf);
    UNUSED(x);
    UNUSED(y);
-    ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
+    ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
 #endif
 }

-void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
    const int qk = QK8_1;
    const int nb = n / qk;

@ -488,11 +488,11 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
    UNUSED(sumf);
    UNUSED(x);
    UNUSED(y);
-    ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
+    ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
 #endif
 }

-void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
    const int qk = QK8_0;
    const int nb = n / qk;

@ -557,7 +557,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
    UNUSED(y);
    UNUSED(ib);
    UNUSED(sumf);
-    ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
+    ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
 #endif
 }

@ -2000,6 +2000,10 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
 #endif
 }

+void ggml_vec_dot_q3_pt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    ggml_vec_dot_q3_pt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+}
+
 void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
    assert(n % QK_K == 0);
    assert(nrc == 1);
@ -2190,7 +2194,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
    UNUSED(nb);
    UNUSED(ib);
    UNUSED(sumf);
-    ggml_vec_dot_iq4_nl_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
+    ggml_vec_dot_iq4_nl_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
 #endif
 }

--- a/ggml/src/ggml-cpu/arch/riscv/quants.c
+++ b/ggml/src/ggml-cpu/arch/riscv/quants.c
@ -213,7 +213,7 @@ void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in

 //===================================== Dot products =================================

-void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
 #if defined(__riscv_v)
    const int qk = QK8_0;
    const int nb = n / qk;
@ -264,11 +264,11 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi

    *s = sumf;
 #else
-    ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
+    ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
 #endif
 }

-void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
 #if defined(__riscv_v)
    const int qk = QK8_1;
    const int nb = n / qk;
@ -315,11 +315,11 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi

    *s = sumf;
 #else
-    ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
+    ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
 #endif
 }

-void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
 #if defined(__riscv_v)
    const int qk = QK8_0;
    const int nb = n / qk;
@ -369,11 +369,11 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi

    *s = sumf;
 #else
-    ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
+    ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
 #endif
 }

-void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
 #if defined(__riscv_v)
    const int qk = QK8_1;
    const int nb = n / qk;
@ -422,11 +422,11 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi

    *s = sumf;
 #else
-    ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
+    ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
 #endif
 }

-void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
    const int qk = QK8_0;
    const int nb = n / qk;

@ -470,7 +470,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
    UNUSED(ib);
    UNUSED(sumf);

-    ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
+    ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
 #endif
 }

@ -2954,6 +2954,14 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
 #endif
 }

+void ggml_vec_dot_q3_pt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    ggml_vec_dot_q3_pt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+}
+
+void ggml_vec_dot_q3_pt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    ggml_vec_dot_q3_pt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+}
+
 #if defined __riscv_v_intrinsic
 static void ggml_vec_dot_iq3_xxs_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
    assert(n % QK_K == 0);
--- a/ggml/src/ggml-cpu/arch/s390/quants.c
+++ b/ggml/src/ggml-cpu/arch/s390/quants.c
@ -146,7 +146,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i

 //===================================== Dot products =================================

-void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
    const int qk = QK8_0;
    const int nb = n / qk;

@ -201,11 +201,11 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
    UNUSED(y);
    UNUSED(ib);
    UNUSED(sumf);
-    ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
+    ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
 #endif
 }

-void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
    const int qk = QK8_1;
    const int nb = n / qk;

@ -258,7 +258,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
    UNUSED(y);
    UNUSED(ib);
    UNUSED(sumf);
-    ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
+    ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
 #endif
 }

@ -353,11 +353,11 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
    UNUSED(y);
    UNUSED(ib);
    UNUSED(sumf);
-    ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
+    ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
 #endif
 }

-void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
    const int qk = QK8_0;
    const int nb = n / qk;

@ -495,11 +495,11 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
    UNUSED(y);
    UNUSED(ib);
    UNUSED(sumf);
-    ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
+    ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
 #endif
 }

-void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
    const int qk = QK8_1;
    const int nb = n / qk;

@ -648,11 +648,11 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
    UNUSED(y);
    UNUSED(ib);
    UNUSED(sumf);
-    ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
+    ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
 #endif
 }

-void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
    const int qk = QK8_0;
    const int nb = n / qk;

@ -698,7 +698,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
    UNUSED(y);
    UNUSED(ib);
    UNUSED(sumf);
-    ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
+    ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
 #endif
 }

@ -1388,7 +1388,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
    UNUSED(nb);
    UNUSED(ib);
    UNUSED(sumf);
-    ggml_vec_dot_iq4_nl_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
+    ggml_vec_dot_iq4_nl_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
 #endif
 }

@ -1463,3 +1463,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
    ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
 #endif
 }
+
+void ggml_vec_dot_q3_pt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    ggml_vec_dot_q3_pt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+}
--- a/ggml/src/ggml-cpu/arch/wasm/quants.c
+++ b/ggml/src/ggml-cpu/arch/wasm/quants.c
@ -229,7 +229,7 @@ void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in

 //===================================== Dot products =================================

-void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
    const int qk = QK8_0;
    const int nb = n / qk;

@ -355,7 +355,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
    *s = sumf;
 }

-void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
    const int qk = QK8_0;
    const int nb = n / qk;

@ -442,11 +442,11 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
    UNUSED(sumf);
    UNUSED(x);
    UNUSED(y);
-    ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
+    ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
 #endif
 }

-void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
    const int qk = QK8_1;
    const int nb = n / qk;

@ -537,11 +537,11 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
    UNUSED(sumf);
    UNUSED(x);
    UNUSED(y);
-    ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
+    ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
 #endif
 }

-void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
    const int qk = QK8_0;
    const int nb = n / qk;

@ -605,7 +605,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
    UNUSED(y);
    UNUSED(ib);
    UNUSED(sumf);
-    ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
+    ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
 #endif
 }

@ -1218,3 +1218,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
    ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
 #endif
 }
+
+void ggml_vec_dot_q3_pt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    ggml_vec_dot_q3_pt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+}
--- a/ggml/src/ggml-cpu/arch/x86/quants.c
+++ b/ggml/src/ggml-cpu/arch/x86/quants.c
@ -540,7 +540,8 @@ static inline __m128i get_scale_shuffle(int i) {
 }
 #endif

-void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
    const int qk = QK8_0;
    const int nb = n / qk;

@ -698,7 +699,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
    *s = sumf;
 }

-void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
    const int qk = QK8_1;
    const int nb = n / qk;

@ -753,11 +755,12 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
    UNUSED(x);
    UNUSED(y);
    UNUSED(ib);
-    ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
+    ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
 #endif
 }

-void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
    assert(nrc == 1);
    UNUSED(nrc);
    UNUSED(bx);
@ -843,7 +846,8 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
    *s = sumf;
 }

-void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
    const int qk = QK8_0;
    const int nb = n / qk;

@ -919,11 +923,12 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
    UNUSED(ib);
    UNUSED(x);
    UNUSED(y);
-    ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
+    ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
 #endif
 }

-void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
    const int qk = QK8_1;
    const int nb = n / qk;

@ -1005,11 +1010,12 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
    UNUSED(ib);
    UNUSED(x);
    UNUSED(y);
-    ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
+    ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
 #endif
 }

-void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
    const int qk = QK8_0;
    const int nb = n / qk;

@ -1077,7 +1083,8 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
    *s = sumf;
 }

-void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
    assert(nrc == 1);
    UNUSED(nrc);
    UNUSED(bx);
@ -1205,11 +1212,12 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
    UNUSED(x);
    UNUSED(y);
    UNUSED(nb);
-    ggml_vec_dot_tq1_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+    ggml_vec_dot_tq1_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
 #endif
 }

-void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
    assert(nrc == 1);
    UNUSED(nrc);
    UNUSED(bx);
@ -1271,11 +1279,12 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
    UNUSED(x);
    UNUSED(y);
    UNUSED(nb);
-    ggml_vec_dot_tq2_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+    ggml_vec_dot_tq2_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
 #endif
 }

-void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
    assert(nrc == 1);
    UNUSED(nrc);
    UNUSED(bx);
@ -1463,11 +1472,12 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
    UNUSED(x);
    UNUSED(y);
    UNUSED(nb);
-    ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+    ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
 #endif
 }

-void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
    assert(n % QK_K == 0);
    assert(nrc == 1);
    UNUSED(nrc);
@ -1735,11 +1745,12 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
    UNUSED(x);
    UNUSED(y);
    UNUSED(nb);
-    ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+    ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
 #endif
 }

-void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
    assert(n % QK_K == 0);
    assert(nrc == 1);
    UNUSED(nrc);
@ -1913,11 +1924,12 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
    UNUSED(kmask2);
    UNUSED(kmask3);
    UNUSED(utmp);
-    ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+    ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
 #endif
 }

-void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy,  size_t by, int nrc) {
+void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy,  size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
    assert(n % QK_K == 0);
    assert(nrc == 1);
    UNUSED(nrc);
@ -2123,11 +2135,12 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
    UNUSED(kmask2);
    UNUSED(kmask3);
    UNUSED(utmp);
-    ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+    ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
 #endif
 }

-void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
    assert(n % QK_K == 0);
    assert(nrc == 1);
    UNUSED(nrc);
@ -2328,7 +2341,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
    UNUSED(x);
    UNUSED(y);
    UNUSED(nb);
-    ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+    ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
 #endif
 }

@ -2369,7 +2382,8 @@ static const int8_t keven_signs_q2xs[1024] = {
 };
 #endif

-void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
    assert(n % QK_K == 0);
    assert(nrc == 1);
    UNUSED(nrc);
@ -2483,11 +2497,12 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
    UNUSED(x);
    UNUSED(y);
    UNUSED(nb);
-    ggml_vec_dot_iq2_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+    ggml_vec_dot_iq2_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
 #endif
 }

-void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
    assert(n % QK_K == 0);
    assert(nrc == 1);
    UNUSED(nrc);
@ -2780,11 +2795,12 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
    UNUSED(x);
    UNUSED(y);
    UNUSED(nb);
-    ggml_vec_dot_iq2_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+    ggml_vec_dot_iq2_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
 #endif
 }

-void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
    assert(n % QK_K == 0);
    assert(nrc == 1);
    UNUSED(nrc);
@ -2965,11 +2981,12 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
    UNUSED(x);
    UNUSED(y);
    UNUSED(nb);
-    ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+    ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
 #endif
 }

-void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
    assert(n % QK_K == 0);
    assert(nrc == 1);
    UNUSED(nrc);
@ -3089,11 +3106,12 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
    UNUSED(x);
    UNUSED(y);
    UNUSED(nb);
-    ggml_vec_dot_iq3_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+    ggml_vec_dot_iq3_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
 #endif
 }

-void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
    assert(n % QK_K == 0);
    assert(nrc == 1);
    UNUSED(nrc);
@ -3299,11 +3317,17 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
    UNUSED(x);
    UNUSED(y);
    UNUSED(nb);
-    ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+    ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
 #endif
 }

-void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q3_pt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
+    ggml_vec_dot_q3_pt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
+}
+
+void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
    assert(n % QK_K == 0);
    assert(nrc == 1);
    UNUSED(nrc);
@ -3418,11 +3442,12 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
    UNUSED(x);
    UNUSED(y);
    UNUSED(nb);
-    ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+    ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
 #endif
 }

-void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
    assert(n % QK_K == 0);
    assert(nrc == 1);
    UNUSED(nrc);
@ -3625,11 +3650,12 @@ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
    UNUSED(y);
    UNUSED(nb);
    UNUSED(scale);
-    ggml_vec_dot_iq1_m_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+    ggml_vec_dot_iq1_m_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
 #endif
 }

-void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
    assert(nrc == 1);
    UNUSED(nrc);
    UNUSED(bx);
@ -3713,7 +3739,185 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
    *s = sumf;
 }

-void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q4_dpt_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+    assert(n % QK4_NL == 0);
+    static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same");
+
+    const block_q4_dpt * GGML_RESTRICT x = vx;
+    const block_q8_0   * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK4_NL;
+
+    const int8_t * values = (const int8_t *)levels;
+    GGML_ASSERT(values != NULL && "Q4_DPT levels not set for tensor");
+
+    int ib = 0;
+    float sumf = 0;
+
+#if defined __AVX2__
+
+    const __m128i values128 = _mm_loadu_si128((const __m128i*)values);
+    const __m128i m4b  = _mm_set1_epi8(0x0f);
+    const __m256i mone = _mm256_set1_epi16(1);
+
+    __m256 accum1 = _mm256_setzero_ps();
+    __m256 accum2 = _mm256_setzero_ps();
+    for (; ib + 1 < nb; ib += 2) {
+        const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)x[ib + 0].qs);
+        const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[ib + 1].qs);
+        const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[ib + 0].qs);
+        const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[ib + 1].qs);
+        const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
+                                              _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
+        const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
+                                              _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
+        const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
+        const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
+        const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
+        const __m256i p_2 = _mm256_madd_epi16(p16_2, mone);
+        accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 0].d)*GGML_CPU_FP16_TO_FP32(x[ib + 0].d)),
+                _mm256_cvtepi32_ps(p_1), accum1);
+        accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 1].d)*GGML_CPU_FP16_TO_FP32(x[ib + 1].d)),
+                _mm256_cvtepi32_ps(p_2), accum2);
+    }
+
+    sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));
+
+#elif defined __AVX__
+    const __m128i values128 = _mm_loadu_si128((const __m128i*)values);
+    const __m128i m4b  = _mm_set1_epi8(0x0f);
+
+    __m256 accum = _mm256_setzero_ps();
+    for (; ib + 1 < nb; ib += 2) {
+        const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs);
+        const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
+        const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs);
+        const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs + 1);
+        const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
+        const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1);
+
+        const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b));
+        const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
+        const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
+        const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
+
+        const __m256 p = mul_sum_i8_quad_float(q4b_1_0, q4b_1_1, q4b_2_0, q4b_2_1, q8b_1_0, q8b_1_1, q8b_2_0, q8b_2_1);
+        const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d);
+        accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
+    }
+
+    sumf = hsum_float_8(accum);
+
+#endif
+    for (; ib < nb; ++ib) {
+        const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
+        int sumi1 = 0, sumi2 = 0;
+        for (int j = 0; j < QK4_NL/2; ++j) {
+            sumi1 += y[ib].qs[j+       0] * values[x[ib].qs[j] & 0xf];
+            sumi2 += y[ib].qs[j+QK4_NL/2] * values[x[ib].qs[j] >>  4];
+        }
+        sumf += d * (sumi1 + sumi2);
+    }
+    *s = sumf;
+}
+
+void ggml_vec_dot_q2_dpt_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+    assert(n % QK2_DPT == 0);
+    static_assert(QK2_DPT == QK8_0, "QK2_DPT and QK8_0 must be the same");
+
+    const block_q2_dpt * GGML_RESTRICT x = vx;
+    const block_q8_0   * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK2_DPT;
+
+    const int8_t * values = (const int8_t *)levels;
+    GGML_ASSERT(values != NULL && "Q2_DPT levels not set for tensor");
+
+    int ib = 0;
+    float sumf = 0;
+
+#if defined __AVX2__
+
+    const __m128i values128 = _mm_loadu_si128((const __m128i*)values);
+    const __m128i m3 = _mm_set1_epi8(0x03);
+
+    __m256 accum = _mm256_setzero_ps();
+    for (; ib + 1 < nb; ib += 2) {
+        const __m128i q2bits_1 = _mm_loadu_si128((const __m128i*)x[ib + 0].qs);
+        const __m128i q2bits_2 = _mm_loadu_si128((const __m128i*)x[ib + 1].qs);
+        const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[ib + 0].qs);
+        const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[ib + 1].qs);
+
+        // Extract 2-bit indices and lookup values - process 8 elements at a time
+        // For each byte of q2bits, we have 4 x 2-bit indices
+        const __m128i q2_01_l = _mm_shuffle_epi8(values128, _mm_and_si128(q2bits_1, m3));
+        const __m128i q2_01_h = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q2bits_1, 2), m3));
+        const __m128i q2_02_l = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q2bits_1, 4), m3));
+        const __m128i q2_02_h = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q2bits_1, 6), m3));
+        const __m128i q2_11_l = _mm_shuffle_epi8(values128, _mm_and_si128(q2bits_2, m3));
+        const __m128i q2_11_h = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q2bits_2, 2), m3));
+        const __m128i q2_12_l = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q2bits_2, 4), m3));
+        const __m128i q2_12_h = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q2bits_2, 6), m3));
+
+        // Combine pairs into __m256i
+        const __m256i q4b_1a = MM256_SET_M128I(q2_01_h, q2_01_l);
+        const __m256i q4b_1b = MM256_SET_M128I(q2_02_h, q2_02_l);
+        const __m256i q4b_2a = MM256_SET_M128I(q2_11_h, q2_11_l);
+        const __m256i q4b_2b = MM256_SET_M128I(q2_12_h, q2_12_l);
+
+        // Split q8 into pairs and compute dot products
+        const __m256i q8b_1a = _mm256_and_si256(q8b_1, _mm256_set1_epi16(0x00ff));
+        const __m256i q8b_1b = _mm256_srli_epi16(q8b_1, 8);
+        const __m256i q8b_2a = _mm256_and_si256(q8b_2, _mm256_set1_epi16(0x00ff));
+        const __m256i q8b_2b = _mm256_srli_epi16(q8b_2, 8);
+
+        const __m256i p16_1a = mul_add_epi8(q4b_1a, q8b_1a);
+        const __m256i p16_1b = mul_add_epi8(q4b_1b, q8b_1b);
+        const __m256i p16_2a = mul_add_epi8(q4b_2a, q8b_2a);
+        const __m256i p16_2b = mul_add_epi8(q4b_2b, q8b_2b);
+
+        const __m256i mone = _mm256_set1_epi16(1);
+        const __m256i p_1 = _mm256_add_epi32(_mm256_madd_epi16(p16_1a, mone), _mm256_madd_epi16(p16_1b, mone));
+        const __m256i p_2 = _mm256_add_epi32(_mm256_madd_epi16(p16_2a, mone), _mm256_madd_epi16(p16_2b, mone));
+
+        accum = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 0].d)*GGML_CPU_FP16_TO_FP32(x[ib + 0].d)),
+                _mm256_cvtepi32_ps(p_1), accum);
+        accum = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 1].d)*GGML_CPU_FP16_TO_FP32(x[ib + 1].d)),
+                _mm256_cvtepi32_ps(p_2), accum);
+    }
+
+    sumf = hsum_float_8(accum);
+
+#endif
+    for (; ib < nb; ++ib) {
+        const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
+        int sumi = 0;
+        for (int j = 0; j < QK2_DPT/4; ++j) {
+            uint8_t q = x[ib].qs[j];
+            sumi += y[ib].qs[j*4 + 0] * values[(q >> 0) & 3];
+            sumi += y[ib].qs[j*4 + 1] * values[(q >> 2) & 3];
+            sumi += y[ib].qs[j*4 + 2] * values[(q >> 4) & 3];
+            sumi += y[ib].qs[j*4 + 3] * values[(q >> 6) & 3];
+        }
+        sumf += d * sumi;
+    }
+    *s = sumf;
+}
+
+void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
    assert(nrc == 1);
    UNUSED(nrc);
    UNUSED(bx);
@ -3815,6 +4019,6 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
    UNUSED(x);
    UNUSED(y);
    UNUSED(nb);
-    ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+    ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
 #endif
 }
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@ -3,6 +3,7 @@

 #include "ggml-backend-impl.h"
 #include "ggml-backend.h"
+#include "ggml-quants.h"
 #include "traits.h"
 #include "ggml-cpu-impl.h"
 #include "ggml-impl.h"
@ -396,6 +397,52 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
        .vec_dot_type             = GGML_TYPE_Q8_K,
        .nrows                    = 1,
    },
+    [GGML_TYPE_Q3_PT] = {
+        // from_float not set — requires codebook initialization via q3pt_set_codebook()
+        .vec_dot                  = ggml_vec_dot_q3_pt_q8_K,
+        .vec_dot_type             = GGML_TYPE_Q8_K,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_Q3_KPT] = {
+        // from_float not set — requires level initialization via q3kpt_set_levels()
+        .vec_dot                  = ggml_vec_dot_q3_kpt_q8_K,
+        .vec_dot_type             = GGML_TYPE_Q8_K,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_Q4_DPT] = {
+        // from_float not set — requires level initialization via q4dpt_set_levels()
+        .vec_dot                  = ggml_vec_dot_q4_dpt_q8_0,
+        .vec_dot_type             = GGML_TYPE_Q8_0,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_Q2_DPT] = {
+        // from_float not set — requires level initialization via q2dpt_set_levels()
+        .vec_dot                  = ggml_vec_dot_q2_dpt_q8_0,
+        .vec_dot_type             = GGML_TYPE_Q8_0,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_Q2_KPT] = {
+        // from_float not set — requires level initialization via q2kpt_set_levels()
+        .vec_dot                  = ggml_vec_dot_q2_kpt_q8_K,
+        .vec_dot_type             = GGML_TYPE_Q8_K,
+        .nrows                    = 1,
+        .levels_row_stride        = 0,  // computed dynamically: (ne0/QK_K)*Q2KPT_N_LEVELS*sizeof(float)
+    },
+    [GGML_TYPE_IQ2_TQ] = {
+        .vec_dot                  = ggml_vec_dot_iq2_tq_q8_K,
+        .vec_dot_type             = GGML_TYPE_Q8_K,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_IQ3_TQ] = {
+        .vec_dot                  = ggml_vec_dot_iq3_tq_q8_K,
+        .vec_dot_type             = GGML_TYPE_Q8_K,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_IQ1_BN] = {
+        .vec_dot                  = ggml_vec_dot_iq1_bn_q8_K,
+        .vec_dot_type             = GGML_TYPE_Q8_K,
+        .nrows                    = 1,
+    },
    [GGML_TYPE_I32] = {
        .from_float               = (ggml_from_float_t) ggml_cpu_fp32_to_i32,
    },
@ -1165,8 +1212,15 @@ static void ggml_compute_forward_mul_mat_one_chunk(

    const bool src1_cont = ggml_is_contiguous(src1);

-    ggml_vec_dot_t const vec_dot      = type_traits_cpu[type].vec_dot;
-    enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
+    ggml_vec_dot_t const vec_dot           = type_traits_cpu[type].vec_dot;
+    enum ggml_type const vec_dot_type      = type_traits_cpu[type].vec_dot_type;
+    // For Q2_KPT, levels are per-block: stride = (ne00 / QK_K) * Q2KPT_N_LEVELS * sizeof(float)
+    // ne00 is the number of elements per row in src0 (input dimension), NOT ne0 (= ne01 = output rows).
+    // For non-square matrices (e.g. ffn_up: [hidden, intermediate]) ne00 != ne01, so ne00 is correct.
+    // For other types, use the static stride from type_traits_cpu
+    const size_t levels_row_stride = (type == GGML_TYPE_Q2_KPT)
+        ? (ne00 / QK_K) * Q2KPT_N_LEVELS * sizeof(float)
+        : type_traits_cpu[type].levels_row_stride;

    // broadcast factors
    const int64_t r2 = ne12 / ne02;
@ -1227,7 +1281,11 @@ static void ggml_compute_forward_mul_mat_one_chunk(
                //}

                for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) {
-                    vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
+                    // For Q2_KPT, levels are stored per-expert: [expert0_rows, expert1_rows, ...]
+                    // So for 3D tensors we need to index by (i03 * ne01 + ir0)
+                    const size_t levels_row_idx = (type == GGML_TYPE_Q2_KPT && ne03 > 1) ? (i03 * ne01 + ir0) : ir0;
+                    const void * row_levels = (const char*)src0->quant_levels + levels_row_idx * levels_row_stride;
+                    vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot, row_levels);
                }

                for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) {
@ -1293,7 +1351,8 @@ void ggml_compute_forward_mul_mat(
                                     nb1/ggml_type_size(dst->type),
                                     src0->type,
                                     src1->type,
-                                     dst->type))
+                                     dst->type,
+                                     src0->quant_levels))
                    goto UseGgmlGemm1;
        return;
    }
@ -1361,7 +1420,8 @@ UseGgmlGemm1:;
                                     nb1/ggml_type_size(dst->type),
                                     src0->type,
                                     vec_dot_type,
-                                     dst->type))
+                                     dst->type,
+                                     src0->quant_levels))
                    goto UseGgmlGemm2;
        return;
    }
@ -1461,8 +1521,14 @@ static void ggml_compute_forward_mul_mat_id_one_chunk(

    const enum ggml_type type = src0->type;

-    ggml_vec_dot_t    const vec_dot      = type_traits_cpu[type].vec_dot;
-    enum ggml_type    const vec_dot_type = type_traits_cpu[type].vec_dot_type;
+    ggml_vec_dot_t    const vec_dot           = type_traits_cpu[type].vec_dot;
+    enum ggml_type    const vec_dot_type      = type_traits_cpu[type].vec_dot_type;
+    // For Q2_KPT, levels are per-block: stride = (ne00 / QK_K) * Q2KPT_N_LEVELS * sizeof(float)
+    // ne00 is the input dimension (elements per row in src0), NOT ne0 (= ne01 = output rows).
+    // For other types, use the static stride from type_traits_cpu
+    const size_t      levels_row_stride = (type == GGML_TYPE_Q2_KPT)
+        ? (ne00 / QK_K) * Q2KPT_N_LEVELS * sizeof(float)
+        : type_traits_cpu[type].levels_row_stride;

    const int64_t blck_0 = 16;
    const int64_t blck_1 = 16;
@ -1495,7 +1561,8 @@ static void ggml_compute_forward_mul_mat_id_one_chunk(
                float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2));

                for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) {
-                    vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1);
+                    const void * row_levels = (const char*)src0->quant_levels + (cur_a * ne01 + ir0) * levels_row_stride;
+                    vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1, row_levels);
                }

                memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir0_end) - iir0)*sizeof(float));
--- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp
+++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp
@ -1356,16 +1356,20 @@ class tinyBLAS_Q0_AVX {
                    const TA *A, int64_t lda,
                    const TB *B, int64_t ldb,
                    TC *C, int64_t ldc,
-                    int ith, int nth)
+                    int ith, int nth,
+                    const int8_t * custom_table = nullptr)
        : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
-        const int8_t kvalues_iq4nl[16] = {
-            -127, -104, -83, -65,
-            -49,  -35,  -22, -10,
-              1,   13,   25,  38,
-             53,   69,   89, 113
-        };
-
-        iq4nlt = _mm_loadu_si128((const __m128i *)kvalues_iq4nl);
+        if (custom_table) {
+            iq4nlt = _mm_loadu_si128((const __m128i *)custom_table);
+        } else {
+            const int8_t kvalues_iq4nl[16] = {
+                -127, -104, -83, -65,
+                -49,  -35,  -22, -10,
+                  1,   13,   25,  38,
+                 53,   69,   89, 113
+            };
+            iq4nlt = _mm_loadu_si128((const __m128i *)kvalues_iq4nl);
+        }
    }

    void matmul(int64_t m, int64_t n) {
@ -3684,7 +3688,7 @@ class tinyBLAS_PPC {
 */
 bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64_t n, int64_t k,
                     const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
-                     int64_t ldc, int Atype, int Btype, int Ctype) {
+                     int64_t ldc, int Atype, int Btype, int Ctype, const void * quant_levels) {

    assert(m >= 0);
    assert(n >= 0);
@ -4024,6 +4028,26 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
 #endif
    }

+    case GGML_TYPE_Q4_DPT: {
+        if (Btype != GGML_TYPE_Q8_0)
+            return false;
+#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
+        // Q4_DPT has identical block layout to IQ4_NL (block_q4_dpt = block_iq4_nl)
+        // but uses a per-tensor lookup table instead of the fixed IQ4_NL values.
+        const int8_t * levels = (const int8_t *)quant_levels;
+        if (!levels) return false;
+        tinyBLAS_Q0_AVX<block_iq4_nl, block_q8_0, float> tb{
+            k, (const block_iq4_nl *)A, lda,
+            (const block_q8_0 *)B, ldb,
+            (float *)C, ldc,
+            params->ith, params->nth, levels};
+        tb.matmul(m, n);
+        return true;
+#else
+        return false;
+#endif
+    }
+
    default:
        return false;
    }
--- a/ggml/src/ggml-cpu/llamafile/sgemm.h
+++ b/ggml/src/ggml-cpu/llamafile/sgemm.h
@ -18,7 +18,7 @@ extern "C" {

 bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t, int64_t, int64_t,
                     const void *, int64_t, const void *, int64_t, void *, int64_t,
-                     int, int, int);
+                     int, int, int, const void * quant_levels);

 #ifdef __cplusplus
 }
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@ -8,6 +8,19 @@
 #include "unary-ops.h"
 #include "vec.h"

+// Helper: compute quant_levels stride for a given row.
+// For most types this is the constant levels_row_stride from type_traits.
+// For Q2_KPT (per-block levels), stride depends on tensor width (ne[0]).
+static inline size_t ggml_quant_levels_stride(ggml_type type, size_t constant_stride, int64_t ne0) {
+    if (type == GGML_TYPE_Q2_KPT) {
+        // Q2_KPT has Q2KPT_N_LEVELS floats per 256-element block
+        // Stride = (ne0 / 256) * Q2KPT_N_LEVELS * sizeof(float)
+        return (size_t)(ne0 / 256) * 4 * sizeof(float);
+    }
+    return constant_stride;
+}
+
+
 #include <algorithm>
 #include <cfloat>
 #include <cmath>
@ -517,9 +530,11 @@ static void ggml_compute_forward_dup_from_q(
        const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
        const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13;

+        const size_t q_lrs0 = ggml_quant_levels_stride(src0->type, ggml_get_type_traits_cpu(src0->type)->levels_row_stride, src0->ne[0]);
        dequantize_row_q(
                (const void *) ((char *) src0->data + x_offset),
-                     (float *) ((char *)  dst->data + dst_offset), qk);
+                     (float *) ((char *)  dst->data + dst_offset), qk,
+                (const char*)src0->quant_levels + i01 * q_lrs0);
    }
 }

@ -639,7 +654,8 @@ static void ggml_compute_forward_add_q_f32(
        assert(ne00 % 32 == 0);

        // unquantize row from src0 to temp buffer
-        dequantize_row_q(src0_row, wdata, ne00);
+        const size_t q_lrs_add = ggml_quant_levels_stride(src0->type, ggml_get_type_traits_cpu(src0->type)->levels_row_stride, src0->ne[0]);
+        dequantize_row_q(src0_row, wdata, ne00, (const char*)src0->quant_levels + i1 * q_lrs_add);
        // add src1
        ggml_vec_acc_f32(ne00, wdata, src1_row);
        // quantize row to dst
@ -688,6 +704,9 @@ void ggml_compute_forward_add(
        case GGML_TYPE_IQ4_XS:
        case GGML_TYPE_IQ3_S:
        case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_IQ2_TQ:
+        case GGML_TYPE_IQ3_TQ:
+        case GGML_TYPE_IQ1_BN:
            {
                ggml_compute_forward_add_q_f32(params, dst);
            } break;
@ -974,7 +993,8 @@ static void ggml_compute_forward_add1_q_f32(
        assert(ne0 % 32 == 0);

        // unquantize row from src0 to temp buffer
-        dequantize_row_q(src0_row, wdata, ne0);
+        const size_t q_lrs_add = ggml_quant_levels_stride(src0->type, ggml_get_type_traits_cpu(src0->type)->levels_row_stride, src0->ne[0]);
+        dequantize_row_q(src0_row, wdata, ne00, (const char*)src0->quant_levels + i1 * q_lrs_add);
        // add src1
        ggml_vec_acc1_f32(ne0, wdata, v);
        // quantize row to dst
@ -1139,6 +1159,9 @@ void ggml_compute_forward_add1(
        case GGML_TYPE_IQ4_XS:
        case GGML_TYPE_IQ3_S:
        case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_IQ2_TQ:
+        case GGML_TYPE_IQ3_TQ:
+        case GGML_TYPE_IQ1_BN:
            {
                ggml_compute_forward_add1_q_f32(params, dst);
            } break;
@ -1269,6 +1292,9 @@ void ggml_compute_forward_acc(
        case GGML_TYPE_IQ4_XS:
        case GGML_TYPE_IQ3_S:
        case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_IQ2_TQ:
+        case GGML_TYPE_IQ3_TQ:
+        case GGML_TYPE_IQ1_BN:
        default:
            {
                GGML_ABORT("fatal error");
@ -4321,7 +4347,8 @@ static void ggml_compute_forward_out_prod_q_f32(
            float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
            float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));

-            dequantize_row_q(s0, wdata, ne0);
+            const size_t q_lrs_op = ggml_quant_levels_stride(src0->type, ggml_get_type_traits_cpu(src0->type)->levels_row_stride, src0->ne[0]);
+            dequantize_row_q(s0, wdata, ne0, (const char*)src0->quant_levels + i01 * q_lrs_op);
            ggml_vec_mad_f32(ne0, d, wdata, *s1);
        }
    }
@ -4358,6 +4385,9 @@ void ggml_compute_forward_out_prod(
        case GGML_TYPE_IQ4_XS:
        case GGML_TYPE_IQ3_S:
        case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_IQ2_TQ:
+        case GGML_TYPE_IQ3_TQ:
+        case GGML_TYPE_IQ1_BN:
            {
                ggml_compute_forward_out_prod_q_f32(params, dst);
            } break;
@ -4635,6 +4665,9 @@ void ggml_compute_forward_set(
        case GGML_TYPE_IQ4_XS:
        case GGML_TYPE_IQ3_S:
        case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_IQ2_TQ:
+        case GGML_TYPE_IQ3_TQ:
+        case GGML_TYPE_IQ1_BN:
        default:
            {
                GGML_ABORT("fatal error");
@ -4698,9 +4731,21 @@ static void ggml_compute_forward_get_rows_q(

        GGML_ASSERT(i01 >= 0 && i01 < ne01);

+        const size_t q_lrs_gr = ggml_quant_levels_stride(src0->type, ggml_get_type_traits_cpu(src0->type)->levels_row_stride, src0->ne[0]);
+        // For Q2_KPT with 3D tensors, levels are indexed by [i12 * ne02 * ne01 + i11 * ne01 + i01]
+        // For 2D tensors, levels are indexed by [i11 * ne01 + i01] (or just [i01] if ne02 == 1)
+        size_t levels_row_idx;
+        if (type == GGML_TYPE_Q2_KPT && ne03 > 1) {
+            levels_row_idx = (i12 * ne02 + i11) * ne01 + i01;
+        } else if (type == GGML_TYPE_Q2_KPT) {
+            levels_row_idx = i11 * ne01 + i01;
+        } else {
+            levels_row_idx = i01;
+        }
        dequantize_row_q(
                (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
-                     (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3), nc);
+                     (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3), nc,
+                (const char*)src0->quant_levels + levels_row_idx * q_lrs_gr);
    }
 }

@ -4859,6 +4904,9 @@ void ggml_compute_forward_get_rows(
        case GGML_TYPE_IQ4_XS:
        case GGML_TYPE_IQ3_S:
        case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_IQ2_TQ:
+        case GGML_TYPE_IQ3_TQ:
+        case GGML_TYPE_IQ1_BN:
            {
                ggml_compute_forward_get_rows_q(params, dst);
            } break;
@ -5436,7 +5484,7 @@ static void ggml_compute_forward_soft_max_ext_back_f32(

        // linear runtime, no additional memory
        float dot_y_dy = 0;
-        ggml_vec_dot_f32  (nc, &dot_y_dy, 0, y, 0, dy, 0, 1);
+        ggml_vec_dot_f32  (nc, &dot_y_dy, 0, y, 0, dy, 0, 1, nullptr);
        ggml_vec_cpy_f32  (nc, dx, dy);
        ggml_vec_acc1_f32 (nc, dx, -dot_y_dy);
        ggml_vec_mul_f32  (nc, dx, dx, y);
@ -5571,6 +5619,8 @@ void ggml_compute_forward_clamp(
        case GGML_TYPE_NVFP4:
        case GGML_TYPE_Q2_K:
        case GGML_TYPE_Q3_K:
+        case GGML_TYPE_Q3_KPT:
+        case GGML_TYPE_Q4_DPT:
        case GGML_TYPE_Q4_K:
        case GGML_TYPE_Q5_K:
        case GGML_TYPE_Q6_K:
@ -5583,6 +5633,12 @@ void ggml_compute_forward_clamp(
        case GGML_TYPE_IQ1_M:
        case GGML_TYPE_IQ4_NL:
        case GGML_TYPE_IQ4_XS:
+        case GGML_TYPE_Q3_PT:
+        case GGML_TYPE_Q2_KPT:
+        case GGML_TYPE_Q2_DPT:
+        case GGML_TYPE_IQ2_TQ:
+        case GGML_TYPE_IQ3_TQ:
+        case GGML_TYPE_IQ1_BN:
        case GGML_TYPE_IQ3_S:
        case GGML_TYPE_IQ2_S:
        case GGML_TYPE_Q8_K:
@ -6007,7 +6063,7 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
                float v = 0;
                ggml_vec_dot_f16(ne02, &v, 0,
                        (ggml_fp16_t *)    wdata_src + i1n, 0,
-                        (ggml_fp16_t *) wdata_kernel + i00*ne02, 0, 1);
+                        (ggml_fp16_t *) wdata_kernel + i00*ne02, 0, 1, nullptr);
                dst_data[i10*s0 + i00] += v;
            }
        }
@ -6095,7 +6151,7 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
                float v = 0;
                ggml_vec_dot_f32(ne02, &v, 0,
                        wdata_src + i1n, 0,
-                        wdata_kernel + i00*ne02, 0, 1);
+                        wdata_kernel + i00*ne02, 0, 1, nullptr);
                dst_data[i10*s0 + i00] += v;
            }
        }
@ -7021,11 +7077,11 @@ static void ggml_compute_forward_conv_transpose_2d_impl(
                        if constexpr (std::is_same_v<kernel_t, ggml_fp16_t>) {
                            ggml_vec_dot_f16(ne03, &v, 0,
                                    wdata_src + i1n, 0,
-                                    wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1);
+                                    wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1, nullptr);
                        } else {
                            ggml_vec_dot_f32(ne03, &v, 0,
                                    wdata_src + i1n, 0,
-                                    wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1);
+                                    wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1, nullptr);
                        }
                        dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v;
                    }
@ -8298,7 +8354,7 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
            float s; // KQ value

            const char * k_data = (const char *) k->data + ( ic*nbk1 + ik2*nbk2 + ik3*nbk3);
-            kq_vec_dot(DK, &s, 0, k_data, 0, Q_q, 0, 1);
+            kq_vec_dot(DK, &s, 0, k_data, 0, Q_q, 0, 1, k->quant_levels);

            s = s*scale; // scale KQ value

@ -8345,7 +8401,7 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(

                // V += v*expf(s - M)
                if (v_to_float) {
-                    v_to_float(v_data, V32, DV);
+                    v_to_float(v_data, V32, DV, v->quant_levels);
                    ggml_vec_mad_f32(DV, VKQ32, V32, vs);
                } else {
                    // V is F32
@ -9058,7 +9114,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
                    ggml_vec_dot_f32(neq0,
                            S + i1, 0,
                            (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
-                            (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
+                            (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1, nullptr);
                }

                // scale
@ -9172,7 +9228,7 @@ static void ggml_compute_forward_flash_attn_back_f32(

                // S = SM * (S - dot(SM, S))
                float dot_SM_gradSM = 0;
-                ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, 0, SM, 0, S, 0, 1);
+                ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, 0, SM, 0, S, 0, 1, nullptr);
                ggml_vec_acc1_f32(M, S, -dot_SM_gradSM);
                ggml_vec_mul_f32 (masked_begin, S, S, SM);

@ -10535,7 +10591,7 @@ static void ggml_compute_forward_gated_delta_net_one_chunk(
            // delta[j] = sum_i S[i][j] * k[i] = dot(row j of M, k)
            for (int64_t j = 0; j < S_v; ++j) {
                float sum = 0.0f;
-                ggml_vec_dot_f32(S_v, &sum, 0, &s_out[j * S_v], 0, k_d, 0, 1);
+                ggml_vec_dot_f32(S_v, &sum, 0, &s_out[j * S_v], 0, k_d, 0, 1, nullptr);
                delta[j] = (v_d[j] - sum) * beta_val;
            }

@ -10547,7 +10603,7 @@ static void ggml_compute_forward_gated_delta_net_one_chunk(
            // attn_out[j] = sum_i S[i][j] * q[i] = dot(row j of M, q)
            for (int64_t j = 0; j < S_v; ++j) {
                float sum = 0.0f;
-                ggml_vec_dot_f32(S_v, &sum, 0, &s_out[j * S_v], 0, q_d, 0, 1);
+                ggml_vec_dot_f32(S_v, &sum, 0, &s_out[j * S_v], 0, q_d, 0, 1, nullptr);
                attn_data[j] = sum * scale;
            }

--- a/ggml/src/ggml-cpu/quants.c
+++ b/ggml/src/ggml-cpu/quants.c
@ -120,7 +120,8 @@ void quantize_row_q8_K_generic(const float * GGML_RESTRICT x, void * GGML_RESTRI

 //===================================== Dot products =================================

-void ggml_vec_dot_q1_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q1_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
    const int qk = QK1_0;
    const int nb = n / qk;

@ -165,7 +166,8 @@ void ggml_vec_dot_q1_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c
 }


-void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
    const int qk = QK8_0;
    const int nb = n / qk;

@ -202,7 +204,8 @@ void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c
 }

 // TODO: add WASM SIMD
-void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
    const int qk = QK8_1;
    const int nb = n / qk;

@ -238,7 +241,8 @@ void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, c
    *s = sumf;
 }

-void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
    assert(nrc == 1);
    UNUSED(nrc);
    UNUSED(bx);
@ -270,7 +274,8 @@ void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
 }

 // NVFP4: super-block of 64 elements = 4 sub-blocks of 16 = 2 q8_0 blocks
-void ggml_vec_dot_nvfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_nvfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
    assert(nrc == 1);
    UNUSED(nrc);
    UNUSED(bx);
@ -305,7 +310,8 @@ void ggml_vec_dot_nvfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
    *s = sumf;
 }

-void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
    const int qk = QK8_0;
    const int nb = n / qk;

@ -348,7 +354,8 @@ void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c
    *s = sumf;
 }

-void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
    const int qk = QK8_1;
    const int nb = n / qk;

@ -391,7 +398,8 @@ void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, c
    *s = sumf;
 }

-void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
    const int qk = QK8_0;
    const int nb = n / qk;

@ -421,7 +429,8 @@ void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c
    *s = sumf;
 }

-void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
    assert(nrc == 1);
    UNUSED(nrc);
    UNUSED(bx);
@ -473,7 +482,8 @@ void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
    *s = sumf;
 }

-void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
    assert(nrc == 1);
    UNUSED(nrc);
    UNUSED(bx);
@ -505,7 +515,8 @@ void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
    *s = sumf;
 }

-void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
    assert(nrc == 1);
    UNUSED(nrc);
    UNUSED(bx);
@ -557,7 +568,8 @@ void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
    *s = sumf;
 }

-void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
    assert(n % QK_K == 0);
    assert(nrc == 1);
    UNUSED(nrc);
@ -636,7 +648,8 @@ void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
    *s = sumf;
 }

-void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
    assert(n % QK_K == 0);
    assert(nrc == 1);
    UNUSED(nrc);
@ -710,8 +723,7 @@ void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
    for (int l = 0; l < 8; ++l) sumf += sums[l];
    *s = sumf;
 }
-
-void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy,  size_t by, int nrc) {
+void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy,  size_t by, int nrc, const void * levels) {
    assert(n % QK_K == 0);
    assert(nrc == 1);
    UNUSED(nrc);
@ -741,6 +753,7 @@ void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c

    float sumf = 0;
    for (int i = 0; i < nb; ++i) {
+    GGML_UNUSED(levels);
        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
        const uint8_t * GGML_RESTRICT hm = x[i].qh;
        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
@ -791,7 +804,8 @@ void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
    *s = sumf;
 }

-void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
    assert(n % QK_K == 0);
    assert(nrc == 1);
    UNUSED(nrc);
@ -846,7 +860,8 @@ void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
    *s = sumf;
 }

-void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
    assert(n % QK_K == 0);
    assert(nrc == 1);
    UNUSED(nrc);
@ -888,7 +903,8 @@ void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs
    *s = 0.125f * sumf;
 }

-void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
    assert(n % QK_K == 0);
    assert(nrc == 1);
    UNUSED(nrc);
@ -938,7 +954,8 @@ void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
    *s = 0.125f * sumf;
 }

-void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
    assert(n % QK_K == 0);
    assert(nrc == 1);
    UNUSED(nrc);
@ -990,7 +1007,8 @@ void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
    *s = 0.125f * sumf;
 }

-void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
    assert(n % QK_K == 0);
    assert(nrc == 1);
    UNUSED(nrc);
@ -1034,7 +1052,8 @@ void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs
    *s = 0.25f * sumf;
 }

-void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
    assert(n % QK_K == 0);
    assert(nrc == 1);
    UNUSED(nrc);
@ -1090,7 +1109,65 @@ void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
    *s = sumf;
 }

-void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_q3_pt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q3_pt * GGML_RESTRICT x = vx;
+    const block_q8_K   * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+    const float * lv = (const float *)levels;
+    GGML_ASSERT(lv != NULL && "Q3_PT levels not set for tensor");
+
+    float sumf = 0.f;
+    for (int i = 0; i < nb; ++i) {
+        const float xd    = GGML_CPU_FP16_TO_FP32(x[i].d);
+        const float xdmin = GGML_CPU_FP16_TO_FP32(x[i].dmin);
+        const float yd    = y[i].d;
+        const uint8_t * sc = x[i].scales;
+        const uint8_t * qs = x[i].qs;
+        const int8_t  * q8 = y[i].qs;
+
+        float block_sum = 0.f;
+        for (int ib = 0; ib < QK_K/16; ++ib) {
+            // Inline 6-bit unpack for range scale (index ib) and neg_min scale (index ib + QK_K/16)
+            const int sbit0  = ib * 6,              sbyte0 = sbit0 / 8,  soff0 = sbit0 % 8;
+            const int sbit1  = (ib + QK_K/16) * 6,  sbyte1 = sbit1 / 8,  soff1 = sbit1 % 8;
+            uint8_t qrange = (sc[sbyte0] >> soff0) & 0x3F;
+            if (soff0 > 2) { qrange |= (uint8_t)((sc[sbyte0+1] << (8 - soff0)) & 0x3F); }
+            uint8_t qnmin  = (sc[sbyte1] >> soff1) & 0x3F;
+            if (soff1 > 2) { qnmin  |= (uint8_t)((sc[sbyte1+1] << (8 - soff1)) & 0x3F); }
+            const float range   = xd    * (float)qrange;
+            const float sub_min = -xdmin * (float)qnmin;
+
+            float sum_lq = 0.f;
+            for (int j = 0; j < 16; ++j) {
+                // Inline 3-bit unpack
+                const int qk    = ib * 16 + j;
+                const int qbit  = qk * 3;
+                const int qbyte = qbit / 8;
+                const int qoff  = qbit % 8;
+                int q = (qs[qbyte] >> qoff) & 0x7;
+                if (qoff > 5) { q |= (int)((qs[qbyte+1] << (8 - qoff)) & 0x7); }
+                sum_lq += lv[q] * (float)q8[qk];
+            }
+            // min contribution uses precomputed 16-element sum from block_q8_K.bsums
+            block_sum += sum_lq * range + sub_min * (float)y[i].bsums[ib];
+        }
+        sumf += block_sum * yd;
+    }
+    *s = sumf;
+}
+
+void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
    assert(n % QK_K == 0);
    assert(nrc == 1);
    UNUSED(nrc);
@ -1133,7 +1210,375 @@ void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
    *s = sumf;
 }

-void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+// Q3_KPT vec_dot - similar to Q3_K but with learned levels
+void ggml_vec_dot_q3_kpt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q3_kpt * GGML_RESTRICT x = vx;
+    const block_q8_K   * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+    const float * lv = (const float *)levels;
+    GGML_ASSERT(lv != NULL && "Q3_KPT levels not set for tensor");
+
+    const uint32_t kmask1 = 0x03030303;
+    const uint32_t kmask2 = 0x0f0f0f0f;
+
+    float sumf = 0.f;
+    for (int i = 0; i < nb; ++i) {
+        const float d_all = GGML_CPU_FP16_TO_FP32(x[i].d);
+        const float yd = y[i].d;
+        const uint8_t * q = x[i].qs;
+        const uint8_t * hm = x[i].hmask;
+        const int8_t  * q8 = y[i].qs;
+        uint8_t m = 1;
+
+        uint32_t aux32[4];
+        memcpy(aux32, x[i].scales, 12);
+        uint32_t tmp = aux32[2];
+        aux32[2] = ((aux32[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
+        aux32[3] = ((aux32[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
+        aux32[0] = (aux32[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
+        aux32[1] = (aux32[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
+        const uint8_t * aux = (const uint8_t *)aux32;
+
+        int is = 0;
+        float block_sum = 0.f;
+        for (int blk = 0; blk < QK_K; blk += 128) {
+            int shift = 0;
+            for (int j = 0; j < 4; ++j) {
+                int sc1 = (int)aux[is] - 32;
+                int sc2 = (int)aux[is+1] - 32;
+                is += 2;
+                float dl1 = d_all * sc1;
+                float dl2 = d_all * sc2;
+
+                float sum1 = 0.f, sum2 = 0.f;
+                for (int l = 0; l < 16; ++l) {
+                    int k_idx = ((q[l+0] >> shift) & 3) + ((hm[l+0] & m) ? 4 : 0);
+                    sum1 += (lv[k_idx] * 7.0f - 4.0f) * (float)q8[l+0];
+                }
+                for (int l = 0; l < 16; ++l) {
+                    int k_idx = ((q[l+16] >> shift) & 3) + ((hm[l+16] & m) ? 4 : 0);
+                    sum2 += (lv[k_idx] * 7.0f - 4.0f) * (float)q8[l+16];
+                }
+                block_sum += dl1 * sum1 + dl2 * sum2;
+
+                shift += 2;
+                m <<= 1;
+                q8 += 32;
+            }
+            q += 32;
+        }
+        sumf += block_sum * yd;
+    }
+    *s = sumf;
+}
+
+void ggml_vec_dot_q3_kpt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    ggml_vec_dot_q3_kpt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
+}
+
+// Q2_KPT vec_dot - similar to Q2_K but with learned levels
+void ggml_vec_dot_q2_kpt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q2_kpt * GGML_RESTRICT x = vx;
+    const block_q8_K   * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+    const float * lv = (const float *)levels;
+    GGML_ASSERT(lv != NULL && "Q2_KPT levels not set for tensor");
+
+    float sumf = 0;
+
+    for (int i = 0; i < nb; ++i) {
+        // Per-block levels: block i uses lv[i*4 + 0..3]
+        const float * block_lv = lv + i * Q2KPT_N_LEVELS;
+
+        // Precompute mapped levels for this block: ml[k] = levels[k] * 3.0
+        float ml[Q2KPT_N_LEVELS];
+        for (int k = 0; k < Q2KPT_N_LEVELS; ++k) {
+            ml[k] = block_lv[k] * 3.0f;
+        }
+
+        const uint8_t * q2 = x[i].qs;
+        const int8_t  * q8 = y[i].qs;
+        const uint8_t * sc = x[i].scales;
+
+        // Min term: accumulate integer bsums * min_scale (same as Q2_K)
+        int summs = 0;
+        for (int j = 0; j < 16; ++j) {
+            summs += y[i].bsums[j] * (sc[j] >> 4);
+        }
+
+        const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+        const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
+
+        // Scale term: need floating-point because levels are non-uniform
+        int is = 0;
+        float fsum = 0;
+        for (int k = 0; k < QK_K/128; ++k) {
+            int shift = 0;
+            for (int j = 0; j < 4; ++j) {
+                int d_sc = sc[is++] & 0xF;
+                float suml = 0;
+                for (int l = 0; l < 16; ++l) {
+                    int idx = (q2[l] >> shift) & 3;
+                    suml += ml[idx] * (float)q8[l];
+                }
+                fsum += d_sc * suml;
+
+                d_sc = sc[is++] & 0xF;
+                suml = 0;
+                for (int l = 16; l < 32; ++l) {
+                    int idx = (q2[l] >> shift) & 3;
+                    suml += ml[idx] * (float)q8[l];
+                }
+                fsum += d_sc * suml;
+
+                shift += 2;
+                q8 += 32;
+            }
+            q2 += 32;
+        }
+        sumf += dall * fsum - dmin * summs;
+    }
+    *s = sumf;
+}
+
+void ggml_vec_dot_q2_kpt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    ggml_vec_dot_q2_kpt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
+}
+
+// IQ2_TQ: 2-bit with asymmetric 4-tuple grid per group
+// Default grid table — only used when no per-tensor grid is available
+static const int8_t iq2tq_grid_cpu[16][4] = {
+   {-20, -8, -2,  6}, {-14, -8, -2,  4}, {-16,-10,  0, 12}, {-14, -4,  2,  8},
+   {-20, -4,  4, 12},  {-8, -4,  0,  4},  {-8, -4,  0,  8}, {-12, -6,  2, 12},
+    {-4, -2,  2,  4}, {-10, -2,  4,  8}, {-16, -6,  4, 20}, {-12, -2,  6, 14},
+    {-8, -2,  4, 14},  {-4,  0,  4,  8},  {-8, -2,  6, 22},  {-4,  2,  8, 14},
+};
+
+void ggml_vec_dot_iq2_tq_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs);
+
+    const int8_t (*grid)[4] = levels ? (const int8_t (*)[4])levels : (const int8_t (*)[4])iq2tq_grid_cpu;
+    const block_iq2_tq * GGML_RESTRICT x = vx;
+    const block_q8_K   * GGML_RESTRICT y = vy;
+    const int nb = n / QK_K;
+
+    float sumf = 0;
+
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * IQ2TQ_GRID_SCALE;
+        const float yd = y[i].d;
+        const int8_t * q8 = y[i].qs;
+
+        int32_t fsum = 0;
+
+        for (int g = 0; g < IQ2TQ_N_GROUPS; ++g) {
+            int si = (x[i].scales[g / 2] >> (4 * (g % 2))) & 0xF;
+            const int8_t * ge = grid[si];
+            const int8_t * q8g = q8 + g * 8;
+
+            for (int k = 0; k < 8; ++k) {
+                int j = g * 8 + k;
+                int qi = (x[i].qs[j / 4] >> ((j % 4) * 2)) & 3;
+                fsum += (int32_t)ge[qi] * (int32_t)q8g[k];
+            }
+        }
+
+        sumf += d * yd * (float)fsum;
+    }
+
+    *s = sumf;
+}
+
+// IQ3_TQ default grid (must match ggml-quants.c)
+static const int8_t iq3tq_grid_cpu[16][8] = {
+    {-24,-18,-12, -6,  0,  6, 12, 18},
+    {-20,-15,-10, -5,  0,  5, 10, 15},
+    {-16,-12, -8, -4,  0,  4,  8, 12},
+    {-12, -8, -4, -2,  0,  2,  4,  8},
+    {-24,-16, -8, -2,  2,  6, 10, 14},
+    {-14,-10, -6, -2,  2,  8, 16, 24},
+    {-20,-14, -8, -4,  0,  4, 10, 18},
+    {-18,-10, -4,  0,  4,  8, 14, 20},
+    { -8, -6, -4, -2,  0,  2,  4,  6},
+    {-10, -6, -4, -2,  2,  4,  6, 10},
+    {-22,-14, -6, -2,  2,  6, 14, 22},
+    {-16, -8, -4, -2,  0,  4,  8, 16},
+    {-24,-20,-16,-12, -8, -4,  0,  4},
+    { -4,  0,  4,  8, 12, 16, 20, 24},
+    {-20,-16,-10, -4,  4, 10, 16, 20},
+    {-12, -8, -6, -2,  2,  6,  8, 12},
+};
+
+void ggml_vec_dot_iq3_tq_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs);
+
+    const int8_t (*grid)[8] = levels ? (const int8_t (*)[8])levels : (const int8_t (*)[8])iq3tq_grid_cpu;
+    const block_iq3_tq * GGML_RESTRICT x = vx;
+    const block_q8_K   * GGML_RESTRICT y = vy;
+    const int nb = n / QK_K;
+
+    float sumf = 0;
+
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * IQ3TQ_GRID_SCALE;
+        const float yd = y[i].d;
+        const int8_t * q8 = y[i].qs;
+
+        int32_t fsum = 0;
+
+        for (int g = 0; g < IQ3TQ_N_GROUPS; ++g) {
+            int si = (x[i].scales[g / 2] >> (4 * (g % 2))) & 0xF;
+            const int8_t * ge = grid[si];
+            const int8_t * q8g = q8 + g * 8;
+
+            for (int k = 0; k < 8; ++k) {
+                int j = g * 8 + k;
+                // 3-bit unpack
+                int bit_pos = j * 3;
+                int byte_idx = bit_pos >> 3;
+                int bit_off = bit_pos & 7;
+                uint16_t val = x[i].qs[byte_idx];
+                if (bit_off > 5) val |= ((uint16_t)x[i].qs[byte_idx + 1] << 8);
+                int qi = (val >> bit_off) & 7;
+                fsum += (int32_t)ge[qi] * (int32_t)q8g[k];
+            }
+        }
+
+        sumf += d * yd * (float)fsum;
+    }
+
+    *s = sumf;
+}
+
+// IQ1_BN: 8D vector quantized — codebook[256][8] + scale_table[16]
+void ggml_vec_dot_iq1_bn_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs);
+
+    GGML_ASSERT(levels && "IQ1_BN requires per-tensor codebook in quant_levels");
+    const int8_t * codebook = (const int8_t *)levels;
+    const block_iq1_bn * GGML_RESTRICT x = vx;
+    const block_q8_K    * GGML_RESTRICT y = vy;
+    const int nb = n / QK_K;
+
+    float sumf = 0;
+
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * IQ1BN_GRID_SCALE;
+        const float yd = y[i].d;
+        const int8_t * q8 = y[i].qs;
+
+        int32_t block_sum = 0;
+
+        for (int g = 0; g < IQ1BN_N_GROUPS; ++g) {
+            int ci = (g & 1)
+                ? ((x[i].qs[3*(g/2)+1] >> 4) | ((int)x[i].qs[3*(g/2)+2] << 4))
+                : (x[i].qs[3*(g/2)] | (((int)x[i].qs[3*(g/2)+1] & 0x0F) << 8));
+            const int8_t * cb = codebook + ci * IQ1BN_CODEBOOK_DIM;
+            const int8_t * q8g = q8 + g * IQ1BN_GROUP_SIZE;
+
+            for (int k = 0; k < IQ1BN_CODEBOOK_DIM; ++k) {
+                block_sum += (int32_t)cb[k] * (int32_t)q8g[k];
+            }
+        }
+
+        sumf += d * yd * (float)block_sum;
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_q4_dpt_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+    assert(n % QK4_NL == 0);
+    static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same");
+
+    const block_q4_dpt * GGML_RESTRICT x = vx;
+    const block_q8_0   * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK4_NL;
+
+    const int8_t * values = (const int8_t *)levels;
+    GGML_ASSERT(values != NULL && "Q4_DPT levels not set for tensor");
+
+    float sumf = 0;
+    for (int ib = 0; ib < nb; ++ib) {
+        const float d = GGML_CPU_FP16_TO_FP32(y[ib].d) * GGML_CPU_FP16_TO_FP32(x[ib].d);
+        int32_t blk = 0;
+        for (int j = 0; j < QK4_NL/2; ++j) {
+            blk += (int32_t)y[ib].qs[j+       0] * (int32_t)values[x[ib].qs[j] & 0xf];
+            blk += (int32_t)y[ib].qs[j+QK4_NL/2] * (int32_t)values[x[ib].qs[j] >>  4];
+        }
+        sumf += d * (float)blk;
+    }
+    *s = sumf;
+}
+
+void ggml_vec_dot_q2_dpt_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+    assert(n % QK2_DPT == 0);
+    static_assert(QK2_DPT == QK8_0, "QK2_DPT and QK8_0 must be the same");
+
+    const block_q2_dpt * GGML_RESTRICT x = vx;
+    const block_q8_0   * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK2_DPT;
+
+    const int8_t * values = (const int8_t *)levels;
+    GGML_ASSERT(values != NULL && "Q2_DPT levels not set for tensor");
+
+    float sumf = 0;
+    for (int ib = 0; ib < nb; ++ib) {
+        const float d = GGML_CPU_FP16_TO_FP32(y[ib].d) * GGML_CPU_FP16_TO_FP32(x[ib].d);
+        int32_t blk = 0;
+        for (int j = 0; j < QK2_DPT/4; ++j) {
+            uint8_t q = x[ib].qs[j];
+            blk += (int32_t)y[ib].qs[j*4 + 0] * (int32_t)values[(q >> 0) & 3];
+            blk += (int32_t)y[ib].qs[j*4 + 1] * (int32_t)values[(q >> 2) & 3];
+            blk += (int32_t)y[ib].qs[j*4 + 2] * (int32_t)values[(q >> 4) & 3];
+            blk += (int32_t)y[ib].qs[j*4 + 3] * (int32_t)values[(q >> 6) & 3];
+        }
+        sumf += d * (float)blk;
+    }
+    *s = sumf;
+}
+
+void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
    assert(n % QK_K == 0);
    assert(nrc == 1);
    UNUSED(nrc);
@ -1194,7 +1639,8 @@ void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
    *s = sumf;
 }

-void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
    assert(nrc == 1);
    UNUSED(nrc);
    UNUSED(bx);
@ -1223,7 +1669,8 @@ void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
    *s = sumf;
 }

-void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
    assert(nrc == 1);
    UNUSED(nrc);
    UNUSED(bx);
--- a/ggml/src/ggml-cpu/quants.h
+++ b/ggml/src/ggml-cpu/quants.h
@ -37,66 +37,79 @@ void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
 void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);

 // Dot product
-void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);

-void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_nvfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_nvfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);

-void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);

-void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);

-void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq2_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq1_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq1_m_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq3_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_iq2_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_iq1_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_iq1_m_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_iq3_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_q3_pt_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_q3_kpt_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_q3_kpt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_q4_dpt_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_q4_dpt_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_q2_dpt_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_q2_dpt_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_q2_kpt_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_q2_kpt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_iq2_tq_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_iq3_tq_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_iq1_bn_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);

 // Generic implementation
 void quantize_row_q8_0_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
 void quantize_row_q8_1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
 void quantize_row_q8_K_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void ggml_vec_dot_q1_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q1_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);

-void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_nvfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_nvfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);

-void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);

-void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy,  size_t by, int nrc);
-void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy,  size_t by, int nrc, const void * levels);
+void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_q3_pt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);

 #ifdef __cplusplus
 }
--- a/ggml/src/ggml-cpu/vec.cpp
+++ b/ggml/src/ggml-cpu/vec.cpp
@ -8,7 +8,8 @@ ggml_fp16_t ggml_table_gelu_f16[1 << 16];
 // precomputed quick gelu table for f16 (128 KB)
 ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];

-void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc) {
+void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc, const void * levels) {
+   GGML_UNUSED(levels);
   assert(nrc == 1);
   GGML_UNUSED(nrc);
   GGML_UNUSED(bx);
@ -136,7 +137,8 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G
    *s = sumf;
 }

-void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc) {
+void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
    assert(nrc == 1);
    GGML_UNUSED(nrc);
    GGML_UNUSED(bx);
@ -261,7 +263,8 @@ void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t *
    *s = sumf;
 }

-void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc) {
+void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc, const void * levels) {
+    GGML_UNUSED(levels);
    assert(nrc == 1);
    GGML_UNUSED(nrc);
    GGML_UNUSED(bx);
--- a/ggml/src/ggml-cpu/vec.h
+++ b/ggml/src/ggml-cpu/vec.h
@ -39,9 +39,9 @@ extern ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
 // fundamental operations
 //

-void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
-void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc);
-void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
+void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc, const void * levels);
+void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc, const void * levels);

 void ggml_vec_silu_f32(const int n, float * y, const float * x);
 ggml_float ggml_vec_cvar_f32(const int n, float * y, const float * x, const float mean); //it will also center y ( y = y - mean )
@ -873,7 +873,7 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float
    }
 }

-inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s);   }
+inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1, NULL); *s = sqrtf(*s);   }
 inline static void ggml_vec_sqr_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i];   }
 inline static void ggml_vec_sqr_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
    for (int i = 0; i < n; ++i) {
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@ -1057,6 +1057,27 @@ struct ggml_cuda_type_traits<GGML_TYPE_IQ4_NL> {
    static constexpr int qi = QI4_NL;
 };

+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_Q4_DPT> {
+    static constexpr int qk = QK4_NL;
+    static constexpr int qr = QR4_NL;
+    static constexpr int qi = QI4_NL;
+};
+
+// Per-tensor lookup table for Q4_DPT (device global memory).
+// Each TU gets its own copy; initialized via cudaGetSymbolAddress + cudaMemcpyAsync before use.
+__device__ int8_t q4dpt_levels_cuda[16];
+
+// Per-tensor lookup table for Q2_DPT (4 int8 levels).
+__device__ int8_t q2dpt_levels_cuda[4];
+
+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_Q2_DPT> {
+    static constexpr int qk = QK2_DPT;
+    static constexpr int qr = 4;  // 4 elements per "quantum" (2-bit)
+    static constexpr int qi = 1;  // 1 uint32 per block
+};
+
 template<>
 struct ggml_cuda_type_traits<GGML_TYPE_IQ4_XS> {
    static constexpr int qk = QK_K;
@ -1064,6 +1085,38 @@ struct ggml_cuda_type_traits<GGML_TYPE_IQ4_XS> {
    static constexpr int qi = QI4_XS;
 };

+// Per-tensor grid for IQ2_TQ (16 × 4 int8 = 64 bytes).
+__device__ int8_t iq2tq_grid_cuda[64];
+
+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_IQ2_TQ> {
+    static constexpr int qk = QK_K;
+    static constexpr int qr = 4;
+    static constexpr int qi = QK_K / (4*4);  // 16
+};
+
+// Per-tensor grid for IQ3_TQ (16 × 8 int8 = 128 bytes).
+__device__ int8_t iq3tq_grid_cuda[128];
+
+
+// Per-tensor codebook for IQ1_BN (4096 × 8 int8 = 32768 bytes).
+__device__ int8_t iq1bn_codebook_cuda[IQ1BN_CODEBOOK_SIZE];
+
+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_IQ3_TQ> {
+    static constexpr int qk = QK_K;
+    static constexpr int qr = 4;
+    static constexpr int qi = QK_K / (4*4);  // 16
+};
+
+
+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_IQ1_BN> {
+    static constexpr int qk = QK_K;
+    static constexpr int qr = 4;
+    static constexpr int qi = QK_K / (4*4);  // 16
+};
+
 template<>
 struct ggml_cuda_type_traits<GGML_TYPE_IQ3_S> {
    static constexpr int qk = QK_K;
--- a/ggml/src/ggml-cuda/convert.cu
+++ b/ggml/src/ggml-cuda/convert.cu
@ -593,12 +593,187 @@ static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int64_t
    dequantize_block_iq1_s<<<nb, 32, 0, stream>>>(vx, y);
 }

+void ggml_cuda_set_q4dpt_levels(const int8_t * levels, cudaStream_t stream) {
+    int8_t * d_q4dpt_levels;
+    CUDA_CHECK(cudaGetSymbolAddress((void **)&d_q4dpt_levels, q4dpt_levels_cuda));
+    CUDA_CHECK(cudaMemcpyAsync(d_q4dpt_levels, levels, 16, cudaMemcpyDeviceToDevice, stream));
+}
+
+void ggml_cuda_set_q2dpt_levels(const int8_t * levels, cudaStream_t stream) {
+    int8_t * d_q2dpt_levels;
+    CUDA_CHECK(cudaGetSymbolAddress((void **)&d_q2dpt_levels, q2dpt_levels_cuda));
+    CUDA_CHECK(cudaMemcpyAsync(d_q2dpt_levels, levels, 4, cudaMemcpyDeviceToDevice, stream));
+}
+
+void ggml_cuda_set_iq2tq_grid(const void * grid, cudaStream_t stream) {
+    int8_t * d_grid;
+    CUDA_CHECK(cudaGetSymbolAddress((void **)&d_grid, iq2tq_grid_cuda));
+    CUDA_CHECK(cudaMemcpyAsync(d_grid, grid, 64, cudaMemcpyHostToDevice, stream));
+}
+
+void ggml_cuda_set_iq3tq_grid(const void * grid, cudaStream_t stream) {
+    int8_t * d_grid;
+    CUDA_CHECK(cudaGetSymbolAddress((void **)&d_grid, iq3tq_grid_cuda));
+    CUDA_CHECK(cudaMemcpyAsync(d_grid, grid, 128, cudaMemcpyHostToDevice, stream));
+}
+
+
+void ggml_cuda_set_iq1bn_aux(const void * aux, cudaStream_t stream) {
+    int8_t * d_cb;
+    CUDA_CHECK(cudaGetSymbolAddress((void **)&d_cb, iq1bn_codebook_cuda));
+    CUDA_CHECK(cudaMemcpyAsync(d_cb, aux, IQ1BN_CODEBOOK_SIZE, cudaMemcpyHostToDevice, stream));
+}
+
 template<typename dst_t>
 static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
    const int nb = (k + QK_K - 1) / QK_K;
    dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
 }

+template<typename dst_t>
+static __global__ void dequantize_block_q4_dpt(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+    const int64_t i   = blockIdx.x;
+    const block_q4_dpt * x = (const block_q4_dpt *) vx + i*(QK_K/QK4_NL);
+
+    const int64_t tid = threadIdx.x;
+    const int64_t il = tid/8; // 0...3
+    const int64_t ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 4*il;
+    const uint8_t  * q4 = x[ib].qs + 4*il;
+    const float d = (float)x[ib].d;
+    for (int j = 0; j < 4; ++j) {
+        y[j+ 0] = d * q4dpt_levels_cuda[q4[j] & 0xf];
+        y[j+16] = d * q4dpt_levels_cuda[q4[j] >>  4];
+    }
+}
+
+template<typename dst_t>
+static void dequantize_row_q4_dpt_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
+    const int nb = (k + QK_K - 1) / QK_K;
+    dequantize_block_q4_dpt<<<nb, 32, 0, stream>>>(vx, y);
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_q2_dpt(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+    const int64_t i   = blockIdx.x;
+    const block_q2_dpt * x = (const block_q2_dpt *) vx + i*(QK_K/QK2_DPT);
+
+    const int64_t tid = threadIdx.x;
+    const int64_t il = tid/8; // 0...3
+    const int64_t ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 4*il;
+    const uint8_t  * q2 = x[ib].qs + il;
+    const float d = (float)x[ib].d;
+    uint8_t q = q2[0];
+    y[ 0] = d * q2dpt_levels_cuda[(q >> 0) & 3];
+    y[ 1] = d * q2dpt_levels_cuda[(q >> 2) & 3];
+    y[ 2] = d * q2dpt_levels_cuda[(q >> 4) & 3];
+    y[ 3] = d * q2dpt_levels_cuda[(q >> 6) & 3];
+}
+
+template<typename dst_t>
+static void dequantize_row_q2_dpt_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
+    const int nb = (k + QK_K - 1) / QK_K;
+    dequantize_block_q2_dpt<<<nb, 32, 0, stream>>>(vx, y);
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_iq2_tq(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+    const int64_t i = blockIdx.x;
+    const block_iq2_tq * bq = (const block_iq2_tq *) vx + i;
+    const int g = threadIdx.x;  // group index 0..31
+
+    const float dq = __half2float(bq->d) * IQ2TQ_GRID_SCALE;
+
+    const int si = (bq->scales[g / 2] >> (4 * (g & 1))) & 0xF;
+    const int8_t * ge = iq2tq_grid_cuda + si * 4;
+
+    dst_t * y = yy + i * QK_K + g * 8;
+    const uint8_t * qs = bq->qs + g * 2;
+
+    y[0] = dq * ge[(qs[0] >> 0) & 3];
+    y[1] = dq * ge[(qs[0] >> 2) & 3];
+    y[2] = dq * ge[(qs[0] >> 4) & 3];
+    y[3] = dq * ge[(qs[0] >> 6) & 3];
+    y[4] = dq * ge[(qs[1] >> 0) & 3];
+    y[5] = dq * ge[(qs[1] >> 2) & 3];
+    y[6] = dq * ge[(qs[1] >> 4) & 3];
+    y[7] = dq * ge[(qs[1] >> 6) & 3];
+}
+
+template<typename dst_t>
+static void dequantize_row_iq2_tq_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+    dequantize_block_iq2_tq<<<nb, 32, 0, stream>>>(vx, y);
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_iq3_tq(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+    const int64_t i = blockIdx.x;
+    const block_iq3_tq * bq = (const block_iq3_tq *) vx + i;
+    const int g = threadIdx.x;  // group index 0..31
+
+    const float dq = __half2float(bq->d) * IQ3TQ_GRID_SCALE;
+
+    const int si = (bq->scales[g / 2] >> (4 * (g & 1))) & 0xF;
+    const int8_t * ge = iq3tq_grid_cuda + si * 8;
+
+    dst_t * y = yy + i * QK_K + g * 8;
+    const uint8_t * qs = bq->qs + g * 3;
+    const uint32_t bits = qs[0] | ((uint32_t)qs[1] << 8) | ((uint32_t)qs[2] << 16);
+
+    y[0] = dq * ge[(bits >>  0) & 7];
+    y[1] = dq * ge[(bits >>  3) & 7];
+    y[2] = dq * ge[(bits >>  6) & 7];
+    y[3] = dq * ge[(bits >>  9) & 7];
+    y[4] = dq * ge[(bits >> 12) & 7];
+    y[5] = dq * ge[(bits >> 15) & 7];
+    y[6] = dq * ge[(bits >> 18) & 7];
+    y[7] = dq * ge[(bits >> 21) & 7];
+}
+
+template<typename dst_t>
+static void dequantize_row_iq3_tq_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+    dequantize_block_iq3_tq<<<nb, 32, 0, stream>>>(vx, y);
+}
+
+
+template<typename dst_t>
+static __global__ void dequantize_block_iq1_bn(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+    const int64_t i = blockIdx.x;
+    const block_iq1_bn * bq = (const block_iq1_bn *) vx + i;
+    const int g = threadIdx.x;  // group index 0..31
+
+    const float dq = __half2float(bq->d) * IQ1BN_GRID_SCALE;
+
+    // Extract 12-bit codebook index
+    const int pair = g / 2;
+    int ci;
+    if (g & 1) {
+        ci = (bq->qs[3*pair+1] >> 4) | ((int)bq->qs[3*pair+2] << 4);
+    } else {
+        ci = bq->qs[3*pair] | (((int)bq->qs[3*pair+1] & 0x0F) << 8);
+    }
+    const int8_t * cb = iq1bn_codebook_cuda + ci * IQ1BN_CODEBOOK_DIM;
+
+    dst_t * y = yy + i * QK_K + g * IQ1BN_GROUP_SIZE;
+    y[0] = dq * cb[0];
+    y[1] = dq * cb[1];
+    y[2] = dq * cb[2];
+    y[3] = dq * cb[3];
+    y[4] = dq * cb[4];
+    y[5] = dq * cb[5];
+    y[6] = dq * cb[6];
+    y[7] = dq * cb[7];
+}
+
+template<typename dst_t>
+static void dequantize_row_iq1_bn_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+    dequantize_block_iq1_bn<<<nb, 32, 0, stream>>>(vx, y);
+}
+
 template<typename dst_t>
 static void dequantize_row_iq1_m_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
    const int nb = k / QK_K;
@ -748,6 +923,16 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
            return dequantize_row_iq1_m_cuda;
        case GGML_TYPE_IQ4_NL:
            return dequantize_row_iq4_nl_cuda;
+        case GGML_TYPE_Q4_DPT:
+            return dequantize_row_q4_dpt_cuda;
+        case GGML_TYPE_Q2_DPT:
+            return dequantize_row_q2_dpt_cuda;
+        case GGML_TYPE_IQ2_TQ:
+            return dequantize_row_iq2_tq_cuda;
+        case GGML_TYPE_IQ3_TQ:
+            return dequantize_row_iq3_tq_cuda;
+        case GGML_TYPE_IQ1_BN:
+            return dequantize_row_iq1_bn_cuda;
        case GGML_TYPE_IQ4_XS:
            return dequantize_row_iq4_xs_cuda;
        case GGML_TYPE_IQ3_S:
@ -801,6 +986,16 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
            return dequantize_row_iq1_m_cuda;
        case GGML_TYPE_IQ4_NL:
            return dequantize_row_iq4_nl_cuda;
+        case GGML_TYPE_Q4_DPT:
+            return dequantize_row_q4_dpt_cuda;
+        case GGML_TYPE_Q2_DPT:
+            return dequantize_row_q2_dpt_cuda;
+        case GGML_TYPE_IQ2_TQ:
+            return dequantize_row_iq2_tq_cuda;
+        case GGML_TYPE_IQ3_TQ:
+            return dequantize_row_iq3_tq_cuda;
+        case GGML_TYPE_IQ1_BN:
+            return dequantize_row_iq1_bn_cuda;
        case GGML_TYPE_IQ4_XS:
            return dequantize_row_iq4_xs_cuda;
        case GGML_TYPE_IQ3_S:
--- a/ggml/src/ggml-cuda/convert.cuh
+++ b/ggml/src/ggml-cuda/convert.cuh
@ -31,6 +31,22 @@ to_fp32_nc_cuda_t ggml_get_to_fp32_nc_cuda(ggml_type type);
 to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type);
 to_bf16_nc_cuda_t ggml_get_to_bf16_nc_cuda(ggml_type type);

+// Set the Q4_DPT lookup table in device constant memory.
+void ggml_cuda_set_q4dpt_levels(const int8_t * levels, cudaStream_t stream);
+
+// Set the Q2_DPT lookup table in device constant memory.
+void ggml_cuda_set_q2dpt_levels(const int8_t * levels, cudaStream_t stream);
+
+// Set the IQ2_TQ per-tensor grid (64 bytes: 16 entries × 4 int8 levels).
+void ggml_cuda_set_iq2tq_grid(const void * grid, cudaStream_t stream);
+
+// Set the IQ3_TQ per-tensor grid (128 bytes: 16 entries × 8 int8 levels).
+void ggml_cuda_set_iq3tq_grid(const void * grid, cudaStream_t stream);
+
+
+// Set the IQ1_BN per-tensor codebook+scale (2064 bytes).
+void ggml_cuda_set_iq1bn_aux(const void * aux, cudaStream_t stream);
+
 template<typename dst_t, typename src_t>
 __host__ __device__ inline dst_t ggml_cuda_cast(src_t x) {
    if constexpr (std::is_same_v<dst_t, src_t>) {
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@ -3,6 +3,7 @@
 #include "ggml-backend-impl.h"

 #include "ggml-cuda/common.cuh"
+#include "ggml-quants.h"
 #include "ggml-cuda/acc.cuh"
 #include "ggml-cuda/add-id.cuh"
 #include "ggml-cuda/arange.cuh"
@ -1426,6 +1427,24 @@ static void ggml_cuda_op_mul_mat_cublas(
        row_diff == src0->ne[1] &&
        dst->op_params[0] == GGML_PREC_DEFAULT;

+    // Upload per-tensor grids/levels before any dequantize path (fp16, fp32, or bf16)
+    if (src0->type == GGML_TYPE_Q4_DPT) {
+        GGML_ASSERT(src0->quant_levels && "Q4_DPT MUL_MAT requires levels (set tensor->quant_levels)");
+        ggml_cuda_set_q4dpt_levels((const int8_t *)src0->quant_levels, stream);
+    }
+    if (src0->type == GGML_TYPE_IQ2_TQ) {
+        GGML_ASSERT(src0->quant_levels && "IQ2_TQ MUL_MAT requires grid (set tensor->quant_levels)");
+        ggml_cuda_set_iq2tq_grid(src0->quant_levels, stream);
+    }
+    if (src0->type == GGML_TYPE_IQ3_TQ) {
+        GGML_ASSERT(src0->quant_levels && "IQ3_TQ MUL_MAT requires grid (set tensor->quant_levels)");
+        ggml_cuda_set_iq3tq_grid(src0->quant_levels, stream);
+    }
+    if (src0->type == GGML_TYPE_IQ1_BN) {
+        GGML_ASSERT(src0->quant_levels && "IQ1_BN MUL_MAT requires codebook (set tensor->quant_levels)");
+        ggml_cuda_set_iq1bn_aux(src0->quant_levels, stream);
+    }
+
    if (supports_bf16 && src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
        ggml_cuda_pool_alloc<nv_bfloat16> src1_as_bf16(ctx.pool(id));
        if (src1->type != GGML_TYPE_BF16) {
@ -4804,6 +4823,10 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                    case GGML_TYPE_IQ3_S:
                    case GGML_TYPE_IQ3_XXS:
                    case GGML_TYPE_IQ4_NL:
+                    case GGML_TYPE_Q4_DPT:
+                    case GGML_TYPE_IQ2_TQ:
+                    case GGML_TYPE_IQ3_TQ:
+                    case GGML_TYPE_IQ1_BN:
                    case GGML_TYPE_IQ4_XS:
                    case GGML_TYPE_BF16:
                        return true;
@ -4838,7 +4861,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
            {
                return (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_BF16 ||
                       op->type == GGML_TYPE_Q4_0 || op->type == GGML_TYPE_Q4_1 || op->type == GGML_TYPE_Q5_0 ||
-                       op->type == GGML_TYPE_Q5_1 || op->type == GGML_TYPE_Q8_0 || op->type == GGML_TYPE_IQ4_NL) &&
+                       op->type == GGML_TYPE_Q5_1 || op->type == GGML_TYPE_Q8_0 || op->type == GGML_TYPE_IQ4_NL ||
+                       op->type == GGML_TYPE_Q4_DPT) &&
                       op->src[0]->type == GGML_TYPE_F32 &&
                       (op->src[1]->type == GGML_TYPE_I64 || op->src[1]->type == GGML_TYPE_I32);
            } break;
@ -4891,6 +4915,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_IQ4_NL) {
                    return true;
                }
+                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_DPT) {
+                    return true;
+                }
                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_I32) {
                    return true;
                }
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@ -2,6 +2,8 @@
 #include "mmq.cuh"
 #include "quantize.cuh"
 #include "mmid.cuh"
+#include "convert.cuh"
+#include "ggml-quants.h"

 static void ggml_cuda_mul_mat_q_switch_type(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
    switch (args.type_x) {
@ -65,6 +67,12 @@ static void ggml_cuda_mul_mat_q_switch_type(ggml_backend_cuda_context & ctx, con
        case GGML_TYPE_IQ4_NL:
            mul_mat_q_case<GGML_TYPE_IQ4_NL>(ctx, args, stream);
            break;
+        case GGML_TYPE_Q4_DPT:
+            mul_mat_q_case<GGML_TYPE_Q4_DPT>(ctx, args, stream);
+            break;
+        case GGML_TYPE_Q2_DPT:
+            mul_mat_q_case<GGML_TYPE_Q2_DPT>(ctx, args, stream);
+            break;
        default:
            GGML_ABORT("fatal error");
            break;
@ -82,6 +90,22 @@ void ggml_cuda_mul_mat_q(
    cudaStream_t stream = ctx.stream();
    const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;

+    // Set Q4_DPT lookup table from tensor's quant_levels
+    if (src0->type == GGML_TYPE_Q4_DPT) {
+        GGML_ASSERT(src0->quant_levels && "Q4_DPT MUL_MAT requires levels (set tensor->quant_levels)");
+        int8_t * d_q4dpt_levels;
+        CUDA_CHECK(cudaGetSymbolAddress((void **)&d_q4dpt_levels, q4dpt_levels_cuda));
+        CUDA_CHECK(cudaMemcpyAsync(d_q4dpt_levels, src0->quant_levels, Q4DPT_N_LEVELS * sizeof(int8_t), cudaMemcpyHostToDevice, stream));
+    }
+
+    // Set Q2_DPT lookup table from tensor's quant_levels
+    if (src0->type == GGML_TYPE_Q2_DPT) {
+        GGML_ASSERT(src0->quant_levels && "Q2_DPT MUL_MAT requires levels (set tensor->quant_levels)");
+        int8_t * d_q2dpt_levels;
+        CUDA_CHECK(cudaGetSymbolAddress((void **)&d_q2dpt_levels, q2dpt_levels_cuda));
+        CUDA_CHECK(cudaMemcpyAsync(d_q2dpt_levels, src0->quant_levels, Q2DPT_N_LEVELS * sizeof(int8_t), cudaMemcpyHostToDevice, stream));
+    }
+
    const size_t ts_src0 = ggml_type_size(src0->type);
    const size_t ts_src1 = ggml_type_size(src1->type);
    const size_t ts_dst  = ggml_type_size(dst->type);
@ -290,6 +314,8 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
        case GGML_TYPE_IQ1_S:
        case GGML_TYPE_IQ4_XS:
        case GGML_TYPE_IQ4_NL:
+        case GGML_TYPE_Q4_DPT:
+        case GGML_TYPE_Q2_DPT:
            mmq_supported = true;
            break;
        default:
@ -367,3 +393,9 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t

    return (!GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
 }
+
+// Q4_DPT must be instantiated in this TU (not a separate template-instance file)
+// because it accesses the TU-local __device__ variable q4dpt_levels_cuda,
+// which is initialized by the code above.
+DECL_MMQ_CASE(GGML_TYPE_Q4_DPT);
+DECL_MMQ_CASE(GGML_TYPE_Q2_DPT);
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@ -1,6 +1,7 @@
 #pragma once

 #include "common.cuh"
+#include "ggml.h"
 #include "vecdotq.cuh"
 #include "mma.cuh"

@ -88,6 +89,8 @@ static mmq_q8_1_ds_layout mmq_get_q8_1_ds_layout(const ggml_type type_x) {
            return MMQ_Q8_1_DS_LAYOUT_DS4;
        case GGML_TYPE_IQ4_XS:
        case GGML_TYPE_IQ4_NL:
+        case GGML_TYPE_Q4_DPT:
+        case GGML_TYPE_Q2_DPT:
            return MMQ_Q8_1_DS_LAYOUT_D4;
        default:
            GGML_ABORT("fatal error");
@ -205,6 +208,8 @@ static constexpr __host__ __device__ tile_x_sizes mmq_get_dp4a_tile_x_sizes(ggml
        case GGML_TYPE_IQ1_S:   return MMQ_DP4A_TXS_Q8_0;
        case GGML_TYPE_IQ4_XS:  return MMQ_DP4A_TXS_Q8_0;
        case GGML_TYPE_IQ4_NL:  return MMQ_DP4A_TXS_Q8_0;
+        case GGML_TYPE_Q4_DPT:  return MMQ_DP4A_TXS_Q8_0;
+        case GGML_TYPE_Q2_DPT:  return MMQ_DP4A_TXS_Q8_0_16;
        default:                return tile_x_sizes{0, 0, 0};
    }
 }
@ -250,6 +255,8 @@ static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type) {
        case GGML_TYPE_IQ1_S:   return MMQ_MMA_TILE_X_K_Q8_0;
        case GGML_TYPE_IQ4_XS:  return MMQ_MMA_TILE_X_K_Q8_0;
        case GGML_TYPE_IQ4_NL:  return MMQ_MMA_TILE_X_K_Q8_0;
+        case GGML_TYPE_Q4_DPT:  return MMQ_MMA_TILE_X_K_Q8_0;
+        case GGML_TYPE_Q2_DPT:  return MMQ_MMA_TILE_X_K_Q8_0;
        default:                return 0;
    }
 }
@ -2763,6 +2770,71 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
    }
 }

+template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_q4_dpt(
+    const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
+    constexpr int nwarps = mmq_get_nwarps_device();
+    constexpr int warp_size = ggml_cuda_get_physical_warp_size();
+
+#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
+    int   * x_qs = (int   *)  x_tile;
+    float * x_df = (float *) (x_qs + MMQ_TILE_NE_K*2);
+#else
+    constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_DPT, mmq_y);
+    int   * x_qs = (int   *)  x_tile;
+    float * x_df = (float *) (x_qs + txs.qs);
+#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
+
+    constexpr int threads_per_row = MMQ_ITER_K / (4 * QR4_NL);
+    constexpr int nrows = warp_size / threads_per_row;
+    const int txi = warp_size > threads_per_row ? threadIdx.x % threads_per_row : threadIdx.x;
+    const int kbx  = txi / QI4_NL;
+    const int kqsx = txi % QI4_NL;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nrows*nwarps) {
+        int i = i0 + (nrows == 1 ? threadIdx.y : threadIdx.y*nrows + threadIdx.x/threads_per_row);
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q4_dpt * bxi = (const block_q4_dpt *) x + kbx0 + i*stride + kbx;
+
+        const int aux_q4 = get_int_b2(bxi->qs, kqsx);
+        const int2 v = get_int_from_table_16(aux_q4, q4dpt_levels_cuda);
+        const int k0 = kbx * (2 * QI4_NL) + kqsx;
+
+#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
+        x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + k0 + 0]      = v.x;
+        x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + k0 + QI4_NL] = v.y;
+#else
+        x_qs[i*(2*MMQ_TILE_NE_K + 1) + k0 + 0]      = v.x;
+        x_qs[i*(2*MMQ_TILE_NE_K + 1) + k0 + QI4_NL] = v.y;
+#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
+    }
+
+    constexpr int blocks_per_tile_x_row = MMQ_TILE_NE_K / QI4_NL;
+    constexpr int rows_per_warp = warp_size / blocks_per_tile_x_row;
+    const int kbxd = threadIdx.x % blocks_per_tile_x_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * rows_per_warp) {
+        int i = i0 + threadIdx.y * rows_per_warp + threadIdx.x / blocks_per_tile_x_row;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q4_dpt * bxi = (const block_q4_dpt *) x + kbx0 + i*stride + kbxd;
+
+#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
+        x_df[i*MMQ_MMA_TILE_X_K_Q8_0             + kbxd] = __half2float(bxi->d);
+#else
+        x_df[i*(MMQ_TILE_NE_K/QI4_NL) + i/QI4_NL + kbxd] = __half2float(bxi->d);
+#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
+    }
+}
+
 template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_iq2_xxs(
    const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
    constexpr int nwarps = mmq_get_nwarps_device();
@ -3447,6 +3519,22 @@ struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_IQ4_NL> {
    static constexpr vec_dot_mmq_t    vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y>;
 };

+template <int mmq_x, int mmq_y, bool need_check>
+struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_Q4_DPT> {
+    static constexpr int              vdr          = VDR_Q4_DPT_Q8_1_MMQ;
+    static constexpr load_tiles_mmq_t load_tiles   = load_tiles_q4_dpt<mmq_y, need_check>;
+    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, MMQ_Q8_1_DS_LAYOUT_D4>;
+    static constexpr vec_dot_mmq_t    vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y>;
+};
+
+template <int mmq_x, int mmq_y, bool need_check>
+struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_Q2_DPT> {
+    static constexpr int              vdr          = VDR_Q2_DPT_Q8_1_MMQ;
+    static constexpr load_tiles_mmq_t load_tiles   = load_tiles_q4_dpt<mmq_y, need_check>;  // Reuse Q4_DPT loader (same layout)
+    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, MMQ_Q8_1_DS_LAYOUT_D4>;
+    static constexpr vec_dot_mmq_t    vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y>;
+};
+
 template <int mmq_x, int mmq_y, bool need_check>
 struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_IQ4_XS> {
    static constexpr int              vdr          = VDR_IQ4_XS_Q8_1_MMQ;
--- a/ggml/src/ggml-cuda/mmvq.cu
+++ b/ggml/src/ggml-cuda/mmvq.cu
@ -2,6 +2,8 @@
 #include "quantize.cuh"
 #include "unary.cuh"
 #include "vecdotq.cuh"
+#include "convert.cuh"
+#include "ggml-quants.h"

 #include <cstdint>

@ -28,6 +30,11 @@ static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type)
        case GGML_TYPE_IQ1_S:   return vec_dot_iq1_s_q8_1;
        case GGML_TYPE_IQ1_M:   return vec_dot_iq1_m_q8_1;
        case GGML_TYPE_IQ4_NL:  return vec_dot_iq4_nl_q8_1;
+        case GGML_TYPE_Q4_DPT:  return vec_dot_q4_dpt_q8_1;
+        case GGML_TYPE_Q2_DPT:  return vec_dot_q2_dpt_q8_1;
+        case GGML_TYPE_IQ2_TQ:  return vec_dot_iq2_tq_q8_1;
+        case GGML_TYPE_IQ3_TQ:  return vec_dot_iq3_tq_q8_1;
+        case GGML_TYPE_IQ1_BN:  return vec_dot_iq1_bn_q8_1;
        case GGML_TYPE_IQ4_XS:  return vec_dot_iq4_xs_q8_1;
        case GGML_TYPE_IQ3_S:   return vec_dot_iq3_s_q8_1;
        default:                return nullptr;
@ -54,6 +61,11 @@ static constexpr __host__ __device__ int get_vdr_mmvq(ggml_type type) {
        case GGML_TYPE_IQ3_XXS: return VDR_IQ3_XXS_Q8_1_MMVQ;
        case GGML_TYPE_IQ3_S:   return VDR_IQ3_S_Q8_1_MMVQ;
        case GGML_TYPE_IQ4_NL:  return VDR_IQ4_NL_Q8_1_MMVQ;
+        case GGML_TYPE_Q4_DPT:  return VDR_Q4_DPT_Q8_1_MMVQ;
+        case GGML_TYPE_Q2_DPT:  return VDR_Q2_DPT_Q8_1_MMVQ;
+        case GGML_TYPE_IQ2_TQ:  return VDR_IQ2_TQ_Q8_1_MMVQ;
+        case GGML_TYPE_IQ3_TQ:  return VDR_IQ3_TQ_Q8_1_MMVQ;
+        case GGML_TYPE_IQ1_BN:  return VDR_IQ1_BN_Q8_1_MMVQ;
        case GGML_TYPE_IQ4_XS:  return VDR_IQ4_XS_Q8_1_MMVQ;
        default:                return 1;
    }
@ -1000,6 +1012,30 @@ static void mul_mat_vec_q_switch_type(
                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
            break;
+        case GGML_TYPE_Q4_DPT:
+            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q4_DPT>
+                (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
+            break;
+        case GGML_TYPE_IQ2_TQ:
+            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ2_TQ>
+                (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
+            break;
+        case GGML_TYPE_IQ3_TQ:
+            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ3_TQ>
+                (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
+            break;
+        case GGML_TYPE_IQ1_BN:
+            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ1_BN>
+                (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
+            break;
        case GGML_TYPE_IQ4_XS:
            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ4_XS>
                (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
@ -1029,6 +1065,45 @@ void ggml_cuda_mul_mat_vec_q(

    cudaStream_t stream = ctx.stream();

+    // Set Q4_DPT lookup table from tensor's quant_levels
+    if (src0->type == GGML_TYPE_Q4_DPT) {
+        GGML_ASSERT(src0->quant_levels && "Q4_DPT MUL_MAT requires levels (set tensor->quant_levels)");
+        int8_t * d_q4dpt_levels;
+        CUDA_CHECK(cudaGetSymbolAddress((void **)&d_q4dpt_levels, q4dpt_levels_cuda));
+        CUDA_CHECK(cudaMemcpyAsync(d_q4dpt_levels, src0->quant_levels, Q4DPT_N_LEVELS * sizeof(int8_t), cudaMemcpyHostToDevice, stream));
+    }
+
+    // Set Q2_DPT lookup table from tensor's quant_levels
+    if (src0->type == GGML_TYPE_Q2_DPT) {
+        GGML_ASSERT(src0->quant_levels && "Q2_DPT MUL_MAT requires levels (set tensor->quant_levels)");
+        int8_t * d_q2dpt_levels;
+        CUDA_CHECK(cudaGetSymbolAddress((void **)&d_q2dpt_levels, q2dpt_levels_cuda));
+        CUDA_CHECK(cudaMemcpyAsync(d_q2dpt_levels, src0->quant_levels, Q2DPT_N_LEVELS * sizeof(int8_t), cudaMemcpyHostToDevice, stream));
+    }
+
+    // Set IQ2_TQ per-tensor grid
+    if (src0->type == GGML_TYPE_IQ2_TQ) {
+        GGML_ASSERT(src0->quant_levels && "IQ2_TQ MUL_MAT requires grid (set tensor->quant_levels)");
+        int8_t * d_grid;
+        CUDA_CHECK(cudaGetSymbolAddress((void **)&d_grid, iq2tq_grid_cuda));
+        CUDA_CHECK(cudaMemcpyAsync(d_grid, src0->quant_levels, 64, cudaMemcpyHostToDevice, stream));
+    }
+
+    // Set IQ3_TQ per-tensor grid
+    if (src0->type == GGML_TYPE_IQ3_TQ) {
+        GGML_ASSERT(src0->quant_levels && "IQ3_TQ MUL_MAT requires grid (set tensor->quant_levels)");
+        int8_t * d_grid;
+        CUDA_CHECK(cudaGetSymbolAddress((void **)&d_grid, iq3tq_grid_cuda));
+        CUDA_CHECK(cudaMemcpyAsync(d_grid, src0->quant_levels, 128, cudaMemcpyHostToDevice, stream));
+    }
+
+
+    // Set IQ1_BN per-tensor codebook+scale
+    if (src0->type == GGML_TYPE_IQ1_BN) {
+        GGML_ASSERT(src0->quant_levels && "IQ1_BN MUL_MAT requires codebook (set tensor->quant_levels)");
+        ggml_cuda_set_iq1bn_aux(src0->quant_levels, stream);
+    }
+
    const size_t ts_src0 = ggml_type_size(src0->type);
    const size_t ts_src1 = ggml_type_size(src1->type);
    const size_t ts_dst  = ggml_type_size(dst->type);
--- a/ggml/src/ggml-cuda/vecdotq.cuh
+++ b/ggml/src/ggml-cuda/vecdotq.cuh
@ -1240,6 +1240,194 @@ static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(
    return d * sumi;
 }

+#define VDR_Q4_DPT_Q8_1_MMVQ 2
+#define VDR_Q4_DPT_Q8_1_MMQ  4
+
+static __device__ __forceinline__ float vec_dot_q4_dpt_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
+
+    const block_q4_dpt * bq4 = (const block_q4_dpt *) vbq + kbx;
+
+    const int * q8 = (const int *) bq8_1->qs + iqs;
+
+    int sumi = 0;
+#pragma unroll
+    for (int l = 0; l < VDR_Q4_DPT_Q8_1_MMVQ; ++l) {
+        const int aux_q4 = get_int_b2(bq4->qs, iqs + l);
+        const int2 v = get_int_from_table_16(aux_q4, q4dpt_levels_cuda);
+
+        sumi = ggml_cuda_dp4a(v.x, q8[l + 0], sumi);
+        sumi = ggml_cuda_dp4a(v.y, q8[l + 4], sumi);
+    }
+
+    const float d = __half2float(bq4->d) * __low2float(bq8_1->ds);
+    return d * sumi;
+}
+
+// Q2_DPT: 2-bit quantization with 4 learned levels
+// Helper: lookup 4 int8 levels using 2-bit indices packed in a 32-bit int
+static __device__ __forceinline__ int4 get_int_from_table_4(const int & q2, const int8_t * table) {
+    int4 result;
+    result.x = table[(q2 >>  0) & 3];
+    result.y = table[(q2 >>  8) & 3];
+    result.z = table[(q2 >> 16) & 3];
+    result.w = table[(q2 >> 24) & 3];
+    return result;
+}
+
+#define VDR_Q2_DPT_Q8_1_MMVQ 4
+#define VDR_Q2_DPT_Q8_1_MMQ  8
+
+static __device__ __forceinline__ float vec_dot_q2_dpt_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
+
+    const block_q2_dpt * bq2 = (const block_q2_dpt *) vbq + kbx;
+
+    const int * q8 = (const int *) bq8_1->qs + iqs;
+
+    int sumi = 0;
+#pragma unroll
+    for (int l = 0; l < VDR_Q2_DPT_Q8_1_MMVQ; ++l) {
+        const int aux_q2 = get_int_b4(bq2->qs, l);
+        const int4 v = get_int_from_table_4(aux_q2, q2dpt_levels_cuda);
+
+        sumi = ggml_cuda_dp4a(v.x, q8[l + 0], sumi);
+        sumi = ggml_cuda_dp4a(v.y, q8[l + 4], sumi);
+        sumi = ggml_cuda_dp4a(v.z, q8[l + 8], sumi);
+        sumi = ggml_cuda_dp4a(v.w, q8[l + 12], sumi);
+    }
+
+    const float d = __half2float(bq2->d) * __low2float(bq8_1->ds);
+    return d * sumi;
+}
+
+// IQ2_TQ: 2-bit with per-tensor trained 16×4 grid table
+// Grid lookup helper: 4 × 2-bit indices packed in a byte → 4 grid values packed as int32
+static __device__ __forceinline__ int iq2tq_grid_lookup4(uint8_t qbyte, const int8_t * grid_entry) {
+    uint32_t r  = (uint32_t)(uint8_t)grid_entry[(qbyte >> 0) & 3];
+    r |= (uint32_t)(uint8_t)grid_entry[(qbyte >> 2) & 3] << 8;
+    r |= (uint32_t)(uint8_t)grid_entry[(qbyte >> 4) & 3] << 16;
+    r |= (uint32_t)(uint8_t)grid_entry[(qbyte >> 6) & 3] << 24;
+    return (int)r;
+}
+
+#define VDR_IQ2_TQ_Q8_1_MMVQ 1
+#define VDR_IQ2_TQ_Q8_1_MMQ  1
+
+static __device__ __forceinline__ float vec_dot_iq2_tq_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
+
+    const block_iq2_tq * bq = (const block_iq2_tq *) vbq + kbx;
+
+    // iqs selects which 16-element portion (0..15): 2 groups of 8 elements
+    const int q8b   = iqs / 2;         // Q8_1 block index (0..7)
+    const int q8off = (iqs & 1) * 4;   // int32 offset within Q8_1 block (0 or 4)
+
+    // Grid indices for groups iqs*2 and iqs*2+1
+    const uint8_t sc = bq->scales[iqs];
+    const int8_t * ge0 = iq2tq_grid_cuda + (sc & 0xF) * 4;
+    const int8_t * ge1 = iq2tq_grid_cuda + (sc >> 4)   * 4;
+
+    const uint8_t * qs = bq->qs + iqs * 4;
+    const int * q8 = (const int *)bq8_1[q8b].qs + q8off;
+
+    int sumi = 0;
+
+    // Group 0: 8 elements = 2 bytes qs, 2 int32 Q8_1
+    sumi = ggml_cuda_dp4a(iq2tq_grid_lookup4(qs[0], ge0), q8[0], sumi);
+    sumi = ggml_cuda_dp4a(iq2tq_grid_lookup4(qs[1], ge0), q8[1], sumi);
+
+    // Group 1: next 8 elements
+    sumi = ggml_cuda_dp4a(iq2tq_grid_lookup4(qs[2], ge1), q8[2], sumi);
+    sumi = ggml_cuda_dp4a(iq2tq_grid_lookup4(qs[3], ge1), q8[3], sumi);
+
+    return __half2float(bq->d) * IQ2TQ_GRID_SCALE * __low2float(bq8_1[q8b].ds) * sumi;
+}
+
+// IQ3_TQ: 3-bit with per-tensor trained 16×8 grid table
+#define VDR_IQ3_TQ_Q8_1_MMVQ 1
+#define VDR_IQ3_TQ_Q8_1_MMQ  1
+
+static __device__ __forceinline__ float vec_dot_iq3_tq_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
+
+    const block_iq3_tq * bq = (const block_iq3_tq *) vbq + kbx;
+
+    const int q8b   = iqs / 2;
+    const int q8off = (iqs & 1) * 4;
+
+    const uint8_t sc = bq->scales[iqs];
+    const int8_t * ge0 = iq3tq_grid_cuda + (sc & 0xF) * 8;
+    const int8_t * ge1 = iq3tq_grid_cuda + (sc >> 4)   * 8;
+
+    const int * q8 = (const int *)bq8_1[q8b].qs + q8off;
+
+    int sumi = 0;
+
+    // Group 0: 8 elements, 3 bytes of qs
+    {
+        const uint8_t * qs = bq->qs + (iqs * 2) * 3;
+        const uint32_t bits = qs[0] | ((uint32_t)qs[1] << 8) | ((uint32_t)qs[2] << 16);
+
+        int v0 = (uint8_t)ge0[(bits >>  0) & 7]        | ((uint32_t)(uint8_t)ge0[(bits >>  3) & 7] << 8)
+               | ((uint32_t)(uint8_t)ge0[(bits >>  6) & 7] << 16) | ((uint32_t)(uint8_t)ge0[(bits >>  9) & 7] << 24);
+        sumi = ggml_cuda_dp4a(v0, q8[0], sumi);
+
+        int v1 = (uint8_t)ge0[(bits >> 12) & 7]        | ((uint32_t)(uint8_t)ge0[(bits >> 15) & 7] << 8)
+               | ((uint32_t)(uint8_t)ge0[(bits >> 18) & 7] << 16) | ((uint32_t)(uint8_t)ge0[(bits >> 21) & 7] << 24);
+        sumi = ggml_cuda_dp4a(v1, q8[1], sumi);
+    }
+
+    // Group 1: next 8 elements, next 3 bytes of qs
+    {
+        const uint8_t * qs = bq->qs + (iqs * 2 + 1) * 3;
+        const uint32_t bits = qs[0] | ((uint32_t)qs[1] << 8) | ((uint32_t)qs[2] << 16);
+
+        int v0 = (uint8_t)ge1[(bits >>  0) & 7]        | ((uint32_t)(uint8_t)ge1[(bits >>  3) & 7] << 8)
+               | ((uint32_t)(uint8_t)ge1[(bits >>  6) & 7] << 16) | ((uint32_t)(uint8_t)ge1[(bits >>  9) & 7] << 24);
+        sumi = ggml_cuda_dp4a(v0, q8[2], sumi);
+
+        int v1 = (uint8_t)ge1[(bits >> 12) & 7]        | ((uint32_t)(uint8_t)ge1[(bits >> 15) & 7] << 8)
+               | ((uint32_t)(uint8_t)ge1[(bits >> 18) & 7] << 16) | ((uint32_t)(uint8_t)ge1[(bits >> 21) & 7] << 24);
+        sumi = ggml_cuda_dp4a(v1, q8[3], sumi);
+    }
+
+    return __half2float(bq->d) * IQ3TQ_GRID_SCALE * __low2float(bq8_1[q8b].ds) * sumi;
+}
+
+
+// IQ1_BN: 8D vector quantized with per-tensor trained 4096-entry codebook
+#define VDR_IQ1_BN_Q8_1_MMVQ 1
+#define VDR_IQ1_BN_Q8_1_MMQ  1
+
+static __device__ __forceinline__ float vec_dot_iq1_bn_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
+
+    const block_iq1_bn * bq = (const block_iq1_bn *) vbq + kbx;
+
+    // iqs = 0..15, each thread handles 2 groups (16 elements)
+    const int q8b   = iqs / 2;
+    const int q8off = (iqs & 1) * 4;
+
+    // Extract two 12-bit codebook indices from qs[3*iqs .. 3*iqs+2]
+    const uint8_t * qs = bq->qs + 3 * iqs;
+    const int ci0 = qs[0] | (((int)qs[1] & 0x0F) << 8);
+    const int ci1 = (qs[1] >> 4) | ((int)qs[2] << 4);
+
+    const int * cb0 = (const int *)(iq1bn_codebook_cuda + ci0 * IQ1BN_CODEBOOK_DIM);
+    const int * cb1 = (const int *)(iq1bn_codebook_cuda + ci1 * IQ1BN_CODEBOOK_DIM);
+
+    const int * q8 = (const int *)bq8_1[q8b].qs + q8off;
+
+    int sumi = 0;
+    sumi = ggml_cuda_dp4a(cb0[0], q8[0], sumi);
+    sumi = ggml_cuda_dp4a(cb0[1], q8[1], sumi);
+    sumi = ggml_cuda_dp4a(cb1[0], q8[2], sumi);
+    sumi = ggml_cuda_dp4a(cb1[1], q8[3], sumi);
+
+    return __half2float(bq->d) * IQ1BN_GRID_SCALE * __low2float(bq8_1[q8b].ds) * (float)sumi;
+}
+
 #define VDR_IQ4_XS_Q8_1_MMVQ 4
 #define VDR_IQ4_XS_Q8_1_MMQ  4

--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
--- a/ggml/src/ggml-quants.h
+++ b/ggml/src/ggml-quants.h
@ -27,6 +27,7 @@ GGML_API void quantize_row_nvfp4_ref(const float * GGML_RESTRICT x, block_nvfp4

 GGML_API void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
 GGML_API void quantize_row_q3_K_ref(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
+GGML_API void quantize_row_q3_kpt_ref(const float * GGML_RESTRICT x, block_q3_kpt * GGML_RESTRICT y, int64_t k);
 GGML_API void quantize_row_q4_K_ref(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k);
 GGML_API void quantize_row_q5_K_ref(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
 GGML_API void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
@ -42,36 +43,37 @@ GGML_API void quantize_row_iq3_s_ref  (const float * GGML_RESTRICT x, block_iq3_
 GGML_API void quantize_row_iq2_s_ref  (const float * GGML_RESTRICT x, block_iq2_s   * GGML_RESTRICT y, int64_t k);

 // Dequantization
-GGML_API void dequantize_row_q1_0(const block_q1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-GGML_API void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-GGML_API void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-GGML_API void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-GGML_API void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-GGML_API void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-//GGML_API void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_q1_0(const block_q1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
+GGML_API void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
+GGML_API void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
+GGML_API void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
+GGML_API void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
+GGML_API void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
+//GGML_API void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);

-GGML_API void dequantize_row_mxfp4(const block_mxfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-GGML_API void dequantize_row_nvfp4(const block_nvfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_mxfp4(const block_mxfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
+GGML_API void dequantize_row_nvfp4(const block_nvfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);

-GGML_API void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-GGML_API void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-GGML_API void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-GGML_API void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-GGML_API void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-GGML_API void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
+GGML_API void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
+GGML_API void dequantize_row_q3_kpt(const block_q3_kpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
+GGML_API void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
+GGML_API void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
+GGML_API void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
+GGML_API void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);

-GGML_API void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-GGML_API void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
+GGML_API void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);

-GGML_API void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-GGML_API void dequantize_row_iq2_xs (const block_iq2_xs  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-GGML_API void dequantize_row_iq2_s  (const block_iq2_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-GGML_API void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-GGML_API void dequantize_row_iq1_s  (const block_iq1_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-GGML_API void dequantize_row_iq1_m  (const block_iq1_m   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-GGML_API void dequantize_row_iq4_nl (const block_iq4_nl  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-GGML_API void dequantize_row_iq4_xs (const block_iq4_xs  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-GGML_API void dequantize_row_iq3_s  (const block_iq3_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
+GGML_API void dequantize_row_iq2_xs (const block_iq2_xs  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
+GGML_API void dequantize_row_iq2_s  (const block_iq2_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
+GGML_API void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
+GGML_API void dequantize_row_iq1_s  (const block_iq1_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
+GGML_API void dequantize_row_iq1_m  (const block_iq1_m   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
+GGML_API void dequantize_row_iq4_nl (const block_iq4_nl  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
+GGML_API void dequantize_row_iq4_xs (const block_iq4_xs  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
+GGML_API void dequantize_row_iq3_s  (const block_iq3_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);

 // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
 GGML_API size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
@ -82,6 +84,14 @@ GGML_API size_t quantize_iq1_s  (const float * GGML_RESTRICT src, void * GGML_RE
 GGML_API size_t quantize_iq1_m  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 GGML_API size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 GGML_API size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API size_t quantize_q3_kpt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+
+// Q3_KPT level management
+GGML_API void q3kpt_set_levels(const float * levels);
+GGML_API const float * q3kpt_get_levels(void);
+GGML_API void q3kpt_free_levels(void);
+GGML_API void q3kpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
+                                  const float * imatrix, float levels_out[Q3KPT_N_LEVELS]);
 GGML_API size_t quantize_iq3_s  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);

 GGML_API size_t quantize_tq1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
@ -102,6 +112,198 @@ GGML_API size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTR
 GGML_API size_t quantize_mxfp4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 GGML_API size_t quantize_nvfp4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);

+GGML_API void quantize_row_q3_pt_ref(const float * GGML_RESTRICT x, block_q3_pt * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_q3_pt(const block_q3_pt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
+GGML_API size_t quantize_q3_pt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+
+// Q3_PT levels management (per-tensor Lloyd-Max levels in [0,1])
+GGML_API void          q3pt_set_levels(const float * levels);   // set global levels (quantization)
+GGML_API const float * q3pt_get_levels(void);
+GGML_API void          q3pt_free_levels(void);
+
+// Per-tensor levels registry (inference — range-based lookup by data address)
+
+// Train 8 Lloyd-Max levels from tensor data via weighted k-means on affine-normalized
+// 16-element sub-block values. Also sets the global levels via q3pt_set_levels().
+// data: float array [nrow * n_per_row], imatrix: importance weights [n_per_row] or NULL.
+GGML_API void          q3pt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
+                                          const float * imatrix, float levels_out[8]);
+
+// Q4_DPT: IQ4_NL with learned per-tensor int8 levels
+GGML_API void dequantize_row_q4_dpt(const block_q4_dpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
+GGML_API void quantize_row_q4_dpt_ref(const float * GGML_RESTRICT x, block_q4_dpt * GGML_RESTRICT y, int64_t k);
+GGML_API size_t quantize_q4_dpt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+
+// Q4_DPT levels management (per-tensor Lloyd-Max int8 levels)
+GGML_API void           q4dpt_set_levels(const int8_t * levels);
+GGML_API const int8_t * q4dpt_get_levels(void);
+GGML_API void           q4dpt_free_levels(void);
+
+// Q2_DPT: 2-bit with learned per-tensor int8 levels
+GGML_API void dequantize_row_q2_dpt(const block_q2_dpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
+GGML_API void quantize_row_q2_dpt_ref(const float * GGML_RESTRICT x, block_q2_dpt * GGML_RESTRICT y, int64_t k);
+GGML_API size_t quantize_q2_dpt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+
+// Q2_DPT levels management (per-tensor Lloyd-Max int8 levels)
+GGML_API void           q2dpt_set_levels(const int8_t * levels);
+GGML_API const int8_t * q2dpt_get_levels(void);
+GGML_API void           q2dpt_free_levels(void);
+GGML_API void           q2dpt_set_quant_strategy(int s);
+
+// Train 4 Lloyd-Max int8 levels from tensor data for Q2_DPT.
+// Bins normalized values (x/amax) in [-1,1], runs weighted k-means, rounds to sorted int8[4].
+// Also sets the global levels via q2dpt_set_levels().
+GGML_API void           q2dpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
+                                            const float * imatrix, int8_t levels_out[Q2DPT_N_LEVELS]);
+
+// Q2_KPT: Q2_K with learned per-tensor float levels
+GGML_API void dequantize_row_q2_kpt(const block_q2_kpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
+GGML_API void quantize_row_q2_kpt_ref(const float * GGML_RESTRICT x, block_q2_kpt * GGML_RESTRICT y, int64_t k);
+GGML_API size_t quantize_q2_kpt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t start_row, int64_t nrows, int64_t n_per_row, const float * imatrix);
+
+// Q2_KPT levels management (per-tensor float levels in [0,1])
+GGML_API void          q2kpt_set_levels(const float * levels);
+GGML_API const float * q2kpt_get_levels(void);
+GGML_API void          q2kpt_free_levels(void);
+// Prepare levels buffer for a tensor with given dimensions (call before parallel quantization)
+GGML_API void          q2kpt_prepare_levels(int64_t nrows, int64_t n_per_row);
+
+// Train 4 Lloyd-Max float levels from tensor data for Q2_KPT.
+// Bins normalized sub-block values in [0,1], runs weighted k-means for 4 centroids.
+// Also sets the global levels via q2kpt_set_levels().
+GGML_API void          q2kpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
+                                           const float * imatrix, float levels_out[Q2KPT_N_LEVELS]);
+
+// Train per-row levels for all rows: writes nrow * Q2KPT_N_LEVELS floats to out_levels.
+GGML_API void          q2kpt_train_all_row_levels(const float * data, int64_t nrow, int64_t n_per_row,
+                                                   const float * imatrix, float * out_levels);
+
+// IQ2_TQ: 2-bit scalar with per-group asymmetric grid (2.5625 bpw)
+GGML_API void dequantize_row_iq2_tq(const block_iq2_tq * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
+GGML_API void quantize_row_iq2_tq_ref(const float * GGML_RESTRICT x, block_iq2_tq * GGML_RESTRICT y, int64_t k);
+GGML_API size_t quantize_iq2_tq(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API void iq2tq_train_grid(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t grid_out[64]);
+GGML_API void iq2tq_set_grid(const int8_t grid[64]);
+GGML_API const int8_t * iq2tq_get_grid(void);
+
+// IQ3_TQ: 3-bit scalar with per-group asymmetric grid (3.5625 bpw)
+GGML_API void dequantize_row_iq3_tq(const block_iq3_tq * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
+GGML_API void quantize_row_iq3_tq_ref(const float * GGML_RESTRICT x, block_iq3_tq * GGML_RESTRICT y, int64_t k);
+GGML_API size_t quantize_iq3_tq(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API void iq3tq_train_grid(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t grid_out[IQ3TQ_GRID_SIZE]);
+GGML_API void iq3tq_set_grid(const int8_t grid[IQ3TQ_GRID_SIZE]);
+GGML_API const int8_t * iq3tq_get_grid(void);
+
+// IQ1_BN: 8D vector quantized with per-tensor trained codebook (1.5625 bpw)
+GGML_API void dequantize_row_iq1_bn(const block_iq1_bn * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
+GGML_API void quantize_row_iq1_bn_ref(const float * GGML_RESTRICT x, block_iq1_bn * GGML_RESTRICT y, int64_t k);
+GGML_API size_t quantize_iq1_bn(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API void iq1bn_train_codebook(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t aux_out[IQ1BN_AUX_SIZE], int nthread);
+GGML_API void iq1bn_set_aux(const int8_t aux[IQ1BN_AUX_SIZE]);
+GGML_API const int8_t * iq1bn_get_aux(void);
+
+// Train 16 Lloyd-Max int8 levels from tensor data.
+// Bins normalized values (x/amax) in [-1,1], runs weighted k-means, rounds to sorted int8[16].
+// Also sets the global levels via q4dpt_set_levels().
+GGML_API void           q4dpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
+                                            const float * imatrix, int8_t levels_out[Q4DPT_N_LEVELS]);
+
+GGML_API void quantize_row_q3_pt_ref(const float * GGML_RESTRICT x, block_q3_pt * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_q3_pt(const block_q3_pt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
+GGML_API size_t quantize_q3_pt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+
+// Q3_PT levels management (per-tensor Lloyd-Max levels in [0,1])
+GGML_API void          q3pt_set_levels(const float * levels);   // set global levels (quantization)
+GGML_API const float * q3pt_get_levels(void);
+GGML_API void          q3pt_free_levels(void);
+
+// Per-tensor levels registry (inference — range-based lookup by data address)
+
+// Train 8 Lloyd-Max levels from tensor data via weighted k-means on affine-normalized
+// 16-element sub-block values. Also sets the global levels via q3pt_set_levels().
+// data: float array [nrow * n_per_row], imatrix: importance weights [n_per_row] or NULL.
+GGML_API void          q3pt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
+                                          const float * imatrix, float levels_out[8]);
+
+// Q4_DPT: IQ4_NL with learned per-tensor int8 levels
+GGML_API void dequantize_row_q4_dpt(const block_q4_dpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
+GGML_API void quantize_row_q4_dpt_ref(const float * GGML_RESTRICT x, block_q4_dpt * GGML_RESTRICT y, int64_t k);
+GGML_API size_t quantize_q4_dpt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+
+// Q4_DPT levels management (per-tensor Lloyd-Max int8 levels)
+GGML_API void           q4dpt_set_levels(const int8_t * levels);
+GGML_API const int8_t * q4dpt_get_levels(void);
+GGML_API void           q4dpt_free_levels(void);
+
+// Q2_DPT: 2-bit with learned per-tensor int8 levels
+GGML_API void dequantize_row_q2_dpt(const block_q2_dpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
+GGML_API void quantize_row_q2_dpt_ref(const float * GGML_RESTRICT x, block_q2_dpt * GGML_RESTRICT y, int64_t k);
+GGML_API size_t quantize_q2_dpt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+
+// Q2_DPT levels management (per-tensor Lloyd-Max int8 levels)
+GGML_API void           q2dpt_set_levels(const int8_t * levels);
+GGML_API const int8_t * q2dpt_get_levels(void);
+GGML_API void           q2dpt_free_levels(void);
+GGML_API void           q2dpt_set_quant_strategy(int s);
+
+// Train 4 Lloyd-Max int8 levels from tensor data for Q2_DPT.
+// Bins normalized values (x/amax) in [-1,1], runs weighted k-means, rounds to sorted int8[4].
+// Also sets the global levels via q2dpt_set_levels().
+GGML_API void           q2dpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
+                                            const float * imatrix, int8_t levels_out[Q2DPT_N_LEVELS]);
+
+// Q2_KPT: Q2_K with learned per-tensor float levels
+GGML_API void dequantize_row_q2_kpt(const block_q2_kpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
+GGML_API void quantize_row_q2_kpt_ref(const float * GGML_RESTRICT x, block_q2_kpt * GGML_RESTRICT y, int64_t k);
+GGML_API size_t quantize_q2_kpt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t start_row, int64_t nrows, int64_t n_per_row, const float * imatrix);
+
+// Q2_KPT levels management (per-tensor float levels in [0,1])
+GGML_API void          q2kpt_set_levels(const float * levels);
+GGML_API const float * q2kpt_get_levels(void);
+GGML_API void          q2kpt_free_levels(void);
+// Prepare levels buffer for a tensor with given dimensions (call before parallel quantization)
+GGML_API void          q2kpt_prepare_levels(int64_t nrows, int64_t n_per_row);
+
+// Train 4 Lloyd-Max float levels from tensor data for Q2_KPT.
+// Bins normalized sub-block values in [0,1], runs weighted k-means for 4 centroids.
+// Also sets the global levels via q2kpt_set_levels().
+GGML_API void          q2kpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
+                                           const float * imatrix, float levels_out[Q2KPT_N_LEVELS]);
+
+// Train per-row levels for all rows: writes nrow * Q2KPT_N_LEVELS floats to out_levels.
+GGML_API void          q2kpt_train_all_row_levels(const float * data, int64_t nrow, int64_t n_per_row,
+                                                   const float * imatrix, float * out_levels);
+
+// IQ2_TQ: 2-bit scalar with per-group asymmetric grid (2.5625 bpw)
+GGML_API void dequantize_row_iq2_tq(const block_iq2_tq * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
+GGML_API void quantize_row_iq2_tq_ref(const float * GGML_RESTRICT x, block_iq2_tq * GGML_RESTRICT y, int64_t k);
+GGML_API size_t quantize_iq2_tq(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API void iq2tq_train_grid(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t grid_out[64]);
+GGML_API void iq2tq_set_grid(const int8_t grid[64]);
+GGML_API const int8_t * iq2tq_get_grid(void);
+
+// IQ3_TQ: 3-bit scalar with per-group asymmetric grid (3.5625 bpw)
+GGML_API void dequantize_row_iq3_tq(const block_iq3_tq * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
+GGML_API void quantize_row_iq3_tq_ref(const float * GGML_RESTRICT x, block_iq3_tq * GGML_RESTRICT y, int64_t k);
+GGML_API size_t quantize_iq3_tq(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API void iq3tq_train_grid(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t grid_out[IQ3TQ_GRID_SIZE]);
+GGML_API void iq3tq_set_grid(const int8_t grid[IQ3TQ_GRID_SIZE]);
+GGML_API const int8_t * iq3tq_get_grid(void);
+
+// IQ1_BN: 8D vector quantized with per-tensor trained codebook (1.5625 bpw)
+GGML_API void dequantize_row_iq1_bn(const block_iq1_bn * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
+GGML_API void quantize_row_iq1_bn_ref(const float * GGML_RESTRICT x, block_iq1_bn * GGML_RESTRICT y, int64_t k);
+GGML_API size_t quantize_iq1_bn(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API void iq1bn_train_codebook(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t aux_out[IQ1BN_AUX_SIZE], int nthread);
+GGML_API void iq1bn_set_aux(const int8_t aux[IQ1BN_AUX_SIZE]);
+GGML_API const int8_t * iq1bn_get_aux(void);
+
+// Train 16 Lloyd-Max int8 levels from tensor data.
+// Bins normalized values (x/amax) in [-1,1], runs weighted k-means, rounds to sorted int8[16].
+// Also sets the global levels via q4dpt_set_levels().
+GGML_API void           q4dpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
+                                            const float * imatrix, int8_t levels_out[Q4DPT_N_LEVELS]);
+
 GGML_API void iq2xs_init_impl(enum ggml_type type);
 GGML_API void iq2xs_free_impl(enum ggml_type type);
 GGML_API void iq3xs_init_impl(int grid_size);
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@ -12273,7 +12273,7 @@ static void ggml_vk_quantize_data(const float * from, void * to, size_t ne, ggml
    ggml_quantize_chunk(quant, from, to, 0, 1, ne, nullptr);
 }

-static void ggml_vk_dequantize_data(const void * from, float * to, size_t ne, ggml_type quant) {
+static void ggml_vk_dequantize_data(const void * from, float * to, size_t ne, ggml_type quant, const void * levels = nullptr) {
    if (quant == GGML_TYPE_F32) {
        memcpy(to, from, sizeof(float) * ne);
        return;
@ -12283,7 +12283,7 @@ static void ggml_vk_dequantize_data(const void * from, float * to, size_t ne, gg

    ggml_to_float_t dequant_fn = tt->to_float;

-    dequant_fn(from, to, ne);
+    dequant_fn(from, to, ne, levels);
 }

 static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) {
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@ -456,6 +456,11 @@ void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) {
    }
 }

+static void ggml_fp16_to_fp32_row_leveled(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t n, const void * levels) {
+    GGML_UNUSED(levels);
+    ggml_fp16_to_fp32_row((const ggml_fp16_t *)x, y, n);
+}
+
 void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
    int i = 0;
    for (; i < n; ++i) {
@ -470,6 +475,11 @@ void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) {
    }
 }

+static void ggml_bf16_to_fp32_row_leveled(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t n, const void * levels) {
+    GGML_UNUSED(levels);
+    ggml_bf16_to_fp32_row((const ggml_bf16_t *)x, y, n);
+}
+
 void ggml_fp32_to_bf16_row_ref(const float * x, ggml_bf16_t * y, int64_t n) {
    for (int i = 0; i < n; i++) {
        y[i] = ggml_compute_fp32_to_bf16(x[i]);
@ -648,7 +658,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
        .blck_size                = 1,
        .type_size                = sizeof(ggml_fp16_t),
        .is_quantized             = false,
-        .to_float                 = (ggml_to_float_t) ggml_fp16_to_fp32_row,
+        .to_float                 = ggml_fp16_to_fp32_row_leveled,
        .from_float_ref           = (ggml_from_float_t) ggml_fp32_to_fp16_row,
    },
    [GGML_TYPE_Q1_0] = {
@ -857,7 +867,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
        .blck_size                = 1,
        .type_size                = sizeof(ggml_bf16_t),
        .is_quantized             = false,
-        .to_float                 = (ggml_to_float_t) ggml_bf16_to_fp32_row,
+        .to_float                 = ggml_bf16_to_fp32_row_leveled,
        .from_float_ref           = (ggml_from_float_t) ggml_fp32_to_bf16_row_ref,
    },
    [31] = { // GGML_TYPE_Q4_0_4_4
@ -912,6 +922,71 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
        .type_size                = 0,
        .is_quantized             = false,
    },
+    [GGML_TYPE_Q3_PT] = {
+        .type_name                = "q3_pt",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_q3_pt),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q3_pt,
+        .from_float_ref           = (ggml_from_float_t) quantize_row_q3_pt_ref,
+    },
+    [GGML_TYPE_Q3_KPT] = {
+        .type_name                = "q3_kpt",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_q3_kpt),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q3_kpt,
+        .from_float_ref           = (ggml_from_float_t) quantize_row_q3_kpt_ref,
+    },
+    [GGML_TYPE_Q4_DPT] = {
+        .type_name                = "q4_dpt",
+        .blck_size                = QK4_NL,
+        .type_size                = sizeof(block_q4_dpt),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q4_dpt,
+        .from_float_ref           = (ggml_from_float_t) quantize_row_q4_dpt_ref,
+    },
+    [GGML_TYPE_Q2_DPT] = {
+        .type_name                = "q2_dpt",
+        .blck_size                = QK2_DPT,
+        .type_size                = sizeof(block_q2_dpt),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q2_dpt,
+        .from_float_ref           = (ggml_from_float_t) quantize_row_q2_dpt_ref,
+    },
+    [GGML_TYPE_Q2_KPT] = {
+        .type_name                = "q2_kpt",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_q2_kpt),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q2_kpt,
+        .from_float_ref           = (ggml_from_float_t) quantize_row_q2_kpt_ref,
+        .levels_row_stride        = 0,  // computed dynamically: (ne[0]/256)*4*sizeof(float)
+    },
+    [GGML_TYPE_IQ2_TQ] = {
+        .type_name                = "iq2_tq",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_iq2_tq),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_iq2_tq,
+        .from_float_ref           = (ggml_from_float_t) quantize_row_iq2_tq_ref,
+    },
+    [GGML_TYPE_IQ3_TQ] = {
+        .type_name                = "iq3_tq",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_iq3_tq),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_iq3_tq,
+        .from_float_ref           = (ggml_from_float_t) quantize_row_iq3_tq_ref,
+    },
+    [GGML_TYPE_IQ1_BN] = {
+        .type_name                = "iq1_bn",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_iq1_bn),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_iq1_bn,
+        .from_float_ref           = (ggml_from_float_t) quantize_row_iq1_bn_ref,
+    },
 };

 const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) {
@ -1412,6 +1487,10 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
        case GGML_FTYPE_MOSTLY_IQ4_XS:        wtype = GGML_TYPE_IQ4_XS;   break;
        case GGML_FTYPE_MOSTLY_IQ3_S:         wtype = GGML_TYPE_IQ3_S;    break;
        case GGML_FTYPE_MOSTLY_IQ2_S:         wtype = GGML_TYPE_IQ2_S;    break;
+        case GGML_FTYPE_MOSTLY_Q3_PT:        wtype = GGML_TYPE_Q3_PT;   break;
+        case GGML_FTYPE_MOSTLY_Q3_KPT:        wtype = GGML_TYPE_Q3_KPT;   break;
+        case GGML_FTYPE_MOSTLY_Q4_DPT:        wtype = GGML_TYPE_Q4_DPT;   break;
+        case GGML_FTYPE_MOSTLY_Q2_KPT:        wtype = GGML_TYPE_Q2_KPT;   break;
        case GGML_FTYPE_UNKNOWN:              wtype = GGML_TYPE_COUNT; break;
        case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
    }
@ -7607,6 +7686,13 @@ void ggml_quantize_init(enum ggml_type type) {
        case GGML_TYPE_IQ1_M:   iq2xs_init_impl(type); break;
        case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break;
        case GGML_TYPE_IQ3_S:   iq3xs_init_impl(512); break;
+        case GGML_TYPE_IQ2_TQ: break; // per-tensor grid stored in tensor->quant_levels
+        case GGML_TYPE_IQ3_TQ: break; // per-tensor grid stored in tensor->quant_levels
+        case GGML_TYPE_IQ1_BN: break; // per-tensor codebook stored in tensor->quant_levels
+        case GGML_TYPE_Q3_PT:  break; // levels stored in tensor->quant_levels
+        case GGML_TYPE_Q3_KPT:  break; // levels stored in tensor->quant_levels
+        case GGML_TYPE_Q4_DPT:  break; // levels stored in tensor->quant_levels
+        case GGML_TYPE_Q2_KPT:  break; // levels stored in tensor->quant_levels
        default: // nothing
            break;
    }
@ -7685,6 +7771,13 @@ size_t ggml_quantize_chunk(
        case GGML_TYPE_IQ1_M:   result = quantize_iq1_m  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_IQ4_NL:  result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_IQ4_XS:  result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q3_PT:  result = quantize_q3_pt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q3_KPT:  result = quantize_q3_kpt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q4_DPT:  result = quantize_q4_dpt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q2_KPT:  result = quantize_q2_kpt (src + start, (char *) dst + start_row * row_size, start_row, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_IQ2_TQ:  result = quantize_iq2_tq (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_IQ3_TQ:  result = quantize_iq3_tq (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_IQ1_BN:  result = quantize_iq1_bn (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_F16:
            {
                size_t elemsize = sizeof(ggml_fp16_t);
--- a/ggml/src/gguf.cpp
+++ b/ggml/src/gguf.cpp
@ -1331,37 +1331,63 @@ struct gguf_writer_base {

        if (kv.is_array) {
            write(GGUF_TYPE_ARRAY);
-            write(kv.get_type());
+            const enum gguf_type elem_type = kv.get_type();
+            write(elem_type);
            write(ne);
+            // Write array element data based on element type
+            switch (elem_type) {
+                case GGUF_TYPE_UINT8:
+                case GGUF_TYPE_INT8:
+                case GGUF_TYPE_UINT16:
+                case GGUF_TYPE_INT16:
+                case GGUF_TYPE_UINT32:
+                case GGUF_TYPE_INT32:
+                case GGUF_TYPE_FLOAT32:
+                case GGUF_TYPE_UINT64:
+                case GGUF_TYPE_INT64:
+                case GGUF_TYPE_FLOAT64: {
+                    // Write raw bytes inline for array data
+                    for (size_t i = 0; i < kv.data.size(); ++i) {
+                        write(kv.data[i]);
+                    }
+                } break;
+                case GGUF_TYPE_BOOL: {
+                    for (size_t i = 0; i < ne; ++i) {
+                        write(kv.get_val<bool>(i));
+                    }
+                } break;
+                case GGUF_TYPE_STRING: {
+                    for (size_t i = 0; i < ne; ++i) {
+                        write(kv.get_val<std::string>(i));
+                    }
+                } break;
+                case GGUF_TYPE_ARRAY:
+                default: GGML_ABORT("invalid array element type");
+            }
        } else {
            write(kv.get_type());
-        }
-
-        switch (kv.get_type()) {
-            case GGUF_TYPE_UINT8:
-            case GGUF_TYPE_INT8:
-            case GGUF_TYPE_UINT16:
-            case GGUF_TYPE_INT16:
-            case GGUF_TYPE_UINT32:
-            case GGUF_TYPE_INT32:
-            case GGUF_TYPE_FLOAT32:
-            case GGUF_TYPE_UINT64:
-            case GGUF_TYPE_INT64:
-            case GGUF_TYPE_FLOAT64: {
-                write(kv.data);
-            } break;
-            case GGUF_TYPE_BOOL: {
-                for (size_t i = 0; i < ne; ++i) {
-                    write(kv.get_val<bool>(i));
-                }
-            } break;
-            case GGUF_TYPE_STRING: {
-                for (size_t i = 0; i < ne; ++i) {
-                    write(kv.get_val<std::string>(i));
-                }
-            } break;
-            case GGUF_TYPE_ARRAY:
-            default: GGML_ABORT("invalid type");
+            switch (kv.get_type()) {
+                case GGUF_TYPE_UINT8:
+                case GGUF_TYPE_INT8:
+                case GGUF_TYPE_UINT16:
+                case GGUF_TYPE_INT16:
+                case GGUF_TYPE_UINT32:
+                case GGUF_TYPE_INT32:
+                case GGUF_TYPE_FLOAT32:
+                case GGUF_TYPE_UINT64:
+                case GGUF_TYPE_INT64:
+                case GGUF_TYPE_FLOAT64: {
+                    write(kv.data);
+                } break;
+                case GGUF_TYPE_BOOL: {
+                    write(kv.get_val<bool>(0));
+                } break;
+                case GGUF_TYPE_STRING: {
+                    write(kv.get_val<std::string>(0));
+                } break;
+                case GGUF_TYPE_ARRAY:
+                default: GGML_ABORT("invalid type");
+            }
        }
    }

--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
--- a/include/llama.h
+++ b/include/llama.h
@ -155,6 +155,14 @@ extern "C" {
        LLAMA_FTYPE_MOSTLY_MXFP4_MOE     = 38, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_NVFP4         = 39, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q1_0          = 40, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q3_PT        =  41, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q3_KPT        = 42, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_DPT        = 43, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q2_KPT        = 44, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_IQ2_TQ        = 45, // except 1d tensors, trellis quantized with RNG codebook
+        LLAMA_FTYPE_MOSTLY_IQ3_TQ        = 46, // except 1d tensors, 3-bit with per-tensor trained grid
+        LLAMA_FTYPE_MOSTLY_IQ1_BN        = 47, // except 1d tensors, 8D vector quantized with trained codebook
+

        LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
    };
--- a/pocs/vdot/q8dot.cpp
+++ b/pocs/vdot/q8dot.cpp
@ -157,8 +157,8 @@ int main(int argc, char** argv) {

        t1 = std::chrono::high_resolution_clock::now();
        float fs;
-        if (type == 0) funcs->vec_dot(kVecSize * QK4_1, &fs, 0, x40.data(), 0, y.data(), 0, 1);
-        else funcs->vec_dot(kVecSize * QK4_1, &fs, 0, x41.data(), 0, y.data(), 0, 1);
+        if (type == 0) funcs->vec_dot(kVecSize * QK4_1, &fs, 0, x40.data(), 0, y.data(), 0, 1, nullptr);
+        else funcs->vec_dot(kVecSize * QK4_1, &fs, 0, x41.data(), 0, y.data(), 0, 1, nullptr);
        t2 = std::chrono::high_resolution_clock::now();
        t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
        if (iloop > 3) ggml.addResult(fs, t);
--- a/pocs/vdot/vdot.cpp
+++ b/pocs/vdot/vdot.cpp
@ -285,8 +285,8 @@ int main(int argc, char** argv) {
        else {
            const auto * vdot = ggml_get_type_traits_cpu(funcs_cpu->vec_dot_type);
            vdot->from_float(y1.data(), q8.data(), kVecSize);
-            if (useQ4_1) funcs_cpu->vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1);
-            else funcs_cpu->vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1);
+            if (useQ4_1) funcs_cpu->vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1, nullptr);
+            else funcs_cpu->vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1, nullptr);
        }
        sumq += result;
        t2 = std::chrono::high_resolution_clock::now();
--- a/scripts/analyze-ffn-down.py
+++ b/scripts/analyze-ffn-down.py
@ -0,0 +1,604 @@
+#!/usr/bin/env python3
+"""Deep analysis of WHY ffn_down is hard to quantize.
+Compares structural properties of all weight and activation tensors.
+"""
+
+import numpy as np
+import struct
+import sys
+import os
+
+DATA_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "data")
+
+
+def load_f32_tensor(name):
+    path = os.path.join(DATA_DIR, name)
+    with open(path, "rb") as f:
+        nrow, ncol = struct.unpack("qq", f.read(16))
+        data = np.frombuffer(f.read(), dtype=np.float32)
+        assert len(data) == nrow * ncol, f"Expected {nrow * ncol}, got {len(data)}"
+        return data.reshape(nrow, ncol)
+
+
+def stats(label, arr):
+    """Print comprehensive statistics for a flat array."""
+    a = arr.ravel()
+    print(f"  {label}:")
+    print(f"    shape={arr.shape}, n={len(a)}")
+    print(f"    mean={a.mean():.6f}, std={a.std():.6f}")
+    print(f"    min={a.min():.6f}, max={a.max():.6f}")
+    print(f"    median={np.median(a):.6f}")
+    print(
+        f"    |mean|/std = {abs(a.mean()) / (a.std() + 1e-10):.4f}  (offset-to-spread ratio)"
+    )
+    # Kurtosis (excess) - how heavy-tailed vs Gaussian
+    kurt = np.mean(((a - a.mean()) / (a.std() + 1e-10)) ** 4) - 3.0
+    # Skewness
+    skew = np.mean(((a - a.mean()) / (a.std() + 1e-10)) ** 3)
+    print(f"    skewness={skew:.4f}, excess_kurtosis={kurt:.4f}")
+    # Percentile ranges
+    pcts = np.percentile(a, [0.1, 1, 5, 25, 50, 75, 95, 99, 99.9])
+    print(
+        f"    percentiles: 0.1%={pcts[0]:.4f}, 1%={pcts[1]:.4f}, 5%={pcts[2]:.4f}, "
+        f"25%={pcts[3]:.4f}, 50%={pcts[4]:.4f}, 75%={pcts[5]:.4f}, "
+        f"95%={pcts[6]:.4f}, 99%={pcts[7]:.4f}, 99.9%={pcts[8]:.4f}"
+    )
+    # Sparsity
+    near_zero = np.sum(np.abs(a) < 0.001 * a.std()) / len(a)
+    print(f"    fraction |x| < 0.001*std: {near_zero:.4f}")
+    return {
+        "mean": a.mean(),
+        "std": a.std(),
+        "skew": skew,
+        "kurt": kurt,
+        "min": a.min(),
+        "max": a.max(),
+    }
+
+
+# ============================================================================
+# 1. BASIC WEIGHT TENSOR COMPARISON
+# ============================================================================
+print("=" * 80)
+print("SECTION 1: WEIGHT TENSOR GLOBAL STATISTICS")
+print("=" * 80)
+
+tensors = {
+    "ffn_gate": ("blk_0_ffn_gate_weight.f32bin", "9728x2560 (wide→narrow proj)"),
+    "ffn_up": ("blk_0_ffn_up_weight.f32bin", "9728x2560 (wide→narrow proj)"),
+    "ffn_down": ("blk_0_ffn_down_weight.f32bin", "2560x9728 (narrow→wide proj)"),
+    "attn_q": ("blk_0_attn_q_weight.f32bin", "4096x2560"),
+    "attn_k": ("blk_0_attn_k_weight.f32bin", "1024x2560"),
+    "attn_v": ("blk_0_attn_v_weight.f32bin", "1024x2560"),
+    "attn_out": ("blk_0_attn_output_weight.f32bin", "2560x4096"),
+}
+
+weight_data = {}
+for name, (fname, desc) in tensors.items():
+    try:
+        W = load_f32_tensor(fname)
+        print(f"\n{'─' * 70}")
+        print(f"  {name} [{desc}] — file: {fname}")
+        weight_data[name] = W
+        stats(name, W)
+    except Exception as e:
+        print(f"  {name}: SKIP ({e})")
+
+# ============================================================================
+# 2. ROW-LEVEL STATISTICS (each row is a neuron output)
+# ============================================================================
+print("\n" + "=" * 80)
+print("SECTION 2: ROW-LEVEL VARIABILITY (per-neuron weight statistics)")
+print("=" * 80)
+print("  Each row of the weight matrix produces one output dimension.")
+print("  High row-to-row variability in mean/std means the quantizer")
+print("  must handle very different distributions across rows.\n")
+
+for name, W in weight_data.items():
+    row_means = W.mean(axis=1)
+    row_stds = W.std(axis=1)
+    row_ranges = W.max(axis=1) - W.min(axis=1)
+
+    print(f"\n  {name} ({W.shape[0]} rows × {W.shape[1]} cols):")
+    print(
+        f"    Row means:  mean={row_means.mean():.6f}, std={row_means.std():.6f}, "
+        f"range=[{row_means.min():.6f}, {row_means.max():.6f}]"
+    )
+    print(
+        f"    Row stds:   mean={row_stds.mean():.6f}, std={row_stds.std():.6f}, "
+        f"range=[{row_stds.min():.6f}, {row_stds.max():.6f}]"
+    )
+    print(f"    Row ranges: mean={row_ranges.mean():.6f}, std={row_ranges.std():.6f}")
+    print(
+        f"    RowMeans CV (std/mean): {row_means.std() / (abs(row_means.mean()) + 1e-10):.4f}"
+    )
+    print(f"    RowStds CV:  {row_stds.std() / (row_stds.mean() + 1e-10):.4f}")
+
+# ============================================================================
+# 3. GROUP-LEVEL ANALYSIS (16-element groups, like Q2_K)
+# ============================================================================
+print("\n" + "=" * 80)
+print("SECTION 3: GROUP-LEVEL ANALYSIS (16-element groups)")
+print("=" * 80)
+print("  Quantization works on 16-element groups. Key question:")
+print("  How much does each group need its own OFFSET (dmin)?\n")
+
+GS = 16
+
+for name, W in weight_data.items():
+    # Look at first 256 rows for speed
+    nr = min(W.shape[0], 256)
+    nc = W.shape[1]
+
+    group_means = []
+    group_stds = []
+    group_ranges = []
+    group_offsets = []  # |mean| / range — how important is the offset
+
+    for r in range(nr):
+        for g_start in range(0, nc, GS):
+            g = W[r, g_start : g_start + GS]
+            gm = g.mean()
+            gs = g.std()
+            gr = g.max() - g.min()
+            gmin = g.min()
+
+            group_means.append(gm)
+            group_stds.append(gs)
+            group_ranges.append(gr)
+            # Offset importance: how large is the group mean relative to its range?
+            # If this is high, offset (dmin) matters a lot
+            if gr > 1e-10:
+                group_offsets.append(abs(gm) / gr)
+            else:
+                group_offsets.append(0)
+
+    gm = np.array(group_means)
+    gs = np.array(group_stds)
+    gr = np.array(group_ranges)
+    go = np.array(group_offsets)
+
+    print(f"\n  {name} ({len(group_means)} groups):")
+    print(
+        f"    Group mean:  mean={gm.mean():.6f}, std={gm.std():.6f}, "
+        f"range=[{gm.min():.6f}, {gm.max():.6f}]"
+    )
+    print(f"    Group std:   mean={gs.mean():.6f}, std={gs.std():.6f}")
+    print(f"    Group range: mean={gr.mean():.6f}, std={gr.std():.6f}")
+    print(f"    *** OFFSET IMPORTANCE (|group_mean| / range) ***")
+    print(
+        f"        mean={go.mean():.4f}, median={np.median(go):.4f}, "
+        f"p90={np.percentile(go, 90):.4f}, max={go.max():.4f}"
+    )
+    print(f"        fraction with offset > 0.1: {np.mean(go > 0.1):.3f}")
+    print(f"        fraction with offset > 0.2: {np.mean(go > 0.2):.3f}")
+    print(f"        fraction with offset > 0.3: {np.mean(go > 0.3):.3f}")
+
+    # How well does zeroing the min (Q2_K style, clamping min to 0) work?
+    # vs keeping the actual min
+    mse_no_offset = 0  # Assume uniform 4 levels [0,1,2,3] * scale
+    mse_with_offset = 0  # Assume uniform 4 levels [0,1,2,3] * scale + offset
+
+    for r in range(nr):
+        for g_start in range(0, nc, GS):
+            g = W[r, g_start : g_start + GS]
+            gmin = g.min()
+            gmax = g.max()
+            gr = gmax - gmin
+            if gr < 1e-10:
+                continue
+
+            # No offset: clamp min to 0, scale = max/3
+            if gmin > 0:
+                scale_no = gmax / 3.0
+                min_no = 0
+            else:
+                scale_no = gmax / 3.0
+                min_no = 0  # lose the negative offset
+                # Actually use (gmax - 0)/3 but we're clamping gmin to 0
+
+            # Better: use actual min/max
+            scale_w = gr / 3.0
+            min_w = gmin
+
+            for val in g:
+                # No offset quantization
+                norm_no = val / (scale_no + 1e-10)
+                idx_no = max(0, min(3, int(round(norm_no))))
+                recon_no = scale_no * idx_no
+                mse_no_offset += (val - recon_no) ** 2
+
+                # With offset quantization
+                norm_w = (val - min_w) / (scale_w + 1e-10)
+                idx_w = max(0, min(3, int(round(norm_w))))
+                recon_w = min_w + scale_w * idx_w
+                mse_with_offset += (val - recon_w) ** 2
+
+    total_elements = nr * nc
+    rmse_no = np.sqrt(mse_no_offset / total_elements)
+    rmse_w = np.sqrt(mse_with_offset / total_elements)
+    improvement = (rmse_no - rmse_w) / rmse_no * 100
+    print(f"    Quant RMSE (no offset): {rmse_no:.6f}")
+    print(f"    Quant RMSE (with offset): {rmse_w:.6f}")
+    print(f"    Offset benefit: {improvement:.1f}% RMSE reduction")
+
+# ============================================================================
+# 4. ACTIVATION ANALYSIS
+# ============================================================================
+print("\n" + "=" * 80)
+print("SECTION 4: ACTIVATION DISTRIBUTION COMPARISON")
+print("=" * 80)
+
+activations = {
+    "ffn_input (gate/up)": "act_blk0_ffn_input.f32bin",
+    "ffn_down_input (swiglu)": "act_blk0_ffn_down_input.f32bin",
+    "attn_input (q/k/v)": "act_blk0_attn_input.f32bin",
+    "attn_output_input": "act_blk0_attn_output_input.f32bin",
+}
+
+act_data = {}
+for name, fname in activations.items():
+    try:
+        A = load_f32_tensor(fname)
+        act_data[name] = A
+        print(f"\n{'─' * 70}")
+        print(f"  {name} — {fname}")
+        stats(name, A)
+    except Exception as e:
+        print(f"  {name}: SKIP ({e})")
+
+# ============================================================================
+# 5. THE CRITICAL QUESTION: PER-DIMENSION ACTIVATION MAGNITUDE
+# ============================================================================
+print("\n" + "=" * 80)
+print("SECTION 5: PER-DIMENSION ACTIVATION POWER (per-column RMS)")
+print("=" * 80)
+print("  If activation dimensions have very different magnitudes,")
+print("  the quantization error in each weight dimension is weighted differently.")
+print("  Dimensions with high activation power amplify weight errors.\n")
+
+for name, A in act_data.items():
+    col_rms = np.sqrt(np.mean(A**2, axis=0))  # RMS per column (dimension)
+    print(f"\n  {name} ({A.shape[1]} dimensions):")
+    print(f"    Col RMS: mean={col_rms.mean():.6f}, std={col_rms.std():.6f}")
+    print(f"    Col RMS range: [{col_rms.min():.6f}, {col_rms.max():.6f}]")
+    print(f"    Col RMS CV (std/mean): {col_rms.std() / (col_rms.mean() + 1e-10):.4f}")
+    print(f"    Max/Min ratio: {col_rms.max() / (col_rms.min() + 1e-10):.1f}x")
+
+    # Top 10 and bottom 10 dimensions by power
+    top10 = np.argsort(col_rms)[-10:][::-1]
+    bot10 = np.argsort(col_rms)[:10]
+    print(
+        f"    Top-10 dims by RMS: {[(int(d), f'{col_rms[d]:.4f}') for d in top10[:5]]}..."
+    )
+    print(
+        f"    Bot-10 dims by RMS: {[(int(d), f'{col_rms[d]:.4f}') for d in bot10[:5]]}..."
+    )
+
+    # How much do the top 10% of dimensions contribute to total power?
+    total_power = np.sum(col_rms**2)
+    sorted_power = np.sort(col_rms**2)[::-1]
+    top10pct = int(len(col_rms) * 0.1)
+    top10pct_power = np.sum(sorted_power[:top10pct])
+    top1pct = max(1, int(len(col_rms) * 0.01))
+    top1pct_power = np.sum(sorted_power[:top1pct])
+    print(
+        f"    Top 10% of dims contribute {top10pct_power / total_power * 100:.1f}% of total power"
+    )
+    print(
+        f"    Top 1% of dims contribute {top1pct_power / total_power * 100:.1f}% of total power"
+    )
+
+# ============================================================================
+# 6. CROSS-CORRELATION: WEIGHT ERROR × ACTIVATION POWER
+# ============================================================================
+print("\n" + "=" * 80)
+print("SECTION 6: WHERE DO WEIGHT ERRORS MEET HIGH ACTIVATION POWER?")
+print("=" * 80)
+print("  For each weight dimension, compute: activation_rms[dim] × weight_error[dim]")
+print("  This tells us which dimensions contribute most to matmul error.\n")
+
+# Focus on ffn_down vs ffn_gate for comparison
+focus = [
+    ("ffn_down", "blk_0_ffn_down_weight.f32bin", "act_blk0_ffn_down_input.f32bin"),
+    ("ffn_gate", "blk_0_ffn_gate_weight.f32bin", "act_blk0_ffn_input.f32bin"),
+    ("ffn_up", "blk_0_ffn_up_weight.f32bin", "act_blk0_ffn_input.f32bin"),
+    ("attn_q", "blk_0_attn_q_weight.f32bin", "act_blk0_attn_input.f32bin"),
+]
+
+for name, wfile, afile in focus:
+    W = load_f32_tensor(wfile)
+    A = load_f32_tensor(afile)
+
+    if W.shape[1] != A.shape[1]:
+        print(f"  {name}: dim mismatch W={W.shape[1]} vs A={A.shape[1]}, SKIP")
+        continue
+
+    nc = W.shape[1]
+
+    # Per-column activation RMS
+    act_rms = np.sqrt(np.mean(A**2, axis=0))
+
+    # Per-column weight std and range (how "hard" to quantize)
+    w_std = W.std(axis=0)
+    w_range = W.max(axis=0) - W.min(axis=0)
+
+    # Per-column weight kurtosis (heavy tails = harder to quantize)
+    w_kurt = (
+        np.mean(((W - W.mean(axis=0)) / (W.std(axis=0) + 1e-10)) ** 4, axis=0) - 3.0
+    )
+
+    # Weight error proxy: with 2-bit uniform quant on 16-element groups
+    # Higher variance columns → more error
+    nr = min(W.shape[0], 256)
+
+    # Simple Q2_K-style error estimate per dimension:
+    # For each group of 16 in the column direction, quantize and measure error
+    dim_mse = np.zeros(nc)
+    for g_start in range(0, nc, GS):
+        g_end = min(g_start + GS, nc)
+        for r in range(nr):
+            g = W[r, g_start:g_end]
+            gmin = min(g.min(), 0)  # Q2_K clamps min to ≤0
+            gmax = g.max()
+            gr = gmax - gmin
+            if gr < 1e-10:
+                continue
+            scale = gr / 3.0
+            for i, val in enumerate(g):
+                norm = (val - gmin) / scale
+                idx = max(0, min(3, int(round(norm))))
+                recon = gmin + scale * idx
+                dim_mse[g_start + i] += (val - recon) ** 2
+
+    dim_rmse = np.sqrt(dim_mse / nr)
+
+    # The key metric: dimension-level contribution to matmul error
+    # matmul_error_contribution[d] ≈ act_rms[d] * weight_rmse[d]
+    matmul_contrib = act_rms * dim_rmse
+
+    print(f"\n  {name} ({nc} dimensions):")
+    print(
+        f"    act_rms: mean={act_rms.mean():.4f}, CV={act_rms.std() / act_rms.mean():.4f}"
+    )
+    print(
+        f"    w_rmse:  mean={dim_rmse.mean():.6f}, CV={dim_rmse.std() / (dim_rmse.mean() + 1e-10):.4f}"
+    )
+    print(
+        f"    matmul_contrib: mean={matmul_contrib.mean():.6f}, "
+        f"std={matmul_contrib.std():.6f}"
+    )
+
+    # Correlation between activation power and weight error
+    corr = np.corrcoef(act_rms, dim_rmse)[0, 1]
+    print(f"    CORRELATION act_rms ↔ weight_rmse: {corr:.4f}")
+    print(f"      (>0 means high-power dims are also hard to quantize — BAD)")
+
+    # Top contributors to matmul error
+    top_dims = np.argsort(matmul_contrib)[-20:][::-1]
+    print(f"    Top-5 error-contributing dimensions:")
+    for d in top_dims[:5]:
+        print(
+            f"      dim {d}: act_rms={act_rms[d]:.4f}, w_rmse={dim_rmse[d]:.6f}, "
+            f"contrib={matmul_contrib[d]:.6f}, w_std={w_std[d]:.6f}, w_kurt={w_kurt[d]:.2f}"
+        )
+
+    # Distribution of matmul contributions
+    total_contrib = matmul_contrib.sum()
+    sorted_contrib = np.sort(matmul_contrib)[::-1]
+    for pct in [0.01, 0.05, 0.10, 0.25]:
+        n = max(1, int(nc * pct))
+        print(
+            f"    Top {pct * 100:.0f}% dims: {sorted_contrib[:n].sum() / total_contrib * 100:.1f}% "
+            f"of total matmul error"
+        )
+
+# ============================================================================
+# 7. THE STRUCTURAL ASYMMETRY: COLUMN DIRECTION GROUP ANALYSIS
+# ============================================================================
+print("\n" + "=" * 80)
+print("SECTION 7: STRUCTURAL ASYMMETRY — COLUMN vs ROW GROUPING")
+print("=" * 80)
+print("  Quantization groups along the ROW (inner dim). For ffn_down,")
+print("  each row has 9728 elements (38 groups of 256).")
+print("  For ffn_gate, each row has 2560 elements (10 groups of 256).")
+print("  More groups = more metadata (scales/offsets) relative to data bits.\n")
+
+for name, wfile, afile in focus:
+    W = load_f32_tensor(wfile)
+    nc = W.shape[1]
+    n_groups_per_row = nc // 256  # super-blocks per row
+
+    print(f"\n  {name}: {nc} cols → {n_groups_per_row} super-blocks per row")
+    print(f"    Groups per row: {nc // GS} (16-element groups)")
+    print(
+        f"    With Q2_K (2.625 bpw): {n_groups_per_row * 2} scale+offset bytes per row"
+    )
+
+    # How much do group means vary WITHIN a row?
+    nr = min(W.shape[0], 64)
+    intra_row_mean_var = []
+    for r in range(nr):
+        group_means = []
+        for g_start in range(0, nc, GS):
+            group_means.append(W[r, g_start : g_start + GS].mean())
+        group_means = np.array(group_means)
+        intra_row_mean_var.append(group_means.std())
+
+    print(
+        f"    Intra-row group mean variability (avg across rows): "
+        f"mean={np.mean(intra_row_mean_var):.6f}"
+    )
+
+    # How much does the sign of group means vary?
+    pos_frac = 0
+    neg_frac = 0
+    total_groups = 0
+    for r in range(nr):
+        for g_start in range(0, nc, GS):
+            gm = W[r, g_start : g_start + GS].mean()
+            if gm > 0.001:
+                pos_frac += 1
+            elif gm < -0.001:
+                neg_frac += 1
+            total_groups += 1
+    print(
+        f"    Group mean sign: {pos_frac / total_groups * 100:.1f}% positive, "
+        f"{neg_frac / total_groups * 100:.1f}% negative, "
+        f"{(1 - pos_frac / total_groups - neg_frac / total_groups) * 100:.1f}% near-zero"
+    )
+
+# ============================================================================
+# 8. THE SWIGLU EFFECT: WHY ffn_down INPUT IS SPECIAL
+# ============================================================================
+print("\n" + "=" * 80)
+print("SECTION 8: THE SWIGLU EFFECT — ffn_down ACTIVATION STRUCTURE")
+print("=" * 80)
+print("  ffn_down's activation is the SwiGLU output: silu(gate) * up")
+print("  This creates a specific activation pattern that differs from")
+print("  raw FFN input (RMSNorm output).\n")
+
+if "ffn_input (gate/up)" in act_data and "ffn_down_input (swiglu)" in act_data:
+    A_in = act_data["ffn_input (gate/up)"]
+    A_swiglu = act_data["ffn_down_input (swiglu)"]
+
+    print(f"  FFN input (RMSNorm output): {A_in.shape}")
+    print(f"  SwiGLU output: {A_swiglu.shape}")
+
+    # Per-token analysis
+    for t in range(min(A_swiglu.shape[0], 3)):
+        tok_in = A_in[t]
+        tok_sw = A_swiglu[t]
+        print(f"\n  Token {t}:")
+        print(
+            f"    FFN input:   mean={tok_in.mean():.6f}, std={tok_in.std():.6f}, "
+            f"|max|={np.abs(tok_in).max():.6f}"
+        )
+        print(
+            f"    SwiGLU out:  mean={tok_sw.mean():.6f}, std={tok_sw.std():.6f}, "
+            f"|max|={np.abs(tok_sw).max():.6f}"
+        )
+
+        # SwiGLU creates lots of near-zero values (silu suppresses negatives)
+        frac_nearzero_sw = np.mean(np.abs(tok_sw) < 0.01 * tok_sw.std())
+        frac_nearzero_in = np.mean(np.abs(tok_in) < 0.01 * tok_in.std())
+        print(
+            f"    Near-zero fraction: FFN input={frac_nearzero_in:.3f}, "
+            f"SwiGLU={frac_nearzero_sw:.3f}"
+        )
+
+        # Sparsity pattern
+        frac_neg = np.mean(tok_sw < 0)
+        print(f"    SwiGLU negative fraction: {frac_neg:.3f}")
+
+    # Dimension-level analysis of SwiGLU
+    print(f"\n  Dimension-level SwiGLU properties:")
+    dim_mean_sw = A_swiglu.mean(axis=0)
+    dim_std_sw = A_swiglu.std(axis=0)
+    dim_sparsity = np.mean(A_swiglu < 0, axis=0)  # fraction of tokens negative per dim
+
+    print(f"    Dim mean range: [{dim_mean_sw.min():.6f}, {dim_mean_sw.max():.6f}]")
+    print(f"    Dim std range: [{dim_std_sw.min():.6f}, {dim_std_sw.max():.6f}]")
+    print(
+        f"    Dim negative fraction: mean={dim_sparsity.mean():.3f}, "
+        f"range=[{dim_sparsity.min():.3f}, {dim_sparsity.max():.3f}]"
+    )
+
+    # Highly sparse dimensions (mostly near-zero after SwiGLU)
+    high_sparsity = np.sum(dim_sparsity > 0.7)
+    low_sparsity = np.sum(dim_sparsity < 0.3)
+    print(f"    Dims with >70% negative tokens: {high_sparsity}/{len(dim_sparsity)}")
+    print(f"    Dims with <30% negative tokens: {low_sparsity}/{len(dim_sparsity)}")
+
+# ============================================================================
+# 9. QUANTIZATION NOISE × ACTIVATION POWER: THE MATMUL ERROR DECOMPOSITION
+# ============================================================================
+print("\n" + "=" * 80)
+print("SECTION 9: MATMUL ERROR DECOMPOSITION")
+print("=" * 80)
+print(
+    "  matmul_error ≈ sum over groups of (activation_power_in_group × "
+    "weight_mse_in_group)"
+)
+print(
+    "  If activation power is concentrated in groups with high weight error, "
+    "matmul error explodes.\n"
+)
+
+# For ffn_down specifically, compare where activation power sits vs weight error
+W_down = load_f32_tensor("blk_0_ffn_down_weight.f32bin")
+A_swiglu = load_f32_tensor("act_blk0_ffn_down_input.f32bin")
+
+W_gate = load_f32_tensor("blk_0_ffn_gate_weight.f32bin")
+A_ffn_in = load_f32_tensor("act_blk0_ffn_input.f32bin")
+
+for label, W, A in [("ffn_down", W_down, A_swiglu), ("ffn_gate", W_gate, A_ffn_in)]:
+    nc = W.shape[1]
+    nr = min(W.shape[0], 128)
+
+    # Compute per-superblock (256) activation power and weight error
+    n_sb = nc // 256
+    sb_act_power = np.zeros(n_sb)
+    sb_weight_mse = np.zeros(n_sb)
+
+    for sb in range(n_sb):
+        s = sb * 256
+        e = s + 256
+        # Activation power: mean squared activation in this region
+        sb_act_power[sb] = np.mean(A[:, s:e] ** 2)
+
+        # Weight MSE: Q2_K-style uniform quant error
+        mse = 0
+        count = 0
+        for r in range(nr):
+            for g in range(0, 256, GS):
+                gvals = W[r, s + g : s + g + GS]
+                gmin = min(gvals.min(), 0)
+                gmax = gvals.max()
+                gr = gmax - gmin
+                if gr < 1e-10:
+                    continue
+                scale = gr / 3.0
+                for v in gvals:
+                    norm = (v - gmin) / scale
+                    idx = max(0, min(3, int(round(norm))))
+                    recon = gmin + scale * idx
+                    mse += (v - recon) ** 2
+                    count += 1
+        sb_weight_mse[sb] = mse / max(count, 1)
+
+    # Correlation between activation power and weight error across super-blocks
+    valid = sb_act_power > 1e-10
+    if valid.sum() > 10:
+        corr = np.corrcoef(np.sqrt(sb_act_power[valid]), np.sqrt(sb_weight_mse[valid]))[
+            0, 1
+        ]
+    else:
+        corr = 0
+
+    print(f"\n  {label}:")
+    print(f"    Super-blocks: {n_sb}")
+    print(
+        f"    act_power: mean={sb_act_power.mean():.6f}, "
+        f"std={np.sqrt(sb_act_power.var()):.6f}, "
+        f"range=[{sb_act_power.min():.6f}, {sb_act_power.max():.6f}]"
+    )
+    print(
+        f"    weight_mse: mean={sb_weight_mse.mean():.6f}, "
+        f"range=[{sb_weight_mse.min():.6f}, {sb_weight_mse.max():.6f}]"
+    )
+    print(f"    CORRELATION (act_power ↔ weight_mse): {corr:.4f}")
+
+    # Show top-5 super-blocks by contribution to matmul error
+    contrib = sb_act_power * sb_weight_mse
+    top5 = np.argsort(contrib)[-5:][::-1]
+    print(f"    Top-5 error-contributing super-blocks (of {n_sb}):")
+    for idx in top5:
+        print(
+            f"      SB {idx * 256}-{(idx + 1) * 256 - 1}: act_power={sb_act_power[idx]:.6f}, "
+            f"weight_mse={sb_weight_mse[idx]:.6f}, contrib={contrib[idx]:.6f}"
+        )
+
+print("\n" + "=" * 80)
+print("ANALYSIS COMPLETE")
+print("=" * 80)
--- a/scripts/compute-imatrix.py
+++ b/scripts/compute-imatrix.py
@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+"""Compute imatrix (importance matrix) from captured activation tensors.
+
+The imatrix is the per-dimension sum-of-squares of the activations.
+It's what upstream llama.cpp uses to weight quantization optimization.
+
+For each activation file act_blkL_*.f32bin, produces imatrix_blkL_<role>.f32bin
+where <role> matches the weight tensor it multiplies with.
+
+Format: flat float32 array of length n_per_row, one importance value per dimension.
+"""
+
+import numpy as np
+import struct
+import os
+
+DATA_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "data")
+
+
+def load_f32_tensor(name):
+    path = os.path.join(DATA_DIR, name)
+    with open(path, "rb") as f:
+        nrow, ncol = struct.unpack("qq", f.read(16))
+        data = np.frombuffer(f.read(), dtype=np.float32)
+        assert len(data) == nrow * ncol
+        return data.reshape(nrow, ncol)
+
+
+def save_imatrix(name, data):
+    path = os.path.join(DATA_DIR, name)
+    data.astype(np.float32).tofile(path)
+    print(
+        f"  Wrote {path}: {len(data)} dims, "
+        f"min={data.min():.6f}, max={data.max():.6f}, mean={data.mean():.6f}"
+    )
+
+
+# Mapping: activation file → imatrix files for each weight it multiplies with
+# Each weight tensor's column dimension matches the activation's column dimension
+mappings = [
+    {
+        "act_file": "act_blk0_ffn_input.f32bin",
+        "imatrix_name": "imatrix_blk0_ffn_gate_up.f32bin",
+        "description": "ffn_gate and ffn_up (both use ffn_input activation)",
+    },
+    {
+        "act_file": "act_blk0_ffn_down_input.f32bin",
+        "imatrix_name": "imatrix_blk0_ffn_down.f32bin",
+        "description": "ffn_down (uses SwiGLU activation)",
+    },
+    {
+        "act_file": "act_blk0_attn_input.f32bin",
+        "imatrix_name": "imatrix_blk0_attn_qkv.f32bin",
+        "description": "attn_q, attn_k, attn_v (all use attn_input activation)",
+    },
+    {
+        "act_file": "act_blk0_attn_output_input.f32bin",
+        "imatrix_name": "imatrix_blk0_attn_output.f32bin",
+        "description": "attn_output (uses kqv_out activation)",
+    },
+]
+
+print("Computing imatrix from captured activations")
+print("=" * 60)
+
+for m in mappings:
+    try:
+        A = load_f32_tensor(m["act_file"])
+        print(f"\n{m['description']}:")
+        print(f"  Activation: {A.shape[0]} tokens × {A.shape[1]} dims")
+
+        # imatrix = sum over tokens of activation^2
+        # This is the standard definition used by llama.cpp
+        imatrix = np.sum(A**2, axis=0)
+
+        # Also compute per-dim RMS for reference
+        rms = np.sqrt(np.mean(A**2, axis=0))
+
+        print(
+            f"  Imatrix stats: min={imatrix.min():.6f}, max={imatrix.max():.6f}, "
+            f"mean={imatrix.mean():.6f}, std={imatrix.std():.6f}"
+        )
+        print(
+            f"  RMS stats:     min={rms.min():.6f}, max={rms.max():.6f}, "
+            f"mean={rms.mean():.6f}"
+        )
+
+        # Concentration metrics
+        total = imatrix.sum()
+        sorted_im = np.sort(imatrix)[::-1]
+        top1pct = max(1, int(len(imatrix) * 0.01))
+        top10pct = max(1, int(len(imatrix) * 0.10))
+        print(f"  Power concentration:")
+        print(
+            f"    Top 1% dims ({top1pct}): {sorted_im[:top1pct].sum() / total * 100:.1f}% of total"
+        )
+        print(
+            f"    Top 10% dims ({top10pct}): {sorted_im[:top10pct].sum() / total * 100:.1f}% of total"
+        )
+
+        save_imatrix(m["imatrix_name"], imatrix)
+    except Exception as e:
+        print(f"  SKIP: {e}")
+
+print("\nDone.")
--- a/scripts/extract-activations.py
+++ b/scripts/extract-activations.py
@ -0,0 +1,210 @@
+#!/usr/bin/env python3
+"""Extract real activation tensors by running a forward pass through the model.
+
+Captures the INPUT activations to specific weight tensors (the vectors that get
+multiplied by the weight matrix). These are what matter for quantization quality:
+quantization error * activation magnitude = output error.
+
+Usage:
+    python3 scripts/extract-activations.py MODEL.gguf OUTPUT_DIR [--prompt TEXT] [--layer N]
+
+Output:
+    For each target tensor, writes a .f32bin file with header:
+        int64_t n_rows, int64_t row_len
+    followed by n_rows * row_len float32 values.
+    n_rows = number of tokens, row_len = hidden dimension.
+
+NOTE: This uses a simplified forward pass (no KV cache, single prompt).
+Activations are extracted from after the norm layers (the actual matmul inputs).
+"""
+import sys
+import os
+import struct
+import numpy as np
+
+script_dir = os.path.dirname(os.path.abspath(__file__))
+repo_root = os.path.dirname(script_dir)
+sys.path.insert(0, os.path.join(repo_root, 'gguf-py'))
+
+from gguf import GGUFReader
+
+
+def bf16_to_f32(raw_bytes):
+    """Convert raw BF16 bytes to float32 numpy array."""
+    bf16 = np.frombuffer(raw_bytes, dtype=np.uint16)
+    f32_bits = bf16.astype(np.uint32) << 16
+    return f32_bits.view(np.float32)
+
+
+def rms_norm(x, weight, eps=1e-6):
+    """RMS normalization (Qwen3/Llama style)."""
+    rms = np.sqrt(np.mean(x * x, axis=-1, keepdims=True) + eps)
+    return (x / rms) * weight
+
+
+def silu(x):
+    """SiLU activation."""
+    return x / (1.0 + np.exp(-np.clip(x, -88, 88)))
+
+
+def softmax(x, axis=-1):
+    """Numerically stable softmax."""
+    x_max = np.max(x, axis=axis, keepdims=True)
+    e = np.exp(x - x_max)
+    return e / np.sum(e, axis=axis, keepdims=True)
+
+
+def main():
+    if len(sys.argv) < 3:
+        print(f"Usage: {sys.argv[0]} MODEL.gguf OUTPUT_DIR [--prompt TEXT] [--layer N]")
+        sys.exit(1)
+
+    model_path = sys.argv[1]
+    output_dir = sys.argv[2]
+    prompt_text = "The quick brown fox jumps over the lazy dog. In a distant galaxy, scientists discovered"
+    target_layer = 16
+
+    for i in range(3, len(sys.argv)):
+        if sys.argv[i] == "--prompt" and i + 1 < len(sys.argv):
+            prompt_text = sys.argv[i + 1]
+        elif sys.argv[i] == "--layer" and i + 1 < len(sys.argv):
+            target_layer = int(sys.argv[i + 1])
+
+    os.makedirs(output_dir, exist_ok=True)
+
+    print(f"Loading {model_path}...")
+    reader = GGUFReader(model_path)
+
+    # Read model config from metadata
+    config = {}
+    for kv in reader.fields.values():
+        if hasattr(kv, 'parts') and len(kv.parts) > 0:
+            name = kv.name
+            if 'block_count' in name:
+                config['n_layer'] = int(kv.parts[-1][0])
+            elif 'embedding_length' in name:
+                config['hidden'] = int(kv.parts[-1][0])
+            elif 'feed_forward_length' in name:
+                config['ffn'] = int(kv.parts[-1][0])
+            elif 'head_count_kv' in name:
+                config['n_kv_heads'] = int(kv.parts[-1][0])
+            elif 'head_count' in name and 'kv' not in name:
+                config['n_heads'] = int(kv.parts[-1][0])
+            elif 'key_length' in name:
+                config['head_dim'] = int(kv.parts[-1][0])
+            elif 'layer_norm_rms_epsilon' in name:
+                config['eps'] = float(kv.parts[-1][0])
+
+    print(f"Config: {config}")
+    hidden = config['hidden']
+
+    # Load tensors into a dict
+    def load_tensor(name):
+        for t in reader.tensors:
+            if t.name == name:
+                raw = bytes(t.data)
+                shape = [int(s) for s in t.shape]
+                n_el = int(t.n_elements)
+                if t.tensor_type.name == 'BF16':
+                    flat = bf16_to_f32(raw)
+                elif t.tensor_type.name == 'F16':
+                    flat = np.frombuffer(raw, dtype=np.float16).astype(np.float32)
+                elif t.tensor_type.name == 'F32':
+                    flat = np.frombuffer(raw, dtype=np.float32)
+                else:
+                    raise ValueError(f"Unsupported type: {t.tensor_type.name}")
+                assert flat.shape[0] == n_el, f"Expected {n_el} elements, got {flat.shape[0]}"
+                if len(shape) == 1:
+                    return flat.copy()
+                return flat.reshape(list(reversed(shape))).copy()
+        raise KeyError(f"Tensor {name} not found")
+
+    # Create simple token IDs from the prompt (use first few tokens from vocab)
+    # We just need realistic activations, not perfect tokenization
+    n_tokens = min(32, len(prompt_text.split()))
+    print(f"Using {n_tokens} pseudo-tokens for activation extraction")
+
+    # Load token embedding and create input
+    print("Loading token_embd...")
+    token_embd = load_tensor("token_embd.weight")  # [vocab, hidden]
+    # Use token IDs 100-131 (arbitrary but avoids special tokens)
+    token_ids = list(range(100, 100 + n_tokens))
+    x = token_embd[token_ids]  # [n_tokens, hidden]
+    print(f"Input shape: {x.shape}")
+
+    # Run forward pass through target layer only (we just need the activations)
+    layer = target_layer
+    print(f"\nProcessing layer {layer}...")
+
+    def save_activation(name, data):
+        """Save activation tensor as f32bin."""
+        if data.ndim == 1:
+            data = data.reshape(1, -1)
+        n_rows, row_len = data.shape
+        fname = os.path.join(output_dir, name + ".f32bin")
+        with open(fname, 'wb') as fp:
+            fp.write(struct.pack('<qq', n_rows, row_len))
+            data.astype(np.float32).tofile(fp)
+        print(f"  Saved {fname}: {n_rows} x {row_len} ({os.path.getsize(fname) / 1024:.1f} KB)")
+
+    # Attention norm → input to attn_q/k/v
+    attn_norm_w = load_tensor(f"blk.{layer}.attn_norm.weight")
+    x_normed = rms_norm(x, attn_norm_w, config.get('eps', 1e-6))
+    save_activation(f"act_blk{layer}_attn_input", x_normed)
+
+    # Compute Q, K, V to get post-attention residual
+    W_q = load_tensor(f"blk.{layer}.attn_q.weight")   # [n_heads*head_dim, hidden]
+    W_k = load_tensor(f"blk.{layer}.attn_k.weight")   # [n_kv_heads*head_dim, hidden]
+    W_v = load_tensor(f"blk.{layer}.attn_v.weight")   # [n_kv_heads*head_dim, hidden]
+    W_o = load_tensor(f"blk.{layer}.attn_output.weight")  # [hidden, n_heads*head_dim]
+
+    Q = x_normed @ W_q.T  # [n_tokens, n_heads*head_dim]
+    K = x_normed @ W_k.T
+    V = x_normed @ W_v.T
+
+    # Simplified attention (no RoPE, no mask, no GQA — just need rough activations)
+    n_heads = config['n_heads']
+    head_dim = config['head_dim']
+    Q_h = Q.reshape(n_tokens, n_heads, head_dim)
+    K_h = K.reshape(n_tokens, config['n_kv_heads'], head_dim)
+    V_h = V.reshape(n_tokens, config['n_kv_heads'], head_dim)
+
+    # Repeat KV heads for GQA
+    rep = n_heads // config['n_kv_heads']
+    K_h = np.repeat(K_h, rep, axis=1)
+    V_h = np.repeat(V_h, rep, axis=1)
+
+    # Attention scores and output
+    scores = np.einsum('thd,shd->ths', Q_h, K_h) / np.sqrt(head_dim)
+    attn_w = softmax(scores, axis=-1)
+    attn_out = np.einsum('ths,shd->thd', attn_w, V_h).reshape(n_tokens, -1)
+
+    # attn_output weight input
+    save_activation(f"act_blk{layer}_attn_output_input", attn_out)
+
+    # Project and add residual
+    attn_proj = attn_out @ W_o.T
+    x = x + attn_proj
+
+    # FFN norm → input to ffn_gate/ffn_up
+    ffn_norm_w = load_tensor(f"blk.{layer}.ffn_norm.weight")
+    x_ffn = rms_norm(x, ffn_norm_w, config.get('eps', 1e-6))
+    save_activation(f"act_blk{layer}_ffn_input", x_ffn)
+
+    # FFN: gate and up projections
+    W_gate = load_tensor(f"blk.{layer}.ffn_gate.weight")  # [ffn, hidden]
+    W_up = load_tensor(f"blk.{layer}.ffn_up.weight")      # [ffn, hidden]
+    W_down = load_tensor(f"blk.{layer}.ffn_down.weight")  # [hidden, ffn]
+
+    gate = x_ffn @ W_gate.T
+    up = x_ffn @ W_up.T
+    ffn_act = silu(gate) * up  # SwiGLU activation
+
+    # ffn_down weight input (the SwiGLU output)
+    save_activation(f"act_blk{layer}_ffn_down_input", ffn_act)
+
+    print(f"\nDone! Extracted 4 activation tensors to {output_dir}/")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/extract-tensor-data.py
+++ b/scripts/extract-tensor-data.py
@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+"""Extract tensor data from GGUF as raw f32 binary files for C++ testing.
+
+Usage:
+    python3 scripts/extract-tensor-data.py MODEL.gguf pattern1 [pattern2 ...]
+
+Output:
+    For each matching tensor, writes a .f32bin file with header:
+        int64_t n_rows, int64_t row_len
+    followed by n_rows * row_len float32 values.
+"""
+import sys
+import os
+import numpy as np
+
+# Support running from build/ or repo root
+script_dir = os.path.dirname(os.path.abspath(__file__))
+repo_root = os.path.dirname(script_dir)
+sys.path.insert(0, os.path.join(repo_root, 'gguf-py'))
+
+from gguf import GGUFReader
+
+def main():
+    if len(sys.argv) < 3:
+        print(f"Usage: {sys.argv[0]} MODEL.gguf pattern1 [pattern2 ...]")
+        print(f"  Extracts tensors whose names contain any of the given patterns.")
+        sys.exit(1)
+
+    model_path = sys.argv[1]
+    patterns = sys.argv[2:]
+
+    print(f"Reading {model_path}...")
+    reader = GGUFReader(model_path)
+
+    for tensor in reader.tensors:
+        if not any(p in tensor.name for p in patterns):
+            continue
+
+        print(f"\nExtracting: {tensor.name}")
+        print(f"  Shape: {list(tensor.shape)}, type: {tensor.tensor_type.name}")
+
+        # Convert to f32
+        raw = np.array(tensor.data, dtype=np.uint8)
+
+        if tensor.tensor_type.name == 'BF16':
+            bf16_vals = raw.view(np.uint16)
+            f32_bits = bf16_vals.astype(np.uint32) << 16
+            f32_vals = f32_bits.view(np.float32)
+        elif tensor.tensor_type.name == 'F16':
+            f16_vals = raw.view(np.float16)
+            f32_vals = f16_vals.astype(np.float32)
+        elif tensor.tensor_type.name == 'F32':
+            f32_vals = raw.view(np.float32)
+        else:
+            print(f"  SKIP: unsupported type {tensor.tensor_type.name}")
+            continue
+
+        # Determine layout: GGUF stores shape as [col, row] for 2D
+        row_len = int(tensor.shape[0])
+        n_rows = tensor.n_elements // row_len
+
+        fname = tensor.name.replace(".", "_") + ".f32bin"
+        with open(fname, 'wb') as fp:
+            fp.write(np.array([n_rows, row_len], dtype=np.int64).tobytes())
+            f32_vals.tofile(fp)
+
+        file_size = os.path.getsize(fname)
+        print(f"  Wrote {fname}: {n_rows} rows x {row_len} cols = {tensor.n_elements} elements")
+        print(f"  File size: {file_size / (1024*1024):.1f} MB")
+        print(f"  Stats: mean={f32_vals.mean():.6f}, std={f32_vals.std():.6f}, "
+              f"min={f32_vals.min():.6f}, max={f32_vals.max():.6f}")
+
+if __name__ == "__main__":
+    main()
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@ -511,6 +511,7 @@ public:
    std::map<llama_seq_id, llama_sampler *> samplers;
 };

+
 //
 // llm_graph_result
 //
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@ -2,6 +2,7 @@

 #include "ggml-alloc.h"
 #include "ggml.h"
+#include "llama.h"
 #include "gguf.h"
 #include "llama-hparams.h"

@ -61,6 +62,13 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
        case LLAMA_FTYPE_MOSTLY_IQ2_S:    return "IQ2_S - 2.5 bpw";
        case LLAMA_FTYPE_MOSTLY_IQ2_M:    return "IQ2_M - 2.7 bpw";
        case LLAMA_FTYPE_MOSTLY_IQ3_XS:   return "IQ3_XS - 3.3 bpw";
+        case LLAMA_FTYPE_MOSTLY_Q3_PT:   return "Q3_PT - 3.25 bpw";
+        case LLAMA_FTYPE_MOSTLY_Q3_KPT:   return "Q3_KPT - Q3_K with learned levels";
+        case LLAMA_FTYPE_MOSTLY_Q4_DPT:   return "Q4_DPT - IQ4_NL with learned levels";
+        case LLAMA_FTYPE_MOSTLY_Q2_KPT:   return "Q2_KPT - Q2_K with learned levels";
+        case LLAMA_FTYPE_MOSTLY_IQ2_TQ:   return "IQ2_TQ - 2.0625 bpw trellis quantized";
+        case LLAMA_FTYPE_MOSTLY_IQ3_TQ:   return "IQ3_TQ - 3.5625 bpw per-tensor trained grid";
+        case LLAMA_FTYPE_MOSTLY_IQ1_BN:   return "IQ1_BN - 1.5625 bpw 8D vector quantized";
        case LLAMA_FTYPE_MOSTLY_IQ3_XXS:  return "IQ3_XXS - 3.0625 bpw";
        case LLAMA_FTYPE_MOSTLY_IQ1_S:    return "IQ1_S - 1.5625 bpw";
        case LLAMA_FTYPE_MOSTLY_IQ1_M:    return "IQ1_M - 1.75 bpw";
@ -758,6 +766,13 @@ llama_model_loader::llama_model_loader(
            case GGML_TYPE_IQ4_NL:  ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL;  break;
            case GGML_TYPE_IQ4_XS:  ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS;  break;
            case GGML_TYPE_IQ3_S:   ftype = LLAMA_FTYPE_MOSTLY_IQ3_S;   break;
+            case GGML_TYPE_Q3_PT:   ftype = LLAMA_FTYPE_MOSTLY_Q3_PT;   break;
+            case GGML_TYPE_Q3_KPT:  ftype = LLAMA_FTYPE_MOSTLY_Q3_KPT;  break;
+            case GGML_TYPE_Q4_DPT:  ftype = LLAMA_FTYPE_MOSTLY_Q4_DPT;  break;
+            case GGML_TYPE_Q2_KPT:  ftype = LLAMA_FTYPE_MOSTLY_Q2_KPT;  break;
+            case GGML_TYPE_IQ2_TQ:  ftype = LLAMA_FTYPE_MOSTLY_IQ2_TQ;  break;
+            case GGML_TYPE_IQ3_TQ:  ftype = LLAMA_FTYPE_MOSTLY_IQ3_TQ;  break;
+            case GGML_TYPE_IQ1_BN:  ftype = LLAMA_FTYPE_MOSTLY_IQ1_BN;  break;
            case GGML_TYPE_NVFP4:   ftype = LLAMA_FTYPE_MOSTLY_NVFP4;   break;
            case GGML_TYPE_Q1_0:    ftype = LLAMA_FTYPE_MOSTLY_Q1_0;    break;
            default:
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@ -21,6 +21,7 @@
 // TODO: tmp until the ggml meta backend matures and becomes public
 #include "../src/ggml-ext.h"

+
 #include <algorithm>
 #include <cassert>
 #include <cfloat>
@ -8247,6 +8248,175 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
        }
    }

+    // Load per-tensor quantization auxiliary data (levels/kvalues) from GGUF metadata.
+    // Indexed by weight tensor pointer for direct lookup during inference.
+    {
+        // Build tensor name to tensor pointer map
+        std::unordered_map<std::string, ggml_tensor*> name_to_tensor;
+        for (auto & [ctx, buf_map] : ctx_buf_maps) {
+            for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+                name_to_tensor[ggml_get_name(t)] = t;
+            }
+        }
+
+        struct level_type_info {
+            ggml_type    type;
+            const char * gguf_key;
+            size_t       n_levels;      // number of level values per tensor
+            size_t       elem_bytes;    // size of each level value
+        };
+
+        const level_type_info level_types[] = {
+            { GGML_TYPE_Q3_PT,  "q3_pt.levels",  8, sizeof(float) },
+            { GGML_TYPE_Q3_KPT, "q3_kpt.levels", 8, sizeof(float) },
+            { GGML_TYPE_Q4_DPT, "q4_dpt.levels", 16, sizeof(int8_t) },
+        };
+
+        for (const auto & lt : level_types) {
+            int64_t lv_idx = gguf_find_key(ml.metadata, lt.gguf_key);
+            if (lv_idx < 0) { continue; }
+
+            const uint8_t * lv_raw = (const uint8_t *)gguf_get_arr_data(ml.metadata, lv_idx);
+            const size_t    lv_arr_n = gguf_get_arr_n(ml.metadata, lv_idx);
+
+            size_t tensor_count = 0;
+
+            // Iterate over GGUF slots to find matching tensors
+            for (size_t gguf_slot = 0; gguf_slot < lv_arr_n / lt.n_levels; ++gguf_slot) {
+                std::string tensor_name = gguf_get_tensor_name(ml.metadata, gguf_slot);
+                auto it = name_to_tensor.find(tensor_name);
+                if (it == name_to_tensor.end()) { continue; }
+
+                ggml_tensor* t = it->second;
+                if (t->type != lt.type) { continue; }
+
+                const size_t gguf_offset = gguf_slot * lt.n_levels;
+
+                // Store directly indexed by tensor pointer
+                auto & aux = tensor_aux_data[t];
+                aux.type = lt.type;
+                aux.host_data.assign(
+                    lv_raw + gguf_offset * lt.elem_bytes,
+                    lv_raw + (gguf_offset + lt.n_levels) * lt.elem_bytes
+                );
+                aux.aux_tensor = nullptr;
+
+                // Set quant_levels directly on the tensor
+                t->quant_levels = aux.host_data.data();
+
+                tensor_count++;
+            }
+
+            if (tensor_count > 0) {
+                LLAMA_LOG_INFO("%s: loaded %zu %s per-tensor level tables\n",
+                               __func__, tensor_count, lt.gguf_key);
+            }
+        }
+
+        // Q2_KPT: per-block levels stored as per-tensor GGUF keys "{tensor_name}.q2kpt_levels"
+        // Each key holds n_blocks * Q2KPT_N_LEVELS floats for that tensor (4 floats per 256-element block).
+        {
+            size_t q2kpt_loaded = 0;
+            for (auto & [tname, t] : name_to_tensor) {
+                if (t->type != GGML_TYPE_Q2_KPT) { continue; }
+                const std::string key = tname + ".q2kpt_levels";
+                int64_t lv_idx = gguf_find_key(ml.metadata, key.c_str());
+                if (lv_idx < 0) { continue; }
+
+                const uint8_t * lv_raw  = (const uint8_t *)gguf_get_arr_data(ml.metadata, lv_idx);
+                const size_t    lv_n    = gguf_get_arr_n(ml.metadata, lv_idx);
+
+                auto & aux = tensor_aux_data[t];
+                aux.type = GGML_TYPE_Q2_KPT;
+                aux.host_data.assign(lv_raw, lv_raw + lv_n * sizeof(float));
+                aux.aux_tensor = nullptr;
+                t->quant_levels = aux.host_data.data();
+                q2kpt_loaded++;
+            }
+            if (q2kpt_loaded > 0) {
+                LLAMA_LOG_INFO("%s: loaded %zu Q2_KPT per-block level tables\n", __func__, q2kpt_loaded);
+            }
+        }
+
+        // IQ2_TQ: per-tensor trained grid (16 × 4 int8 = 64 bytes)
+        {
+            size_t iq2tq_loaded = 0;
+            for (auto & [tname, t] : name_to_tensor) {
+                if (t->type != GGML_TYPE_IQ2_TQ) { continue; }
+
+                const std::string grid_key = "iq2tq.grid." + tname;
+                int64_t grid_idx = gguf_find_key(ml.metadata, grid_key.c_str());
+                if (grid_idx < 0) { continue; }
+
+                auto & taux = tensor_aux_data[t];
+                taux.type = GGML_TYPE_IQ2_TQ;
+                taux.host_data.resize(64);
+                const int8_t * grid_data = (const int8_t *)gguf_get_arr_data(ml.metadata, grid_idx);
+                memcpy(taux.host_data.data(), grid_data, 64);
+
+                t->quant_levels = taux.host_data.data();
+                iq2tq_loaded++;
+            }
+            if (iq2tq_loaded > 0) {
+                LLAMA_LOG_INFO("%s: loaded IQ2_TQ grid for %zu tensors\n", __func__, iq2tq_loaded);
+            }
+        }
+
+        // IQ3_TQ: per-tensor trained grid (16 × 8 int8 = 128 bytes)
+        {
+            size_t iq3tq_loaded = 0;
+            for (auto & [tname, t] : name_to_tensor) {
+                if (t->type != GGML_TYPE_IQ3_TQ) { continue; }
+
+                const std::string grid_key = "iq3tq.grid." + tname;
+                int64_t grid_idx = gguf_find_key(ml.metadata, grid_key.c_str());
+                if (grid_idx < 0) {
+                    // backward compat: try old key name
+                    const std::string old_key = "iq3qt.grid." + tname;
+                    grid_idx = gguf_find_key(ml.metadata, old_key.c_str());
+                    if (grid_idx < 0) { continue; }
+                }
+
+                auto & taux = tensor_aux_data[t];
+                taux.type = GGML_TYPE_IQ3_TQ;
+                taux.host_data.resize(128);
+                const int8_t * grid_data = (const int8_t *)gguf_get_arr_data(ml.metadata, grid_idx);
+                memcpy(taux.host_data.data(), grid_data, 128);
+
+                t->quant_levels = taux.host_data.data();
+                iq3tq_loaded++;
+            }
+            if (iq3tq_loaded > 0) {
+                LLAMA_LOG_INFO("%s: loaded IQ3_TQ grid for %zu tensors\n", __func__, iq3tq_loaded);
+            }
+        }
+
+        // IQ1_BN: per-tensor trained codebook (32768 bytes)
+        {
+            size_t iq1bn_loaded = 0;
+            for (auto & [tname, t] : name_to_tensor) {
+                if (t->type != GGML_TYPE_IQ1_BN) { continue; }
+
+                const std::string aux_key = "iq1bn.aux." + tname;
+                int64_t aux_idx = gguf_find_key(ml.metadata, aux_key.c_str());
+                if (aux_idx < 0) { continue; }
+
+                auto & taux = tensor_aux_data[t];
+                taux.type = GGML_TYPE_IQ1_BN;
+                taux.host_data.resize(32768);
+                const int8_t * aux_data = (const int8_t *)gguf_get_arr_data(ml.metadata, aux_idx);
+                memcpy(taux.host_data.data(), aux_data, 32768);
+
+                t->quant_levels = taux.host_data.data();
+                iq1bn_loaded++;
+            }
+            if (iq1bn_loaded > 0) {
+                LLAMA_LOG_INFO("%s: loaded IQ1_BN codebook for %zu tensors\n", __func__, iq1bn_loaded);
+            }
+        }
+
+    }
+
    if (use_mmap_buffer) {
        for (auto & mapping : ml.mappings) {
            pimpl->mappings.emplace_back(std::move(mapping));
--- a/src/llama-model.h
+++ b/src/llama-model.h
@ -574,6 +574,24 @@ struct llama_model {
    // for keeping track of associated LoRA adapters
    std::unordered_set<llama_adapter_lora *> loras;

+    // host-side auxiliary data for dynamic quantization types (Q4_DPT, Q3_PT, Q3_KPT)
+    // indexed by weight tensor pointer, allows separate GPU placement of aux data
+    struct tensor_auxiliary {
+        ggml_type type;                    // Quantization type this aux data is for
+        std::vector<uint8_t> host_data;    // Host copy of aux data (levels or kvalues)
+        struct ggml_tensor * aux_tensor;   // Separate ggml tensor for backend placement
+    };
+
+    // Hash function for ggml_tensor pointers (reuse existing ggml_hash pattern)
+    struct ggml_tensor_ptr_hash {
+        size_t operator()(const ggml_tensor* t) const noexcept {
+            return (size_t)(uintptr_t)t >> 4;  // Same as ggml_hash()
+        }
+    };
+
+    // Per-tensor auxiliary data lookup - indexed by WEIGHT tensor pointer
+    std::unordered_map<const ggml_tensor*, tensor_auxiliary, ggml_tensor_ptr_hash> tensor_aux_data;
+
    // statically allocated context for assigning
    struct llama_meta_device_get_split_state_userdata get_split_state_ud;

--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@ -1,6 +1,8 @@
+#include "ggml.h"
 #include "llama-impl.h"
 #include "llama-model.h"
 #include "llama-model-loader.h"
+#include "llama.h"
 #include "llama-ext.h"

 #include <algorithm>
@ -13,6 +15,98 @@
 #include <thread>
 #include <unordered_map>

+// Q3_PT levels functions (defined in ggml-quants.c)
+extern "C" {
+    void   q3pt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
+                               const float * imatrix, float levels_out[8]);
+    void   q3pt_set_levels(const float * levels);
+}
+
+// Q3_KPT levels functions (defined in ggml-quants.c)
+extern "C" {
+    void   q3kpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
+                               const float * imatrix, float levels_out[8]);
+    void   q3kpt_set_levels(const float * levels);
+}
+
+// Q4_DPT levels functions (defined in ggml-quants.c)
+extern "C" {
+    void   q4dpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
+                               const float * imatrix, int8_t levels_out[16]);
+    void   q4dpt_set_levels(const int8_t * levels);
+}
+
+// Q2_KPT levels are handled internally by quantize_q2_kpt
+#define Q2KPT_N_LEVELS 4
+#define QK_K 256
+extern "C" const float * q2kpt_get_levels(void);
+extern "C" void          q2kpt_prepare_levels(int64_t nrows, int64_t n_per_row);
+extern "C" void          q2kpt_free_levels(void);
+
+// IQ2_TQ functions — per-tensor trained grid
+extern "C" size_t   quantize_iq2_tq(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+extern "C" void     iq2tq_train_grid(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t grid_out[64]);
+extern "C" void     iq2tq_set_grid(const int8_t grid[64]);
+extern "C" const int8_t * iq2tq_get_grid(void);
+
+// IQ3_TQ functions — per-tensor trained grid (3-bit, 128 bytes per tensor)
+extern "C" size_t   quantize_iq3_tq(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+extern "C" void     iq3tq_train_grid(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t grid_out[128]);
+extern "C" void     iq3tq_set_grid(const int8_t grid[128]);
+extern "C" const int8_t * iq3tq_get_grid(void);
+
+// IQ1_BN functions — 8D vector quantized with per-tensor trained 4096-entry codebook (32768 bytes per tensor)
+extern "C" size_t   quantize_iq1_bn(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+extern "C" void     iq1bn_train_codebook(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t aux_out[32768], int nthread);
+extern "C" void     iq1bn_set_aux(const int8_t aux[32768]);
+extern "C" const int8_t * iq1bn_get_aux(void);
+
+// Q3_PT levels functions (defined in ggml-quants.c)
+extern "C" {
+    void   q3pt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
+                               const float * imatrix, float levels_out[8]);
+    void   q3pt_set_levels(const float * levels);
+}
+
+// Q3_KPT levels functions (defined in ggml-quants.c)
+extern "C" {
+    void   q3kpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
+                               const float * imatrix, float levels_out[8]);
+    void   q3kpt_set_levels(const float * levels);
+}
+
+// Q4_DPT levels functions (defined in ggml-quants.c)
+extern "C" {
+    void   q4dpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
+                               const float * imatrix, int8_t levels_out[16]);
+    void   q4dpt_set_levels(const int8_t * levels);
+}
+
+// Q2_KPT levels are handled internally by quantize_q2_kpt
+#define Q2KPT_N_LEVELS 4
+#define QK_K 256
+extern "C" const float * q2kpt_get_levels(void);
+extern "C" void          q2kpt_prepare_levels(int64_t nrows, int64_t n_per_row);
+extern "C" void          q2kpt_free_levels(void);
+
+// IQ2_TQ functions — per-tensor trained grid
+extern "C" size_t   quantize_iq2_tq(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+extern "C" void     iq2tq_train_grid(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t grid_out[64]);
+extern "C" void     iq2tq_set_grid(const int8_t grid[64]);
+extern "C" const int8_t * iq2tq_get_grid(void);
+
+// IQ3_TQ functions — per-tensor trained grid (3-bit, 128 bytes per tensor)
+extern "C" size_t   quantize_iq3_tq(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+extern "C" void     iq3tq_train_grid(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t grid_out[128]);
+extern "C" void     iq3tq_set_grid(const int8_t grid[128]);
+extern "C" const int8_t * iq3tq_get_grid(void);
+
+// IQ1_BN functions — 8D vector quantized with per-tensor trained 4096-entry codebook (32768 bytes per tensor)
+extern "C" size_t   quantize_iq1_bn(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+extern "C" void     iq1bn_train_codebook(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t aux_out[32768], int nthread);
+extern "C" void     iq1bn_set_aux(const int8_t aux[32768]);
+extern "C" const int8_t * iq1bn_get_aux(void);
+
 // result of parsing --tensor-type option
 // (changes to this struct must be reflected in tools/quantize/quantize.cpp)
 struct tensor_type_option {
@ -234,7 +328,7 @@ static void llama_tensor_dequantize_impl(
        } else if (tensor->type == GGML_TYPE_BF16) {
            ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements);
        } else if (ggml_is_quantized(tensor->type)) {
-            qtype->to_float(tensor->data, f32_output, nelements);
+            qtype->to_float(tensor->data, f32_output, nelements, tensor->quant_levels);
        } else {
            GGML_ABORT("fatal error"); // unreachable
        }
@ -264,13 +358,14 @@ static void llama_tensor_dequantize_impl(
        size_t thr_elems = thr_blocks * block_size; // number of elements for this thread
        size_t thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread

-        auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
+        const void * quant_levels = tensor->quant_levels;
+        auto compute = [qtype, quant_levels] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
            if (typ == GGML_TYPE_F16) {
                ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
            } else if (typ == GGML_TYPE_BF16) {
                ggml_bf16_to_fp32_row((ggml_bf16_t *)inbuf, outbuf, nels);
            } else {
-                qtype->to_float(inbuf, outbuf, nels);
+                qtype->to_float(inbuf, outbuf, nels, quant_levels);
            }
        };
        workers.emplace_back(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems);
@ -480,6 +575,18 @@ static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type
            else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
                new_type = GGML_TYPE_IQ3_S;
            }
+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_PT) {
+                new_type = GGML_TYPE_IQ4_XS;
+            }
+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_KPT) {
+                new_type = GGML_TYPE_Q4_K;
+            }
+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_DPT) {
+                new_type = GGML_TYPE_IQ4_XS;
+            }
+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_KPT) {
+                new_type = GGML_TYPE_Q4_K;
+            }
            else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) {
                new_type = GGML_TYPE_Q4_K;
            }
@ -518,13 +625,16 @@ static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type
        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
            new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
        }
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_PT) {
+            new_type = GGML_TYPE_Q3_PT;
+        }
        else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) {
            new_type = GGML_TYPE_Q4_K;
        }
        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
            new_type = GGML_TYPE_Q4_K;
        }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_KPT) {
            new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
        }
        else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
@ -569,16 +679,17 @@ static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type
        auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
        int i_layer = info.first, n_layer = info.second;
        if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_KPT) new_type = GGML_TYPE_Q3_K;
        else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
            if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
        }
        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) {
            new_type = i_layer < n_layer/8 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
        }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_KPT) {
            new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K
                     : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
-                     : GGML_TYPE_Q3_K;
+                     : (ftype == LLAMA_FTYPE_MOSTLY_Q3_KPT ? GGML_TYPE_Q3_KPT : GGML_TYPE_Q3_K);
        }
        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 ||
                    (qs.model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) {
@ -587,6 +698,9 @@ static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type
        else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
            new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
        }
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_PT) {
+            new_type = GGML_TYPE_IQ4_XS;
+        }
        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
            if (arch == LLM_ARCH_FALCON) {
                new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K :
@ -616,13 +730,14 @@ static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type
                if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K   || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
                    ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M  || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL  ||
                    ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M  || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S  ||
-                    ftype == LLAMA_FTYPE_MOSTLY_IQ3_M  || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) {
+                    ftype == LLAMA_FTYPE_MOSTLY_IQ3_M  || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || ftype == LLAMA_FTYPE_MOSTLY_Q3_KPT ||
+                    ftype == LLAMA_FTYPE_MOSTLY_Q2_KPT) {
                    new_type = GGML_TYPE_Q5_K;
                }
            } else {
-                if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K   ) new_type = GGML_TYPE_Q3_K;
+                if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_KPT) new_type = GGML_TYPE_Q3_K;
                else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
-                else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K;
+                else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_KPT) new_type = GGML_TYPE_Q4_K;
                else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K;
                else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M  ) new_type = GGML_TYPE_Q4_K;
            }
@ -828,6 +943,14 @@ ggml_type llama_ftype_get_default_type(llama_ftype ftype) {
        case LLAMA_FTYPE_MOSTLY_IQ4_XS:  return GGML_TYPE_IQ4_XS;
        case LLAMA_FTYPE_MOSTLY_IQ3_S:
        case LLAMA_FTYPE_MOSTLY_IQ3_M:   return GGML_TYPE_IQ3_S;
+        case LLAMA_FTYPE_MOSTLY_Q3_PT:   return GGML_TYPE_Q3_PT;
+        case LLAMA_FTYPE_MOSTLY_Q3_KPT:  return GGML_TYPE_Q3_KPT;
+        case LLAMA_FTYPE_MOSTLY_Q4_DPT:  return GGML_TYPE_Q4_DPT;
+        case LLAMA_FTYPE_MOSTLY_Q2_KPT:  return GGML_TYPE_Q2_KPT;
+        case LLAMA_FTYPE_MOSTLY_IQ2_TQ:  return GGML_TYPE_IQ2_TQ;
+        case LLAMA_FTYPE_MOSTLY_IQ3_TQ:  return GGML_TYPE_IQ3_TQ;
+        case LLAMA_FTYPE_MOSTLY_IQ1_BN:  return GGML_TYPE_IQ1_BN;
+

        default: return GGML_TYPE_COUNT;
    }
@ -1098,6 +1221,615 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
        ::zeros(fout, meta_size);
    };

+    // Q3_PT two-pass approach: train all per-tensor levels BEFORE opening the output
+    // file, so the levels KV entry is already populated at the time of the metadata placeholder.
+    static const size_t Q3PT_N_LEVELS = 8;
+    std::vector<float> q3pt_all_levels;  // indexed by position in tensors[]
+    if (ftype == LLAMA_FTYPE_MOSTLY_Q3_PT && !params->dry_run) {
+        LLAMA_LOG_INFO("%s: Q3_PT pass 1: training per-tensor levels...\n", __func__);
+        q3pt_all_levels.assign(tensors.size() * Q3PT_N_LEVELS, 0.0f);
+
+        // Temporary dequant buffer for pass 1 (reuse f32_conv_buf / read_data declared below)
+        std::vector<no_init<uint8_t>> p1_read_data;
+        std::vector<no_init<float>>   p1_f32_buf;
+        std::vector<std::thread>      p1_workers;
+        p1_workers.reserve(nthread);
+
+        for (size_t ti = 0; ti < tensors.size(); ++ti) {
+            ggml_tensor * tensor = tensors[ti]->tensor;
+            const std::string tname = ggml_get_name(tensor);
+
+            // Determine whether this tensor will be Q3_PT (mirror the pass-2 logic)
+            bool quantize = tname.rfind("weight") == tname.size() - 6;
+            quantize &= (ggml_n_dims(tensor) >= 2);
+            quantize &= tname.find("_norm.weight")        == std::string::npos;
+            quantize &= tname.find("ffn_gate_inp.weight") == std::string::npos;
+            if (!quantize) { continue; }
+
+            ggml_type new_type = default_type;
+            if (!params->pure) {
+                new_type = llama_tensor_get_type_impl(qs, new_type, tensor, ftype, tensor_get_category(tname));
+            }
+            if (new_type != GGML_TYPE_Q3_PT) { continue; }
+
+            // Load tensor data
+            const size_t tsz = ggml_nbytes(tensor);
+            if (!ml.use_mmap) {
+                if (p1_read_data.size() < tsz) { p1_read_data.resize(tsz); }
+                tensor->data = p1_read_data.data();
+            }
+            ml.load_data_for(tensor);
+
+            // Dequantize to f32 if needed
+            const int64_t nelements = ggml_nelements(tensor);
+            float * f32_data;
+            if (tensor->type == GGML_TYPE_F32) {
+                f32_data = (float *) tensor->data;
+            } else {
+                llama_tensor_dequantize_impl(tensor, p1_f32_buf, p1_workers, nelements, nthread);
+                f32_data = (float *) p1_f32_buf.data();
+            }
+
+            // Resolve imatrix
+            const float * imatrix = nullptr;
+            if (imatrix_data) {
+                auto it2 = imatrix_data->find(remap_imatrix(tensor->name, mapped));
+                if (it2 != imatrix_data->end() &&
+                    it2->second.size() == (size_t)tensor->ne[0] * tensor->ne[2]) {
+                    imatrix = it2->second.data();
+                }
+            }
+
+            const int64_t n_per_row = tensor->ne[0];
+            const int64_t nrows     = tensor->ne[1];
+
+            LLAMA_LOG_INFO("%s: Q3_PT levels for [%zu/%zu] %s\n", __func__, ti+1, tensors.size(), tensor->name);
+            q3pt_train_levels(f32_data, nrows, n_per_row, imatrix,
+                               q3pt_all_levels.data() + ti * Q3PT_N_LEVELS);
+        }
+
+        // All levels ready — store in GGUF metadata before the file is opened
+        for (auto & ctx : ctx_outs) {
+            if (ctx) {
+                gguf_set_arr_data(ctx.get(), "q3_pt.levels", GGUF_TYPE_FLOAT32,
+                                  q3pt_all_levels.data(), q3pt_all_levels.size());
+            }
+        }
+        LLAMA_LOG_INFO("%s: Q3_PT pass 1 complete.\n", __func__);
+    }
+
+    // Q3_KPT two-pass approach: train all per-tensor levels BEFORE opening the output
+    static const size_t Q3KPT_N_LEVELS = 8;
+    std::vector<float> q3kpt_all_levels;  // indexed by position in tensors[]
+    if (ftype == LLAMA_FTYPE_MOSTLY_Q3_KPT && !params->dry_run) {
+        LLAMA_LOG_INFO("%s: Q3_KPT pass 1: training per-tensor levels...\n", __func__);
+        q3kpt_all_levels.assign(tensors.size() * Q3KPT_N_LEVELS, 0.0f);
+
+        // Temporary dequant buffer for pass 1
+        std::vector<no_init<uint8_t>> p1_read_data;
+        std::vector<no_init<float>>   p1_f32_buf;
+        std::vector<std::thread>      p1_workers;
+        p1_workers.reserve(nthread);
+
+        for (size_t ti = 0; ti < tensors.size(); ++ti) {
+            ggml_tensor * tensor = tensors[ti]->tensor;
+            const std::string tname = ggml_get_name(tensor);
+
+            // Determine whether this tensor will be Q3_KPT (mirror the pass-2 logic)
+            bool quantize = tname.rfind("weight") == tname.size() - 6;
+            quantize &= (ggml_n_dims(tensor) >= 2);
+            quantize &= tname.find("_norm.weight")        == std::string::npos;
+            quantize &= tname.find("ffn_gate_inp.weight") == std::string::npos;
+            if (!quantize) { continue; }
+
+            ggml_type new_type = default_type;
+            if (!params->pure) {
+                new_type = llama_tensor_get_type_impl(qs, new_type, tensor, ftype, tensor_get_category(tname));
+            }
+            if (params->token_embedding_type < GGML_TYPE_COUNT &&
+                (tname == "token_embd.weight" || tname == "per_layer_token_embd.weight")) {
+                new_type = params->token_embedding_type;
+            }
+            if (params->output_tensor_type < GGML_TYPE_COUNT && tname == "output.weight") {
+                new_type = params->output_tensor_type;
+            }
+            if (new_type != GGML_TYPE_Q3_KPT) { continue; }
+
+            // Load tensor data
+            const size_t tsz = ggml_nbytes(tensor);
+            if (!ml.use_mmap) {
+                if (p1_read_data.size() < tsz) { p1_read_data.resize(tsz); }
+                tensor->data = p1_read_data.data();
+            }
+            ml.load_data_for(tensor);
+
+            // Dequantize to f32 if needed
+            const int64_t nelements = ggml_nelements(tensor);
+            float * f32_data;
+            if (tensor->type == GGML_TYPE_F32) {
+                f32_data = (float *) tensor->data;
+            } else {
+                llama_tensor_dequantize_impl(tensor, p1_f32_buf, p1_workers, nelements, nthread);
+                f32_data = (float *) p1_f32_buf.data();
+            }
+
+            // Resolve imatrix
+            const float * imatrix = nullptr;
+            if (imatrix_data) {
+                auto it2 = imatrix_data->find(remap_imatrix(tensor->name, mapped));
+                if (it2 != imatrix_data->end() &&
+                    it2->second.size() == (size_t)tensor->ne[0] * tensor->ne[2]) {
+                    imatrix = it2->second.data();
+                }
+            }
+
+            const int64_t n_per_row = tensor->ne[0];
+            const int64_t nrows     = tensor->ne[1];
+
+            LLAMA_LOG_INFO("%s: Q3_KPT levels for [%zu/%zu] %s\n", __func__, ti+1, tensors.size(), tensor->name);
+            q3kpt_train_levels(f32_data, nrows, n_per_row, imatrix,
+                               q3kpt_all_levels.data() + ti * Q3KPT_N_LEVELS);
+        }
+
+        // All levels ready — store in GGUF metadata before the file is opened
+        for (auto & ctx : ctx_outs) {
+            if (ctx) {
+                gguf_set_arr_data(ctx.get(), "q3_kpt.levels", GGUF_TYPE_FLOAT32,
+                                  q3kpt_all_levels.data(), q3kpt_all_levels.size());
+            }
+        }
+        LLAMA_LOG_INFO("%s: Q3_KPT pass 1 complete.\n", __func__);
+    }
+
+    // Q4_DPT two-pass approach: train all per-tensor int8 levels BEFORE opening the output
+    // file, so the levels KV entry is already populated at the time of the metadata placeholder.
+    static const size_t Q4DPT_N_LEVELS = 16;
+    std::vector<int8_t> q4dpt_all_levels;  // indexed by position in tensors[]
+    if (ftype == LLAMA_FTYPE_MOSTLY_Q4_DPT && !params->dry_run) {
+        LLAMA_LOG_INFO("%s: Q4_DPT pass 1: training per-tensor int8 levels...\n", __func__);
+        q4dpt_all_levels.assign(tensors.size() * Q4DPT_N_LEVELS, (int8_t)0);
+
+        std::vector<no_init<uint8_t>> p1_read_data;
+        std::vector<no_init<float>>   p1_f32_buf;
+        std::vector<std::thread>      p1_workers;
+        p1_workers.reserve(nthread);
+
+        for (size_t ti = 0; ti < tensors.size(); ++ti) {
+            ggml_tensor * tensor = tensors[ti]->tensor;
+            const std::string tname = ggml_get_name(tensor);
+
+            bool quantize = tname.rfind("weight") == tname.size() - 6;
+            quantize &= (ggml_n_dims(tensor) >= 2);
+            quantize &= tname.find("_norm.weight")        == std::string::npos;
+            quantize &= tname.find("ffn_gate_inp.weight") == std::string::npos;
+            if (!quantize) { continue; }
+
+            ggml_type new_type = default_type;
+            if (!params->pure) {
+                new_type = llama_tensor_get_type_impl(qs, new_type, tensor, ftype, tensor_get_category(tname));
+            }
+            if (params->token_embedding_type < GGML_TYPE_COUNT &&
+                (tname == "token_embd.weight" || tname == "per_layer_token_embd.weight")) {
+                new_type = params->token_embedding_type;
+            }
+            if (params->output_tensor_type < GGML_TYPE_COUNT && tname == "output.weight") {
+                new_type = params->output_tensor_type;
+            }
+            if (new_type != GGML_TYPE_Q4_DPT) { continue; }
+
+            // Load tensor data
+            const size_t tsz = ggml_nbytes(tensor);
+            if (!ml.use_mmap) {
+                if (p1_read_data.size() < tsz) { p1_read_data.resize(tsz); }
+                tensor->data = p1_read_data.data();
+            }
+            ml.load_data_for(tensor);
+
+            // Dequantize to f32 if needed
+            const int64_t nelements = ggml_nelements(tensor);
+            float * f32_data;
+            if (tensor->type == GGML_TYPE_F32) {
+                f32_data = (float *) tensor->data;
+            } else {
+                llama_tensor_dequantize_impl(tensor, p1_f32_buf, p1_workers, nelements, nthread);
+                f32_data = (float *) p1_f32_buf.data();
+            }
+
+            // Resolve imatrix
+            const float * imatrix = nullptr;
+            if (imatrix_data) {
+                auto it2 = imatrix_data->find(remap_imatrix(tensor->name, mapped));
+                if (it2 != imatrix_data->end() &&
+                    it2->second.size() == (size_t)tensor->ne[0] * tensor->ne[2]) {
+                    imatrix = it2->second.data();
+                }
+            }
+
+            const int64_t n_per_row = tensor->ne[0];
+            const int64_t nrows     = tensor->ne[1];
+
+            LLAMA_LOG_INFO("%s: Q4_DPT levels for [%zu/%zu] %s\n", __func__, ti+1, tensors.size(), tensor->name);
+            q4dpt_train_levels(f32_data, nrows, n_per_row, imatrix,
+                               q4dpt_all_levels.data() + ti * Q4DPT_N_LEVELS);
+        }
+
+        // Store in GGUF metadata before the file is opened
+        for (auto & ctx : ctx_outs) {
+            if (ctx) {
+                gguf_set_arr_data(ctx.get(), "q4_dpt.levels", GGUF_TYPE_INT8,
+                                  q4dpt_all_levels.data(), q4dpt_all_levels.size());
+            }
+        }
+        LLAMA_LOG_INFO("%s: Q4_DPT pass 1 complete.\n", __func__);
+    }
+
+    // Q2_KPT two-pass approach: train all per-block levels BEFORE opening the output
+    // file, so the levels KV entry is already populated at the time of the metadata placeholder.
+    // Per-block levels: 4 floats per 256-element block.
+    struct q2kpt_tensor_levels {
+        std::string name;
+        std::vector<float> levels;  // nrows * (n_per_row / QK_K) * Q2KPT_N_LEVELS floats
+    };
+    std::vector<q2kpt_tensor_levels> q2kpt_all_levels;
+    if (ftype == LLAMA_FTYPE_MOSTLY_Q2_KPT && !params->dry_run) {
+        LLAMA_LOG_INFO("%s: Q2_KPT pass 1: training per-block levels...\n", __func__);
+
+        std::vector<no_init<uint8_t>> p1_read_data;
+        std::vector<no_init<float>>   p1_f32_buf;
+        std::vector<std::thread>      p1_workers;
+        p1_workers.reserve(nthread);
+
+        for (size_t ti = 0; ti < tensors.size(); ++ti) {
+            ggml_tensor * tensor = tensors[ti]->tensor;
+            const std::string tname = ggml_get_name(tensor);
+
+            // Determine whether this tensor will be Q2_KPT (mirror the pass-2 logic)
+            bool quantize = tname.rfind("weight") == tname.size() - 6;
+            quantize &= (ggml_n_dims(tensor) >= 2);
+            quantize &= tname.find("_norm.weight")        == std::string::npos;
+            quantize &= tname.find("ffn_gate_inp.weight") == std::string::npos;
+            if (!quantize) { continue; }
+
+            ggml_type new_type = default_type;
+            if (!params->pure) {
+                new_type = llama_tensor_get_type_impl(qs, new_type, tensor, ftype, tensor_get_category(tname));
+            }
+            if (params->token_embedding_type < GGML_TYPE_COUNT &&
+                (tname == "token_embd.weight" || tname == "per_layer_token_embd.weight")) {
+                new_type = params->token_embedding_type;
+            }
+            if (params->output_tensor_type < GGML_TYPE_COUNT && tname == "output.weight") {
+                new_type = params->output_tensor_type;
+            }
+            if (new_type != GGML_TYPE_Q2_KPT) { continue; }
+
+            // Load tensor data
+            const size_t tsz = ggml_nbytes(tensor);
+            if (!ml.use_mmap) {
+                if (p1_read_data.size() < tsz) { p1_read_data.resize(tsz); }
+                tensor->data = p1_read_data.data();
+            }
+            ml.load_data_for(tensor);
+
+            // Dequantize to f32 if needed
+            const int64_t nelements = ggml_nelements(tensor);
+            float * f32_data;
+            if (tensor->type == GGML_TYPE_F32) {
+                f32_data = (float *) tensor->data;
+            } else {
+                llama_tensor_dequantize_impl(tensor, p1_f32_buf, p1_workers, nelements, nthread);
+                f32_data = (float *) p1_f32_buf.data();
+            }
+
+            // Resolve imatrix
+            const float * imatrix = nullptr;
+            if (imatrix_data) {
+                auto it2 = imatrix_data->find(remap_imatrix(tensor->name, mapped));
+                if (it2 != imatrix_data->end() &&
+                    it2->second.size() == (size_t)tensor->ne[0] * tensor->ne[2]) {
+                    imatrix = it2->second.data();
+                }
+            }
+
+            const int64_t n_per_row = tensor->ne[0];
+            const int64_t nrows     = tensor->ne[1];
+
+            // Allocate levels buffer for this tensor
+            const int nb = n_per_row / QK_K;
+            const size_t n_levels = (size_t)nrows * tensor->ne[2] * nb * Q2KPT_N_LEVELS;
+            q2kpt_all_levels.push_back({tname, std::vector<float>(n_levels)});
+
+            LLAMA_LOG_INFO("%s: Q2_KPT levels for [%zu/%zu] %s (%zu floats)\n",
+                           __func__, ti+1, tensors.size(), tensor->name, n_levels);
+
+            // Train levels by running quantization internally
+            // We need to quantize to f32 -> Q2_KPT -> f32 to get the trained levels
+            std::vector<no_init<uint8_t>> p1_qbuf(ggml_nbytes(tensor));
+            const size_t row_size = ggml_row_size(GGML_TYPE_Q2_KPT, n_per_row);
+
+            // Prepare levels buffer for this tensor
+            q2kpt_free_levels();
+            q2kpt_prepare_levels(nrows * tensor->ne[2], n_per_row);
+
+            // Quantize each expert slice
+            const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
+            for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
+                const float * f32_data_03 = f32_data + i03 * nelements_matrix;
+                void * q_data_03 = (char *)p1_qbuf.data() + row_size * i03 * nrows;
+                const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
+
+                // start_row must be the absolute row index for correct levels indexing
+                ggml_quantize_chunk(GGML_TYPE_Q2_KPT, f32_data_03, q_data_03, i03 * nrows, nrows, n_per_row, imatrix_03);
+            }
+
+            // Copy trained levels to our storage
+            const float * trained_levels = q2kpt_get_levels();
+            if (trained_levels) {
+                memcpy(q2kpt_all_levels.back().levels.data(), trained_levels, n_levels * sizeof(float));
+            }
+        }
+
+        // Store all levels in GGUF metadata before the file is opened
+        for (const auto & tl : q2kpt_all_levels) {
+            for (auto & ctx : ctx_outs) {
+                if (ctx) {
+                    const std::string key = tl.name + ".q2kpt_levels";
+                    gguf_set_arr_data(ctx.get(), key.c_str(), GGUF_TYPE_FLOAT32,
+                                      tl.levels.data(), tl.levels.size());
+                }
+            }
+        }
+        LLAMA_LOG_INFO("%s: Q2_KPT pass 1 complete.\n", __func__);
+    }
+
+    // IQ2_TQ: train per-tensor grid in pass 1
+    struct iq2tq_meta {
+        std::string tensor_name;
+        int8_t grid[64];
+    };
+    std::vector<iq2tq_meta> iq2tq_all_meta;
+    if (params->ftype == LLAMA_FTYPE_MOSTLY_IQ2_TQ) {
+        const int64_t t_start_p1 = ggml_time_us();
+        LLAMA_LOG_INFO("%s: IQ2_TQ pass 1: training per-tensor grids...\n", __func__);
+
+        std::vector<no_init<uint8_t>> p1_read_data;
+        std::vector<no_init<float>>   p1_f32_buf;
+        std::vector<std::thread>      p1_workers;
+        p1_workers.reserve(nthread);
+
+        for (size_t ti = 0; ti < tensors.size(); ++ti) {
+            ggml_tensor * tensor = tensors[ti]->tensor;
+            const std::string tname = ggml_get_name(tensor);
+
+            // Mirror pass-2 logic: only quantize 2D+ weight tensors
+            bool quantize = tname.rfind("weight") == tname.size() - 6;
+            quantize &= (ggml_n_dims(tensor) >= 2);
+            quantize &= tname.find("_norm.weight")        == std::string::npos;
+            quantize &= tname.find("ffn_gate_inp.weight") == std::string::npos;
+            if (!quantize) { continue; }
+
+            ggml_type new_type = default_type;
+            if (!params->pure) {
+                new_type = llama_tensor_get_type_impl(qs, new_type, tensor, ftype, tensor_get_category(tname));
+            }
+            if (params->token_embedding_type < GGML_TYPE_COUNT &&
+                (tname == "token_embd.weight" || tname == "per_layer_token_embd.weight")) {
+                new_type = params->token_embedding_type;
+            }
+            if (params->output_tensor_type < GGML_TYPE_COUNT && tname == "output.weight") {
+                new_type = params->output_tensor_type;
+            }
+            if (new_type != GGML_TYPE_IQ2_TQ) { continue; }
+
+            // Load tensor data
+            const size_t tsz = ggml_nbytes(tensor);
+            if (!ml.use_mmap) {
+                if (p1_read_data.size() < tsz) { p1_read_data.resize(tsz); }
+                tensor->data = p1_read_data.data();
+            }
+            ml.load_data_for(tensor);
+
+            // Dequantize to f32 if needed
+            const int64_t nelements = ggml_nelements(tensor);
+            float * f32_data;
+            if (tensor->type == GGML_TYPE_F32) {
+                f32_data = (float *) tensor->data;
+            } else {
+                llama_tensor_dequantize_impl(tensor, p1_f32_buf, p1_workers, nelements, nthread);
+                f32_data = (float *) p1_f32_buf.data();
+            }
+
+            // Resolve imatrix
+            const float * imatrix = nullptr;
+            if (imatrix_data) {
+                auto it2 = imatrix_data->find(remap_imatrix(tensor->name, mapped));
+                if (it2 != imatrix_data->end() &&
+                    it2->second.size() == (size_t)tensor->ne[0] * tensor->ne[2]) {
+                    imatrix = it2->second.data();
+                }
+            }
+
+            const int64_t n_per_row = tensor->ne[0];
+            const int64_t nrows     = tensor->ne[1];
+
+            LLAMA_LOG_INFO("%s: IQ2_TQ grid for [%zu/%zu] %s\n", __func__, ti+1, tensors.size(), tensor->name);
+
+            iq2tq_meta meta;
+            meta.tensor_name = tname;
+            iq2tq_train_grid(f32_data, nrows, n_per_row, imatrix, meta.grid);
+            iq2tq_all_meta.push_back(meta);
+
+            // Save to GGUF
+            std::string grid_key = "iq2tq.grid." + tname;
+            gguf_set_arr_data(ctx_outs[0].get(), grid_key.c_str(), GGUF_TYPE_INT8, meta.grid, 64);
+        }
+        const int64_t t_end_p1 = ggml_time_us();
+        LLAMA_LOG_INFO("%s: IQ2_TQ pass 1 complete (%zu tensors trained, %.1f s).\n",
+                       __func__, iq2tq_all_meta.size(), (t_end_p1 - t_start_p1) / 1e6);
+    }
+
+    // IQ3_TQ: train per-tensor grid in pass 1 (16 entries × 8 levels = 128 bytes)
+    struct iq3tq_meta {
+        std::string tensor_name;
+        int8_t grid[128];
+    };
+    std::vector<iq3tq_meta> iq3tq_all_meta;
+    if (params->ftype == LLAMA_FTYPE_MOSTLY_IQ3_TQ) {
+        const int64_t t_start_p1 = ggml_time_us();
+        LLAMA_LOG_INFO("%s: IQ3_TQ pass 1: training per-tensor grids...\n", __func__);
+
+        std::vector<no_init<uint8_t>> p1_read_data;
+        std::vector<no_init<float>>   p1_f32_buf;
+        std::vector<std::thread>      p1_workers;
+        p1_workers.reserve(nthread);
+
+        for (size_t ti = 0; ti < tensors.size(); ++ti) {
+            ggml_tensor * tensor = tensors[ti]->tensor;
+            const std::string tname = ggml_get_name(tensor);
+
+            bool quantize = tname.rfind("weight") == tname.size() - 6;
+            quantize &= (ggml_n_dims(tensor) >= 2);
+            quantize &= tname.find("_norm.weight")        == std::string::npos;
+            quantize &= tname.find("ffn_gate_inp.weight") == std::string::npos;
+            if (!quantize) { continue; }
+
+            ggml_type new_type = default_type;
+            if (!params->pure) {
+                new_type = llama_tensor_get_type_impl(qs, new_type, tensor, ftype, tensor_get_category(tname));
+            }
+            if (params->token_embedding_type < GGML_TYPE_COUNT &&
+                (tname == "token_embd.weight" || tname == "per_layer_token_embd.weight")) {
+                new_type = params->token_embedding_type;
+            }
+            if (params->output_tensor_type < GGML_TYPE_COUNT && tname == "output.weight") {
+                new_type = params->output_tensor_type;
+            }
+            if (new_type != GGML_TYPE_IQ3_TQ) { continue; }
+
+            const size_t tsz = ggml_nbytes(tensor);
+            if (!ml.use_mmap) {
+                if (p1_read_data.size() < tsz) { p1_read_data.resize(tsz); }
+                tensor->data = p1_read_data.data();
+            }
+            ml.load_data_for(tensor);
+
+            const int64_t nelements = ggml_nelements(tensor);
+            float * f32_data;
+            if (tensor->type == GGML_TYPE_F32) {
+                f32_data = (float *) tensor->data;
+            } else {
+                llama_tensor_dequantize_impl(tensor, p1_f32_buf, p1_workers, nelements, nthread);
+                f32_data = (float *) p1_f32_buf.data();
+            }
+
+            const float * imatrix = nullptr;
+            if (imatrix_data) {
+                auto it2 = imatrix_data->find(remap_imatrix(tensor->name, mapped));
+                if (it2 != imatrix_data->end() &&
+                    it2->second.size() == (size_t)tensor->ne[0] * tensor->ne[2]) {
+                    imatrix = it2->second.data();
+                }
+            }
+
+            const int64_t n_per_row = tensor->ne[0];
+            const int64_t nrows     = tensor->ne[1];
+
+            LLAMA_LOG_INFO("%s: IQ3_TQ grid for [%zu/%zu] %s\n", __func__, ti+1, tensors.size(), tensor->name);
+
+            iq3tq_meta meta;
+            meta.tensor_name = tname;
+            iq3tq_train_grid(f32_data, nrows, n_per_row, imatrix, meta.grid);
+            iq3tq_all_meta.push_back(meta);
+
+            std::string grid_key = "iq3tq.grid." + tname;
+            gguf_set_arr_data(ctx_outs[0].get(), grid_key.c_str(), GGUF_TYPE_INT8, meta.grid, 128);
+        }
+        const int64_t t_end_p1 = ggml_time_us();
+        LLAMA_LOG_INFO("%s: IQ3_TQ pass 1 complete (%zu tensors trained, %.1f s).\n",
+                       __func__, iq3tq_all_meta.size(), (t_end_p1 - t_start_p1) / 1e6);
+    }
+
+    // IQ1_BN: train per-tensor codebook in pass 1 (4096 × 8D centroids = 32768 bytes)
+    struct iq1bn_meta {
+        std::string tensor_name;
+        int8_t aux[32768];
+    };
+    std::vector<iq1bn_meta> iq1bn_all_meta;
+    if (params->ftype == LLAMA_FTYPE_MOSTLY_IQ1_BN) {
+        const int64_t t_start_p1 = ggml_time_us();
+        LLAMA_LOG_INFO("%s: IQ1_BN pass 1: training per-tensor codebooks...\n", __func__);
+
+        std::vector<no_init<uint8_t>> p1_read_data;
+        std::vector<no_init<float>>   p1_f32_buf;
+        std::vector<std::thread>      p1_workers;
+        p1_workers.reserve(nthread);
+
+        for (size_t ti = 0; ti < tensors.size(); ++ti) {
+            ggml_tensor * tensor = tensors[ti]->tensor;
+            const std::string tname = ggml_get_name(tensor);
+
+            bool quantize = tname.rfind("weight") == tname.size() - 6;
+            quantize &= (ggml_n_dims(tensor) >= 2);
+            quantize &= tname.find("_norm.weight")        == std::string::npos;
+            quantize &= tname.find("ffn_gate_inp.weight") == std::string::npos;
+            if (!quantize) { continue; }
+
+            ggml_type new_type = default_type;
+            if (!params->pure) {
+                new_type = llama_tensor_get_type_impl(qs, new_type, tensor, ftype, tensor_get_category(tname));
+            }
+            if (params->token_embedding_type < GGML_TYPE_COUNT &&
+                (tname == "token_embd.weight" || tname == "per_layer_token_embd.weight")) {
+                new_type = params->token_embedding_type;
+            }
+            if (params->output_tensor_type < GGML_TYPE_COUNT && tname == "output.weight") {
+                new_type = params->output_tensor_type;
+            }
+            if (new_type != GGML_TYPE_IQ1_BN) { continue; }
+
+            const size_t tsz = ggml_nbytes(tensor);
+            if (!ml.use_mmap) {
+                if (p1_read_data.size() < tsz) { p1_read_data.resize(tsz); }
+                tensor->data = p1_read_data.data();
+            }
+            ml.load_data_for(tensor);
+
+            const int64_t nelements = ggml_nelements(tensor);
+            float * f32_data;
+            if (tensor->type == GGML_TYPE_F32) {
+                f32_data = (float *) tensor->data;
+            } else {
+                llama_tensor_dequantize_impl(tensor, p1_f32_buf, p1_workers, nelements, nthread);
+                f32_data = (float *) p1_f32_buf.data();
+            }
+
+            const float * imatrix = nullptr;
+            if (imatrix_data) {
+                auto it2 = imatrix_data->find(remap_imatrix(tensor->name, mapped));
+                if (it2 != imatrix_data->end() &&
+                    it2->second.size() == (size_t)tensor->ne[0] * tensor->ne[2]) {
+                    imatrix = it2->second.data();
+                }
+            }
+
+            const int64_t n_per_row = tensor->ne[0];
+            const int64_t nrows     = tensor->ne[1];
+
+            LLAMA_LOG_INFO("%s: IQ1_BN codebook for [%zu/%zu] %s\n", __func__, ti+1, tensors.size(), tensor->name);
+
+            iq1bn_meta meta;
+            meta.tensor_name = tname;
+            iq1bn_train_codebook(f32_data, nrows, n_per_row, imatrix, meta.aux, nthread);
+            iq1bn_all_meta.push_back(meta);
+
+            std::string aux_key = "iq1bn.aux." + tname;
+            gguf_set_arr_data(ctx_outs[0].get(), aux_key.c_str(), GGUF_TYPE_INT8, meta.aux, 32768);
+        }
+        const int64_t t_end_p1 = ggml_time_us();
+        LLAMA_LOG_INFO("%s: IQ1_BN pass 1 complete (%zu tensors trained, %.1f s).\n",
+                       __func__, iq1bn_all_meta.size(), (t_end_p1 - t_start_p1) / 1e6);
+    }
+
    // no output file for --dry-run
    if (!params->dry_run) {
        new_ofstream(0);
@ -1106,6 +1838,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
    //
    // main loop: iterate over all weights
    //
+    size_t tensor_pass2_idx = 0;  // index into tensors[], used for Q3_PT levels lookup

    for (size_t i = 0; i < tensors.size(); ++i) {
        const auto & weight = *tensors[i];
@ -1232,6 +1965,75 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
                const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;

+                // Q3_PT: set the per-tensor levels (trained in pass 1) as global for quantization
+                if (new_type == GGML_TYPE_Q3_PT) {
+                    q3pt_set_levels(q3pt_all_levels.data() + tensor_pass2_idx * Q3PT_N_LEVELS);
+                }
+
+                // Q3_KPT: set the per-tensor levels (trained in pass 1) as global for quantization
+                if (new_type == GGML_TYPE_Q3_KPT) {
+                    q3kpt_set_levels(q3kpt_all_levels.data() + tensor_pass2_idx * Q3KPT_N_LEVELS);
+                }
+
+                // Q4_DPT: set the per-tensor levels (trained in pass 1) as global for quantization
+                if (new_type == GGML_TYPE_Q4_DPT) {
+                    q4dpt_set_levels(q4dpt_all_levels.data() + tensor_pass2_idx * Q4DPT_N_LEVELS);
+                }
+
+                // IQ2_TQ: set per-tensor trained grid
+                if (new_type == GGML_TYPE_IQ2_TQ) {
+                    bool found = false;
+                    for (const auto & meta : iq2tq_all_meta) {
+                        if (meta.tensor_name == tm.name) {
+                            iq2tq_set_grid(meta.grid);
+                            found = true;
+                            break;
+                        }
+                    }
+                    if (!found) {
+                        LLAMA_LOG_WARN("%s: WARNING: no trained grid for IQ2_TQ tensor %s\n", __func__, tm.name.c_str());
+                    }
+                }
+
+                // IQ3_TQ: set per-tensor trained grid
+                if (new_type == GGML_TYPE_IQ3_TQ) {
+                    bool found = false;
+                    for (const auto & meta : iq3tq_all_meta) {
+                        if (meta.tensor_name == tm.name) {
+                            iq3tq_set_grid(meta.grid);
+                            found = true;
+                            break;
+                        }
+                    }
+                    if (!found) {
+                        LLAMA_LOG_WARN("%s: WARNING: no trained grid for IQ3_TQ tensor %s\n", __func__, tm.name.c_str());
+                    }
+                }
+
+                // IQ1_BN: set per-tensor trained codebook
+                if (new_type == GGML_TYPE_IQ1_BN) {
+                    bool found = false;
+                    for (const auto & meta : iq1bn_all_meta) {
+                        if (meta.tensor_name == tm.name) {
+                            iq1bn_set_aux(meta.aux);
+                            found = true;
+                            break;
+                        }
+                    }
+                    if (!found) {
+                        LLAMA_LOG_WARN("%s: WARNING: no trained codebook for IQ1_BN tensor %s\n", __func__, tm.name.c_str());
+                    }
+                }
+
+                // Q2_KPT: quantize_q2_kpt trains per-block levels internally.
+                // Levels were already trained and saved to GGUF in pass 1.
+                // We still need to allocate the levels buffer for quantization to work correctly.
+                if (new_type == GGML_TYPE_Q2_KPT) {
+                    const int64_t total_rows = nrows * tensor->ne[2];
+                    q2kpt_free_levels();  // Clear any stale levels from previous tensor
+                    q2kpt_prepare_levels(total_rows, n_per_row);  // Allocate for this tensor
+                }
+
                // quantize each expert separately since they have different importance matrices
                new_size = 0;
                for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
@ -1255,7 +2057,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
            fout.write((const char *) new_data, new_size);
            zeros(fout, GGML_PAD(new_size, align) - new_size);
        } // no --dry-run
-    } // main loop
+
+        tensor_pass2_idx++;
+    } // iterate over tensors

    if (!params->dry_run) {
        close_ofstream();
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -257,6 +257,9 @@ if (NOT GGML_BACKEND_DL)
    llama_build_and_test(test-rope.cpp)
 endif()

+# Quantization laboratory - tests for 2.5 BPW proposals
+llama_build_and_test(test-quant-laboratory.cpp)
+
 # libmtmd
 set(LLAMA_TEST_NAME test-mtmd-c-api)
 llama_build_and_test(test-mtmd-c-api.c)
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@ -261,7 +261,7 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
                    } else if (t->type == GGML_TYPE_I8) {
                        tv.push_back((float)*(int8_t *) &buf[i]);
                    } else if (quantized) {
-                        tt->to_float(&buf[i], vq.data(), bs);
+                        tt->to_float(&buf[i], vq.data(), bs, nullptr);
                        tv.insert(tv.end(), vq.begin(), vq.end());
                    } else {
                        GGML_ABORT("fatal error");
--- a/tests/test-quant-laboratory.cpp
+++ b/tests/test-quant-laboratory.cpp
@ -0,0 +1,355 @@
+// test-quant-laboratory.cpp
+// Reusable testing harness for quantization experiments.
+//
+// Provides:
+//   - Synthetic data generators (Gaussian, Laplace, uniform)
+//   - Real tensor data loading (f32bin format with [nrow, ncol] header)
+//   - Importance matrix loading (flat f32 array)
+//   - RMSE computation
+//   - Multi-approach comparison framework (quantize → dequantize → matmul error)
+//   - ggml graph-level verification skeleton
+//
+// To add a new experiment:
+//   1. Add an approach function:  void approach_xxx(const float *W, float *out,
+//                                                   int64_t nrow, int64_t ncol,
+//                                                   const float *imatrix)
+//   2. Register it in compare_approaches()
+//   3. Call test_approach_comparison() from main()
+
+#include "../ggml/src/ggml-quants.h"
+#include "ggml-backend.h"
+#include "ggml-alloc.h"
+#include "ggml.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <functional>
+#include <random>
+#include <string>
+#include <vector>
+
+// ============================================================================
+// Helper functions
+// ============================================================================
+
+static float rmse(const float * a, const float * b, size_t n) {
+    double sum = 0.0;
+    for (size_t i = 0; i < n; ++i) {
+        double d = (double) a[i] - (double) b[i];
+        sum += d * d;
+    }
+    return (float) sqrt(sum / n);
+}
+
+static void fill_gaussian(float * data, size_t n, std::mt19937 & gen, float sigma = 1.0f) {
+    std::normal_distribution<float> dist(0.0f, sigma);
+    for (size_t i = 0; i < n; ++i) {
+        data[i] = dist(gen);
+    }
+}
+
+static void fill_laplace(float * data, size_t n, std::mt19937 & gen, float b = 1.0f) {
+    std::uniform_real_distribution<float> u(-0.5f, 0.5f);
+    for (size_t i = 0; i < n; ++i) {
+        float v = u(gen);
+        data[i] = -b * ((v > 0) - (v < 0)) * logf(1.0f - 2.0f * fabsf(v));
+    }
+}
+
+static void fill_uniform(float * data, size_t n, std::mt19937 & gen, float range = 1.0f) {
+    std::uniform_real_distribution<float> dist(-range, range);
+    for (size_t i = 0; i < n; ++i) {
+        data[i] = dist(gen);
+    }
+}
+
+static void fill_offset_gaussian(float * data, size_t n, std::mt19937 & gen, float sigma = 1.0f, float offset = 2.0f) {
+    std::normal_distribution<float> dist(offset, sigma);
+    for (size_t i = 0; i < n; ++i) {
+        data[i] = dist(gen);
+    }
+}
+
+// ============================================================================
+// Data loading
+// ============================================================================
+static bool load_f32_tensor(const char * path, std::vector<float> & data, int64_t & nrow, int64_t & n_per_row) {
+    FILE * f = fopen(path, "rb");
+    if (!f) {
+        return false;
+    }
+
+    int64_t header[2];
+    if (fread(header, sizeof(int64_t), 2, f) != 2) {
+        fclose(f);
+        return false;
+    }
+    nrow      = header[0];
+    n_per_row = header[1];
+
+    int64_t total = nrow * n_per_row;
+    data.resize(total);
+    size_t nread = fread(data.data(), sizeof(float), total, f);
+    fclose(f);
+    if ((int64_t) nread != total) {
+        return false;
+    }
+    return true;
+}
+
+// Load imatrix file (flat f32 array, no header, one importance value per column dimension)
+// The imatrix is the sum-of-squares of activations per dimension.
+static bool load_imatrix(const char * path, std::vector<float> & data, int64_t expected_dims) {
+    FILE * f = fopen(path, "rb");
+    if (!f) {
+        return false;
+    }
+
+    // Get file size to determine dimensions
+    fseek(f, 0, SEEK_END);
+    long file_size = ftell(f);
+    fseek(f, 0, SEEK_SET);
+
+    int64_t dims = file_size / sizeof(float);
+    if (expected_dims > 0 && dims != expected_dims) {
+        printf("  WARN: imatrix dims %lld != expected %lld\n", (long long) dims, (long long) expected_dims);
+        fclose(f);
+        return false;
+    }
+
+    data.resize(dims);
+    size_t nread = fread(data.data(), sizeof(float), dims, f);
+    fclose(f);
+    if ((int64_t) nread != dims) {
+        return false;
+    }
+
+    // Compute stats
+    float imin = data[0], imax = data[0], isum = 0;
+    for (int64_t i = 0; i < dims; i++) {
+        if (data[i] < imin) imin = data[i];
+        if (data[i] > imax) imax = data[i];
+        isum += data[i];
+    }
+    printf("  Loaded imatrix: %lld dims, min=%.6f, max=%.6f, mean=%.6f\n",
+           (long long) dims, imin, imax, isum / dims);
+
+    return true;
+}
+
+// ============================================================================
+// Test class
+// ============================================================================
+
+class QuantLaboratory {
+  public:
+    QuantLaboratory() : gen(42) {}
+
+    // ========================================================================
+    // MULTI-APPROACH COMPARISON FRAMEWORK
+    //
+    // Each "approach" is a function that takes float weights and produces
+    // dequantized float output.  The framework computes:
+    //   - Weight RMSE (dequant vs original)
+    //   - Matmul error (dequant weights x real activations vs f64 reference)
+    //   - Ratio vs first approach (typically Q2_K baseline)
+    //
+    // To add a new approach:
+    //   1. Write: void approach_xxx(const float *W, float *out,
+    //                               int64_t nrow, int64_t ncol,
+    //                               const float *imatrix) { ... }
+    //   2. Add it to the `approaches` array in compare_approaches()
+    // ========================================================================
+
+    // -- Example approach: Q2_K baseline (via ggml library) --
+    // Uncomment and adapt for your experiment:
+    //
+    // void approach_q2k(const float * W, float * out, int64_t nrow, int64_t ncol, const float * imatrix) {
+    //     size_t               rs = ggml_row_size(GGML_TYPE_Q2_K, ncol);
+    //     std::vector<uint8_t> buf(nrow * rs);
+    //     quantize_q2_K(W, buf.data(), nrow, ncol, imatrix);
+    //     auto * tr = ggml_get_type_traits(GGML_TYPE_Q2_K);
+    //     for (int64_t r = 0; r < nrow; r++) {
+    //         tr->to_float(buf.data() + r * rs, out + r * ncol, ncol, NULL);
+    //     }
+    // }
+
+    void compare_approaches(const float * W,
+                            int64_t       w_nrow,
+                            int64_t       w_ncol,
+                            const float * A,
+                            int64_t       a_nrow,
+                            int64_t       a_ncol,
+                            const char *  name,
+                            const float * imatrix) {
+        if (w_ncol != a_ncol) {
+            return;
+        }
+        int64_t nr = std::min(w_nrow, (int64_t) 256);
+        int64_t nc = w_ncol;
+
+        // Reference matmul (double precision)
+        std::vector<double> ref(a_nrow * nr);
+        for (int64_t t = 0; t < a_nrow; t++) {
+            for (int64_t r = 0; r < nr; r++) {
+                double s = 0;
+                for (int64_t c = 0; c < nc; c++) {
+                    s += (double) A[t * a_ncol + c] * (double) W[r * nc + c];
+                }
+                ref[t * nr + r] = s;
+            }
+        }
+        double ref_mag2 = 0;
+        for (auto v : ref) {
+            ref_mag2 += v * v;
+        }
+        float ref_rms = (float) sqrt(ref_mag2 / (a_nrow * nr));
+        (void) ref_rms;
+
+        struct Approach {
+            const char *                                                                    name;
+            float                                                                           bpw;
+            std::function<void(const float *, float *, int64_t, int64_t, const float *)>    fn;
+        };
+
+        // ── Register approaches here ──
+        Approach approaches[] = {
+            // { "Q2_K (baseline)", 2.625f,
+            //   [&](auto * W, auto * o, auto nr, auto nc, auto * im) {
+            //        approach_q2k(W, o, nr, nc, im);
+            //    } },
+            // Add more approaches...
+            { "placeholder", 0.0f, nullptr },  // remove once real approaches added
+        };
+
+        printf("\n  %-28s  %5s  %10s  %10s  %7s\n", name, "BPW", "RMSE", "MatmulErr", "vs Q2K");
+        printf("  %-28s  %5s  %10s  %10s  %7s\n", "---", "---", "---", "---", "---");
+
+        float baseline_matmul_err = 0;
+        for (auto & ap : approaches) {
+            if (!ap.fn) {
+                continue;
+            }
+            std::vector<float> dec(nr * nc);
+            ap.fn(W, dec.data(), nr, nc, imatrix);
+
+            // Weight RMSE
+            double werr2 = 0;
+            for (int64_t i = 0; i < nr * nc; i++) {
+                double d = W[i] - dec[i];
+                werr2 += d * d;
+            }
+            float wrmse = (float) sqrt(werr2 / (nr * nc));
+
+            // Matmul error
+            double merr2 = 0;
+            for (int64_t t = 0; t < a_nrow; t++) {
+                for (int64_t r = 0; r < nr; r++) {
+                    double s = 0;
+                    for (int64_t c = 0; c < nc; c++) {
+                        s += (double) A[t * a_ncol + c] * (double) dec[r * nc + c];
+                    }
+                    double d = s - ref[t * nr + r];
+                    merr2 += d * d;
+                }
+            }
+            float matmul_rmse = (float) sqrt(merr2 / (a_nrow * nr));
+
+            if (baseline_matmul_err == 0) {
+                baseline_matmul_err = matmul_rmse;
+            }
+            float ratio = (baseline_matmul_err > 1e-10f) ? matmul_rmse / baseline_matmul_err : 0;
+
+            printf("  %-28s  %5.3f  %10.6f  %10.6f  %6.3fx\n", ap.name, ap.bpw, wrmse, matmul_rmse, ratio);
+        }
+    }
+
+    // Run comparison on all tensor pairs from data directory
+    int test_approach_comparison(const char * data_dir) {
+        printf("\n");
+        printf("=======================================================================\n");
+        printf("  MULTI-APPROACH COMPARISON (real weights x real activations)\n");
+        printf("=======================================================================\n");
+
+        struct TestPair {
+            const char * wf;
+            const char * af;
+            const char * imf;
+            const char * name;
+        } pairs[] = {
+            { "blk_0_ffn_gate_weight.f32bin", "act_blk0_ffn_input.f32bin",      "imatrix_blk0_ffn_gate_up.f32bin", "ffn_gate" },
+            { "blk_0_ffn_up_weight.f32bin",   "act_blk0_ffn_input.f32bin",      "imatrix_blk0_ffn_gate_up.f32bin", "ffn_up"   },
+            { "blk_0_ffn_down_weight.f32bin", "act_blk0_ffn_down_input.f32bin", "imatrix_blk0_ffn_down.f32bin",    "ffn_down" },
+            { "blk_0_attn_q_weight.f32bin",   "act_blk0_attn_input.f32bin",     "imatrix_blk0_attn_qkv.f32bin",    "attn_q"   },
+        };
+
+        for (auto & p : pairs) {
+            char wp[512], ap[512], imp[512];
+            snprintf(wp, sizeof(wp), "%s/%s", data_dir, p.wf);
+            snprintf(ap, sizeof(ap), "%s/%s", data_dir, p.af);
+            snprintf(imp, sizeof(imp), "%s/%s", data_dir, p.imf);
+            std::vector<float> wd, ad, im;
+            int64_t            wnr, wnc, anr, anc;
+            if (!load_f32_tensor(wp, wd, wnr, wnc) || !load_f32_tensor(ap, ad, anr, anc)) {
+                continue;
+            }
+            const float * im_ptr = nullptr;
+            if (load_imatrix(imp, im, wnc)) {
+                im_ptr = im.data();
+            } else {
+                printf("  [%s] No imatrix found, using uniform weights\n", p.name);
+            }
+            compare_approaches(wd.data(), wnr, wnc, ad.data(), anr, anc, p.name, im_ptr);
+        }
+        printf("\n");
+        return 0;
+    }
+
+  private:
+    std::mt19937 gen;
+};
+
+// ============================================================================
+// Main
+// ============================================================================
+
+int main(int argc, char ** argv) {
+    ggml_backend_load_all();
+
+    QuantLaboratory lab;
+    int             total_fail = 0;
+
+    printf("Quantization Laboratory\n");
+    printf("=======================\n");
+
+    // Real data tests (from data/ directory)
+    {
+        const char * data_dir = "data";
+        if (argc > 1) {
+            data_dir = argv[1];
+        }
+
+        char probe[512];
+        snprintf(probe, sizeof(probe), "%s/blk_0_ffn_gate_weight.f32bin", data_dir);
+        FILE * fp = fopen(probe, "rb");
+        if (fp) {
+            fclose(fp);
+            total_fail += lab.test_approach_comparison(data_dir);
+        } else {
+            printf("\n=== Real Data Tests SKIPPED ===\n");
+            printf("  No data found at %s\n", data_dir);
+            printf(
+                "  Run: cd data && PYTHONPATH=../gguf-py python3 ../scripts/extract-tensor-data.py MODEL.gguf "
+                "blk.0.ffn_gate blk.0.ffn_up blk.0.ffn_down blk.0.attn_q\n");
+            printf("  And: llama-capture-layer-data -m MODEL.gguf -l 0 -o data\n");
+        }
+    }
+
+    printf("\n\n=== Testing Complete: %d failures ===\n", total_fail);
+
+    return total_fail > 0 ? 1 : 0;
+}
--- a/tests/test-quantize-fns.cpp
+++ b/tests/test-quantize-fns.cpp
@ -54,7 +54,7 @@ static float total_quantization_error(const ggml_type_traits * qfns, const ggml_
    std::vector<float> tmp_out(test_size);

    qfns_cpu->from_float(test_data, tmp_q.data(), test_size);
-    qfns->to_float(tmp_q.data(), tmp_out.data(), test_size);
+    qfns->to_float(tmp_q.data(), tmp_out.data(), test_size, nullptr);
    return array_rmse(test_data, tmp_out.data(), test_size);
 }

@ -66,10 +66,10 @@ static float reference_quantization_error(const ggml_type_traits * qfns, const g

    // FIXME: why is done twice?
    qfns_cpu->from_float(test_data, tmp_q.data(), test_size);
-    qfns->to_float(tmp_q.data(), tmp_out.data(), test_size);
+    qfns->to_float(tmp_q.data(), tmp_out.data(), test_size, nullptr);

    qfns->from_float_ref(test_data, tmp_q.data(), test_size);
-    qfns->to_float(tmp_q.data(), tmp_out_ref.data(), test_size);
+    qfns->to_float(tmp_q.data(), tmp_out_ref.data(), test_size, nullptr);

    return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size);
 }
@ -95,7 +95,7 @@ static float dot_product_error(const ggml_type_traits * qfns, const ggml_type_tr
    vdot->from_float(test_data2, tmp_q2.data(), test_size);

    float result = INFINITY;
-    qfns_cpu->vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);
+    qfns_cpu->vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1, nullptr);

    const float dot_ref = dot_product(test_data1, test_data2, test_size);

--- a/tests/test-quantize-perf.cpp
+++ b/tests/test-quantize-perf.cpp
@ -309,7 +309,7 @@ int main(int argc, char * argv[]) {
                for (size_t size : params.test_sizes) {
                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
                    auto quantize_fn = [&](void) -> float {
-                        qfns->to_float(test_q1, test_out, size);
+                        qfns->to_float(test_q1, test_out, size, nullptr);
                        return test_out[0];
                    };
                    size_t quantized_size = ggml_row_size(type, size);
@ -341,7 +341,7 @@ int main(int argc, char * argv[]) {
                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
                    auto quantize_fn = [&](void) -> float {
                        float result;
-                        qfns_cpu->vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1);
+                        qfns_cpu->vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1, nullptr);
                        return result;
                    };
                    size_t quantized_size = ggml_row_size(type, size);
--- a/tests/test-quantize-stats.cpp
+++ b/tests/test-quantize-stats.cpp
@ -158,7 +158,7 @@ static void test_roundtrip_on_chunk(
    } else {
        qfns_cpu.from_float(input_scratch, quantized_scratch, chunk_size);
    }
-    qfns.to_float(quantized_scratch, output_scratch, chunk_size);
+    qfns.to_float(quantized_scratch, output_scratch, chunk_size, nullptr);

    update_error_stats(chunk_size, input_scratch, output_scratch, stats);
 }
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@ -38,5 +38,6 @@ else()
        add_subdirectory(export-lora)
    endif()
    add_subdirectory(fit-params)
+    add_subdirectory(capture-layer-data)
    add_subdirectory(results)
 endif()
--- a/tools/capture-layer-data/CMakeLists.txt
+++ b/tools/capture-layer-data/CMakeLists.txt
@ -0,0 +1,9 @@
+set(TARGET llama-capture-layer-data)
+add_executable(${TARGET} capture-layer-data.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_include_directories(${TARGET} PRIVATE ../../common)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} RUNTIME)
+endif()
--- a/tools/capture-layer-data/capture-layer-data.cpp
+++ b/tools/capture-layer-data/capture-layer-data.cpp
@ -0,0 +1,251 @@
+// capture-layer-data.cpp
+// Captures intermediate activation tensors during model inference
+// and saves them as .f32bin files for the quantization laboratory.
+//
+// Usage:
+//   llama-capture-layer-data -m MODEL_PATH -l LAYER [-p PROMPT] [-o OUTPUT_DIR]
+//
+// Example:
+//   llama-capture-layer-data -m /devel/models/Qwen_Qwen3-4B-Instruct-2507-bf16.gguf -l 0 -o data
+
+#include "arg.h"
+#include "common.h"
+#include "ggml-backend.h"
+#include "ggml.h"
+#include "llama.h"
+#include "log.h"
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <filesystem>
+#include <fstream>
+#include <string>
+#include <vector>
+
+struct TensorMapping {
+    const char * graph_name_prefix;
+    const char * output_suffix;
+};
+
+static const TensorMapping mappings[] = {
+    { "attn_norm",  "attn_input"        },
+    { "kqv_out",    "attn_output_input" },
+    { "ffn_norm",   "ffn_input"         },
+    { "ffn_swiglu", "ffn_down_input"    },
+};
+static constexpr int N_MAPPINGS = sizeof(mappings) / sizeof(mappings[0]);
+
+struct CaptureState {
+    int         target_layer;
+    std::string output_dir;
+    int         captured_count = 0;
+    std::string pending_name;
+
+    std::string graph_to_filename(const char * graph_name) const {
+        for (int i = 0; i < N_MAPPINGS; i++) {
+            std::string prefix = mappings[i].graph_name_prefix;
+            if (strncmp(graph_name, prefix.c_str(), prefix.size()) == 0) {
+                char buf[256];
+                snprintf(buf, sizeof(buf), "act_blk%d_%s.f32bin", target_layer, mappings[i].output_suffix);
+                return std::string(buf);
+            }
+        }
+        return "";
+    }
+};
+
+static CaptureState * g_capture_state = nullptr;
+
+static void save_tensor_as_f32bin(const ggml_tensor * t, const std::string & filepath) {
+    int64_t n_rows  = t->ne[1];
+    int64_t row_len = t->ne[0];
+
+    int64_t total = 1;
+    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+        total *= t->ne[i];
+    }
+
+    std::vector<float> f32_data(total);
+
+    if (t->type == GGML_TYPE_F32) {
+        const float * src = (const float *) t->data;
+        if (!src) {
+            LOG_ERR("Tensor %s has null data pointer\n", t->name);
+            return;
+        }
+        memcpy(f32_data.data(), src, total * sizeof(float));
+    } else if (t->type == GGML_TYPE_F16) {
+        const ggml_fp16_t * src = (const ggml_fp16_t *) t->data;
+        for (int64_t i = 0; i < total; i++) {
+            f32_data[i] = ggml_fp16_to_fp32(src[i]);
+        }
+    } else if (t->type == GGML_TYPE_BF16) {
+        const ggml_bf16_t * src = (const ggml_bf16_t *) t->data;
+        for (int64_t i = 0; i < total; i++) {
+            f32_data[i] = ggml_bf16_to_fp32(src[i]);
+        }
+    } else {
+        LOG_ERR("Unsupported tensor type %s for %s\n", ggml_type_name(t->type), t->name);
+        return;
+    }
+
+    std::ofstream file(filepath, std::ios::binary);
+    if (!file) {
+        LOG_ERR("Failed to open %s for writing\n", filepath.c_str());
+        return;
+    }
+
+    file.write(reinterpret_cast<const char *>(&n_rows), sizeof(int64_t));
+    file.write(reinterpret_cast<const char *>(&row_len), sizeof(int64_t));
+    file.write(reinterpret_cast<const char *>(f32_data.data()), total * sizeof(float));
+
+    file.close();
+    LOG("  Captured: %s -> %s (%lld x %lld, %s)\n", t->name, filepath.c_str(), (long long) n_rows, (long long) row_len,
+        ggml_type_name(t->type));
+}
+
+static bool capture_callback(ggml_tensor * t, bool ask, void * user_data) {
+    auto * state = (CaptureState *) user_data;
+
+    if (ask) {
+        char target[128];
+        for (int i = 0; i < N_MAPPINGS; i++) {
+            snprintf(target, sizeof(target), "%s-%d", mappings[i].graph_name_prefix, state->target_layer);
+            if (strcmp(t->name, target) == 0) {
+                state->pending_name = t->name;
+                return true;
+            }
+        }
+        return false;
+    }
+
+    if (state->pending_name.empty()) {
+        return true;
+    }
+    if (strcmp(t->name, state->pending_name.c_str()) != 0) {
+        return true;
+    }
+
+    if (!ggml_backend_buffer_is_host(t->buffer)) {
+        size_t               nbytes = ggml_nbytes(t);
+        std::vector<uint8_t> tmp(nbytes);
+        ggml_backend_tensor_get(t, tmp.data(), 0, nbytes);
+        LOG_WRN("Tensor %s is not host-accessible, data copied via backend\n", t->name);
+    }
+
+    std::string filename = state->graph_to_filename(t->name);
+    if (!filename.empty()) {
+        std::filesystem::create_directories(state->output_dir);
+        std::string filepath = (std::filesystem::path(state->output_dir) / filename).string();
+        save_tensor_as_f32bin(t, filepath);
+        state->captured_count++;
+    }
+
+    state->pending_name.clear();
+    return true;
+}
+
+static void print_usage(void) {
+    LOG("Usage: llama-capture-layer-data -m MODEL_PATH [-l LAYER] [-p PROMPT] [-o OUTPUT_DIR]\n");
+    LOG("\n");
+    LOG("  -m MODEL      Path to GGUF model (BF16/F16 recommended)\n");
+    LOG("  -l LAYER      Target layer index (default: 0)\n");
+    LOG("  -p PROMPT     Inference prompt (default: \"The quick brown fox...\")\n");
+    LOG("  -o DIR        Output directory for .f32bin files (default: data)\n");
+}
+
+int main(int argc, char ** argv) {
+    if (argc < 3 || (std::string(argv[1]) == "-h" || std::string(argv[1]) == "--help")) {
+        print_usage();
+        return 1;
+    }
+
+    common_params params;
+    int           layer      = 0;
+    std::string   output_dir = "data";
+    std::string   prompt     = "The quick brown fox jumps over the lazy dog.";
+    std::string   model_path;
+
+    for (int i = 1; i < argc; i++) {
+        std::string arg = argv[i];
+        if (arg == "-m" && i + 1 < argc) {
+            model_path = argv[++i];
+        } else if (arg == "-l" && i + 1 < argc) {
+            layer = atoi(argv[++i]);
+        } else if (arg == "-p" && i + 1 < argc) {
+            prompt = argv[++i];
+        } else if (arg == "-o" && i + 1 < argc) {
+            output_dir = argv[++i];
+        }
+    }
+
+    if (model_path.empty()) {
+        LOG_ERR("Error: -m MODEL_PATH is required\n\n");
+        print_usage();
+        return 1;
+    }
+
+    params.model.path   = model_path;
+    params.prompt       = prompt;
+    params.n_batch      = 512;
+    params.n_ubatch     = 512;
+    params.n_gpu_layers = 0;
+    params.fit_params   = false;
+
+    CaptureState state;
+    state.target_layer = layer;
+    state.output_dir   = output_dir;
+    g_capture_state    = &state;
+
+    params.cb_eval           = capture_callback;
+    params.cb_eval_user_data = &state;
+
+    LOG("Loading model: %s\n", model_path.c_str());
+    LOG("Target layer: %d\n", layer);
+    LOG("Output directory: %s\n", output_dir.c_str());
+
+    common_init();
+    ggml_backend_load_all();
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    auto llama_init = common_init_from_params(params);
+    if (!llama_init) {
+        LOG_ERR("Failed to load model\n");
+        return 1;
+    }
+
+    auto * model = llama_init->model();
+    auto * ctx   = llama_init->context();
+
+    if (model == nullptr || ctx == nullptr) {
+        LOG_ERR("Failed to initialize context\n");
+        return 1;
+    }
+
+    LOG("Model loaded successfully\n");
+
+    const llama_vocab *      vocab   = llama_model_get_vocab(model);
+    const bool               add_bos = llama_vocab_get_add_bos(vocab);
+    std::vector<llama_token> tokens  = common_tokenize(ctx, params.prompt, add_bos);
+
+    if (tokens.empty()) {
+        LOG_ERR("No tokens generated from prompt\n");
+        return 1;
+    }
+
+    LOG("Tokenizing prompt: %zu tokens\n", tokens.size());
+    LOG("Running inference...\n");
+
+    if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
+        LOG_ERR("llama_decode failed\n");
+        return 1;
+    }
+
+    LOG("\nDone. Captured %d tensors to %s/\n", state.captured_count, output_dir.c_str());
+
+    llama_backend_free();
+
+    return state.captured_count == 0 ? 1 : 0;
+}
--- a/tools/export-lora/export-lora.cpp
+++ b/tools/export-lora/export-lora.cpp
@ -318,7 +318,7 @@ struct lora_merge_ctx {
            auto nels = ggml_nelements(inp_base);
            const auto * qtype = ggml_get_type_traits(base->type);
            std::vector<uint8_t> dequant_buf(nels * sizeof(float));
-            qtype->to_float(read_buf.data(), (float *)dequant_buf.data(), nels);
+            qtype->to_float(read_buf.data(), (float *)dequant_buf.data(), nels, nullptr);
            ggml_backend_tensor_set(inp_base, dequant_buf.data(), 0, dequant_buf.size());
        } else {
            ggml_backend_tensor_set(inp_base, read_buf.data(), 0, ggml_nbytes(inp_base));
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@ -46,6 +46,13 @@ static const std::vector<quant_option> QUANT_OPTIONS = {
    { "Q2_K",     LLAMA_FTYPE_MOSTLY_Q2_K,     " 2.96G, +3.5199 ppl @ Llama-3-8B",  },
    { "Q2_K_S",   LLAMA_FTYPE_MOSTLY_Q2_K_S,   " 2.96G, +3.1836 ppl @ Llama-3-8B",  },
    { "IQ3_XXS",  LLAMA_FTYPE_MOSTLY_IQ3_XXS,  " 3.06 bpw quantization",            },
+    { "Q3_PT",    LLAMA_FTYPE_MOSTLY_Q3_PT,  " 3.25 bpw quantization",            },
+    { "Q3_KPT",   LLAMA_FTYPE_MOSTLY_Q3_KPT,   " Q3_K with learned per-tensor levels" },
+    { "Q4_DPT",   LLAMA_FTYPE_MOSTLY_Q4_DPT,   " IQ4_NL with learned per-tensor int8 levels" },
+    { "Q2_KPT",   LLAMA_FTYPE_MOSTLY_Q2_KPT,   " Q2_K with learned per-tensor float levels" },
+    { "IQ2_TQ",   LLAMA_FTYPE_MOSTLY_IQ2_TQ,   " 2.0625 bpw, trellis quantized" },
+    { "IQ3_TQ",   LLAMA_FTYPE_MOSTLY_IQ3_TQ,   " 3.5625 bpw, per-tensor trained grid" },
+    { "IQ1_BN",   LLAMA_FTYPE_MOSTLY_IQ1_BN,   " 1.5625 bpw, 8D vector quantized" },
    { "IQ3_S",    LLAMA_FTYPE_MOSTLY_IQ3_S,    " 3.44 bpw quantization",            },
    { "IQ3_M",    LLAMA_FTYPE_MOSTLY_IQ3_M,    " 3.66 bpw quantization mix",        },
    { "Q3_K",     LLAMA_FTYPE_MOSTLY_Q3_K_M,   "alias for Q3_K_M"                   },
@ -162,6 +169,9 @@ static void usage(const char * executable) {
    printf("                                      WARNING: this is an advanced option, use with care.\n");
    printf("  --dry-run\n");
    printf("                                      calculate and show the final quantization size without performing quantization\n");
+    printf("  --threads n\n");
+    printf("                                      number of threads to use for cross-tensor parallelization (default: 0, use same as within-tensor)\n");
+    printf("                                      when n > 0, enables parallel quantization of multiple tensors simultaneously\n");
    printf("                                      example: llama-quantize --dry-run model-f32.gguf Q4_K\n\n");
    printf("note: --include-weights and --exclude-weights cannot be used together\n\n");
    printf("-----------------------------------------------------------------------------\n");
@ -565,6 +575,8 @@ int main(int argc, char ** argv) {
            }
        } else if (strcmp(argv[arg_idx], "--keep-split") == 0) {
            params.keep_split = true;
+        } else if (strcmp(argv[arg_idx], "--keep-split") == 0) {
+            params.keep_split = true;
        } else {
            usage(argv[0]);
        }