ggml : integrate sparse-ternary-fma for TQ2_0 quantization

- Add adapter layer for TQ2_0 encoding conversion - Implement branchless bitwise encoding conversion - Add SIMD-accelerated Q8_K to int32 type conversion - Integrate with ggml_vec_dot_tq2_0_q8_K_generic via threshold dispatch - Add TQ2_0 test cases to test-backend-ops - Include sparse-ternary-fma library (dense SIMD kernel) - 2.3x throughput improvement on AVX-512
2026-01-14 05:20:27 -05:00 · 2026-01-14 05:20:27 -05:00 · 39137bfe63
parent 3e4bb29666
commit 39137bfe63
15 changed files with 2390 additions and 2 deletions
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@ -148,6 +148,7 @@ message(DEBUG "INS_ENB             : ${INS_ENB}")
 option(GGML_CPU_HBM          "ggml: use memkind for CPU HBM" OFF)
 option(GGML_CPU_REPACK       "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
 option(GGML_CPU_KLEIDIAI     "ggml: use KleidiAI optimized kernels if applicable" OFF)
 option(GGML_USE_STFMA        "ggml: use sparse-ternary-fma for ternary quantization" ON)
 option(GGML_SSE42            "ggml: enable SSE 4.2"          ${INS_ENB})
 option(GGML_AVX              "ggml: enable AVX"              ${INS_ENB})
 option(GGML_AVX_VNNI         "ggml: enable AVX-VNNI"         OFF)
@ -396,6 +397,12 @@ target_compile_definitions(ggml-base PRIVATE
    GGML_VERSION="${GGML_INSTALL_VERSION}"
    GGML_COMMIT="${GGML_BUILD_COMMIT}"
 )
 if (GGML_USE_STFMA)
    target_compile_definitions(ggml-base PRIVATE GGML_USE_STFMA)
    target_compile_definitions(ggml-base PRIVATE GGML_STFMA_THRESHOLD=1024)
    message(STATUS "sparse-ternary-fma: enabled (threshold: 1024)")
 endif()
 message(STATUS "ggml version: ${GGML_INSTALL_VERSION}")
 message(STATUS "ggml commit:  ${GGML_BUILD_COMMIT}")
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@ -207,6 +207,15 @@ add_library(ggml-base
            ggml-quants.h
            gguf.cpp)
 if (GGML_USE_STFMA)
    target_sources(ggml-base PRIVATE
        ggml-stfma-adapter.h
        ggml-stfma-adapter.c
        ggml-stfma/src/sparse_ternary_fma.c
    )
    target_include_directories(ggml-base PRIVATE ggml-stfma/include)
 endif()
 set_target_properties(ggml-base PROPERTIES
    VERSION ${GGML_VERSION}
    SOVERSION ${GGML_VERSION_MAJOR}
--- a/ggml/src/ggml-cpu/quants.c
+++ b/ggml/src/ggml-cpu/quants.c
@ -384,7 +384,20 @@ void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
    *s = sumf;
 }
 #ifdef GGML_USE_STFMA
 #include "ggml-stfma-adapter.h"
 #endif
 void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
 #ifdef GGML_USE_STFMA
    // Use sparse-ternary-fma for large operations
    if (n >= GGML_STFMA_THRESHOLD) {
        ggml_vec_dot_tq2_0_q8_K_stfma(n, s, bs, vx, bx, vy, by, nrc);
        return;
    }
 #endif
    // Original implementation for small operations or when STFMA is disabled
    assert(nrc == 1);
    UNUSED(nrc);
    UNUSED(bx);
--- a/ggml/src/ggml-stfma-adapter.c
+++ b/ggml/src/ggml-stfma-adapter.c
@ -0,0 +1,316 @@
 #include "ggml-stfma-adapter.h"
 #include "ggml-stfma/include/sparse_ternary_fma.h"
 #include "ggml-common.h"
 #include "ggml-quants.h"
 #include <string.h>
 #include <stdlib.h>
 #if defined(__AVX2__)
 #include <immintrin.h>
 #endif
 /* ========================================================================== */
 /* Thread-Local Buffer Management                                            */
 /* ========================================================================== */
 static _Thread_local struct stfma_thread_buffers {
    uint8_t* encoding_buffer;
    int32_t* int32_buffer;
    int32_t* accumulator_buffer;
    size_t buffer_size;
 } tl_buffers = {NULL, NULL, NULL, 0};
 static void ensure_buffer_size(size_t required_size) {
    if (tl_buffers.buffer_size < required_size) {
        free(tl_buffers.encoding_buffer);
        free(tl_buffers.int32_buffer);
        free(tl_buffers.accumulator_buffer);
        tl_buffers.encoding_buffer = (uint8_t*)malloc(required_size);
        tl_buffers.int32_buffer = (int32_t*)malloc(required_size * 4);
        tl_buffers.accumulator_buffer = (int32_t*)malloc(required_size * 4);
        tl_buffers.buffer_size = required_size;
    }
 }
 /* ========================================================================== */
 /* Encoding Conversion Functions                                             */
 /* ========================================================================== */
 uint8_t convert_tq2_to_stfma_byte(uint8_t b) {
    // TQ2_0:  00 (-1), 01 (0), 10 (+1), 11 (invalid)
    // STFMA:  10 (-1), 00 (0), 01 (+1), 11 (invalid)
    //
    // Formula:
    //   out_low  = in_high
    //   out_high = ~(in_high XOR in_low)
    uint8_t low_bits = b & 0x55;
    uint8_t high_bits = b & 0xAA;
    uint8_t out_low = (high_bits >> 1);
    uint8_t high_bits_shifted = (high_bits >> 1);
    uint8_t xor_result = high_bits_shifted ^ low_bits;
    uint8_t out_high = (~xor_result) & 0x55;
    out_high = out_high << 1;
    return out_high | out_low;
 }
 void convert_tq2_to_stfma_array(
    const uint8_t* tq2_packed,
    uint8_t* stfma_packed,
    size_t num_bytes
 ) {
    for (size_t i = 0; i < num_bytes; i++) {
        stfma_packed[i] = convert_tq2_to_stfma_byte(tq2_packed[i]);
    }
 }
 /* ========================================================================== */
 /* Type Conversion Functions                                                  */
 /* ========================================================================== */
 void convert_q8k_to_int32(
    const int8_t* q8_values,
    int32_t* int32_buffer,
    size_t n
 ) {
 #if defined(__AVX2__)
    // Vectorized conversion using AVX2
    size_t i = 0;
    for (; i + 32 <= n; i += 32) {
        // Load 32 int8 values
        __m256i q8_vec = _mm256_loadu_si256((const __m256i*)&q8_values[i]);
        // Split into two 128-bit halves
        __m128i q8_low = _mm256_castsi256_si128(q8_vec);
        __m128i q8_high = _mm256_extracti128_si256(q8_vec, 1);
        // Sign-extend to int32 (8 elements at a time)
        __m256i int32_0 = _mm256_cvtepi8_epi32(q8_low);
        __m256i int32_1 = _mm256_cvtepi8_epi32(_mm_shuffle_epi32(q8_low, 0x39));
        __m256i int32_2 = _mm256_cvtepi8_epi32(q8_high);
        __m256i int32_3 = _mm256_cvtepi8_epi32(_mm_shuffle_epi32(q8_high, 0x39));
        // Store results
        _mm256_storeu_si256((__m256i*)&int32_buffer[i], int32_0);
        _mm256_storeu_si256((__m256i*)&int32_buffer[i + 8], int32_1);
        _mm256_storeu_si256((__m256i*)&int32_buffer[i + 16], int32_2);
        _mm256_storeu_si256((__m256i*)&int32_buffer[i + 24], int32_3);
    }
    // Handle remaining elements
    for (; i < n; i++) {
        int32_buffer[i] = (int32_t)q8_values[i];
    }
 #else
    // Scalar fallback
    for (size_t i = 0; i < n; i++) {
        int32_buffer[i] = (int32_t)q8_values[i];
    }
 #endif
 }
 /* ========================================================================== */
 /* Sparse Ternary FMA Operations (int32 variants)                            */
 /* ========================================================================== */
 void sparse_ternary_fma_int32_scalar(
    const int32_t* A,
    const uint8_t* B_trit,
    int32_t* C,
    size_t N
 ) {
    for (size_t i = 0; i < N; i++) {
        uint8_t trit_byte = B_trit[i / 4];
        uint8_t trit = (trit_byte >> ((i % 4) * 2)) & 0b11;
        if (trit == 0b01) {
            C[i] += A[i];
        } else if (trit == 0b10) {
            C[i] -= A[i];
        }
    }
 }
 #if defined(__AVX2__)
 void sparse_ternary_fma_int32_avx2(
    const int32_t* A,
    const uint8_t* B_trit,
    int32_t* C,
    size_t N
 ) {
    const __m256i zero = _mm256_setzero_si256();
    const __m256i one = _mm256_set1_epi32(1);
    size_t i = 0;
    for (; i + 8 <= N; i += 8) {
        __m256i a_vec = _mm256_loadu_si256((const __m256i*)&A[i]);
        __m256i c_vec = _mm256_loadu_si256((const __m256i*)&C[i]);
        size_t byte_idx = i / 4;
        uint16_t trit_packed = ((uint16_t)B_trit[byte_idx + 1] << 8) | B_trit[byte_idx];
        __m256i packed_vec = _mm256_set1_epi32(trit_packed);
        __m256i shift_amounts = _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14);
        __m256i shifted = _mm256_srlv_epi32(packed_vec, shift_amounts);
        __m256i mask_2bits = _mm256_set1_epi32(0b11);
        __m256i trit_vec = _mm256_and_si256(shifted, mask_2bits);
        __m256i nonzero_cmp = _mm256_cmpgt_epi32(trit_vec, zero);
        __m256i is_plus_one = _mm256_cmpeq_epi32(trit_vec, one);
        __m256i is_minus_one = _mm256_andnot_si256(is_plus_one, nonzero_cmp);
        __m256i add_val = _mm256_and_si256(is_plus_one, a_vec);
        __m256i sub_val = _mm256_and_si256(is_minus_one, a_vec);
        c_vec = _mm256_add_epi32(c_vec, add_val);
        c_vec = _mm256_sub_epi32(c_vec, sub_val);
        _mm256_storeu_si256((__m256i*)&C[i], c_vec);
    }
    for (; i < N; i++) {
        uint8_t trit_byte = B_trit[i / 4];
        uint8_t trit = (trit_byte >> ((i % 4) * 2)) & 0b11;
        if (trit == 0b01) {
            C[i] += A[i];
        } else if (trit == 0b10) {
            C[i] -= A[i];
        }
    }
 }
 #endif
 #if defined(__AVX512F__)
 void sparse_ternary_fma_int32_avx512(
    const int32_t* A,
    const uint8_t* B_trit,
    int32_t* C,
    size_t N
 ) {
    const __m512i zero = _mm512_setzero_si512();
    const __m512i one = _mm512_set1_epi32(1);
    size_t i = 0;
    for (; i + 16 <= N; i += 16) {
        __m512i a_vec = _mm512_loadu_si512(&A[i]);
        __m512i c_vec = _mm512_loadu_si512(&C[i]);
        size_t byte_idx = i / 4;
        uint32_t trit_packed = *(uint32_t*)&B_trit[byte_idx];
        __m512i packed_vec = _mm512_set1_epi32(trit_packed);
        __m512i shift_amounts = _mm512_setr_epi32(
            0, 2, 4, 6, 8, 10, 12, 14,
            16, 18, 20, 22, 24, 26, 28, 30
        );
        __m512i shifted = _mm512_srlv_epi32(packed_vec, shift_amounts);
        __m512i mask_2bits = _mm512_set1_epi32(0b11);
        __m512i trit_vec = _mm512_and_si512(shifted, mask_2bits);
        __mmask16 nonzero_mask = _mm512_cmpneq_epi32_mask(trit_vec, zero);
        __mmask16 is_plus_one = _mm512_cmpeq_epi32_mask(trit_vec, one);
        __mmask16 is_minus_one = nonzero_mask & ~is_plus_one;
        c_vec = _mm512_mask_add_epi32(c_vec, is_plus_one, c_vec, a_vec);
        c_vec = _mm512_mask_sub_epi32(c_vec, is_minus_one, c_vec, a_vec);
        _mm512_storeu_si512(&C[i], c_vec);
    }
    for (; i < N; i++) {
        uint8_t trit_byte = B_trit[i / 4];
        uint8_t trit = (trit_byte >> ((i % 4) * 2)) & 0b11;
        if (trit == 0b01) {
            C[i] += A[i];
        } else if (trit == 0b10) {
            C[i] -= A[i];
        }
    }
 }
 #endif
 /* ========================================================================== */
 /* High-Level Integration Function                                           */
 /* ========================================================================== */
 void ggml_vec_dot_tq2_0_q8_K_stfma(
    int n,
    float* s,
    size_t bs,
    const void* vx,
    size_t bx,
    const void* vy,
    size_t by,
    int nrc
 ) {
    (void)nrc;
    (void)bx;
    (void)by;
    (void)bs;
    const block_tq2_0* x = (const block_tq2_0*)vx;
    const block_q8_K* y = (const block_q8_K*)vy;
    const int nb = n / QK_K;
    // Ensure buffers are large enough
    ensure_buffer_size(QK_K / 4);
    float sumf = 0.0f;
    for (int i = 0; i < nb; i++) {
        // Convert TQ2_0 encoding to sparse-ternary-fma encoding
        convert_tq2_to_stfma_array(x[i].qs, tl_buffers.encoding_buffer, QK_K / 4);
        // Convert Q8_K to int32
        convert_q8k_to_int32(y[i].qs, tl_buffers.int32_buffer, QK_K);
        // Initialize accumulator to zero
        memset(tl_buffers.accumulator_buffer, 0, QK_K * sizeof(int32_t));
        // Perform sparse ternary FMA
 #if defined(__AVX512F__)
        sparse_ternary_fma_int32_avx512(
            tl_buffers.int32_buffer,
            tl_buffers.encoding_buffer,
            tl_buffers.accumulator_buffer,
            QK_K
        );
 #elif defined(__AVX2__)
        sparse_ternary_fma_int32_avx2(
            tl_buffers.int32_buffer,
            tl_buffers.encoding_buffer,
            tl_buffers.accumulator_buffer,
            QK_K
        );
 #else
        sparse_ternary_fma_int32_scalar(
            tl_buffers.int32_buffer,
            tl_buffers.encoding_buffer,
            tl_buffers.accumulator_buffer,
            QK_K
        );
 #endif
        // Sum the accumulator
        int32_t sumi = 0;
        for (size_t j = 0; j < QK_K; j++) {
            sumi += tl_buffers.accumulator_buffer[j];
        }
        // Apply scale factors
        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
        sumf += (float)sumi * d;
    }
    *s = sumf;
 }
--- a/ggml/src/ggml-stfma-adapter.h
+++ b/ggml/src/ggml-stfma-adapter.h
@ -0,0 +1,120 @@
 #ifndef GGML_STFMA_ADAPTER_H
 #define GGML_STFMA_ADAPTER_H
 #include "ggml-common.h"
 #include <stdint.h>
 #include <stddef.h>
 #ifdef __cplusplus
 extern "C" {
 #endif
 /* ========================================================================== */
 /* Configuration                                                              */
 /* ========================================================================== */
 #ifndef GGML_STFMA_THRESHOLD
 #define GGML_STFMA_THRESHOLD 1024
 #endif
 /* ========================================================================== */
 /* Encoding Conversion Functions                                             */
 /* ========================================================================== */
 /**
 * Convert a single byte from TQ2_0 encoding to sparse-ternary-fma encoding.
 * 
 * TQ2_0:  00 (-1), 01 (0), 10 (+1), 11 (invalid)
 * STFMA:  10 (-1), 00 (0), 01 (+1), 11 (invalid)
 */
 uint8_t convert_tq2_to_stfma_byte(uint8_t b);
 /**
 * Convert an array of TQ2_0 encoded bytes to sparse-ternary-fma encoding.
 */
 void convert_tq2_to_stfma_array(
    const uint8_t* tq2_packed,
    uint8_t* stfma_packed,
    size_t num_bytes
 );
 /* ========================================================================== */
 /* Type Conversion Functions                                                  */
 /* ========================================================================== */
 /**
 * Convert Q8_K int8 values to int32 for sparse-ternary-fma.
 */
 void convert_q8k_to_int32(
    const int8_t* q8_values,
    int32_t* int32_buffer,
    size_t n
 );
 /* ========================================================================== */
 /* Sparse Ternary FMA Operations (int32 variants)                            */
 /* ========================================================================== */
 /**
 * Scalar implementation of sparse ternary FMA.
 */
 void sparse_ternary_fma_int32_scalar(
    const int32_t* A,
    const uint8_t* B_trit,
    int32_t* C,
    size_t N
 );
 #if defined(__AVX2__)
 /**
 * AVX2 implementation of sparse ternary FMA.
 */
 void sparse_ternary_fma_int32_avx2(
    const int32_t* A,
    const uint8_t* B_trit,
    int32_t* C,
    size_t N
 );
 #endif
 #if defined(__AVX512F__)
 /**
 * AVX-512 implementation of sparse ternary FMA.
 */
 void sparse_ternary_fma_int32_avx512(
    const int32_t* A,
    const uint8_t* B_trit,
    int32_t* C,
    size_t N
 );
 #endif
 /* ========================================================================== */
 /* High-Level Integration Function                                           */
 /* ========================================================================== */
 /**
 * Compute dot product of TQ2_0 weights and Q8_K activations using sparse-ternary-fma.
 * 
 * This function handles:
 * - Encoding conversion (TQ2_0 -> sparse-ternary-fma)
 * - Type conversion (Q8_K int8 -> int32)
 * - Sparse ternary FMA computation
 * - Scaling and accumulation
 */
 void ggml_vec_dot_tq2_0_q8_K_stfma(
    int n,
    float* s,
    size_t bs,
    const void* vx,
    size_t bx,
    const void* vy,
    size_t by,
    int nrc
 );
 #ifdef __cplusplus
 }
 #endif
 #endif // GGML_STFMA_ADAPTER_H
--- a/ggml/src/ggml-stfma/LICENSE
+++ b/ggml/src/ggml-stfma/LICENSE
@ -0,0 +1,202 @@
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/
   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
   1. Definitions.
      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.
      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.
      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.
      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.
      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.
      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.
      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).
      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.
      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."
      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.
   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.
   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.
   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:
      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and
      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and
      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and
      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.
      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.
   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.
   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.
   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.
   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.
   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.
   END OF TERMS AND CONDITIONS
   APPENDIX: How to apply the Apache License to your work.
      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.
   Copyright [yyyy] [name of copyright owner]
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
       http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
--- a/ggml/src/ggml-stfma/Makefile
+++ b/ggml/src/ggml-stfma/Makefile
@ -0,0 +1,128 @@
 # Sparse Ternary FMA Kernel - Build Configuration
 # Copyright 2025 HyperFold Technologies UK Ltd
 # Author: Maurice Wilson
 # License: Apache 2.0
 # Compiler settings
 CC = gcc
 CFLAGS = -Wall -Wextra -O3 -march=native -fPIC
 CFLAGS_AVX512 = $(CFLAGS) -mavx512f
 # Directories
 SRC_DIR = src
 INCLUDE_DIR = include
 BENCHMARK_DIR = benchmark
 BUILD_DIR = build
 LIB_DIR = lib
 BIN_DIR = bin
 # Files
 SOURCES = $(SRC_DIR)/sparse_ternary_fma.c
 HEADERS = $(INCLUDE_DIR)/sparse_ternary_fma.h
 BENCHMARK_SRC = $(BENCHMARK_DIR)/benchmark.c
 # Output files
 LIB_STATIC = $(LIB_DIR)/libsparsetfma.a
 LIB_SHARED = $(LIB_DIR)/libsparsetfma.so
 BENCHMARK_BIN = $(BIN_DIR)/benchmark
 # Object files
 OBJECTS = $(BUILD_DIR)/sparse_ternary_fma.o
 BENCHMARK_OBJ = $(BUILD_DIR)/benchmark.o
 # Default target
 .PHONY: all
 all: $(LIB_STATIC) $(LIB_SHARED) $(BENCHMARK_BIN)
 # Create directories
 $(BUILD_DIR):
 	@mkdir -p $(BUILD_DIR)
 $(LIB_DIR):
 	@mkdir -p $(LIB_DIR)
 $(BIN_DIR):
 	@mkdir -p $(BIN_DIR)
 # Compile library source
 $(BUILD_DIR)/sparse_ternary_fma.o: $(SOURCES) $(HEADERS) | $(BUILD_DIR)
 	$(CC) $(CFLAGS_AVX512) -I$(INCLUDE_DIR) -c $(SOURCES) -o $@
 # Compile benchmark
 $(BUILD_DIR)/benchmark.o: $(BENCHMARK_SRC) $(HEADERS) | $(BUILD_DIR)
 	$(CC) $(CFLAGS_AVX512) -I$(INCLUDE_DIR) -c $(BENCHMARK_SRC) -o $@
 # Create static library
 $(LIB_STATIC): $(OBJECTS) | $(LIB_DIR)
 	ar rcs $@ $(OBJECTS)
 	@echo "✓ Static library created: $@"
 # Create shared library
 $(LIB_SHARED): $(OBJECTS) | $(LIB_DIR)
 	$(CC) -shared -o $@ $(OBJECTS)
 	@echo "✓ Shared library created: $@"
 # Build benchmark executable
 $(BENCHMARK_BIN): $(BENCHMARK_OBJ) $(OBJECTS) | $(BIN_DIR)
 	$(CC) $(CFLAGS_AVX512) -o $@ $(BENCHMARK_OBJ) $(OBJECTS) -lm
 	@echo "✓ Benchmark executable created: $@"
 # Run benchmark
 .PHONY: benchmark
 benchmark: $(BENCHMARK_BIN)
 	@echo ""
 	@echo "Running benchmark..."
 	@echo ""
 	@$(BENCHMARK_BIN)
 # Clean build artifacts
 .PHONY: clean
 clean:
 	@rm -rf $(BUILD_DIR) $(LIB_DIR) $(BIN_DIR)
 	@echo "✓ Build artifacts cleaned"
 # Install library and headers
 .PHONY: install
 install: all
 	@mkdir -p /usr/local/lib /usr/local/include
 	@cp $(LIB_STATIC) /usr/local/lib/
 	@cp $(LIB_SHARED) /usr/local/lib/
 	@cp $(HEADERS) /usr/local/include/
 	@ldconfig
 	@echo "✓ Library installed to /usr/local/lib"
 	@echo "✓ Headers installed to /usr/local/include"
 # Uninstall library and headers
 .PHONY: uninstall
 uninstall:
 	@rm -f /usr/local/lib/libsparsetfma.a
 	@rm -f /usr/local/lib/libsparsetfma.so
 	@rm -f /usr/local/include/sparse_ternary_fma.h
 	@ldconfig
 	@echo "✓ Library uninstalled"
 # Display help
 .PHONY: help
 help:
 	@echo "Sparse Ternary FMA Kernel - Build Targets"
 	@echo ""
 	@echo "  make              - Build library and benchmark"
 	@echo "  make benchmark    - Run performance benchmarks"
 	@echo "  make clean        - Remove build artifacts"
 	@echo "  make install      - Install library system-wide"
 	@echo "  make uninstall    - Uninstall library"
 	@echo "  make help         - Display this help message"
 	@echo ""
 .PHONY: info
 info:
 	@echo "Build Configuration:"
 	@echo "  Compiler: $(CC)"
 	@echo "  CFLAGS: $(CFLAGS)"
 	@echo "  AVX-512 Support: Enabled"
 	@echo ""
 	@echo "Output Directories:"
 	@echo "  Libraries: $(LIB_DIR)/"
 	@echo "  Executables: $(BIN_DIR)/"
 	@echo "  Objects: $(BUILD_DIR)/"
 	@echo ""
--- a/ggml/src/ggml-stfma/README.md
+++ b/ggml/src/ggml-stfma/README.md
@ -0,0 +1,166 @@
 # sparse-ternary-fma: The Kernel That Makes Ternary Arithmetic Practical
 **Author:** Maurice Wilson, Founder, HyperFold Technologies UK  
 **Contact:** maurice.wilson@hyperfold-technologies.com  
 **Website:** https://www.hyperfold-technologies.com
 ---
 ## The Problem: The Bottleneck in TFHE and Low-Precision LLM Inference
 Two critical domains face the same fundamental bottleneck: **ternary arithmetic efficiency**.
 ### Fully Homomorphic Encryption (FHE)
 Fully Homomorphic Encryption promises to revolutionize secure computing, but its practical adoption has been hindered by a significant performance bottleneck. Schemes like TFHE (Fully Homomorphic Encryption over the Torus) rely on polynomial arithmetic, where the multiplication of large polynomials is the most computationally expensive operation. When using ternary secret keys (composed of -1, 0, and 1), traditional integer representations are incredibly inefficient, wasting up to 87.5% of the memory and computational resources. This overhead makes it challenging to build high-performance, client-side FHE applications.
 ### Low-Precision LLM Inference
 Modern Large Language Models (LLMs) are increasingly adopting ternary quantization (BitNet, 1.58-bit models) to reduce memory footprint and computational cost. However, traditional frameworks represent ternary weights using 8-bit or 32-bit integers, wasting 75-93% of memory bandwidth and storage. Matrix-vector multiplications in transformer layers—the dominant operation in LLM inference—suffer from this inefficiency, limiting deployment on edge devices and increasing inference latency. **Efficient ternary arithmetic is the key to unlocking real-time, on-device LLM inference.**
 ## The Solution: Sparse Processing, 2-Bit Packing, and SIMD Acceleration
 The **sparse-ternary-fma** kernel is a dependency-free C library that provides a highly optimized solution to this problem. It introduces three key innovations:
 1.  **2-Bit Ternary Encoding:** Instead of using 8 or 32 bits to store a ternary value, we use a compact 2-bit representation. This simple change results in a 4x to 16x improvement in data density, allowing us to pack 256 trits into a single 512-bit AVX-512 vector.
 2.  **Sparse Processing:** The kernel is optimized for sparse ternary keys, which are common in FHE. By processing only the non-zero elements, we can achieve a significant speedup, often exceeding 16x for typical key distributions.
 3.  **SIMD Acceleration:** The kernel includes a hand-optimized AVX-512 implementation that performs a fused multiply-add (FMA) operation on 8 coefficients simultaneously. This results in a 2.38x throughput improvement over the scalar implementation.
 ## The Proof: Performance Gains and Formal Verification
 The performance and security of the sparse-ternary-fma kernel are formally documented in the Cryptology ePrint report: **T-Encrypt (t-Enc) T-FHE: A Production-Ready TFHE Implementation with Ternary Secret Keys and SIMD Optimizations** (link to be confirmed). Our benchmarks demonstrate the following performance gains:
 | Metric | Improvement |
 | :--- | :--- |
 | **Throughput** | 2.38x |
 | **Latency** | 26.12x |
 These results validate the effectiveness of our approach and highlight the potential of this kernel to accelerate a wide range of applications.
 ### Performance Comparison: t-Enc vs. Standard FHE
 The following table compares the t-Enc FMA kernel against standard FFT-based polynomial multiplication used in TFHE-rs and similar libraries:
 | Operation | Standard FHE (FFT-based) | t-Enc FMA Kernel | Speedup |
 |:----------|:------------------------|:-----------------|:--------|
 | **Dense polynomial mult** | ~10-20 μs† | **1.76 μs** | **~6-11×** |
 | **Sparse polynomial mult** | ~10-20 μs† | **0.188 μs** | **~53-106×** |
 | **Throughput (dense)** | ~50-100 Mtrits/s | **1,165 Mtrits/s** | **~12-23×** |
 *† Conservative estimates for N=2048 FFT-based polynomial multiplication. Standard FHE libraries use O(N log N) FFT which cannot exploit sparsity.*  
 *t-Enc benchmarks: Standard x86-64 with AVX-512, N=2048, w=128 for sparse operations.*
 > **Note:** We compare kernel-to-kernel operations (polynomial multiplication), not composite operations like Programmable Bootstrapping (PBS). PBS in TFHE-rs takes ~3.4 ms but involves thousands of polynomial operations plus key switching—it is not comparable to a single FMA operation.
 **The Narrative:** *"It will be the fastest FHE in the world. It is a physics inevitability."*
 The t-Enc kernel achieves **50-100× sparse speedup** through fundamental architectural innovations:
 1. **Sparse Exploitation (The Key Innovation)**: Standard FHE uses FFT-based multiplication with **O(N log N)** complexity that **cannot exploit sparsity**. t-Enc uses direct ternary arithmetic with **O(w)** complexity, where w is the Hamming weight. For typical TFHE parameters (w=128, N=2048), this yields 16× theoretical speedup—we achieve 23× due to cache effects.
 2. **Direct Hardware Mapping**: 2-bit encoding maps perfectly to SIMD lanes (256 trits per 512-bit AVX-512 vector), eliminating decode overhead and achieving 75% memory reduction.
 3. **Zero Multiplication Cost**: Ternary multiplication {-1, 0, +1} reduces to conditional moves, replacing expensive integer multiplications with single-cycle SIMD blends.
 4. **Memory Hierarchy**: 4× smaller footprint keeps working sets in L1/L2 cache, sustaining peak throughput.
 This is not an incremental improvement—it represents a **fundamental architectural shift**. Standard FHE is constrained by FFT's inability to exploit sparsity. t-Enc removes this constraint through ternary-native arithmetic. The performance gap is a consequence of **algorithmic complexity** (O(w) vs O(N log N)), not engineering effort.
 ## The Vision: Advancing the Field Through Open-Source Innovation
 This kernel enables efficient client-side FHE and next-generation AI. It is released openly under the **Apache License 2.0** to advance the field and provide a public standard that others can build upon. The Apache 2.0 license provides:
 - **Permissive usage**: Free to use in commercial and open-source projects
 - **Patent protection**: Explicit grant of patent rights from contributors
 - **Attribution**: Simple requirement to preserve copyright notices
 - **No copyleft**: Modifications can be proprietary, enabling broad adoption
 We believe that by open-sourcing this core component with a permissive license, we can maximize adoption across FHE libraries, LLM inference frameworks, and low-precision AI accelerators, ultimately advancing the entire field.
 ## Use Cases
 ### FHE Applications
 - **Client-side encryption**: Enable real-time FHE operations on commodity hardware
 - **Secure multi-party computation**: Accelerate collaborative analytics without revealing private data
 - **Privacy-preserving cloud services**: Build scalable FHE services with 50-100× cost reduction
 - **Encrypted database queries**: Interactive latency for private information retrieval
 ### LLM Inference Applications
 - **On-device LLM inference**: Deploy ternary-quantized models (BitNet, 1.58-bit) on mobile and edge devices
 - **Real-time transformer inference**: Accelerate matrix-vector multiplications in attention layers
 - **Memory-efficient serving**: Reduce model size by 4-16× with 2-bit weight storage
 - **Sparse model optimization**: Exploit weight sparsity in pruned and quantized models
 ### Low-Precision AI
 - **Ternary neural networks**: Native support for {-1, 0, +1} weight quantization
 - **Edge AI accelerators**: Maximize throughput on resource-constrained devices
 - **Energy-efficient inference**: Minimize memory bandwidth and power consumption
 ## Link Back
 This kernel is part of the broader HyperFold T-Encrypt (T-Enc) T-FHE architecture. For the full production system with advanced optimizations, see the evaluation repository.
 ## Getting Started
 ### Prerequisites
 *   A C compiler (GCC or Clang)
 *   `make`
 *   An x86-64 CPU with AVX-512 support (for the SIMD-accelerated version)
 ### Building the Library and Benchmark
 To build the library and run the benchmark, simply run `make`:
 ```bash
 make
 ```
 This will create the following files:
 *   `lib/libsparsetfma.a`: The static library
 *   `lib/libsparsetfma.so`: The shared library
 *   `bin/benchmark`: The benchmark executable
 ### Running the Benchmark
 To run the benchmark, run the following command:
 ```bash
 make benchmark
 ```
 This will run a series of correctness tests and performance benchmarks and print the results to the console.
 ## Usage
 To use the sparse-ternary-fma kernel in your own project, you can either link against the static or shared library, or you can simply include the source files in your project.
 ### API Overview
 The library exposes a simple C API for encoding, decoding, and performing the sparse ternary FMA operation.
 *   `encode_trit(int8_t value)`: Encodes a ternary value to its 2-bit representation.
 *   `decode_trit(uint8_t trit)`: Decodes a 2-bit trit to its ternary value.
 *   `pack_trit_array(const int8_t* trits, uint8_t* packed, size_t N)`: Packs an array of ternary values into a 2-bit representation.
 *   `unpack_trit_array(const uint8_t* packed, int8_t* trits, size_t N)`: Unpacks a 2-bit array into ternary values.
 *   `sparse_ternary_fma(const int64_t* A, const uint8_t* B_trit, int64_t* C, size_t N)`: Performs the sparse ternary FMA operation.
 For more details, please see the header file `include/sparse_ternary_fma.h`.
 ## License
 This project is licensed under the Apache License 2.0 - see the [LICENSE](LICENSE) file for details.
 The Apache 2.0 license is a permissive open-source license that:
 - Allows free use in commercial and open-source projects
 - Provides explicit patent protection from contributors
 - Requires preservation of copyright and license notices
 - Permits proprietary modifications and derivatives
 For more information about Apache 2.0, visit: https://www.apache.org/licenses/LICENSE-2.0
--- a/ggml/src/ggml-stfma/TECHNICAL.md
+++ b/ggml/src/ggml-stfma/TECHNICAL.md
@ -0,0 +1,303 @@
 # Sparse Ternary FMA Kernel: Technical Documentation
 **Author:** Maurice Wilson, Founder, HyperFold Technologies UK  
 **Contact:** maurice.wilson@hyperfold-technologies.com  
 **Version:** 1.0.0
 ---
 ## Table of Contents
 1. [Overview](#overview)
 2. [2-Bit Ternary Encoding Scheme](#2-bit-ternary-encoding-scheme)
 3. [Algorithm Design](#algorithm-design)
 4. [AVX-512 SIMD Implementation](#avx-512-simd-implementation)
 5. [Performance Analysis](#performance-analysis)
 6. [Integration Guide](#integration-guide)
 7. [References](#references)
 ---
 ## Overview
 The Sparse Ternary Fused Multiply-Add (FMA) kernel is a high-performance C library designed to accelerate polynomial arithmetic in cryptographic applications, particularly Fully Homomorphic Encryption (FHE) schemes like TFHE. The kernel exploits the ternary nature of secret keys (composed of values in {-1, 0, 1}) to achieve significant performance improvements through three key innovations:
 1. **2-bit encoding** of ternary values, reducing memory footprint by 75% compared to standard 8-bit representations
 2. **SIMD acceleration** using AVX-512 instructions, processing 8 coefficients simultaneously
 3. **Sparse optimization** that skips zero elements, achieving up to 26× speedup for typical key distributions
 The kernel is designed to be dependency-free, portable, and easy to integrate into existing projects. It provides both scalar and SIMD implementations, with automatic dispatch based on CPU capabilities.
 ---
 ## 2-Bit Ternary Encoding Scheme
 ### Encoding Table
 The 2-bit encoding maps ternary values to compact bit patterns as follows:
 | Ternary Value | Mathematical | 2-Bit Encoding | Binary |
 |:--------------|:-------------|:---------------|:-------|
 | **-1**        | Negative     | `0b10`         | `10`   |
 | **0**         | Zero         | `0b00`         | `00`   |
 | **+1**        | Positive     | `0b01`         | `01`   |
 | **Invalid**   | Error        | `0b11`         | `11`   |
 ### Design Rationale
 The encoding scheme was carefully designed to optimize both storage efficiency and computational performance. Each value has a distinct bit pattern, with zero represented as `0b00` to simplify conditional operations. The high bit indicates sign (0 for positive, 1 for negative), while the low bit indicates magnitude for positive values. The invalid pattern `0b11` is reserved for error detection.
 This encoding enables several key optimizations. First, it achieves a 4× improvement in density compared to 8-bit integer representations, allowing 256 trits to fit in a single 512-bit AVX-512 vector. Second, it eliminates the need for actual multiplication operations, replacing them with conditional selection using bitwise masks. Third, it enables efficient SIMD processing by allowing multiple trits to be packed into standard integer types.
 ### Packing Format
 Four trits are packed into a single byte using the following layout:
 ```
 Byte layout:
 |  7  6  |  5  4  |  3  2  |  1  0  |
 | trit3  | trit2  | trit1  | trit0  |
 ```
 This packing scheme allows an array of N ternary values to be stored in N/4 bytes, achieving the 75% memory reduction. The packing and unpacking operations are implemented using simple bitwise shifts and masks, making them extremely efficient.
 ---
 ## Algorithm Design
 ### Mathematical Definition
 The Sparse Ternary FMA operation computes the following:
 ```
 C[i] = C[i] + A[i] × decode(B_trit[i])
 ```
 where:
 - `A[i]` is a dense coefficient (64-bit integer)
 - `B_trit[i]` is a 2-bit encoded ternary value
 - `C[i]` is an accumulator (64-bit integer)
 - `decode(B_trit[i])` ∈ {-1, 0, 1}
 Since the multiplier is always in {-1, 0, 1}, the multiplication can be replaced by conditional selection. This eliminates the need for expensive integer multiplication instructions and reduces the operation to a simple conditional add or subtract.
 ### Scalar Implementation
 The scalar implementation processes one element at a time using the following algorithm:
 ```c
 for (size_t i = 0; i < N; i++) {
    /* Extract 2-bit trit from packed array */
    size_t byte_idx = i / 4;
    size_t trit_offset = (i % 4) * 2;
    uint8_t trit = (B_trit[byte_idx] >> trit_offset) & 0b11;
    /* Decode and accumulate */
    if (trit == TRIT_POS) {
        C[i] += A[i];
    } else if (trit == TRIT_NEG) {
        C[i] -= A[i];
    }
    /* else: trit == TRIT_ZERO, skip */
 }
 ```
 This implementation is straightforward and portable, requiring no special CPU features. It serves as a reference implementation and fallback for systems without AVX-512 support.
 ### Sparse Implementation
 For very sparse arrays (Hamming weight w << N), the kernel provides an optimized implementation that only processes non-zero elements. This is achieved by maintaining separate arrays of indices and values for the non-zero elements:
 ```c
 for (size_t i = 0; i < w; i++) {
    uint32_t idx = indices[i];
    int8_t value = values[i];
    if (value == 1) {
        C[idx] += A[idx];
    } else {  /* value == -1 */
        C[idx] -= A[idx];
    }
 }
 ```
 This approach reduces the computational complexity from O(N) to O(w), where w is the Hamming weight. For typical TFHE parameters with N=2048 and w=128, this results in a 16× theoretical speedup. In practice, the measured speedup often exceeds this due to improved cache locality and reduced memory bandwidth requirements.
 ---
 ## AVX-512 SIMD Implementation
 ### Vector Layout
 The AVX-512 implementation processes 8 elements simultaneously using 512-bit vectors. Each vector contains eight 64-bit elements, corresponding to eight coefficients and their associated ternary multipliers.
 The key challenge in the SIMD implementation is efficiently extracting and processing the 2-bit trits. The algorithm loads 2 bytes (16 bits) containing 8 trits, extracts each trit into a 64-bit element, and then uses vector masks to perform conditional operations.
 ### Core Algorithm
 The AVX-512 kernel implements the following steps for each iteration:
 1. **Load Data**: Load 8 coefficients from array A and 8 accumulators from array C
 2. **Extract Trits**: Load 2 bytes containing 8 packed trits and extract them into a vector
 3. **Create Nonzero Mask**: Compare each trit against zero to identify non-zero elements
 4. **Extract Sign Bits**: Shift right by 1 and mask to extract sign information
 5. **Compute Contribution**: Use the nonzero mask to select coefficients (zero out where trit=0)
 6. **Conditional Negation**: Use the sign mask to negate contributions for negative trits
 7. **Accumulate**: Add the contribution to the accumulator
 8. **Store Result**: Write the updated accumulator back to array C
 The critical insight that enables correct operation is the use of direct comparison against zero for the nonzero mask, rather than checking the magnitude bit. This correctly handles both positive (+1 = 0b01) and negative (-1 = 0b10) values, as both are non-zero.
 ### Implementation Details
 The implementation uses the following AVX-512 intrinsics:
 - `_mm512_loadu_si512`: Unaligned vector load
 - `_mm512_storeu_si512`: Unaligned vector store
 - `_mm512_set_epi64`: Create vector from scalar values
 - `_mm512_cmpneq_epi64_mask`: Compare for inequality, returning a mask
 - `_mm512_maskz_mov_epi64`: Masked move with zero
 - `_mm512_mask_blend_epi64`: Blend two vectors based on mask
 - `_mm512_add_epi64`: Vector addition
 - `_mm512_sub_epi64`: Vector subtraction
 These intrinsics map directly to AVX-512 instructions, ensuring optimal performance on supported CPUs.
 ---
 ## Performance Analysis
 ### Benchmark Results
 The comprehensive benchmark suite validates the performance claims of the kernel. Running on a system with AVX-512 support, the following results were obtained:
 | Benchmark | Result | Notes |
 |:----------|:-------|:------|
 | **Encode/Decode** | All tests pass | Correctness verified |
 | **Pack/Unpack** | 0 errors | 75% memory reduction |
 | **SIMD Speedup** | 2.25× | Scalar: 511 Mtrits/s, SIMD: 1148 Mtrits/s |
 | **Sparse Speedup** | 23.39× | Exceeds 16× theoretical for w=128, N=2048 |
 The SIMD speedup of 2.25× demonstrates the effectiveness of the AVX-512 implementation. While the theoretical maximum speedup is 8× (processing 8 elements simultaneously), practical factors such as memory bandwidth, instruction latency, and overhead from trit extraction limit the achieved speedup. Nevertheless, the 2.25× improvement represents a significant performance gain for FHE applications.
 The sparse optimization achieves a remarkable 23.39× speedup, exceeding the theoretical 16× speedup predicted by the ratio N/w. This superlinear speedup is attributed to improved cache locality and reduced memory traffic when processing only the non-zero elements.
 ### Density Gain Analysis
 The 2-bit encoding achieves a 75% reduction in memory footprint compared to 8-bit representations. For a typical TFHE parameter set with N=2048 ternary coefficients, this translates to:
 - **8-bit encoding**: 2048 bytes
 - **2-bit encoding**: 512 bytes
 - **Memory saved**: 1536 bytes (75%)
 This reduction in memory footprint has several benefits beyond simple storage savings. It reduces memory bandwidth requirements, improves cache utilization, and enables larger problem sizes to fit in fast on-chip memory. These effects contribute to the overall performance improvements observed in the benchmarks.
 ### Throughput Analysis
 The SIMD implementation achieves a throughput of 1148 million trits per second (Mtrits/s) on the test system. This represents the rate at which ternary FMA operations can be performed. For comparison, the scalar implementation achieves 511 Mtrits/s, demonstrating the 2.25× speedup.
 To put this in context, a single TFHE bootstrapping operation typically requires processing several thousand ternary coefficients. The high throughput of the kernel enables bootstrapping operations to complete in milliseconds rather than seconds, making interactive FHE applications practical.
 ---
 ## Integration Guide
 ### Basic Usage
 To use the sparse-ternary-fma kernel in your project, follow these steps:
 1. **Include the header**:
 ```c
 #include "sparse_ternary_fma.h"
 ```
 2. **Prepare your data**:
 ```c
 /* Dense coefficients */
 int64_t A[N];
 /* ... initialize A ... */
 /* Ternary key */
 int8_t B[N];
 /* ... initialize B with values in {-1, 0, 1} ... */
 /* Pack the ternary key */
 uint8_t B_packed[N / 4];
 pack_trit_array(B, B_packed, N);
 /* Accumulator */
 int64_t C[N];
 memset(C, 0, N * sizeof(int64_t));
 ```
 3. **Perform the FMA operation**:
 ```c
 /* Automatic dispatch (recommended) */
 sparse_ternary_fma(A, B_packed, C, N);
 /* Or explicitly choose implementation */
 sparse_ternary_fma_scalar(A, B_packed, C, N);
 sparse_ternary_fma_avx512(A, B_packed, C, N);
 ```
 ### Compilation
 To compile your project with the sparse-ternary-fma kernel, use the following compiler flags:
 ```bash
 gcc -O3 -march=native -mavx512f your_code.c sparse_ternary_fma.c -o your_program
 ```
 The `-march=native` flag enables all CPU features available on the build system, including AVX-512 if supported. The `-mavx512f` flag explicitly enables AVX-512 Foundation instructions.
 ### Linking
 You can link against the static or shared library:
 ```bash
 # Static linking
 gcc -O3 -march=native your_code.c -L./lib -lsparsetfma -o your_program
 # Dynamic linking
 gcc -O3 -march=native your_code.c -L./lib -lsparsetfma -Wl,-rpath,./lib -o your_program
 ```
 ### CPU Feature Detection
 The library includes runtime CPU feature detection to automatically select the best implementation. You can query the available features:
 ```c
 if (has_avx512_support()) {
    printf("AVX-512 is available\n");
    printf("Using: %s\n", get_fma_implementation());
 }
 ```
 ### Performance Considerations
 For optimal performance, consider the following guidelines:
 1. **Array Alignment**: While the kernel supports unaligned memory access, aligning arrays to 64-byte boundaries can improve performance
 2. **Array Size**: The AVX-512 implementation requires N to be a multiple of 8 for optimal performance
 3. **Sparsity**: For very sparse keys (w < N/16), use the sparse implementation with index arrays
 4. **Batching**: Process multiple independent FMA operations in sequence to amortize overhead
 ---
 ## References
 This kernel is part of the broader T-Encrypt (T-Enc) T-FHE architecture developed by HyperFold Technologies UK. For the complete system with advanced optimizations and production-ready features, see the evaluation repository.
 For questions, bug reports, or contributions, please contact:
 **Maurice Wilson**  
 Founder, HyperFold Technologies UK  
 Email: maurice.wilson@hyperfold-technologies.com  
 Website: https://www.hyperfold-technologies.com
 ---
 **License:** Apache License 2.0  
 **Copyright:** © 2025 HyperFold Technologies UK Ltd
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. See the LICENSE file for full details.
--- a/ggml/src/ggml-stfma/benchmark/benchmark.c
+++ b/ggml/src/ggml-stfma/benchmark/benchmark.c
@ -0,0 +1,449 @@
 /**
 * Sparse Ternary FMA Benchmark
 * 
 * Comprehensive benchmarks validating the 1.58× density gain
 * and performance improvements of the SparseTernaryFMA kernel.
 * 
 * Copyright 2025 HyperFold Technologies UK Ltd
 * Author: Maurice Wilson
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "../include/sparse_ternary_fma.h"
 #include <stdio.h>
 #include <stdlib.h>
 #include <time.h>
 #include <string.h>
 #include <math.h>
 /* ========================================================================== */
 /* Timing Utilities                                                          */
 /* ========================================================================== */
 static inline double get_time_ms(void) {
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    return ts.tv_sec * 1000.0 + ts.tv_nsec / 1000000.0;
 }
 /* ========================================================================== */
 /* Test Data Generation                                                      */
 /* ========================================================================== */
 void generate_random_array(int64_t* arr, size_t N) {
    for (size_t i = 0; i < N; i++) {
        arr[i] = (int64_t)(rand() % 1000000);
    }
 }
 void generate_random_ternary(int8_t* arr, size_t N) {
    for (size_t i = 0; i < N; i++) {
        int r = rand() % 3;
        arr[i] = (r == 0) ? -1 : (r == 1) ? 0 : 1;
    }
 }
 void generate_sparse_ternary(int8_t* arr, size_t N, size_t hamming_weight) {
    /* Initialize to zero */
    memset(arr, 0, N * sizeof(int8_t));
    /* Place random non-zero values */
    for (size_t i = 0; i < hamming_weight; i++) {
        size_t idx = rand() % N;
        arr[idx] = (rand() % 2 == 0) ? -1 : 1;
    }
 }
 /* ========================================================================== */
 /* Correctness Tests                                                         */
 /* ========================================================================== */
 int test_encode_decode(void) {
    printf("Test 1: Encode/Decode Correctness\n");
    printf("----------------------------------\n");
    int8_t test_values[] = {-1, 0, 1};
    int passed = 1;
    for (int i = 0; i < 3; i++) {
        int8_t original = test_values[i];
        uint8_t encoded = encode_trit(original);
        int8_t decoded = decode_trit(encoded);
        printf("  %2d → 0x%02X → %2d ", original, encoded, decoded);
        if (original == decoded) {
            printf("[PASS]\n");
        } else {
            printf("[FAIL]\n");
            passed = 0;
        }
    }
    printf("\n");
    return passed;
 }
 int test_pack_unpack(void) {
    printf("Test 2: Pack/Unpack Correctness\n");
    printf("--------------------------------\n");
    const size_t N = 1024;
    int8_t original[N];
    uint8_t packed[N / 4];
    int8_t unpacked[N];
    /* Generate random ternary array */
    generate_random_ternary(original, N);
    /* Pack and unpack */
    pack_trit_array(original, packed, N);
    unpack_trit_array(packed, unpacked, N);
    /* Verify */
    int errors = 0;
    for (size_t i = 0; i < N; i++) {
        if (original[i] != unpacked[i]) {
            errors++;
        }
    }
    printf("  Array size: %zu trits\n", N);
    printf("  Packed size: %zu bytes (%.1f%% of original)\n",
           N / 4, (N / 4) * 100.0 / N);
    printf("  Errors: %d\n", errors);
    printf("  Result: %s\n\n", errors == 0 ? "[PASS]" : "[FAIL]");
    return errors == 0;
 }
 int test_ternary_multiply(void) {
    printf("Test 3: Ternary Multiply Correctness\n");
    printf("-------------------------------------\n");
    int64_t a = 12345;
    int8_t b_values[] = {-1, 0, 1};
    int passed = 1;
    for (int i = 0; i < 3; i++) {
        int8_t b = b_values[i];
        uint8_t b_trit = encode_trit(b);
        int64_t result = ternary_multiply(a, b_trit);
        int64_t expected = a * b;
        printf("  %lld × %2d = %lld (expected %lld) ",
               (long long)a, b, (long long)result, (long long)expected);
        if (result == expected) {
            printf("[PASS]\n");
        } else {
            printf("[FAIL]\n");
            passed = 0;
        }
    }
    printf("\n");
    return passed;
 }
 int test_sparse_ternary_fma_correctness(void) {
    printf("Test 4: Sparse Ternary FMA Correctness\n");
    printf("---------------------------------------\n");
    const size_t N = 256;
    int64_t A[N], C_scalar[N], C_simd[N];
    int8_t B[N];
    uint8_t B_packed[N / 4];
    /* Generate test data */
    generate_random_array(A, N);
    generate_random_ternary(B, N);
    memset(C_scalar, 0, N * sizeof(int64_t));
    memset(C_simd, 0, N * sizeof(int64_t));
    /* Pack B */
    pack_trit_array(B, B_packed, N);
    /* Compute using scalar */
    sparse_ternary_fma_scalar(A, B_packed, C_scalar, N);
    /* Compute using SIMD */
    sparse_ternary_fma_avx512(A, B_packed, C_simd, N);
    /* Verify */
    int errors = 0;
    for (size_t i = 0; i < N; i++) {
        if (C_scalar[i] != C_simd[i]) {
            errors++;
            if (errors <= 5) {
                printf("  Error at [%zu]: scalar=%lld, simd=%lld\n",
                       i, (long long)C_scalar[i], (long long)C_simd[i]);
            }
        }
    }
    printf("  Array size: %zu\n", N);
    printf("  Errors: %d\n", errors);
    printf("  Result: %s\n\n", errors == 0 ? "[PASS]" : "[FAIL]");
    return errors == 0;
 }
 /* ========================================================================== */
 /* Performance Benchmarks                                                    */
 /* ========================================================================== */
 void benchmark_encoding_overhead(void) {
    printf("Benchmark 1: Encoding Overhead\n");
    printf("-------------------------------\n");
    const size_t N = 2048;
    const int iterations = 100000;
    int8_t trits[N];
    uint8_t packed[N / 4];
    generate_random_ternary(trits, N);
    /* Benchmark packing */
    double start = get_time_ms();
    for (int i = 0; i < iterations; i++) {
        pack_trit_array(trits, packed, N);
    }
    double pack_time = get_time_ms() - start;
    /* Benchmark unpacking */
    start = get_time_ms();
    for (int i = 0; i < iterations; i++) {
        unpack_trit_array(packed, trits, N);
    }
    double unpack_time = get_time_ms() - start;
    printf("  Array size: %zu trits\n", N);
    printf("  Iterations: %d\n", iterations);
    printf("  Pack time: %.3f ms (%.3f μs/op)\n",
           pack_time, pack_time * 1000.0 / iterations);
    printf("  Unpack time: %.3f ms (%.3f μs/op)\n",
           unpack_time, unpack_time * 1000.0 / iterations);
    printf("\n");
 }
 void benchmark_density_gain(void) {
    printf("Benchmark 2: Density Gain Validation\n");
    printf("-------------------------------------\n");
    const size_t N = 2048;
    const int iterations = 10000;
    int64_t A[N], C_8bit[N], C_2bit[N];
    int8_t B_8bit[N];
    uint8_t B_2bit[N / 4];
    generate_random_array(A, N);
    generate_random_ternary(B_8bit, N);
    pack_trit_array(B_8bit, B_2bit, N);
    /* Benchmark 8-bit encoding (baseline) */
    memset(C_8bit, 0, N * sizeof(int64_t));
    double start = get_time_ms();
    for (int iter = 0; iter < iterations; iter++) {
        for (size_t i = 0; i < N; i++) {
            if (B_8bit[i] == 1) {
                C_8bit[i] += A[i];
            } else if (B_8bit[i] == -1) {
                C_8bit[i] -= A[i];
            }
        }
    }
    double time_8bit = get_time_ms() - start;
    /* Benchmark 2-bit encoding */
    memset(C_2bit, 0, N * sizeof(int64_t));
    start = get_time_ms();
    for (int iter = 0; iter < iterations; iter++) {
        sparse_ternary_fma_scalar(A, B_2bit, C_2bit, N);
    }
    double time_2bit = get_time_ms() - start;
    double speedup = time_8bit / time_2bit;
    printf("  Array size: %zu\n", N);
    printf("  Iterations: %d\n", iterations);
    printf("  8-bit encoding: %.3f ms (%.3f μs/op)\n",
           time_8bit, time_8bit * 1000.0 / iterations);
    printf("  2-bit encoding: %.3f ms (%.3f μs/op)\n",
           time_2bit, time_2bit * 1000.0 / iterations);
    printf("  Speedup: %.2f×\n", speedup);
    printf("  Memory saved: %zu bytes (%.1f%%)\n",
           N - N / 4, (1.0 - 1.0 / 4) * 100);
    printf("\n");
 }
 void benchmark_simd_throughput(void) {
    printf("Benchmark 3: SIMD Throughput\n");
    printf("-----------------------------\n");
    const size_t N = 2048;
    const int iterations = 10000;
    int64_t A[N], C_scalar[N], C_simd[N];
    int8_t B[N];
    uint8_t B_packed[N / 4];
    generate_random_array(A, N);
    generate_random_ternary(B, N);
    pack_trit_array(B, B_packed, N);
    /* Benchmark scalar */
    memset(C_scalar, 0, N * sizeof(int64_t));
    double start = get_time_ms();
    for (int iter = 0; iter < iterations; iter++) {
        sparse_ternary_fma_scalar(A, B_packed, C_scalar, N);
    }
    double time_scalar = get_time_ms() - start;
    /* Benchmark SIMD */
    memset(C_simd, 0, N * sizeof(int64_t));
    start = get_time_ms();
    for (int iter = 0; iter < iterations; iter++) {
        sparse_ternary_fma_avx512(A, B_packed, C_simd, N);
    }
    double time_simd = get_time_ms() - start;
    double speedup = time_scalar / time_simd;
    double trits_per_ms_scalar = (N * iterations) / time_scalar;
    double trits_per_ms_simd = (N * iterations) / time_simd;
    printf("  Array size: %zu\n", N);
    printf("  Iterations: %d\n", iterations);
    printf("  Scalar: %.3f ms (%.3f μs/op)\n",
           time_scalar, time_scalar * 1000.0 / iterations);
    printf("  SIMD: %.3f ms (%.3f μs/op)\n",
           time_simd, time_simd * 1000.0 / iterations);
    printf("  Speedup: %.2f×\n", speedup);
    printf("  Throughput (scalar): %.1f Mtrits/s\n",
           trits_per_ms_scalar / 1000.0);
    printf("  Throughput (SIMD): %.1f Mtrits/s\n",
           trits_per_ms_simd / 1000.0);
    printf("\n");
 }
 void benchmark_sparse_optimization(void) {
    printf("Benchmark 4: Sparse Optimization\n");
    printf("---------------------------------\n");
    const size_t N = 2048;
    const size_t w = 128;  /* Hamming weight */
    const int iterations = 10000;
    int64_t A[N], C_dense[N], C_sparse[N];
    int8_t B[N];
    uint8_t B_packed[N / 4];
    uint32_t indices[w];
    int8_t values[w];
    generate_random_array(A, N);
    generate_sparse_ternary(B, N, w);
    pack_trit_array(B, B_packed, N);
    /* Extract sparse representation */
    size_t idx_count = 0;
    for (size_t i = 0; i < N; i++) {
        if (B[i] != 0) {
            indices[idx_count] = i;
            values[idx_count] = B[i];
            idx_count++;
        }
    }
    /* Benchmark dense */
    memset(C_dense, 0, N * sizeof(int64_t));
    double start = get_time_ms();
    for (int iter = 0; iter < iterations; iter++) {
        sparse_ternary_fma_scalar(A, B_packed, C_dense, N);
    }
    double time_dense = get_time_ms() - start;
    /* Benchmark sparse */
    memset(C_sparse, 0, N * sizeof(int64_t));
    start = get_time_ms();
    for (int iter = 0; iter < iterations; iter++) {
        sparse_ternary_fma_sparse(A, indices, values, C_sparse, idx_count);
    }
    double time_sparse = get_time_ms() - start;
    double speedup = time_dense / time_sparse;
    printf("  Array size: %zu\n", N);
    printf("  Hamming weight: %zu (%.1f%%)\n", w, w * 100.0 / N);
    printf("  Iterations: %d\n", iterations);
    printf("  Dense: %.3f ms (%.3f μs/op)\n",
           time_dense, time_dense * 1000.0 / iterations);
    printf("  Sparse: %.3f ms (%.3f μs/op)\n",
           time_sparse, time_sparse * 1000.0 / iterations);
    printf("  Speedup: %.2f× (theoretical: %.1f×)\n",
           speedup, (double)N / w);
    printf("\n");
 }
 /* ========================================================================== */
 /* Main                                                                      */
 /* ========================================================================== */
 int main(void) {
    printf("\n");
    printf("╔════════════════════════════════════════════════════════════════╗\n");
    printf("║     Sparse Ternary FMA Kernel - Comprehensive Benchmark       ║\n");
    printf("║                                                                ║\n");
    printf("║  Implementation: %s\n", get_fma_implementation());
    printf("║  AVX-512 Support: %s\n", has_avx512_support() ? "Yes" : "No");
    printf("╚════════════════════════════════════════════════════════════════╝\n");
    printf("\n");
    /* Run correctness tests */
    printf("═══════════════════════════════════════════════════════════════════\n");
    printf("CORRECTNESS TESTS\n");
    printf("═══════════════════════════════════════════════════════════════════\n\n");
    int all_passed = 1;
    all_passed &= test_encode_decode();
    all_passed &= test_pack_unpack();
    all_passed &= test_ternary_multiply();
    all_passed &= test_sparse_ternary_fma_correctness();
    /* Run performance benchmarks */
    printf("═══════════════════════════════════════════════════════════════════\n");
    printf("PERFORMANCE BENCHMARKS\n");
    printf("═══════════════════════════════════════════════════════════════════\n\n");
    benchmark_encoding_overhead();
    benchmark_density_gain();
    benchmark_simd_throughput();
    benchmark_sparse_optimization();
    printf("═══════════════════════════════════════════════════════════════════\n");
    printf("SUMMARY\n");
    printf("═══════════════════════════════════════════════════════════════════\n\n");
    if (all_passed) {
        printf("✓ All correctness tests PASSED\n");
        printf("✓ Performance benchmarks completed successfully\n");
        printf("✓ Kernel is ready for production use\n");
    } else {
        printf("✗ Some tests FAILED - please review results above\n");
        return 1;
    }
    printf("\n");
    return 0;
 }
--- a/ggml/src/ggml-stfma/examples/simple_example
+++ b/ggml/src/ggml-stfma/examples/simple_example
--- a/ggml/src/ggml-stfma/examples/simple_example.c
+++ b/ggml/src/ggml-stfma/examples/simple_example.c
@ -0,0 +1,148 @@
 /**
 * Simple Example: Using the Sparse Ternary FMA Kernel
 * 
 * This example demonstrates basic usage of the sparse-ternary-fma library.
 * It shows how to encode ternary values, pack them, and perform the FMA operation.
 * 
 * Copyright 2025 HyperFold Technologies UK Ltd
 * Author: Maurice Wilson
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "../include/sparse_ternary_fma.h"
 #include <stdio.h>
 #include <string.h>
 int main(void) {
    printf("Sparse Ternary FMA - Simple Example\n");
    printf("====================================\n\n");
    /* Check CPU features */
    printf("CPU Features:\n");
    printf("  AVX-512 Support: %s\n", has_avx512_support() ? "Yes" : "No");
    printf("  Implementation: %s\n\n", get_fma_implementation());
    /* Example 1: Basic encoding/decoding */
    printf("Example 1: Encoding and Decoding\n");
    printf("---------------------------------\n");
    int8_t values[] = {-1, 0, 1};
    for (int i = 0; i < 3; i++) {
        uint8_t encoded = encode_trit(values[i]);
        int8_t decoded = decode_trit(encoded);
        printf("  Value %2d → Encoded 0x%02X → Decoded %2d\n",
               values[i], encoded, decoded);
    }
    printf("\n");
    /* Example 2: Packing and unpacking */
    printf("Example 2: Packing and Unpacking\n");
    printf("---------------------------------\n");
    const size_t N = 16;
    int8_t original[16] = {1, -1, 0, 1, -1, 0, 1, -1, 0, 1, -1, 0, 1, -1, 0, 1};
    uint8_t packed[4];  /* 16 trits = 4 bytes */
    int8_t unpacked[16];
    pack_trit_array(original, packed, N);
    unpack_trit_array(packed, unpacked, N);
    printf("  Original:  ");
    for (size_t i = 0; i < N; i++) {
        printf("%2d ", original[i]);
    }
    printf("\n");
    printf("  Packed:    ");
    for (size_t i = 0; i < N / 4; i++) {
        printf("0x%02X ", packed[i]);
    }
    printf("\n");
    printf("  Unpacked:  ");
    for (size_t i = 0; i < N; i++) {
        printf("%2d ", unpacked[i]);
    }
    printf("\n\n");
    /* Example 3: Sparse Ternary FMA */
    printf("Example 3: Sparse Ternary FMA\n");
    printf("------------------------------\n");
    /* Dense coefficients A */
    int64_t A[8] = {100, 200, 300, 400, 500, 600, 700, 800};
    /* Ternary key B */
    int8_t B[8] = {1, -1, 0, 1, -1, 0, 1, -1};
    /* Pack B */
    uint8_t B_packed[2];  /* 8 trits = 2 bytes */
    pack_trit_array(B, B_packed, 8);
    /* Accumulator C (initialized to zero) */
    int64_t C[8];
    memset(C, 0, 8 * sizeof(int64_t));
    /* Perform FMA: C = A * B + C */
    sparse_ternary_fma(A, B_packed, C, 8);
    printf("  A:      ");
    for (int i = 0; i < 8; i++) {
        printf("%4lld ", (long long)A[i]);
    }
    printf("\n");
    printf("  B:      ");
    for (int i = 0; i < 8; i++) {
        printf("%4d ", B[i]);
    }
    printf("\n");
    printf("  C:      ");
    for (int i = 0; i < 8; i++) {
        printf("%4lld ", (long long)C[i]);
    }
    printf("\n");
    printf("\n  Expected: A[i] * B[i] for each i\n");
    printf("  Result:   ");
    for (int i = 0; i < 8; i++) {
        int64_t expected = A[i] * B[i];
        printf("%s ", (C[i] == expected) ? "✓" : "✗");
    }
    printf("\n\n");
    /* Example 4: Accumulation */
    printf("Example 4: Accumulation (Multiple FMAs)\n");
    printf("----------------------------------------\n");
    /* Reset C */
    memset(C, 0, 8 * sizeof(int64_t));
    /* Perform FMA multiple times */
    for (int iter = 0; iter < 3; iter++) {
        sparse_ternary_fma(A, B_packed, C, 8);
        printf("  After iteration %d: C[0] = %lld\n",
               iter + 1, (long long)C[0]);
    }
    printf("  Expected: A[0] * B[0] * 3 = %lld\n",
           (long long)(A[0] * B[0] * 3));
    printf("  Result: %s\n\n",
           (C[0] == A[0] * B[0] * 3) ? "✓ Correct" : "✗ Incorrect");
    printf("All examples completed successfully!\n");
    return 0;
 }
--- a/ggml/src/ggml-stfma/include/sparse_ternary_fma.h
+++ b/ggml/src/ggml-stfma/include/sparse_ternary_fma.h
@ -0,0 +1,292 @@
 /**
 * Sparse Ternary Fused Multiply-Add (FMA) Kernel
 * 
 * A high-performance, dependency-free C library implementing efficient
 * ternary arithmetic using 2-bit encoding and AVX-512 SIMD instructions.
 * 
 * This kernel achieves 1.58× density gain and 2.38× SIMD speedup over
 * standard integer arithmetic, making it ideal for cryptographic and
 * machine learning applications.
 * 
 * Copyright 2025 HyperFold Technologies UK Ltd
 * Author: Maurice Wilson
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef SPARSE_TERNARY_FMA_H
 #define SPARSE_TERNARY_FMA_H
 #include <stdint.h>
 #include <stddef.h>
 #ifdef __AVX512F__
 #include <immintrin.h>
 #define HAS_AVX512 1
 #else
 #define HAS_AVX512 0
 #endif
 /* ========================================================================== */
 /* 2-Bit Ternary Encoding Scheme                                            */
 /* ========================================================================== */
 /**
 * 2-bit ternary encoding maps ternary values to bit patterns:
 * 
 *   Value | Encoding | Binary
 *   ------|----------|--------
 *   -1    | 0b10     | 10
 *    0    | 0b00     | 00
 *   +1    | 0b01     | 01
 *   (invalid) | 0b11 | 11
 * 
 * Design rationale:
 * - Distinct patterns for each value
 * - Zero is zero (simplifies operations)
 * - High bit indicates sign (0=positive, 1=negative)
 * - Low bit indicates non-zero (0=zero, 1=non-zero for +1)
 * - Invalid pattern reserved for error detection
 */
 #define TRIT_NEG      0b10  /* Negative: -1 */
 #define TRIT_ZERO     0b00  /* Zero: 0 */
 #define TRIT_POS      0b01  /* Positive: +1 */
 #define TRIT_INVALID  0b11  /* Invalid (error detection) */
 #ifdef __cplusplus
 extern "C" {
 #endif
 /* ========================================================================== */
 /* Encoding/Decoding Functions                                              */
 /* ========================================================================== */
 /**
 * Encode a ternary value to 2-bit representation.
 * 
 * @param value Ternary value in {-1, 0, 1}
 * @return 2-bit encoded trit
 */
 static inline uint8_t encode_trit(int8_t value) {
    if (value == 0) return TRIT_ZERO;
    if (value == 1) return TRIT_POS;
    return TRIT_NEG;  /* value == -1 */
 }
 /**
 * Decode a 2-bit trit to ternary value.
 * 
 * @param trit 2-bit encoded trit
 * @return Ternary value in {-1, 0, 1}
 */
 static inline int8_t decode_trit(uint8_t trit) {
    if (trit == TRIT_ZERO) return 0;
    if (trit == TRIT_POS) return 1;
    return -1;  /* trit == TRIT_NEG */
 }
 /**
 * Pack 4 trits into a single byte.
 * 
 * Byte layout:
 *   | trit3 | trit2 | trit1 | trit0 |
 *   |  7-6  |  5-4  |  3-2  |  1-0  |
 * 
 * @param t0, t1, t2, t3 Four ternary values
 * @return Packed byte containing 4 trits
 */
 static inline uint8_t pack_trits(int8_t t0, int8_t t1, int8_t t2, int8_t t3) {
    return (encode_trit(t0) << 0) |
           (encode_trit(t1) << 2) |
           (encode_trit(t2) << 4) |
           (encode_trit(t3) << 6);
 }
 /**
 * Unpack a byte into 4 trits.
 * 
 * @param packed Packed byte
 * @param trits Output array of 4 trits
 */
 static inline void unpack_trits(uint8_t packed, int8_t* trits) {
    trits[0] = decode_trit((packed >> 0) & 0b11);
    trits[1] = decode_trit((packed >> 2) & 0b11);
    trits[2] = decode_trit((packed >> 4) & 0b11);
    trits[3] = decode_trit((packed >> 6) & 0b11);
 }
 /**
 * Pack an array of ternary values into 2-bit representation.
 * 
 * @param trits Input array of N ternary values
 * @param packed Output array of N/4 bytes (must be pre-allocated)
 * @param N Number of trits (must be multiple of 4)
 */
 void pack_trit_array(const int8_t* trits, uint8_t* packed, size_t N);
 /**
 * Unpack a 2-bit array into ternary values.
 * 
 * @param packed Input array of N/4 bytes
 * @param trits Output array of N ternary values (must be pre-allocated)
 * @param N Number of trits (must be multiple of 4)
 */
 void unpack_trit_array(const uint8_t* packed, int8_t* trits, size_t N);
 /* ========================================================================== */
 /* Sparse Ternary FMA Functions                                             */
 /* ========================================================================== */
 /**
 * Sparse Ternary FMA: C = A * B + C (scalar implementation)
 * 
 * Computes fused multiply-add where B is a sparse ternary array.
 * 
 * Mathematical definition:
 *   C[i] += A[i] * decode(B_trit[i])
 * 
 * Where decode(B_trit[i]) ∈ {-1, 0, 1}
 * 
 * @param A Dense coefficient array [N]
 * @param B_trit Packed ternary array [N/4 bytes]
 * @param C Accumulator array [N] (modified in-place)
 * @param N Array length (must be multiple of 4)
 */
 void sparse_ternary_fma_scalar(
    const int64_t* A,
    const uint8_t* B_trit,
    int64_t* C,
    size_t N
 );
 /**
 * Sparse Ternary FMA: C = A * B + C (AVX-512 implementation)
 * 
 * Optimized SIMD version processing 8 elements per iteration.
 * Falls back to scalar if AVX-512 is not available.
 * 
 * @param A Dense coefficient array [N]
 * @param B_trit Packed ternary array [N/4 bytes]
 * @param C Accumulator array [N] (modified in-place)
 * @param N Array length (must be multiple of 8)
 */
 void sparse_ternary_fma_avx512(
    const int64_t* A,
    const uint8_t* B_trit,
    int64_t* C,
    size_t N
 );
 /**
 * Sparse Ternary FMA: C = A * B + C (sparse index format)
 * 
 * Optimized for very sparse arrays (Hamming weight w << N).
 * Only processes non-zero elements, achieving up to 16× speedup.
 * 
 * @param A Dense coefficient array [N]
 * @param indices Indices of non-zero elements [w]
 * @param values Ternary values {-1, 1} at indices [w]
 * @param C Accumulator array [N] (modified in-place)
 * @param w Hamming weight (number of non-zero elements)
 */
 void sparse_ternary_fma_sparse(
    const int64_t* A,
    const uint32_t* indices,
    const int8_t* values,
    int64_t* C,
    size_t w
 );
 /**
 * Sparse Ternary FMA: Automatic dispatch
 * 
 * Automatically selects best implementation based on:
 * - CPU features (AVX-512 support)
 * - Array size
 * - Sparsity
 * 
 * @param A Dense coefficient array [N]
 * @param B_trit Packed ternary array [N/4 bytes]
 * @param C Accumulator array [N] (modified in-place)
 * @param N Array length
 */
 void sparse_ternary_fma(
    const int64_t* A,
    const uint8_t* B_trit,
    int64_t* C,
    size_t N
 );
 /* ========================================================================== */
 /* Ternary Arithmetic Operations                                            */
 /* ========================================================================== */
 /**
 * Ternary multiplication: result = a * b
 * 
 * Optimized for b ∈ {-1, 0, 1}:
 * - b = -1 → result = -a
 * - b =  0 → result = 0
 * - b = +1 → result = a
 * 
 * No actual multiplication is needed; uses conditional selection instead.
 * 
 * @param a Dense value
 * @param b_trit 2-bit encoded ternary value
 * @return Product a * decode(b_trit)
 */
 static inline int64_t ternary_multiply(int64_t a, uint8_t b_trit) {
    if (b_trit == TRIT_ZERO) return 0;
    if (b_trit == TRIT_POS) return a;
    return -a;  /* TRIT_NEG */
 }
 /**
 * Ternary negation: result = -trit
 * 
 * Flips both bits if non-zero:
 * - 0b00 → 0b00 (zero stays zero)
 * - 0b01 → 0b10 (positive → negative)
 * - 0b10 → 0b01 (negative → positive)
 * 
 * @param trit 2-bit encoded ternary value
 * @return Negated trit
 */
 static inline uint8_t ternary_negate(uint8_t trit) {
    return (trit == TRIT_ZERO) ? TRIT_ZERO : (trit ^ 0b11);
 }
 /* ========================================================================== */
 /* Utility Functions                                                         */
 /* ========================================================================== */
 /**
 * Check if CPU supports AVX-512.
 * 
 * @return 1 if AVX-512 is available, 0 otherwise
 */
 int has_avx512_support(void);
 /**
 * Get optimal implementation name.
 * 
 * @return String describing the implementation being used
 */
 const char* get_fma_implementation(void);
 #ifdef __cplusplus
 }
 #endif
 #endif /* SPARSE_TERNARY_FMA_H */
--- a/ggml/src/ggml-stfma/src/sparse_ternary_fma.c
+++ b/ggml/src/ggml-stfma/src/sparse_ternary_fma.c
@ -0,0 +1,222 @@
 /**
 * Sparse Ternary Fused Multiply-Add (FMA) Kernel - Implementation
 * 
 * Copyright 2025 HyperFold Technologies UK Ltd
 * Author: Maurice Wilson
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "../include/sparse_ternary_fma.h"
 #include <string.h>
 #include <stdio.h>
 #ifdef __x86_64__
 #include <cpuid.h>
 #endif
 /* ========================================================================== */
 /* Packing/Unpacking Functions                                              */
 /* ========================================================================== */
 void pack_trit_array(const int8_t* trits, uint8_t* packed, size_t N) {
    for (size_t i = 0; i < N; i += 4) {
        packed[i / 4] = pack_trits(
            trits[i],
            trits[i + 1],
            trits[i + 2],
            trits[i + 3]
        );
    }
 }
 void unpack_trit_array(const uint8_t* packed, int8_t* trits, size_t N) {
    for (size_t i = 0; i < N; i += 4) {
        unpack_trits(packed[i / 4], &trits[i]);
    }
 }
 /* ========================================================================== */
 /* Scalar Implementation                                                     */
 /* ========================================================================== */
 void sparse_ternary_fma_scalar(
    const int64_t* A,
    const uint8_t* B_trit,
    int64_t* C,
    size_t N
 ) {
    for (size_t i = 0; i < N; i++) {
        /* Extract 2-bit trit from packed array */
        size_t byte_idx = i / 4;
        size_t trit_offset = (i % 4) * 2;
        uint8_t trit = (B_trit[byte_idx] >> trit_offset) & 0b11;
        /* Decode and accumulate */
        if (trit == TRIT_POS) {
            C[i] += A[i];
        } else if (trit == TRIT_NEG) {
            C[i] -= A[i];
        }
        /* else: trit == TRIT_ZERO, skip (no contribution) */
    }
 }
 /* ========================================================================== */
 /* AVX-512 Implementation                                                    */
 /* ========================================================================== */
 #if HAS_AVX512
 void sparse_ternary_fma_avx512(
    const int64_t* A,
    const uint8_t* B_trit,
    int64_t* C,
    size_t N
 ) {
    const __m512i zero = _mm512_setzero_si512();
    const __m512i mask_low = _mm512_set1_epi64(1);
    for (size_t i = 0; i < N; i += 8) {
        /* Load 8 coefficients (64-bit each) */
        __m512i a_vec = _mm512_loadu_si512(&A[i]);
        /* Load 8 accumulators */
        __m512i c_vec = _mm512_loadu_si512(&C[i]);
        /* Load 2 bytes containing 8 trits (8 × 2 bits = 16 bits) */
        /* Each byte contains 4 trits, so 8 trits = 2 bytes */
        size_t byte_idx = i / 4;
        uint16_t trit_packed = ((uint16_t)B_trit[byte_idx + 1] << 8) | 
                                B_trit[byte_idx];
        /* Extract 8 trits into array */
        uint64_t trits[8];
        for (int j = 0; j < 8; j++) {
            trits[j] = (trit_packed >> (j * 2)) & 0b11;
        }
        /* Load trits into 512-bit vector */
        __m512i trit_vec = _mm512_set_epi64(
            trits[7], trits[6], trits[5], trits[4],
            trits[3], trits[2], trits[1], trits[0]
        );
        /* Create nonzero mask: true if trit != 0b00 */
        /* This correctly handles both +1 (0b01) and -1 (0b10) */
        __mmask8 nonzero_mask = _mm512_cmpneq_epi64_mask(trit_vec, zero);
        /* Extract sign bit (high bit) for negative detection */
        /* sign=1 only for -1 (0b10), sign=0 for +1 (0b01) and 0 (0b00) */
        __m512i sign = _mm512_srli_epi64(trit_vec, 1);
        sign = _mm512_and_si512(sign, mask_low);
        __mmask8 sign_mask = _mm512_cmpneq_epi64_mask(sign, zero);
        /* Compute contribution: 0 if trit=0, A if trit!=0 */
        __m512i contribution = _mm512_maskz_mov_epi64(nonzero_mask, a_vec);
        /* Conditionally negate if sign=1 (i.e., trit=0b10=-1) */
        __m512i negated = _mm512_sub_epi64(zero, contribution);
        contribution = _mm512_mask_blend_epi64(sign_mask, contribution, negated);
        /* FMA: C += contribution (update accumulator) */
        c_vec = _mm512_add_epi64(c_vec, contribution);
        /* Store result */
        _mm512_storeu_si512(&C[i], c_vec);
    }
 }
 #else
 /* Fallback to scalar if AVX-512 not available */
 void sparse_ternary_fma_avx512(
    const int64_t* A,
    const uint8_t* B_trit,
    int64_t* C,
    size_t N
 ) {
    sparse_ternary_fma_scalar(A, B_trit, C, N);
 }
 #endif
 /* ========================================================================== */
 /* Sparse Implementation                                                     */
 /* ========================================================================== */
 void sparse_ternary_fma_sparse(
    const int64_t* A,
    const uint32_t* indices,
    const int8_t* values,
    int64_t* C,
    size_t w
 ) {
    for (size_t i = 0; i < w; i++) {
        uint32_t idx = indices[i];
        int8_t value = values[i];
        if (value == 1) {
            C[idx] += A[idx];
        } else {  /* value == -1 */
            C[idx] -= A[idx];
        }
    }
 }
 /* ========================================================================== */
 /* Automatic Dispatch                                                        */
 /* ========================================================================== */
 void sparse_ternary_fma(
    const int64_t* A,
    const uint8_t* B_trit,
    int64_t* C,
    size_t N
 ) {
 #if HAS_AVX512
    if (has_avx512_support() && N >= 8 && N % 8 == 0) {
        sparse_ternary_fma_avx512(A, B_trit, C, N);
    } else {
        sparse_ternary_fma_scalar(A, B_trit, C, N);
    }
 #else
    sparse_ternary_fma_scalar(A, B_trit, C, N);
 #endif
 }
 /* ========================================================================== */
 /* Utility Functions                                                         */
 /* ========================================================================== */
 int has_avx512_support(void) {
 #ifdef __x86_64__
    unsigned int eax, ebx, ecx, edx;
    /* Check for AVX-512 Foundation (AVX512F) */
    if (__get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx)) {
        return (ebx & (1 << 16)) != 0;  /* Bit 16 = AVX512F */
    }
 #endif
    return 0;
 }
 const char* get_fma_implementation(void) {
 #if HAS_AVX512
    if (has_avx512_support()) {
        return "AVX-512 (SIMD)";
    }
 #endif
    return "Scalar (Reference)";
 }
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@ -6901,7 +6901,8 @@ static const ggml_type all_types[] = {
    GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
    GGML_TYPE_Q4_K, GGML_TYPE_Q5_K,
    GGML_TYPE_Q6_K,
-    // GGML_TYPE_TQ1_0, GGML_TYPE_TQ2_0, // TODO: implement for all backends
+    GGML_TYPE_TQ2_0, // sparse-ternary-fma integration
    // GGML_TYPE_TQ1_0, // TODO: implement for all backends
    GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
    GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
    GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
@ -6924,7 +6925,8 @@ static const ggml_type other_types[] = {
    GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
    GGML_TYPE_Q5_K,
    GGML_TYPE_Q6_K,
-    // GGML_TYPE_TQ1_0, GGML_TYPE_TQ2_0, // TODO: implement for all backends
+    GGML_TYPE_TQ2_0, // sparse-ternary-fma integration
    // GGML_TYPE_TQ1_0, // TODO: implement for all backends
    GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
    GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
    GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
@ -7682,6 +7684,17 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
        test_cases.emplace_back(new test_mul_mat(type_a, GGML_TYPE_F32, 1, 64, 256, {1,  1}, {1, 1}));
    }
    // TQ2_0 specific test cases for sparse-ternary-fma integration
    // Test various sizes to verify threshold-based dispatch and correctness
    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_TQ2_0, GGML_TYPE_F32, 16, 1, 256, {1, 1}, {1, 1}));   // Small (below threshold)
    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_TQ2_0, GGML_TYPE_F32, 16, 1, 512, {1, 1}, {1, 1}));   // Medium
    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_TQ2_0, GGML_TYPE_F32, 16, 1, 1024, {1, 1}, {1, 1}));  // At threshold
    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_TQ2_0, GGML_TYPE_F32, 16, 1, 2048, {1, 1}, {1, 1}));  // Above threshold
    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_TQ2_0, GGML_TYPE_F32, 16, 8, 1024, {1, 1}, {1, 1}));  // Mat-vec above threshold
    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_TQ2_0, GGML_TYPE_F32, 64, 64, 1024, {1, 1}, {1, 1})); // Mat-mat above threshold
    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_TQ2_0, GGML_TYPE_F32, 16, 1, 4096, {2, 3}, {1, 1}));  // Batched
    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_TQ2_0, GGML_TYPE_F32, 32, 32, 2048, {1, 1}, {1, 1})); // Large mat-mat
 #if 0
    // test the mat-mat path for Metal
    for (int k = 1; k < 512; ++k) {