From a73b9d361a0a2e3407aa72c36abf7a62e378c6b3 Mon Sep 17 00:00:00 2001 From: Antoine Viallon Date: Thu, 15 Jan 2026 10:54:23 +0100 Subject: [PATCH] ggml-cpu: add q4_0 repack support for wasm --- ggml/src/ggml-cpu/CMakeLists.txt | 5 +- ggml/src/ggml-cpu/arch/wasm/repack.cpp | 631 +++++++++++++++++++++++++ ggml/src/ggml-cpu/repack.cpp | 8 + 3 files changed, 643 insertions(+), 1 deletion(-) create mode 100644 ggml/src/ggml-cpu/arch/wasm/repack.cpp diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index 7622d0bf49..d9f24f5686 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -543,7 +543,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name) ggml_add_cpu_backend_features(${GGML_CPU_NAME} s390 ${ARCH_DEFINITIONS}) elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm") message(STATUS "Wasm detected") - list (APPEND GGML_CPU_SOURCES ggml-cpu/arch/wasm/quants.c) + list (APPEND GGML_CPU_SOURCES + ggml-cpu/arch/wasm/quants.c + ggml-cpu/arch/wasm/repack.cpp + ) else() message(WARNING "Unknown CPU architecture. Falling back to generic implementations.") list(APPEND ARCH_FLAGS -DGGML_CPU_GENERIC) diff --git a/ggml/src/ggml-cpu/arch/wasm/repack.cpp b/ggml/src/ggml-cpu/arch/wasm/repack.cpp new file mode 100644 index 0000000000..5dee26dbcb --- /dev/null +++ b/ggml/src/ggml-cpu/arch/wasm/repack.cpp @@ -0,0 +1,631 @@ +#define GGML_COMMON_IMPL_CPP +#define GGML_COMMON_DECL_CPP +#include "ggml-common.h" +#include "ggml-backend-impl.h" + +#include "ggml-impl.h" +#include "ggml-cpu.h" +#include "ggml-cpu-impl.h" +#include "simd-mappings.h" +#include "traits.h" + +#include +#include +#include +#include // for qsort +#include // for GGML_ASSERT + +#if defined(__wasm_simd128__) +#include +#endif + +#include "../../repack.h" + +#define UNUSED GGML_UNUSED + +#if defined(__wasm_simd128__) + +// Wasm SIMD128 optimized quantization for Q8_0 4x4 interleaved blocks +void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { + assert(QK8_0 == 32); + assert(k % QK8_0 == 0); + const int nb = k / QK8_0; + + block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy; + + for (int i = 0; i < nb; i++) { + v128_t srcv[4][8]; + float id[4]; + + // Process 4 rows + for (int row = 0; row < 4; row++) { + v128_t asrcv[8]; + v128_t amaxv[8]; + + // Load 8 vectors of 4 floats each (32 floats total per row) + for (int j = 0; j < 8; j++) { + srcv[row][j] = wasm_v128_load(x + row * k + i * 32 + 4 * j); + } + + // Compute absolute values + for (int j = 0; j < 8; j++) { + asrcv[j] = wasm_f32x4_abs(srcv[row][j]); + } + + // Find maximum across all 8 vectors using pairwise reduction + for (int j = 0; j < 4; j++) { + amaxv[2 * j] = wasm_f32x4_max(asrcv[2 * j], asrcv[2 * j + 1]); + } + for (int j = 0; j < 2; j++) { + amaxv[4 * j] = wasm_f32x4_max(amaxv[4 * j], amaxv[4 * j + 2]); + } + amaxv[0] = wasm_f32x4_max(amaxv[0], amaxv[4]); + + // Extract maximum from the final vector + float amax = wasm_f32x4_extract_lane(amaxv[0], 0); + amax = fmaxf(amax, wasm_f32x4_extract_lane(amaxv[0], 1)); + amax = fmaxf(amax, wasm_f32x4_extract_lane(amaxv[0], 2)); + amax = fmaxf(amax, wasm_f32x4_extract_lane(amaxv[0], 3)); + + const float d = amax / ((1 << 7) - 1); + id[row] = d ? 1.0f / d : 0.0f; + + y[i].d[row] = GGML_CPU_FP32_TO_FP16(d); + } + + // Quantize and interleave with blocklen=4 + for (int j = 0; j < 8; j++) { + for (int row = 0; row < 4; row++) { + v128_t v = wasm_f32x4_mul(srcv[row][j], wasm_f32x4_splat(id[row])); + v128_t vi = wasm_i32x4_trunc_sat_f32x4(v); + + // Store interleaved: row0[0-3], row1[0-3], row2[0-3], row3[0-3] + y[i].qs[16 * j + row * 4 + 0] = wasm_i32x4_extract_lane(vi, 0); + y[i].qs[16 * j + row * 4 + 1] = wasm_i32x4_extract_lane(vi, 1); + y[i].qs[16 * j + row * 4 + 2] = wasm_i32x4_extract_lane(vi, 2); + y[i].qs[16 * j + row * 4 + 3] = wasm_i32x4_extract_lane(vi, 3); + } + } + } +} + +// Wasm SIMD128 optimized quantization for Q8_0 4x8 interleaved blocks +void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { + assert(QK8_0 == 32); + assert(k % QK8_0 == 0); + const int nb = k / QK8_0; + + block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy; + + for (int i = 0; i < nb; i++) { + v128_t srcv[4][8]; + float id[4]; + + // Process 4 rows + for (int row = 0; row < 4; row++) { + v128_t asrcv[8]; + v128_t amaxv[8]; + + // Load 8 vectors of 4 floats each (32 floats total per row) + for (int j = 0; j < 8; j++) { + srcv[row][j] = wasm_v128_load(x + row * k + i * 32 + 4 * j); + } + + // Compute absolute values + for (int j = 0; j < 8; j++) { + asrcv[j] = wasm_f32x4_abs(srcv[row][j]); + } + + // Find maximum across all 8 vectors + for (int j = 0; j < 4; j++) { + amaxv[2 * j] = wasm_f32x4_max(asrcv[2 * j], asrcv[2 * j + 1]); + } + for (int j = 0; j < 2; j++) { + amaxv[4 * j] = wasm_f32x4_max(amaxv[4 * j], amaxv[4 * j + 2]); + } + amaxv[0] = wasm_f32x4_max(amaxv[0], amaxv[4]); + + float amax = wasm_f32x4_extract_lane(amaxv[0], 0); + amax = fmaxf(amax, wasm_f32x4_extract_lane(amaxv[0], 1)); + amax = fmaxf(amax, wasm_f32x4_extract_lane(amaxv[0], 2)); + amax = fmaxf(amax, wasm_f32x4_extract_lane(amaxv[0], 3)); + + const float d = amax / ((1 << 7) - 1); + id[row] = d ? 1.0f / d : 0.0f; + + y[i].d[row] = GGML_CPU_FP32_TO_FP16(d); + } + + // Quantize and interleave with blocklen=8 + for (int j = 0; j < 4; j++) { + for (int row = 0; row < 4; row++) { + // First 4 floats of block + v128_t v0 = wasm_f32x4_mul(srcv[row][2 * j], wasm_f32x4_splat(id[row])); + v128_t vi0 = wasm_i32x4_trunc_sat_f32x4(v0); + + // Second 4 floats of block + v128_t v1 = wasm_f32x4_mul(srcv[row][2 * j + 1], wasm_f32x4_splat(id[row])); + v128_t vi1 = wasm_i32x4_trunc_sat_f32x4(v1); + + // Store interleaved with blocklen=8 + y[i].qs[32 * j + row * 8 + 0] = wasm_i32x4_extract_lane(vi0, 0); + y[i].qs[32 * j + row * 8 + 1] = wasm_i32x4_extract_lane(vi0, 1); + y[i].qs[32 * j + row * 8 + 2] = wasm_i32x4_extract_lane(vi0, 2); + y[i].qs[32 * j + row * 8 + 3] = wasm_i32x4_extract_lane(vi0, 3); + y[i].qs[32 * j + row * 8 + 4] = wasm_i32x4_extract_lane(vi1, 0); + y[i].qs[32 * j + row * 8 + 5] = wasm_i32x4_extract_lane(vi1, 1); + y[i].qs[32 * j + row * 8 + 6] = wasm_i32x4_extract_lane(vi1, 2); + y[i].qs[32 * j + row * 8 + 7] = wasm_i32x4_extract_lane(vi1, 3); + } + } + } +} + +// Wasm SIMD128 optimized GEMV for Q4_0 4x4 with Q8_0 activation +void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + const int qk = QK8_0; + const int nb = n / qk; + const int ncols_interleaved = 4; + const int blocklen = 4; + + assert(nr == 1); + assert(n % qk == 0); + assert(nc % ncols_interleaved == 0); + + UNUSED(bs); + UNUSED(nr); + + const block_q8_0 * a_ptr = (const block_q8_0 *) vy; + + for (int x_idx = 0; x_idx < nc / ncols_interleaved; x_idx++) { + const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x_idx * nb); + + v128_t acc = wasm_f32x4_splat(0.0f); + + for (int l = 0; l < nb; l++) { + float a_d = GGML_CPU_FP16_TO_FP32(a_ptr[l].d); + v128_t b_d = wasm_f32x4_make( + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[0]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[1]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[2]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[3]) + ); + + v128_t sumi = wasm_i32x4_splat(0); + + for (int k = 0; k < (qk / (2 * blocklen)); k++) { + for (int i = 0; i < blocklen; i++) { + int base = k * ncols_interleaved * blocklen + i; + + int8_t b0 = b_ptr[l].qs[base + 0 * blocklen]; + int8_t b1 = b_ptr[l].qs[base + 1 * blocklen]; + int8_t b2 = b_ptr[l].qs[base + 2 * blocklen]; + int8_t b3 = b_ptr[l].qs[base + 3 * blocklen]; + + v128_t v0 = wasm_i32x4_make( + (int32_t)(int8_t)(b0 << 4), + (int32_t)(int8_t)(b1 << 4), + (int32_t)(int8_t)(b2 << 4), + (int32_t)(int8_t)(b3 << 4) + ); + v128_t v1 = wasm_i32x4_make( + (int32_t)(int8_t)(b0 & 0xF0), + (int32_t)(int8_t)(b1 & 0xF0), + (int32_t)(int8_t)(b2 & 0xF0), + (int32_t)(int8_t)(b3 & 0xF0) + ); + + int32_t a_val_lo = a_ptr[l].qs[k * blocklen + i]; + int32_t a_val_hi = a_ptr[l].qs[k * blocklen + i + qk / 2]; + + v128_t mul0 = wasm_i32x4_mul(v0, wasm_i32x4_splat(a_val_lo)); + v128_t mul1 = wasm_i32x4_mul(v1, wasm_i32x4_splat(a_val_hi)); + v128_t sum = wasm_i32x4_add(mul0, mul1); + sum = wasm_i32x4_shr(sum, 4); + sumi = wasm_i32x4_add(sumi, sum); + } + } + + v128_t sumf = wasm_f32x4_convert_i32x4(sumi); + v128_t scale = wasm_f32x4_mul(b_d, wasm_f32x4_splat(a_d)); + acc = wasm_f32x4_add(acc, wasm_f32x4_mul(sumf, scale)); + } + + wasm_v128_store(s + x_idx * ncols_interleaved, acc); + } +} + +// Wasm SIMD128 optimized GEMM for Q4_0 4x4 with Q8_0 activation +void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + const int qk = QK8_0; + const int nb = n / qk; + const int ncols_interleaved = 4; + const int blocklen = 4; + + assert(n % qk == 0); + assert(nc % ncols_interleaved == 0); + assert(nr % 4 == 0); + + UNUSED(bs); + + for (int row = 0; row < nr; row += 4) { + const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) ((const char *)vy + row * nb * sizeof(block_q8_0)); + + for (int x_idx = 0; x_idx < nc / ncols_interleaved; x_idx++) { + const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x_idx * nb); + + v128_t acc[4] = { + wasm_f32x4_splat(0.0f), + wasm_f32x4_splat(0.0f), + wasm_f32x4_splat(0.0f), + wasm_f32x4_splat(0.0f) + }; + + for (int l = 0; l < nb; l++) { + v128_t b_d = wasm_f32x4_make( + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[0]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[1]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[2]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[3]) + ); + + for (int r = 0; r < 4; r++) { + float a_d = GGML_CPU_FP16_TO_FP32(a_ptr[l].d[r]); + v128_t sumi = wasm_i32x4_splat(0); + + for (int k = 0; k < (qk / (2 * blocklen)); k++) { + for (int i = 0; i < blocklen; i++) { + int base = k * ncols_interleaved * blocklen + i; + + int8_t b0 = b_ptr[l].qs[base + 0 * blocklen]; + int8_t b1 = b_ptr[l].qs[base + 1 * blocklen]; + int8_t b2 = b_ptr[l].qs[base + 2 * blocklen]; + int8_t b3 = b_ptr[l].qs[base + 3 * blocklen]; + + v128_t v0 = wasm_i32x4_make( + (int32_t)(int8_t)(b0 << 4), + (int32_t)(int8_t)(b1 << 4), + (int32_t)(int8_t)(b2 << 4), + (int32_t)(int8_t)(b3 << 4) + ); + v128_t v1 = wasm_i32x4_make( + (int32_t)(int8_t)(b0 & 0xF0), + (int32_t)(int8_t)(b1 & 0xF0), + (int32_t)(int8_t)(b2 & 0xF0), + (int32_t)(int8_t)(b3 & 0xF0) + ); + + int32_t a_val_lo = a_ptr[l].qs[k * 16 + r * blocklen + i]; + int32_t a_val_hi = a_ptr[l].qs[k * 16 + r * blocklen + i + 64]; + + v128_t mul0 = wasm_i32x4_mul(v0, wasm_i32x4_splat(a_val_lo)); + v128_t mul1 = wasm_i32x4_mul(v1, wasm_i32x4_splat(a_val_hi)); + v128_t sum = wasm_i32x4_add(mul0, mul1); + sum = wasm_i32x4_shr(sum, 4); + sumi = wasm_i32x4_add(sumi, sum); + } + } + + v128_t sumf = wasm_f32x4_convert_i32x4(sumi); + v128_t scale = wasm_f32x4_mul(b_d, wasm_f32x4_splat(a_d)); + acc[r] = wasm_f32x4_add(acc[r], wasm_f32x4_mul(sumf, scale)); + } + } + + for (int r = 0; r < 4; r++) { + wasm_v128_store(s + (row + r) * nc + x_idx * ncols_interleaved, acc[r]); + } + } + } +} + +// For other functions, fall back to generic implementations +void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + ggml_gemv_q4_0_4x8_q8_0_generic(n, s, bs, vx, vy, nr, nc); +} + +void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + const int qk = QK8_0; + const int nb = n / qk; + const int ncols_interleaved = 8; + const int blocklen = 8; + + assert(nr == 1); + assert(n % qk == 0); + assert(nc % ncols_interleaved == 0); + + UNUSED(bs); + UNUSED(nr); + + const block_q8_0 * a_ptr = (const block_q8_0 *) vy; + + for (int x_idx = 0; x_idx < nc / ncols_interleaved; x_idx++) { + const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x_idx * nb); + + v128_t acc0 = wasm_f32x4_splat(0.0f); + v128_t acc1 = wasm_f32x4_splat(0.0f); + + for (int l = 0; l < nb; l++) { + float a_d = GGML_CPU_FP16_TO_FP32(a_ptr[l].d); + v128_t b_d0 = wasm_f32x4_make( + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[0]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[1]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[2]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[3]) + ); + v128_t b_d1 = wasm_f32x4_make( + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[4]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[5]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[6]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[7]) + ); + + v128_t sumi0 = wasm_i32x4_splat(0); + v128_t sumi1 = wasm_i32x4_splat(0); + + for (int k = 0; k < (qk / (2 * blocklen)); k++) { + for (int i = 0; i < blocklen; i++) { + int base = k * ncols_interleaved * blocklen + i; + + int8_t b0 = b_ptr[l].qs[base + 0 * blocklen]; + int8_t b1 = b_ptr[l].qs[base + 1 * blocklen]; + int8_t b2 = b_ptr[l].qs[base + 2 * blocklen]; + int8_t b3 = b_ptr[l].qs[base + 3 * blocklen]; + int8_t b4 = b_ptr[l].qs[base + 4 * blocklen]; + int8_t b5 = b_ptr[l].qs[base + 5 * blocklen]; + int8_t b6 = b_ptr[l].qs[base + 6 * blocklen]; + int8_t b7 = b_ptr[l].qs[base + 7 * blocklen]; + + v128_t v0_0 = wasm_i32x4_make( + (int32_t)(int8_t)(b0 << 4), + (int32_t)(int8_t)(b1 << 4), + (int32_t)(int8_t)(b2 << 4), + (int32_t)(int8_t)(b3 << 4) + ); + v128_t v1_0 = wasm_i32x4_make( + (int32_t)(int8_t)(b0 & 0xF0), + (int32_t)(int8_t)(b1 & 0xF0), + (int32_t)(int8_t)(b2 & 0xF0), + (int32_t)(int8_t)(b3 & 0xF0) + ); + + v128_t v0_1 = wasm_i32x4_make( + (int32_t)(int8_t)(b4 << 4), + (int32_t)(int8_t)(b5 << 4), + (int32_t)(int8_t)(b6 << 4), + (int32_t)(int8_t)(b7 << 4) + ); + v128_t v1_1 = wasm_i32x4_make( + (int32_t)(int8_t)(b4 & 0xF0), + (int32_t)(int8_t)(b5 & 0xF0), + (int32_t)(int8_t)(b6 & 0xF0), + (int32_t)(int8_t)(b7 & 0xF0) + ); + + int32_t a_val_lo = a_ptr[l].qs[k * blocklen + i]; + int32_t a_val_hi = a_ptr[l].qs[k * blocklen + i + qk / 2]; + + v128_t mul0_0 = wasm_i32x4_mul(v0_0, wasm_i32x4_splat(a_val_lo)); + v128_t mul1_0 = wasm_i32x4_mul(v1_0, wasm_i32x4_splat(a_val_hi)); + v128_t sum0 = wasm_i32x4_add(mul0_0, mul1_0); + sum0 = wasm_i32x4_shr(sum0, 4); + sumi0 = wasm_i32x4_add(sumi0, sum0); + + v128_t mul0_1 = wasm_i32x4_mul(v0_1, wasm_i32x4_splat(a_val_lo)); + v128_t mul1_1 = wasm_i32x4_mul(v1_1, wasm_i32x4_splat(a_val_hi)); + v128_t sum1 = wasm_i32x4_add(mul0_1, mul1_1); + sum1 = wasm_i32x4_shr(sum1, 4); + sumi1 = wasm_i32x4_add(sumi1, sum1); + } + } + + v128_t sumf0 = wasm_f32x4_convert_i32x4(sumi0); + v128_t sumf1 = wasm_f32x4_convert_i32x4(sumi1); + v128_t scale0 = wasm_f32x4_mul(b_d0, wasm_f32x4_splat(a_d)); + v128_t scale1 = wasm_f32x4_mul(b_d1, wasm_f32x4_splat(a_d)); + acc0 = wasm_f32x4_add(acc0, wasm_f32x4_mul(sumf0, scale0)); + acc1 = wasm_f32x4_add(acc1, wasm_f32x4_mul(sumf1, scale1)); + } + + wasm_v128_store(s + x_idx * ncols_interleaved, acc0); + wasm_v128_store(s + x_idx * ncols_interleaved + 4, acc1); + } +} + +void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + ggml_gemm_q4_0_4x8_q8_0_generic(n, s, bs, vx, vy, nr, nc); +} + +void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + const int qk = QK8_0; + const int nb = n / qk; + const int ncols_interleaved = 8; + const int blocklen = 8; + + assert(n % qk == 0); + assert(nc % ncols_interleaved == 0); + assert(nr % 4 == 0); + + UNUSED(bs); + + for (int row = 0; row < nr; row += 4) { + const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) ((const char *)vy + row * nb * sizeof(block_q8_0)); + + for (int x_idx = 0; x_idx < nc / ncols_interleaved; x_idx++) { + const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x_idx * nb); + + v128_t acc0[4] = { + wasm_f32x4_splat(0.0f), + wasm_f32x4_splat(0.0f), + wasm_f32x4_splat(0.0f), + wasm_f32x4_splat(0.0f) + }; + v128_t acc1[4] = { + wasm_f32x4_splat(0.0f), + wasm_f32x4_splat(0.0f), + wasm_f32x4_splat(0.0f), + wasm_f32x4_splat(0.0f) + }; + + for (int l = 0; l < nb; l++) { + v128_t b_d0 = wasm_f32x4_make( + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[0]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[1]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[2]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[3]) + ); + v128_t b_d1 = wasm_f32x4_make( + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[4]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[5]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[6]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[7]) + ); + + for (int r = 0; r < 4; r++) { + float a_d = GGML_CPU_FP16_TO_FP32(a_ptr[l].d[r]); + v128_t sumi0 = wasm_i32x4_splat(0); + v128_t sumi1 = wasm_i32x4_splat(0); + + for (int k = 0; k < (qk / (2 * blocklen)); k++) { + for (int i = 0; i < blocklen; i++) { + int base = k * ncols_interleaved * blocklen + i; + + int8_t b0 = b_ptr[l].qs[base + 0 * blocklen]; + int8_t b1 = b_ptr[l].qs[base + 1 * blocklen]; + int8_t b2 = b_ptr[l].qs[base + 2 * blocklen]; + int8_t b3 = b_ptr[l].qs[base + 3 * blocklen]; + int8_t b4 = b_ptr[l].qs[base + 4 * blocklen]; + int8_t b5 = b_ptr[l].qs[base + 5 * blocklen]; + int8_t b6 = b_ptr[l].qs[base + 6 * blocklen]; + int8_t b7 = b_ptr[l].qs[base + 7 * blocklen]; + + v128_t v0_0 = wasm_i32x4_make( + (int32_t)(int8_t)(b0 << 4), + (int32_t)(int8_t)(b1 << 4), + (int32_t)(int8_t)(b2 << 4), + (int32_t)(int8_t)(b3 << 4) + ); + v128_t v1_0 = wasm_i32x4_make( + (int32_t)(int8_t)(b0 & 0xF0), + (int32_t)(int8_t)(b1 & 0xF0), + (int32_t)(int8_t)(b2 & 0xF0), + (int32_t)(int8_t)(b3 & 0xF0) + ); + + v128_t v0_1 = wasm_i32x4_make( + (int32_t)(int8_t)(b4 << 4), + (int32_t)(int8_t)(b5 << 4), + (int32_t)(int8_t)(b6 << 4), + (int32_t)(int8_t)(b7 << 4) + ); + v128_t v1_1 = wasm_i32x4_make( + (int32_t)(int8_t)(b4 & 0xF0), + (int32_t)(int8_t)(b5 & 0xF0), + (int32_t)(int8_t)(b6 & 0xF0), + (int32_t)(int8_t)(b7 & 0xF0) + ); + + int32_t a_val_lo = a_ptr[l].qs[k * 4 * blocklen + r * blocklen + i]; + int32_t a_val_hi = a_ptr[l].qs[k * 4 * blocklen + r * blocklen + i + qk / 2 * 4]; + + v128_t mul0_0 = wasm_i32x4_mul(v0_0, wasm_i32x4_splat(a_val_lo)); + v128_t mul1_0 = wasm_i32x4_mul(v1_0, wasm_i32x4_splat(a_val_hi)); + v128_t sum0 = wasm_i32x4_add(mul0_0, mul1_0); + sum0 = wasm_i32x4_shr(sum0, 4); + sumi0 = wasm_i32x4_add(sumi0, sum0); + + v128_t mul0_1 = wasm_i32x4_mul(v0_1, wasm_i32x4_splat(a_val_lo)); + v128_t mul1_1 = wasm_i32x4_mul(v1_1, wasm_i32x4_splat(a_val_hi)); + v128_t sum1 = wasm_i32x4_add(mul0_1, mul1_1); + sum1 = wasm_i32x4_shr(sum1, 4); + sumi1 = wasm_i32x4_add(sumi1, sum1); + } + } + + v128_t sumf0 = wasm_f32x4_convert_i32x4(sumi0); + v128_t sumf1 = wasm_f32x4_convert_i32x4(sumi1); + v128_t scale0 = wasm_f32x4_mul(b_d0, wasm_f32x4_splat(a_d)); + v128_t scale1 = wasm_f32x4_mul(b_d1, wasm_f32x4_splat(a_d)); + acc0[r] = wasm_f32x4_add(acc0[r], wasm_f32x4_mul(sumf0, scale0)); + acc1[r] = wasm_f32x4_add(acc1[r], wasm_f32x4_mul(sumf1, scale1)); + } + } + + for (int r = 0; r < 4; r++) { + wasm_v128_store(s + (row + r) * nc + x_idx * ncols_interleaved, acc0[r]); + wasm_v128_store(s + (row + r) * nc + x_idx * ncols_interleaved + 4, acc1[r]); + } + } + } +} + +// Q4_K functions - fall back to generic +void ggml_quantize_mat_q8_K_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { + ggml_quantize_mat_q8_K_4x4_generic(x, vy, k); +} + +void ggml_quantize_mat_q8_K_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { + ggml_quantize_mat_q8_K_4x8_generic(x, vy, k); +} + +void ggml_gemv_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + ggml_gemv_q4_K_8x4_q8_K_generic(n, s, bs, vx, vy, nr, nc); +} + +void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + ggml_gemv_q4_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc); +} + +void ggml_gemm_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + ggml_gemm_q4_K_8x4_q8_K_generic(n, s, bs, vx, vy, nr, nc); +} + +void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + ggml_gemm_q4_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc); +} + +// Q2_K functions - fall back to generic +void ggml_gemv_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + ggml_gemv_q2_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc); +} + +void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + ggml_gemm_q2_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc); +} + +// IQ4_NL functions - fall back to generic +void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + ggml_gemv_iq4_nl_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc); +} + +void ggml_gemv_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + ggml_gemv_iq4_nl_8x8_q8_0_generic(n, s, bs, vx, vy, nr, nc); +} + +void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + ggml_gemm_iq4_nl_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc); +} + +void ggml_gemm_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + ggml_gemm_iq4_nl_8x8_q8_0_generic(n, s, bs, vx, vy, nr, nc); +} + +// Q8_0 functions - fall back to generic +void ggml_gemv_q8_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + ggml_gemv_q8_0_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc); +} + +void ggml_gemv_q8_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + ggml_gemv_q8_0_4x8_q8_0_generic(n, s, bs, vx, vy, nr, nc); +} + +void ggml_gemm_q8_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + ggml_gemm_q8_0_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc); +} + +void ggml_gemm_q8_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + ggml_gemm_q8_0_4x8_q8_0_generic(n, s, bs, vx, vy, nr, nc); +} + +#endif // __wasm_simd128__ + diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp index fbf7ed9432..48760e1d72 100644 --- a/ggml/src/ggml-cpu/repack.cpp +++ b/ggml/src/ggml-cpu/repack.cpp @@ -2460,6 +2460,14 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons return &q4_0_4x4_q8_0; } } + if (ggml_cpu_has_wasm_simd()) { + if (cur->ne[1] % 8 == 0) { + return &q4_0_8x8_q8_0; + } + if (cur->ne[1] % 4 == 0) { + return &q4_0_4x4_q8_0; + } + } } else if (cur->type == GGML_TYPE_Q4_K) { if (ggml_cpu_has_avx2()) { if (cur->ne[1] % 8 == 0) {