Merge a73b9d361a into 06bf3796f4
This commit is contained in:
commit
f7887cd9f0
|
|
@ -543,7 +543,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||||
ggml_add_cpu_backend_features(${GGML_CPU_NAME} s390 ${ARCH_DEFINITIONS})
|
ggml_add_cpu_backend_features(${GGML_CPU_NAME} s390 ${ARCH_DEFINITIONS})
|
||||||
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm")
|
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm")
|
||||||
message(STATUS "Wasm detected")
|
message(STATUS "Wasm detected")
|
||||||
list (APPEND GGML_CPU_SOURCES ggml-cpu/arch/wasm/quants.c)
|
list (APPEND GGML_CPU_SOURCES
|
||||||
|
ggml-cpu/arch/wasm/quants.c
|
||||||
|
ggml-cpu/arch/wasm/repack.cpp
|
||||||
|
)
|
||||||
else()
|
else()
|
||||||
message(WARNING "Unknown CPU architecture. Falling back to generic implementations.")
|
message(WARNING "Unknown CPU architecture. Falling back to generic implementations.")
|
||||||
list(APPEND ARCH_FLAGS -DGGML_CPU_GENERIC)
|
list(APPEND ARCH_FLAGS -DGGML_CPU_GENERIC)
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,631 @@
|
||||||
|
#define GGML_COMMON_IMPL_CPP
|
||||||
|
#define GGML_COMMON_DECL_CPP
|
||||||
|
#include "ggml-common.h"
|
||||||
|
#include "ggml-backend-impl.h"
|
||||||
|
|
||||||
|
#include "ggml-impl.h"
|
||||||
|
#include "ggml-cpu.h"
|
||||||
|
#include "ggml-cpu-impl.h"
|
||||||
|
#include "simd-mappings.h"
|
||||||
|
#include "traits.h"
|
||||||
|
|
||||||
|
#include <cmath>
|
||||||
|
#include <cstring>
|
||||||
|
#include <cassert>
|
||||||
|
#include <cstdlib> // for qsort
|
||||||
|
#include <cstdio> // for GGML_ASSERT
|
||||||
|
|
||||||
|
#if defined(__wasm_simd128__)
|
||||||
|
#include <wasm_simd128.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "../../repack.h"
|
||||||
|
|
||||||
|
#define UNUSED GGML_UNUSED
|
||||||
|
|
||||||
|
#if defined(__wasm_simd128__)
|
||||||
|
|
||||||
|
// Wasm SIMD128 optimized quantization for Q8_0 4x4 interleaved blocks
|
||||||
|
void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
||||||
|
assert(QK8_0 == 32);
|
||||||
|
assert(k % QK8_0 == 0);
|
||||||
|
const int nb = k / QK8_0;
|
||||||
|
|
||||||
|
block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy;
|
||||||
|
|
||||||
|
for (int i = 0; i < nb; i++) {
|
||||||
|
v128_t srcv[4][8];
|
||||||
|
float id[4];
|
||||||
|
|
||||||
|
// Process 4 rows
|
||||||
|
for (int row = 0; row < 4; row++) {
|
||||||
|
v128_t asrcv[8];
|
||||||
|
v128_t amaxv[8];
|
||||||
|
|
||||||
|
// Load 8 vectors of 4 floats each (32 floats total per row)
|
||||||
|
for (int j = 0; j < 8; j++) {
|
||||||
|
srcv[row][j] = wasm_v128_load(x + row * k + i * 32 + 4 * j);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compute absolute values
|
||||||
|
for (int j = 0; j < 8; j++) {
|
||||||
|
asrcv[j] = wasm_f32x4_abs(srcv[row][j]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find maximum across all 8 vectors using pairwise reduction
|
||||||
|
for (int j = 0; j < 4; j++) {
|
||||||
|
amaxv[2 * j] = wasm_f32x4_max(asrcv[2 * j], asrcv[2 * j + 1]);
|
||||||
|
}
|
||||||
|
for (int j = 0; j < 2; j++) {
|
||||||
|
amaxv[4 * j] = wasm_f32x4_max(amaxv[4 * j], amaxv[4 * j + 2]);
|
||||||
|
}
|
||||||
|
amaxv[0] = wasm_f32x4_max(amaxv[0], amaxv[4]);
|
||||||
|
|
||||||
|
// Extract maximum from the final vector
|
||||||
|
float amax = wasm_f32x4_extract_lane(amaxv[0], 0);
|
||||||
|
amax = fmaxf(amax, wasm_f32x4_extract_lane(amaxv[0], 1));
|
||||||
|
amax = fmaxf(amax, wasm_f32x4_extract_lane(amaxv[0], 2));
|
||||||
|
amax = fmaxf(amax, wasm_f32x4_extract_lane(amaxv[0], 3));
|
||||||
|
|
||||||
|
const float d = amax / ((1 << 7) - 1);
|
||||||
|
id[row] = d ? 1.0f / d : 0.0f;
|
||||||
|
|
||||||
|
y[i].d[row] = GGML_CPU_FP32_TO_FP16(d);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Quantize and interleave with blocklen=4
|
||||||
|
for (int j = 0; j < 8; j++) {
|
||||||
|
for (int row = 0; row < 4; row++) {
|
||||||
|
v128_t v = wasm_f32x4_mul(srcv[row][j], wasm_f32x4_splat(id[row]));
|
||||||
|
v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
|
||||||
|
|
||||||
|
// Store interleaved: row0[0-3], row1[0-3], row2[0-3], row3[0-3]
|
||||||
|
y[i].qs[16 * j + row * 4 + 0] = wasm_i32x4_extract_lane(vi, 0);
|
||||||
|
y[i].qs[16 * j + row * 4 + 1] = wasm_i32x4_extract_lane(vi, 1);
|
||||||
|
y[i].qs[16 * j + row * 4 + 2] = wasm_i32x4_extract_lane(vi, 2);
|
||||||
|
y[i].qs[16 * j + row * 4 + 3] = wasm_i32x4_extract_lane(vi, 3);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wasm SIMD128 optimized quantization for Q8_0 4x8 interleaved blocks
|
||||||
|
void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
||||||
|
assert(QK8_0 == 32);
|
||||||
|
assert(k % QK8_0 == 0);
|
||||||
|
const int nb = k / QK8_0;
|
||||||
|
|
||||||
|
block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy;
|
||||||
|
|
||||||
|
for (int i = 0; i < nb; i++) {
|
||||||
|
v128_t srcv[4][8];
|
||||||
|
float id[4];
|
||||||
|
|
||||||
|
// Process 4 rows
|
||||||
|
for (int row = 0; row < 4; row++) {
|
||||||
|
v128_t asrcv[8];
|
||||||
|
v128_t amaxv[8];
|
||||||
|
|
||||||
|
// Load 8 vectors of 4 floats each (32 floats total per row)
|
||||||
|
for (int j = 0; j < 8; j++) {
|
||||||
|
srcv[row][j] = wasm_v128_load(x + row * k + i * 32 + 4 * j);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compute absolute values
|
||||||
|
for (int j = 0; j < 8; j++) {
|
||||||
|
asrcv[j] = wasm_f32x4_abs(srcv[row][j]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find maximum across all 8 vectors
|
||||||
|
for (int j = 0; j < 4; j++) {
|
||||||
|
amaxv[2 * j] = wasm_f32x4_max(asrcv[2 * j], asrcv[2 * j + 1]);
|
||||||
|
}
|
||||||
|
for (int j = 0; j < 2; j++) {
|
||||||
|
amaxv[4 * j] = wasm_f32x4_max(amaxv[4 * j], amaxv[4 * j + 2]);
|
||||||
|
}
|
||||||
|
amaxv[0] = wasm_f32x4_max(amaxv[0], amaxv[4]);
|
||||||
|
|
||||||
|
float amax = wasm_f32x4_extract_lane(amaxv[0], 0);
|
||||||
|
amax = fmaxf(amax, wasm_f32x4_extract_lane(amaxv[0], 1));
|
||||||
|
amax = fmaxf(amax, wasm_f32x4_extract_lane(amaxv[0], 2));
|
||||||
|
amax = fmaxf(amax, wasm_f32x4_extract_lane(amaxv[0], 3));
|
||||||
|
|
||||||
|
const float d = amax / ((1 << 7) - 1);
|
||||||
|
id[row] = d ? 1.0f / d : 0.0f;
|
||||||
|
|
||||||
|
y[i].d[row] = GGML_CPU_FP32_TO_FP16(d);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Quantize and interleave with blocklen=8
|
||||||
|
for (int j = 0; j < 4; j++) {
|
||||||
|
for (int row = 0; row < 4; row++) {
|
||||||
|
// First 4 floats of block
|
||||||
|
v128_t v0 = wasm_f32x4_mul(srcv[row][2 * j], wasm_f32x4_splat(id[row]));
|
||||||
|
v128_t vi0 = wasm_i32x4_trunc_sat_f32x4(v0);
|
||||||
|
|
||||||
|
// Second 4 floats of block
|
||||||
|
v128_t v1 = wasm_f32x4_mul(srcv[row][2 * j + 1], wasm_f32x4_splat(id[row]));
|
||||||
|
v128_t vi1 = wasm_i32x4_trunc_sat_f32x4(v1);
|
||||||
|
|
||||||
|
// Store interleaved with blocklen=8
|
||||||
|
y[i].qs[32 * j + row * 8 + 0] = wasm_i32x4_extract_lane(vi0, 0);
|
||||||
|
y[i].qs[32 * j + row * 8 + 1] = wasm_i32x4_extract_lane(vi0, 1);
|
||||||
|
y[i].qs[32 * j + row * 8 + 2] = wasm_i32x4_extract_lane(vi0, 2);
|
||||||
|
y[i].qs[32 * j + row * 8 + 3] = wasm_i32x4_extract_lane(vi0, 3);
|
||||||
|
y[i].qs[32 * j + row * 8 + 4] = wasm_i32x4_extract_lane(vi1, 0);
|
||||||
|
y[i].qs[32 * j + row * 8 + 5] = wasm_i32x4_extract_lane(vi1, 1);
|
||||||
|
y[i].qs[32 * j + row * 8 + 6] = wasm_i32x4_extract_lane(vi1, 2);
|
||||||
|
y[i].qs[32 * j + row * 8 + 7] = wasm_i32x4_extract_lane(vi1, 3);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wasm SIMD128 optimized GEMV for Q4_0 4x4 with Q8_0 activation
|
||||||
|
void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||||
|
const int qk = QK8_0;
|
||||||
|
const int nb = n / qk;
|
||||||
|
const int ncols_interleaved = 4;
|
||||||
|
const int blocklen = 4;
|
||||||
|
|
||||||
|
assert(nr == 1);
|
||||||
|
assert(n % qk == 0);
|
||||||
|
assert(nc % ncols_interleaved == 0);
|
||||||
|
|
||||||
|
UNUSED(bs);
|
||||||
|
UNUSED(nr);
|
||||||
|
|
||||||
|
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
||||||
|
|
||||||
|
for (int x_idx = 0; x_idx < nc / ncols_interleaved; x_idx++) {
|
||||||
|
const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x_idx * nb);
|
||||||
|
|
||||||
|
v128_t acc = wasm_f32x4_splat(0.0f);
|
||||||
|
|
||||||
|
for (int l = 0; l < nb; l++) {
|
||||||
|
float a_d = GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
||||||
|
v128_t b_d = wasm_f32x4_make(
|
||||||
|
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[0]),
|
||||||
|
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[1]),
|
||||||
|
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[2]),
|
||||||
|
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[3])
|
||||||
|
);
|
||||||
|
|
||||||
|
v128_t sumi = wasm_i32x4_splat(0);
|
||||||
|
|
||||||
|
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||||
|
for (int i = 0; i < blocklen; i++) {
|
||||||
|
int base = k * ncols_interleaved * blocklen + i;
|
||||||
|
|
||||||
|
int8_t b0 = b_ptr[l].qs[base + 0 * blocklen];
|
||||||
|
int8_t b1 = b_ptr[l].qs[base + 1 * blocklen];
|
||||||
|
int8_t b2 = b_ptr[l].qs[base + 2 * blocklen];
|
||||||
|
int8_t b3 = b_ptr[l].qs[base + 3 * blocklen];
|
||||||
|
|
||||||
|
v128_t v0 = wasm_i32x4_make(
|
||||||
|
(int32_t)(int8_t)(b0 << 4),
|
||||||
|
(int32_t)(int8_t)(b1 << 4),
|
||||||
|
(int32_t)(int8_t)(b2 << 4),
|
||||||
|
(int32_t)(int8_t)(b3 << 4)
|
||||||
|
);
|
||||||
|
v128_t v1 = wasm_i32x4_make(
|
||||||
|
(int32_t)(int8_t)(b0 & 0xF0),
|
||||||
|
(int32_t)(int8_t)(b1 & 0xF0),
|
||||||
|
(int32_t)(int8_t)(b2 & 0xF0),
|
||||||
|
(int32_t)(int8_t)(b3 & 0xF0)
|
||||||
|
);
|
||||||
|
|
||||||
|
int32_t a_val_lo = a_ptr[l].qs[k * blocklen + i];
|
||||||
|
int32_t a_val_hi = a_ptr[l].qs[k * blocklen + i + qk / 2];
|
||||||
|
|
||||||
|
v128_t mul0 = wasm_i32x4_mul(v0, wasm_i32x4_splat(a_val_lo));
|
||||||
|
v128_t mul1 = wasm_i32x4_mul(v1, wasm_i32x4_splat(a_val_hi));
|
||||||
|
v128_t sum = wasm_i32x4_add(mul0, mul1);
|
||||||
|
sum = wasm_i32x4_shr(sum, 4);
|
||||||
|
sumi = wasm_i32x4_add(sumi, sum);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
v128_t sumf = wasm_f32x4_convert_i32x4(sumi);
|
||||||
|
v128_t scale = wasm_f32x4_mul(b_d, wasm_f32x4_splat(a_d));
|
||||||
|
acc = wasm_f32x4_add(acc, wasm_f32x4_mul(sumf, scale));
|
||||||
|
}
|
||||||
|
|
||||||
|
wasm_v128_store(s + x_idx * ncols_interleaved, acc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wasm SIMD128 optimized GEMM for Q4_0 4x4 with Q8_0 activation
|
||||||
|
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||||
|
const int qk = QK8_0;
|
||||||
|
const int nb = n / qk;
|
||||||
|
const int ncols_interleaved = 4;
|
||||||
|
const int blocklen = 4;
|
||||||
|
|
||||||
|
assert(n % qk == 0);
|
||||||
|
assert(nc % ncols_interleaved == 0);
|
||||||
|
assert(nr % 4 == 0);
|
||||||
|
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
|
for (int row = 0; row < nr; row += 4) {
|
||||||
|
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) ((const char *)vy + row * nb * sizeof(block_q8_0));
|
||||||
|
|
||||||
|
for (int x_idx = 0; x_idx < nc / ncols_interleaved; x_idx++) {
|
||||||
|
const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x_idx * nb);
|
||||||
|
|
||||||
|
v128_t acc[4] = {
|
||||||
|
wasm_f32x4_splat(0.0f),
|
||||||
|
wasm_f32x4_splat(0.0f),
|
||||||
|
wasm_f32x4_splat(0.0f),
|
||||||
|
wasm_f32x4_splat(0.0f)
|
||||||
|
};
|
||||||
|
|
||||||
|
for (int l = 0; l < nb; l++) {
|
||||||
|
v128_t b_d = wasm_f32x4_make(
|
||||||
|
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[0]),
|
||||||
|
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[1]),
|
||||||
|
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[2]),
|
||||||
|
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[3])
|
||||||
|
);
|
||||||
|
|
||||||
|
for (int r = 0; r < 4; r++) {
|
||||||
|
float a_d = GGML_CPU_FP16_TO_FP32(a_ptr[l].d[r]);
|
||||||
|
v128_t sumi = wasm_i32x4_splat(0);
|
||||||
|
|
||||||
|
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||||
|
for (int i = 0; i < blocklen; i++) {
|
||||||
|
int base = k * ncols_interleaved * blocklen + i;
|
||||||
|
|
||||||
|
int8_t b0 = b_ptr[l].qs[base + 0 * blocklen];
|
||||||
|
int8_t b1 = b_ptr[l].qs[base + 1 * blocklen];
|
||||||
|
int8_t b2 = b_ptr[l].qs[base + 2 * blocklen];
|
||||||
|
int8_t b3 = b_ptr[l].qs[base + 3 * blocklen];
|
||||||
|
|
||||||
|
v128_t v0 = wasm_i32x4_make(
|
||||||
|
(int32_t)(int8_t)(b0 << 4),
|
||||||
|
(int32_t)(int8_t)(b1 << 4),
|
||||||
|
(int32_t)(int8_t)(b2 << 4),
|
||||||
|
(int32_t)(int8_t)(b3 << 4)
|
||||||
|
);
|
||||||
|
v128_t v1 = wasm_i32x4_make(
|
||||||
|
(int32_t)(int8_t)(b0 & 0xF0),
|
||||||
|
(int32_t)(int8_t)(b1 & 0xF0),
|
||||||
|
(int32_t)(int8_t)(b2 & 0xF0),
|
||||||
|
(int32_t)(int8_t)(b3 & 0xF0)
|
||||||
|
);
|
||||||
|
|
||||||
|
int32_t a_val_lo = a_ptr[l].qs[k * 16 + r * blocklen + i];
|
||||||
|
int32_t a_val_hi = a_ptr[l].qs[k * 16 + r * blocklen + i + 64];
|
||||||
|
|
||||||
|
v128_t mul0 = wasm_i32x4_mul(v0, wasm_i32x4_splat(a_val_lo));
|
||||||
|
v128_t mul1 = wasm_i32x4_mul(v1, wasm_i32x4_splat(a_val_hi));
|
||||||
|
v128_t sum = wasm_i32x4_add(mul0, mul1);
|
||||||
|
sum = wasm_i32x4_shr(sum, 4);
|
||||||
|
sumi = wasm_i32x4_add(sumi, sum);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
v128_t sumf = wasm_f32x4_convert_i32x4(sumi);
|
||||||
|
v128_t scale = wasm_f32x4_mul(b_d, wasm_f32x4_splat(a_d));
|
||||||
|
acc[r] = wasm_f32x4_add(acc[r], wasm_f32x4_mul(sumf, scale));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int r = 0; r < 4; r++) {
|
||||||
|
wasm_v128_store(s + (row + r) * nc + x_idx * ncols_interleaved, acc[r]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// For other functions, fall back to generic implementations
|
||||||
|
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||||
|
ggml_gemv_q4_0_4x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||||
|
const int qk = QK8_0;
|
||||||
|
const int nb = n / qk;
|
||||||
|
const int ncols_interleaved = 8;
|
||||||
|
const int blocklen = 8;
|
||||||
|
|
||||||
|
assert(nr == 1);
|
||||||
|
assert(n % qk == 0);
|
||||||
|
assert(nc % ncols_interleaved == 0);
|
||||||
|
|
||||||
|
UNUSED(bs);
|
||||||
|
UNUSED(nr);
|
||||||
|
|
||||||
|
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
||||||
|
|
||||||
|
for (int x_idx = 0; x_idx < nc / ncols_interleaved; x_idx++) {
|
||||||
|
const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x_idx * nb);
|
||||||
|
|
||||||
|
v128_t acc0 = wasm_f32x4_splat(0.0f);
|
||||||
|
v128_t acc1 = wasm_f32x4_splat(0.0f);
|
||||||
|
|
||||||
|
for (int l = 0; l < nb; l++) {
|
||||||
|
float a_d = GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
||||||
|
v128_t b_d0 = wasm_f32x4_make(
|
||||||
|
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[0]),
|
||||||
|
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[1]),
|
||||||
|
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[2]),
|
||||||
|
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[3])
|
||||||
|
);
|
||||||
|
v128_t b_d1 = wasm_f32x4_make(
|
||||||
|
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[4]),
|
||||||
|
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[5]),
|
||||||
|
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[6]),
|
||||||
|
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[7])
|
||||||
|
);
|
||||||
|
|
||||||
|
v128_t sumi0 = wasm_i32x4_splat(0);
|
||||||
|
v128_t sumi1 = wasm_i32x4_splat(0);
|
||||||
|
|
||||||
|
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||||
|
for (int i = 0; i < blocklen; i++) {
|
||||||
|
int base = k * ncols_interleaved * blocklen + i;
|
||||||
|
|
||||||
|
int8_t b0 = b_ptr[l].qs[base + 0 * blocklen];
|
||||||
|
int8_t b1 = b_ptr[l].qs[base + 1 * blocklen];
|
||||||
|
int8_t b2 = b_ptr[l].qs[base + 2 * blocklen];
|
||||||
|
int8_t b3 = b_ptr[l].qs[base + 3 * blocklen];
|
||||||
|
int8_t b4 = b_ptr[l].qs[base + 4 * blocklen];
|
||||||
|
int8_t b5 = b_ptr[l].qs[base + 5 * blocklen];
|
||||||
|
int8_t b6 = b_ptr[l].qs[base + 6 * blocklen];
|
||||||
|
int8_t b7 = b_ptr[l].qs[base + 7 * blocklen];
|
||||||
|
|
||||||
|
v128_t v0_0 = wasm_i32x4_make(
|
||||||
|
(int32_t)(int8_t)(b0 << 4),
|
||||||
|
(int32_t)(int8_t)(b1 << 4),
|
||||||
|
(int32_t)(int8_t)(b2 << 4),
|
||||||
|
(int32_t)(int8_t)(b3 << 4)
|
||||||
|
);
|
||||||
|
v128_t v1_0 = wasm_i32x4_make(
|
||||||
|
(int32_t)(int8_t)(b0 & 0xF0),
|
||||||
|
(int32_t)(int8_t)(b1 & 0xF0),
|
||||||
|
(int32_t)(int8_t)(b2 & 0xF0),
|
||||||
|
(int32_t)(int8_t)(b3 & 0xF0)
|
||||||
|
);
|
||||||
|
|
||||||
|
v128_t v0_1 = wasm_i32x4_make(
|
||||||
|
(int32_t)(int8_t)(b4 << 4),
|
||||||
|
(int32_t)(int8_t)(b5 << 4),
|
||||||
|
(int32_t)(int8_t)(b6 << 4),
|
||||||
|
(int32_t)(int8_t)(b7 << 4)
|
||||||
|
);
|
||||||
|
v128_t v1_1 = wasm_i32x4_make(
|
||||||
|
(int32_t)(int8_t)(b4 & 0xF0),
|
||||||
|
(int32_t)(int8_t)(b5 & 0xF0),
|
||||||
|
(int32_t)(int8_t)(b6 & 0xF0),
|
||||||
|
(int32_t)(int8_t)(b7 & 0xF0)
|
||||||
|
);
|
||||||
|
|
||||||
|
int32_t a_val_lo = a_ptr[l].qs[k * blocklen + i];
|
||||||
|
int32_t a_val_hi = a_ptr[l].qs[k * blocklen + i + qk / 2];
|
||||||
|
|
||||||
|
v128_t mul0_0 = wasm_i32x4_mul(v0_0, wasm_i32x4_splat(a_val_lo));
|
||||||
|
v128_t mul1_0 = wasm_i32x4_mul(v1_0, wasm_i32x4_splat(a_val_hi));
|
||||||
|
v128_t sum0 = wasm_i32x4_add(mul0_0, mul1_0);
|
||||||
|
sum0 = wasm_i32x4_shr(sum0, 4);
|
||||||
|
sumi0 = wasm_i32x4_add(sumi0, sum0);
|
||||||
|
|
||||||
|
v128_t mul0_1 = wasm_i32x4_mul(v0_1, wasm_i32x4_splat(a_val_lo));
|
||||||
|
v128_t mul1_1 = wasm_i32x4_mul(v1_1, wasm_i32x4_splat(a_val_hi));
|
||||||
|
v128_t sum1 = wasm_i32x4_add(mul0_1, mul1_1);
|
||||||
|
sum1 = wasm_i32x4_shr(sum1, 4);
|
||||||
|
sumi1 = wasm_i32x4_add(sumi1, sum1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
v128_t sumf0 = wasm_f32x4_convert_i32x4(sumi0);
|
||||||
|
v128_t sumf1 = wasm_f32x4_convert_i32x4(sumi1);
|
||||||
|
v128_t scale0 = wasm_f32x4_mul(b_d0, wasm_f32x4_splat(a_d));
|
||||||
|
v128_t scale1 = wasm_f32x4_mul(b_d1, wasm_f32x4_splat(a_d));
|
||||||
|
acc0 = wasm_f32x4_add(acc0, wasm_f32x4_mul(sumf0, scale0));
|
||||||
|
acc1 = wasm_f32x4_add(acc1, wasm_f32x4_mul(sumf1, scale1));
|
||||||
|
}
|
||||||
|
|
||||||
|
wasm_v128_store(s + x_idx * ncols_interleaved, acc0);
|
||||||
|
wasm_v128_store(s + x_idx * ncols_interleaved + 4, acc1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||||
|
ggml_gemm_q4_0_4x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||||
|
const int qk = QK8_0;
|
||||||
|
const int nb = n / qk;
|
||||||
|
const int ncols_interleaved = 8;
|
||||||
|
const int blocklen = 8;
|
||||||
|
|
||||||
|
assert(n % qk == 0);
|
||||||
|
assert(nc % ncols_interleaved == 0);
|
||||||
|
assert(nr % 4 == 0);
|
||||||
|
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
|
for (int row = 0; row < nr; row += 4) {
|
||||||
|
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) ((const char *)vy + row * nb * sizeof(block_q8_0));
|
||||||
|
|
||||||
|
for (int x_idx = 0; x_idx < nc / ncols_interleaved; x_idx++) {
|
||||||
|
const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x_idx * nb);
|
||||||
|
|
||||||
|
v128_t acc0[4] = {
|
||||||
|
wasm_f32x4_splat(0.0f),
|
||||||
|
wasm_f32x4_splat(0.0f),
|
||||||
|
wasm_f32x4_splat(0.0f),
|
||||||
|
wasm_f32x4_splat(0.0f)
|
||||||
|
};
|
||||||
|
v128_t acc1[4] = {
|
||||||
|
wasm_f32x4_splat(0.0f),
|
||||||
|
wasm_f32x4_splat(0.0f),
|
||||||
|
wasm_f32x4_splat(0.0f),
|
||||||
|
wasm_f32x4_splat(0.0f)
|
||||||
|
};
|
||||||
|
|
||||||
|
for (int l = 0; l < nb; l++) {
|
||||||
|
v128_t b_d0 = wasm_f32x4_make(
|
||||||
|
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[0]),
|
||||||
|
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[1]),
|
||||||
|
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[2]),
|
||||||
|
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[3])
|
||||||
|
);
|
||||||
|
v128_t b_d1 = wasm_f32x4_make(
|
||||||
|
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[4]),
|
||||||
|
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[5]),
|
||||||
|
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[6]),
|
||||||
|
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[7])
|
||||||
|
);
|
||||||
|
|
||||||
|
for (int r = 0; r < 4; r++) {
|
||||||
|
float a_d = GGML_CPU_FP16_TO_FP32(a_ptr[l].d[r]);
|
||||||
|
v128_t sumi0 = wasm_i32x4_splat(0);
|
||||||
|
v128_t sumi1 = wasm_i32x4_splat(0);
|
||||||
|
|
||||||
|
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||||
|
for (int i = 0; i < blocklen; i++) {
|
||||||
|
int base = k * ncols_interleaved * blocklen + i;
|
||||||
|
|
||||||
|
int8_t b0 = b_ptr[l].qs[base + 0 * blocklen];
|
||||||
|
int8_t b1 = b_ptr[l].qs[base + 1 * blocklen];
|
||||||
|
int8_t b2 = b_ptr[l].qs[base + 2 * blocklen];
|
||||||
|
int8_t b3 = b_ptr[l].qs[base + 3 * blocklen];
|
||||||
|
int8_t b4 = b_ptr[l].qs[base + 4 * blocklen];
|
||||||
|
int8_t b5 = b_ptr[l].qs[base + 5 * blocklen];
|
||||||
|
int8_t b6 = b_ptr[l].qs[base + 6 * blocklen];
|
||||||
|
int8_t b7 = b_ptr[l].qs[base + 7 * blocklen];
|
||||||
|
|
||||||
|
v128_t v0_0 = wasm_i32x4_make(
|
||||||
|
(int32_t)(int8_t)(b0 << 4),
|
||||||
|
(int32_t)(int8_t)(b1 << 4),
|
||||||
|
(int32_t)(int8_t)(b2 << 4),
|
||||||
|
(int32_t)(int8_t)(b3 << 4)
|
||||||
|
);
|
||||||
|
v128_t v1_0 = wasm_i32x4_make(
|
||||||
|
(int32_t)(int8_t)(b0 & 0xF0),
|
||||||
|
(int32_t)(int8_t)(b1 & 0xF0),
|
||||||
|
(int32_t)(int8_t)(b2 & 0xF0),
|
||||||
|
(int32_t)(int8_t)(b3 & 0xF0)
|
||||||
|
);
|
||||||
|
|
||||||
|
v128_t v0_1 = wasm_i32x4_make(
|
||||||
|
(int32_t)(int8_t)(b4 << 4),
|
||||||
|
(int32_t)(int8_t)(b5 << 4),
|
||||||
|
(int32_t)(int8_t)(b6 << 4),
|
||||||
|
(int32_t)(int8_t)(b7 << 4)
|
||||||
|
);
|
||||||
|
v128_t v1_1 = wasm_i32x4_make(
|
||||||
|
(int32_t)(int8_t)(b4 & 0xF0),
|
||||||
|
(int32_t)(int8_t)(b5 & 0xF0),
|
||||||
|
(int32_t)(int8_t)(b6 & 0xF0),
|
||||||
|
(int32_t)(int8_t)(b7 & 0xF0)
|
||||||
|
);
|
||||||
|
|
||||||
|
int32_t a_val_lo = a_ptr[l].qs[k * 4 * blocklen + r * blocklen + i];
|
||||||
|
int32_t a_val_hi = a_ptr[l].qs[k * 4 * blocklen + r * blocklen + i + qk / 2 * 4];
|
||||||
|
|
||||||
|
v128_t mul0_0 = wasm_i32x4_mul(v0_0, wasm_i32x4_splat(a_val_lo));
|
||||||
|
v128_t mul1_0 = wasm_i32x4_mul(v1_0, wasm_i32x4_splat(a_val_hi));
|
||||||
|
v128_t sum0 = wasm_i32x4_add(mul0_0, mul1_0);
|
||||||
|
sum0 = wasm_i32x4_shr(sum0, 4);
|
||||||
|
sumi0 = wasm_i32x4_add(sumi0, sum0);
|
||||||
|
|
||||||
|
v128_t mul0_1 = wasm_i32x4_mul(v0_1, wasm_i32x4_splat(a_val_lo));
|
||||||
|
v128_t mul1_1 = wasm_i32x4_mul(v1_1, wasm_i32x4_splat(a_val_hi));
|
||||||
|
v128_t sum1 = wasm_i32x4_add(mul0_1, mul1_1);
|
||||||
|
sum1 = wasm_i32x4_shr(sum1, 4);
|
||||||
|
sumi1 = wasm_i32x4_add(sumi1, sum1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
v128_t sumf0 = wasm_f32x4_convert_i32x4(sumi0);
|
||||||
|
v128_t sumf1 = wasm_f32x4_convert_i32x4(sumi1);
|
||||||
|
v128_t scale0 = wasm_f32x4_mul(b_d0, wasm_f32x4_splat(a_d));
|
||||||
|
v128_t scale1 = wasm_f32x4_mul(b_d1, wasm_f32x4_splat(a_d));
|
||||||
|
acc0[r] = wasm_f32x4_add(acc0[r], wasm_f32x4_mul(sumf0, scale0));
|
||||||
|
acc1[r] = wasm_f32x4_add(acc1[r], wasm_f32x4_mul(sumf1, scale1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int r = 0; r < 4; r++) {
|
||||||
|
wasm_v128_store(s + (row + r) * nc + x_idx * ncols_interleaved, acc0[r]);
|
||||||
|
wasm_v128_store(s + (row + r) * nc + x_idx * ncols_interleaved + 4, acc1[r]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Q4_K functions - fall back to generic
|
||||||
|
void ggml_quantize_mat_q8_K_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
||||||
|
ggml_quantize_mat_q8_K_4x4_generic(x, vy, k);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_quantize_mat_q8_K_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
||||||
|
ggml_quantize_mat_q8_K_4x8_generic(x, vy, k);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_gemv_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||||
|
ggml_gemv_q4_K_8x4_q8_K_generic(n, s, bs, vx, vy, nr, nc);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||||
|
ggml_gemv_q4_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_gemm_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||||
|
ggml_gemm_q4_K_8x4_q8_K_generic(n, s, bs, vx, vy, nr, nc);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||||
|
ggml_gemm_q4_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Q2_K functions - fall back to generic
|
||||||
|
void ggml_gemv_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||||
|
ggml_gemv_q2_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||||
|
ggml_gemm_q2_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc);
|
||||||
|
}
|
||||||
|
|
||||||
|
// IQ4_NL functions - fall back to generic
|
||||||
|
void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||||
|
ggml_gemv_iq4_nl_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_gemv_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||||
|
ggml_gemv_iq4_nl_8x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||||
|
ggml_gemm_iq4_nl_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_gemm_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||||
|
ggml_gemm_iq4_nl_8x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Q8_0 functions - fall back to generic
|
||||||
|
void ggml_gemv_q8_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||||
|
ggml_gemv_q8_0_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_gemv_q8_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||||
|
ggml_gemv_q8_0_4x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_gemm_q8_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||||
|
ggml_gemm_q8_0_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_gemm_q8_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||||
|
ggml_gemm_q8_0_4x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // __wasm_simd128__
|
||||||
|
|
||||||
|
|
@ -3073,6 +3073,14 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
||||||
return &q4_0_4x4_q8_0;
|
return &q4_0_4x4_q8_0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (ggml_cpu_has_wasm_simd()) {
|
||||||
|
if (cur->ne[1] % 8 == 0) {
|
||||||
|
return &q4_0_8x8_q8_0;
|
||||||
|
}
|
||||||
|
if (cur->ne[1] % 4 == 0) {
|
||||||
|
return &q4_0_4x4_q8_0;
|
||||||
|
}
|
||||||
|
}
|
||||||
} else if (cur->type == GGML_TYPE_Q4_K) {
|
} else if (cur->type == GGML_TYPE_Q4_K) {
|
||||||
if (ggml_cpu_has_avx2()) {
|
if (ggml_cpu_has_avx2()) {
|
||||||
if (cur->ne[1] % 8 == 0) {
|
if (cur->ne[1] % 8 == 0) {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue