diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 7dcb031f0f..6f6a32c84a 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -103,6 +103,15 @@ endif() # TODO: probably these flags need to be tweaked on some architectures # feel free to update the Makefile for your architecture and send a pull request or issue message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}") +# Enable SVE for ARMv8-A+ architectures +if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64|ARM64") + message(STATUS "Enabling ARM SVE support") + add_compile_options(-march=native -fopenmp -O3 -ftree-vectorize) + add_compile_definitions( + __ARM_FEATURE_SVE=1 + __ARM_FEATURE_FP16_VECTOR_ARITHMETIC=1 + ) +endif() if (MSVC) string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR) message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}") diff --git a/ggml/src/ggml-cpu/arch/arm/quants.c b/ggml/src/ggml-cpu/arch/arm/quants.c index b0909dac08..e630d4c98d 100644 --- a/ggml/src/ggml-cpu/arch/arm/quants.c +++ b/ggml/src/ggml-cpu/arch/arm/quants.c @@ -44,7 +44,34 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i block_q8_0 * GGML_RESTRICT y = vy; -#if defined(__ARM_NEON) +#if defined(__ARM_FEATURE_SVE) + const int sve_register_length = ggml_cpu_get_sve_cnt() * 8; + const int ggml_f32_epr = sve_register_length / 32; + const svfloat32_t inactive1 = svdup_n_f32(0.0f); + const svint32_t inactive2 = svdup_n_s32(0); + const svbool_t pg = svptrue_b32(); + for (int i = 0; i < nb; i+=1) { + svfloat32_t srcv1, asrcv1; + svfloat32_t sv_max = svdup_n_f32(0.0f); + float32_t amax = 0.0; + + for (int j = 0; j < QK8_0; j+=ggml_f32_epr) { + srcv1 = svld1_f32(pg, x + i*32 + j); + asrcv1 = svabs_f32_m(inactive1, pg, srcv1); + sv_max = svmax_f32_m(pg, sv_max, asrcv1); + } + amax = svmaxv_f32(pg, sv_max); + float32_t d = amax / ((1 << 7) - 1); + float32_t id = d ? 1.0f/d : 0.0f; + y[i].d = GGML_FP32_TO_FP16(d); + for (int j = 0; j < QK8_0; j+=ggml_f32_epr) { + srcv1 = svld1_f32(pg, x + i*32 + j); + const svfloat32_t v1 = svmul_n_f32_m(pg, srcv1, id); + const svint32_t vi1 = svcvt_s32_f32_m(inactive2, pg, v1); + svst1b_s32(pg, &y[i].qs[j], vi1); + } + } +#elif defined(__ARM_NEON) for (int i = 0; i < nb; i++) { float32x4_t srcv [8]; float32x4_t asrcv[8]; diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index e389a46dbe..587751c72b 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -347,13 +347,31 @@ void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRI const int nb = k / qk; - for (int i = 0; i < nb; i++) { - const float d = GGML_FP16_TO_FP32(x[i].d); + #if defined(__ARM_FEATURE_SVE) + svbool_t pg = svptrue_b32(); + const svfloat32_t inactive1 = svdup_n_f32(0.0f); + const int ggml_f32_epr = svcntw(); - for (int j = 0; j < qk; ++j) { - y[i*qk + j] = x[i].qs[j]*d; + for (int i = 0; i < nb; i+=1) { + const float d1 = GGML_FP16_TO_FP32(x[i].d); // d:0 + + const int8_t *x_data1 = x[i].qs; + float *y_base = y + i * qk; + for (int j = 0; j < qk; j+=ggml_f32_epr) { + svint32_t vec0 = svld1sb_s32(pg, x_data1 + j); + svfloat32_t fvec0 = svmul_n_f32_m(pg, svcvt_f32_s32_m(inactive1, pg, vec0), d1); // Convert to float and scale + svst1_f32(pg, y_base + j, fvec0); + } } - } + #else + for (int i = 0; i < nb; i++) { + const float d = GGML_FP16_TO_FP32(x[i].d); + + for (int j = 0; j < qk; ++j) { + y[i*qk + j] = x[i].qs[j]*d; + } + } + #endif } //