From 397f61590d17bf0835f68d508fbd2777f8480b3c Mon Sep 17 00:00:00 2001 From: vithulep Date: Tue, 10 Jun 2025 15:17:58 +0530 Subject: [PATCH 01/21] Implementation of SVE for kernel quantize_row_q8_0() and dequantize_row_q8_0() --- ggml/src/CMakeLists.txt | 9 +++++++++ ggml/src/ggml-cpu/arch/arm/quants.c | 29 ++++++++++++++++++++++++++++- ggml/src/ggml-quants.c | 28 +++++++++++++++++++++++----- 3 files changed, 60 insertions(+), 6 deletions(-) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 7dcb031f0f..6f6a32c84a 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -103,6 +103,15 @@ endif() # TODO: probably these flags need to be tweaked on some architectures # feel free to update the Makefile for your architecture and send a pull request or issue message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}") +# Enable SVE for ARMv8-A+ architectures +if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64|ARM64") + message(STATUS "Enabling ARM SVE support") + add_compile_options(-march=native -fopenmp -O3 -ftree-vectorize) + add_compile_definitions( + __ARM_FEATURE_SVE=1 + __ARM_FEATURE_FP16_VECTOR_ARITHMETIC=1 + ) +endif() if (MSVC) string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR) message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}") diff --git a/ggml/src/ggml-cpu/arch/arm/quants.c b/ggml/src/ggml-cpu/arch/arm/quants.c index b0909dac08..e630d4c98d 100644 --- a/ggml/src/ggml-cpu/arch/arm/quants.c +++ b/ggml/src/ggml-cpu/arch/arm/quants.c @@ -44,7 +44,34 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i block_q8_0 * GGML_RESTRICT y = vy; -#if defined(__ARM_NEON) +#if defined(__ARM_FEATURE_SVE) + const int sve_register_length = ggml_cpu_get_sve_cnt() * 8; + const int ggml_f32_epr = sve_register_length / 32; + const svfloat32_t inactive1 = svdup_n_f32(0.0f); + const svint32_t inactive2 = svdup_n_s32(0); + const svbool_t pg = svptrue_b32(); + for (int i = 0; i < nb; i+=1) { + svfloat32_t srcv1, asrcv1; + svfloat32_t sv_max = svdup_n_f32(0.0f); + float32_t amax = 0.0; + + for (int j = 0; j < QK8_0; j+=ggml_f32_epr) { + srcv1 = svld1_f32(pg, x + i*32 + j); + asrcv1 = svabs_f32_m(inactive1, pg, srcv1); + sv_max = svmax_f32_m(pg, sv_max, asrcv1); + } + amax = svmaxv_f32(pg, sv_max); + float32_t d = amax / ((1 << 7) - 1); + float32_t id = d ? 1.0f/d : 0.0f; + y[i].d = GGML_FP32_TO_FP16(d); + for (int j = 0; j < QK8_0; j+=ggml_f32_epr) { + srcv1 = svld1_f32(pg, x + i*32 + j); + const svfloat32_t v1 = svmul_n_f32_m(pg, srcv1, id); + const svint32_t vi1 = svcvt_s32_f32_m(inactive2, pg, v1); + svst1b_s32(pg, &y[i].qs[j], vi1); + } + } +#elif defined(__ARM_NEON) for (int i = 0; i < nb; i++) { float32x4_t srcv [8]; float32x4_t asrcv[8]; diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index e389a46dbe..587751c72b 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -347,13 +347,31 @@ void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRI const int nb = k / qk; - for (int i = 0; i < nb; i++) { - const float d = GGML_FP16_TO_FP32(x[i].d); + #if defined(__ARM_FEATURE_SVE) + svbool_t pg = svptrue_b32(); + const svfloat32_t inactive1 = svdup_n_f32(0.0f); + const int ggml_f32_epr = svcntw(); - for (int j = 0; j < qk; ++j) { - y[i*qk + j] = x[i].qs[j]*d; + for (int i = 0; i < nb; i+=1) { + const float d1 = GGML_FP16_TO_FP32(x[i].d); // d:0 + + const int8_t *x_data1 = x[i].qs; + float *y_base = y + i * qk; + for (int j = 0; j < qk; j+=ggml_f32_epr) { + svint32_t vec0 = svld1sb_s32(pg, x_data1 + j); + svfloat32_t fvec0 = svmul_n_f32_m(pg, svcvt_f32_s32_m(inactive1, pg, vec0), d1); // Convert to float and scale + svst1_f32(pg, y_base + j, fvec0); + } } - } + #else + for (int i = 0; i < nb; i++) { + const float d = GGML_FP16_TO_FP32(x[i].d); + + for (int j = 0; j < qk; ++j) { + y[i*qk + j] = x[i].qs[j]*d; + } + } + #endif } // From c6158b076666fdce00122e9ad5ece6cda3fcefa3 Mon Sep 17 00:00:00 2001 From: vithulep Date: Wed, 11 Jun 2025 09:31:51 +0530 Subject: [PATCH 02/21] Remove spaces --- ggml/src/ggml-quants.c | 1 - 1 file changed, 1 deletion(-) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 587751c72b..949d80caf1 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -366,7 +366,6 @@ void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRI #else for (int i = 0; i < nb; i++) { const float d = GGML_FP16_TO_FP32(x[i].d); - for (int j = 0; j < qk; ++j) { y[i*qk + j] = x[i].qs[j]*d; } From 9922ee7c242c504c5f10d8a5ea1c2b2a4ac5a417 Mon Sep 17 00:00:00 2001 From: vithulep Date: Wed, 11 Jun 2025 09:36:42 +0530 Subject: [PATCH 03/21] row removed --- ggml/src/ggml-quants.c | 1 - 1 file changed, 1 deletion(-) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 949d80caf1..4204c808d8 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -354,7 +354,6 @@ void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRI for (int i = 0; i < nb; i+=1) { const float d1 = GGML_FP16_TO_FP32(x[i].d); // d:0 - const int8_t *x_data1 = x[i].qs; float *y_base = y + i * qk; for (int j = 0; j < qk; j+=ggml_f32_epr) { From 139f717a53161691898755ecc28ba089c02e76ea Mon Sep 17 00:00:00 2001 From: vithulep Date: Wed, 11 Jun 2025 09:38:47 +0530 Subject: [PATCH 04/21] row removed --- ggml/src/ggml-quants.c | 1 - 1 file changed, 1 deletion(-) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 4204c808d8..deaea58f3a 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -351,7 +351,6 @@ void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRI svbool_t pg = svptrue_b32(); const svfloat32_t inactive1 = svdup_n_f32(0.0f); const int ggml_f32_epr = svcntw(); - for (int i = 0; i < nb; i+=1) { const float d1 = GGML_FP16_TO_FP32(x[i].d); // d:0 const int8_t *x_data1 = x[i].qs; From 88cf63c8d43e53f4558c410cb120da3965ca49a5 Mon Sep 17 00:00:00 2001 From: vithulep Date: Wed, 11 Jun 2025 10:27:58 +0530 Subject: [PATCH 05/21] Updated cmake file --- ggml/src/CMakeLists.txt | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 6f6a32c84a..80f7ff6240 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -103,15 +103,7 @@ endif() # TODO: probably these flags need to be tweaked on some architectures # feel free to update the Makefile for your architecture and send a pull request or issue message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}") -# Enable SVE for ARMv8-A+ architectures -if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64|ARM64") - message(STATUS "Enabling ARM SVE support") - add_compile_options(-march=native -fopenmp -O3 -ftree-vectorize) - add_compile_definitions( - __ARM_FEATURE_SVE=1 - __ARM_FEATURE_FP16_VECTOR_ARITHMETIC=1 - ) -endif() + if (MSVC) string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR) message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}") From f321910a3b6249718f0f1bd240c0bce11dcb5ca2 Mon Sep 17 00:00:00 2001 From: vithulep Date: Wed, 11 Jun 2025 10:41:07 +0530 Subject: [PATCH 06/21] Updated cmake file --- ggml/src/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 80f7ff6240..7dcb031f0f 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -103,7 +103,6 @@ endif() # TODO: probably these flags need to be tweaked on some architectures # feel free to update the Makefile for your architecture and send a pull request or issue message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}") - if (MSVC) string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR) message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}") From de0a047f47176af381ae9e999c1856b0142ef603 Mon Sep 17 00:00:00 2001 From: vithulep Date: Wed, 11 Jun 2025 11:49:15 +0530 Subject: [PATCH 07/21] added row --- ggml/src/ggml-quants.c | 1 + 1 file changed, 1 insertion(+) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index deaea58f3a..6d0d5d4ec1 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -364,6 +364,7 @@ void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRI #else for (int i = 0; i < nb; i++) { const float d = GGML_FP16_TO_FP32(x[i].d); + for (int j = 0; j < qk; ++j) { y[i*qk + j] = x[i].qs[j]*d; } From 3405a588a493e78422c71f2438e94c6a40a23fa5 Mon Sep 17 00:00:00 2001 From: vithulep Date: Wed, 11 Jun 2025 15:23:59 +0530 Subject: [PATCH 08/21] updated cmake file --- ggml/src/CMakeLists.txt | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index d91dbc46fe..3bced163cc 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -103,6 +103,15 @@ endif() # TODO: probably these flags need to be tweaked on some architectures # feel free to update the Makefile for your architecture and send a pull request or issue message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}") +# Enable SVE for ARMv8-A+ architectures +if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64|ARM64") + message(STATUS "Enabling ARM SVE support") + add_compile_options(-march=native -fopenmp -O3 -ftree-vectorize) + add_compile_definitions( + __ARM_FEATURE_SVE=1 + __ARM_FEATURE_FP16_VECTOR_ARITHMETIC=1 + ) +endif() if (MSVC) string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR) message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}") From 72e532c8c3f028e0cf667ea1197e5e1e3a243ad3 Mon Sep 17 00:00:00 2001 From: vithulep Date: Wed, 11 Jun 2025 15:35:41 +0530 Subject: [PATCH 09/21] updated cmake file --- ggml/src/CMakeLists.txt | 9 --------- 1 file changed, 9 deletions(-) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 3bced163cc..d91dbc46fe 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -103,15 +103,6 @@ endif() # TODO: probably these flags need to be tweaked on some architectures # feel free to update the Makefile for your architecture and send a pull request or issue message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}") -# Enable SVE for ARMv8-A+ architectures -if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64|ARM64") - message(STATUS "Enabling ARM SVE support") - add_compile_options(-march=native -fopenmp -O3 -ftree-vectorize) - add_compile_definitions( - __ARM_FEATURE_SVE=1 - __ARM_FEATURE_FP16_VECTOR_ARITHMETIC=1 - ) -endif() if (MSVC) string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR) message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}") From 3a3007f63ef2ed7418ab9e8885e1b01bd39c6574 Mon Sep 17 00:00:00 2001 From: vithulep Date: Fri, 13 Jun 2025 10:11:12 +0530 Subject: [PATCH 10/21] Added comments --- ggml/src/ggml-quants.c | 1 + 1 file changed, 1 insertion(+) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 6d0d5d4ec1..a617f951e2 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -340,6 +340,7 @@ void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRI } } +// SVE Support added for Scaler Implementation void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { static const int qk = QK8_0; From 2add7877159ff00bca99516d05009ccf4c15684a Mon Sep 17 00:00:00 2001 From: vithulep Date: Tue, 17 Jun 2025 14:50:06 +0530 Subject: [PATCH 14/21] Removed SVE Implementation of Dequantized row --- ggml/src/ggml-cpu/arch/arm/quants.c | 3 ++- ggml/src/ggml-quants.c | 27 +++++---------------------- 2 files changed, 7 insertions(+), 23 deletions(-) diff --git a/ggml/src/ggml-cpu/arch/arm/quants.c b/ggml/src/ggml-cpu/arch/arm/quants.c index e630d4c98d..cb502d4b40 100644 --- a/ggml/src/ggml-cpu/arch/arm/quants.c +++ b/ggml/src/ggml-cpu/arch/arm/quants.c @@ -43,7 +43,8 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i const int nb = k / QK8_0; block_q8_0 * GGML_RESTRICT y = vy; - + + // printf("Here"); #if defined(__ARM_FEATURE_SVE) const int sve_register_length = ggml_cpu_get_sve_cnt() * 8; const int ggml_f32_epr = sve_register_length / 32; diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index a617f951e2..e389a46dbe 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -340,7 +340,6 @@ void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRI } } -// SVE Support added for Scaler Implementation void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { static const int qk = QK8_0; @@ -348,29 +347,13 @@ void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRI const int nb = k / qk; - #if defined(__ARM_FEATURE_SVE) - svbool_t pg = svptrue_b32(); - const svfloat32_t inactive1 = svdup_n_f32(0.0f); - const int ggml_f32_epr = svcntw(); - for (int i = 0; i < nb; i+=1) { - const float d1 = GGML_FP16_TO_FP32(x[i].d); // d:0 - const int8_t *x_data1 = x[i].qs; - float *y_base = y + i * qk; - for (int j = 0; j < qk; j+=ggml_f32_epr) { - svint32_t vec0 = svld1sb_s32(pg, x_data1 + j); - svfloat32_t fvec0 = svmul_n_f32_m(pg, svcvt_f32_s32_m(inactive1, pg, vec0), d1); // Convert to float and scale - svst1_f32(pg, y_base + j, fvec0); - } - } - #else - for (int i = 0; i < nb; i++) { - const float d = GGML_FP16_TO_FP32(x[i].d); + for (int i = 0; i < nb; i++) { + const float d = GGML_FP16_TO_FP32(x[i].d); - for (int j = 0; j < qk; ++j) { - y[i*qk + j] = x[i].qs[j]*d; - } + for (int j = 0; j < qk; ++j) { + y[i*qk + j] = x[i].qs[j]*d; } - #endif + } } // From b1487ec29310ec4baf591756988ccfd38fdd5148 Mon Sep 17 00:00:00 2001 From: vithulep Date: Tue, 17 Jun 2025 15:41:18 +0530 Subject: [PATCH 15/21] remove white spaces --- ggml/src/ggml-cpu/arch/arm/quants.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ggml/src/ggml-cpu/arch/arm/quants.c b/ggml/src/ggml-cpu/arch/arm/quants.c index cb502d4b40..e630d4c98d 100644 --- a/ggml/src/ggml-cpu/arch/arm/quants.c +++ b/ggml/src/ggml-cpu/arch/arm/quants.c @@ -43,8 +43,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i const int nb = k / QK8_0; block_q8_0 * GGML_RESTRICT y = vy; - - // printf("Here"); + #if defined(__ARM_FEATURE_SVE) const int sve_register_length = ggml_cpu_get_sve_cnt() * 8; const int ggml_f32_epr = sve_register_length / 32; From d46f3637a30fac0ff9e555d0dd16a66ff194f4c1 Mon Sep 17 00:00:00 2001 From: vithulep Date: Wed, 25 Jun 2025 13:20:13 +0530 Subject: [PATCH 16/21] Updated Quantize_row_q8_0() function --- ggml/src/ggml-cpu/arch/arm/quants.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-cpu/arch/arm/quants.c b/ggml/src/ggml-cpu/arch/arm/quants.c index e630d4c98d..f100ba73a2 100644 --- a/ggml/src/ggml-cpu/arch/arm/quants.c +++ b/ggml/src/ggml-cpu/arch/arm/quants.c @@ -48,8 +48,10 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i const int sve_register_length = ggml_cpu_get_sve_cnt() * 8; const int ggml_f32_epr = sve_register_length / 32; const svfloat32_t inactive1 = svdup_n_f32(0.0f); - const svint32_t inactive2 = svdup_n_s32(0); const svbool_t pg = svptrue_b32(); + svfloat32_t zero = svdup_f32(0.0f); + svfloat32_t half = svdup_f32(0.5f); + for (int i = 0; i < nb; i+=1) { svfloat32_t srcv1, asrcv1; svfloat32_t sv_max = svdup_n_f32(0.0f); @@ -67,8 +69,14 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i for (int j = 0; j < QK8_0; j+=ggml_f32_epr) { srcv1 = svld1_f32(pg, x + i*32 + j); const svfloat32_t v1 = svmul_n_f32_m(pg, srcv1, id); - const svint32_t vi1 = svcvt_s32_f32_m(inactive2, pg, v1); - svst1b_s32(pg, &y[i].qs[j], vi1); + + svbool_t ge_zero = svcmpge_f32(pg, v1, zero); + svfloat32_t v_pos = svadd_f32_m(pg, v1, half); + svfloat32_t v_neg = svsub_f32_m(pg, v1, half); + + svfloat32_t v_rounded = svsel_f32(ge_zero, v_pos, v_neg); + svint32_t result = svcvt_s32_f32_x(pg, v_rounded); + svst1b_s32(pg, &y[i].qs[j], result); } } #elif defined(__ARM_NEON) From 1d47d63d40eee89bfd3399949e9307b89ad4aa1f Mon Sep 17 00:00:00 2001 From: vithulep Date: Wed, 25 Jun 2025 13:46:11 +0530 Subject: [PATCH 17/21] removed white spaces --- ggml/src/ggml-cpu/arch/arm/quants.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-cpu/arch/arm/quants.c b/ggml/src/ggml-cpu/arch/arm/quants.c index f100ba73a2..bbfe1cf753 100644 --- a/ggml/src/ggml-cpu/arch/arm/quants.c +++ b/ggml/src/ggml-cpu/arch/arm/quants.c @@ -51,7 +51,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i const svbool_t pg = svptrue_b32(); svfloat32_t zero = svdup_f32(0.0f); svfloat32_t half = svdup_f32(0.5f); - + for (int i = 0; i < nb; i+=1) { svfloat32_t srcv1, asrcv1; svfloat32_t sv_max = svdup_n_f32(0.0f); @@ -70,7 +70,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i srcv1 = svld1_f32(pg, x + i*32 + j); const svfloat32_t v1 = svmul_n_f32_m(pg, srcv1, id); - svbool_t ge_zero = svcmpge_f32(pg, v1, zero); + svbool_t ge_zero = svcmpge_f32(pg, v1, zero); svfloat32_t v_pos = svadd_f32_m(pg, v1, half); svfloat32_t v_neg = svsub_f32_m(pg, v1, half); From a318c587cd3e2e0ac07965c28e6d47553a3dda6e Mon Sep 17 00:00:00 2001 From: vithulep Date: Fri, 4 Jul 2025 11:16:32 +0530 Subject: [PATCH 18/21] change quant.c --- ggml/src/ggml-cpu/arch/arm/quants.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-cpu/arch/arm/quants.c b/ggml/src/ggml-cpu/arch/arm/quants.c index bbfe1cf753..42f018efd7 100644 --- a/ggml/src/ggml-cpu/arch/arm/quants.c +++ b/ggml/src/ggml-cpu/arch/arm/quants.c @@ -45,7 +45,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i block_q8_0 * GGML_RESTRICT y = vy; #if defined(__ARM_FEATURE_SVE) - const int sve_register_length = ggml_cpu_get_sve_cnt() * 8; + const int sve_register_length = svcntb() * 8; const int ggml_f32_epr = sve_register_length / 32; const svfloat32_t inactive1 = svdup_n_f32(0.0f); const svbool_t pg = svptrue_b32(); @@ -79,6 +79,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i svst1b_s32(pg, &y[i].qs[j], result); } } + #elif defined(__ARM_NEON) for (int i = 0; i < nb; i++) { float32x4_t srcv [8]; From a6b0726eec79501c8a589e3cf41fe83316bba57c Mon Sep 17 00:00:00 2001 From: vithulep Date: Thu, 31 Jul 2025 09:43:32 +0530 Subject: [PATCH 19/21] Added sve for dequantized_q8_0 --- ggml/src/ggml-cpu/arch/arm/quants.c | 108 +++++++++++++++++++--------- ggml/src/ggml-quants.c | 28 ++++++-- 2 files changed, 98 insertions(+), 38 deletions(-) diff --git a/ggml/src/ggml-cpu/arch/arm/quants.c b/ggml/src/ggml-cpu/arch/arm/quants.c index 42f018efd7..61fde4f162 100644 --- a/ggml/src/ggml-cpu/arch/arm/quants.c +++ b/ggml/src/ggml-cpu/arch/arm/quants.c @@ -36,51 +36,84 @@ static const uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b) << 4 static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4 #endif - +#include +static clock_t total_time = 0; +static int call_count = 0; void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { + clock_t start = clock(); assert(QK8_0 == 32); assert(k % QK8_0 == 0); const int nb = k / QK8_0; block_q8_0 * GGML_RESTRICT y = vy; -#if defined(__ARM_FEATURE_SVE) - const int sve_register_length = svcntb() * 8; - const int ggml_f32_epr = sve_register_length / 32; - const svfloat32_t inactive1 = svdup_n_f32(0.0f); - const svbool_t pg = svptrue_b32(); - svfloat32_t zero = svdup_f32(0.0f); - svfloat32_t half = svdup_f32(0.5f); +// #if defined(__ARM_FEATURE_SVE) +// // const int sve_register_length = svcntb() * 8; //get the vector length +// // const int ggml_f32_epr = sve_register_length / 32; +// const svfloat32_t inactive1 = svdup_n_f32(0.0f); +// const svbool_t pg = svptrue_b32(); +// svfloat32_t zero = svdup_f32(0.0f); +// svfloat32_t half = svdup_f32(0.5f); +// const svint32_t inactive2 = svdup_n_s32(0); - for (int i = 0; i < nb; i+=1) { - svfloat32_t srcv1, asrcv1; - svfloat32_t sv_max = svdup_n_f32(0.0f); - float32_t amax = 0.0; +// for (int i = 0; i < nb; i+=1) { +// svfloat32_t srcv1, srcv2, srcv3, srcv4; +// svfloat32_t asrcv1, asrcv2, asrcv3, asrcv4; +// float32_t amax1 = 0.0; - for (int j = 0; j < QK8_0; j+=ggml_f32_epr) { - srcv1 = svld1_f32(pg, x + i*32 + j); - asrcv1 = svabs_f32_m(inactive1, pg, srcv1); - sv_max = svmax_f32_m(pg, sv_max, asrcv1); - } - amax = svmaxv_f32(pg, sv_max); - float32_t d = amax / ((1 << 7) - 1); - float32_t id = d ? 1.0f/d : 0.0f; - y[i].d = GGML_FP32_TO_FP16(d); - for (int j = 0; j < QK8_0; j+=ggml_f32_epr) { - srcv1 = svld1_f32(pg, x + i*32 + j); - const svfloat32_t v1 = svmul_n_f32_m(pg, srcv1, id); +// srcv1 = svld1_f32(pg, x + i*32); +// asrcv1 = svabs_f32_m(inactive1, pg, srcv1); - svbool_t ge_zero = svcmpge_f32(pg, v1, zero); - svfloat32_t v_pos = svadd_f32_m(pg, v1, half); - svfloat32_t v_neg = svsub_f32_m(pg, v1, half); +// srcv2 = svld1_f32(pg, x + i*32 + 8); +// asrcv2 = svabs_f32_m(inactive1, pg, srcv2); - svfloat32_t v_rounded = svsel_f32(ge_zero, v_pos, v_neg); - svint32_t result = svcvt_s32_f32_x(pg, v_rounded); - svst1b_s32(pg, &y[i].qs[j], result); - } - } +// srcv3 = svld1_f32(pg, x + i*32 + 16); +// asrcv3 = svabs_f32_m(inactive1, pg, srcv3); -#elif defined(__ARM_NEON) +// srcv4 = svld1_f32(pg, x + i*32 + 24); +// asrcv4 = svabs_f32_m(inactive1, pg, srcv4); + +// svfloat32_t max1 = svmax_f32_m(pg, asrcv2, asrcv1); +// svfloat32_t max2 = svmax_f32_m(pg, asrcv4, asrcv3); +// svfloat32_t max3 = svmax_f32_m(pg, max2, max1); +// amax1 = svmaxv_f32(pg, max3); + +// float32_t d1 = amax1 / ((1 << 7) - 1); +// float32_t id1 = d1 ? 1.0f/d1 : 0.0f; +// y[i].d = GGML_FP32_TO_FP16(d1); + +// const svfloat32_t v1 = svmul_n_f32_m(pg, srcv1, id1); +// const svfloat32_t v2 = svmul_n_f32_m(pg, srcv2, id1); +// const svfloat32_t v3 = svmul_n_f32_m(pg, srcv3, id1); +// const svfloat32_t v4 = svmul_n_f32_m(pg, srcv4, id1); + +// svbool_t ge_zero = svcmpge_f32(pg, v1, zero); +// svfloat32_t v_rounded = svsel_f32(ge_zero, svadd_f32_m(pg, v1, half), svsub_f32_m(pg, v1, half)); +// // svint32_t v_rounded = svcvt_s32_f32_m(inactive2, pg, v1); + +// svbool_t ge_zero_2 = svcmpge_f32(pg, v2, zero); +// svfloat32_t v_rounded_2 = svsel_f32(ge_zero_2, svadd_f32_m(pg, v2, half), svsub_f32_m(pg, v2, half)); + +// svbool_t ge_zero_3 = svcmpge_f32(pg, v3, zero); +// svfloat32_t v_rounded_3 = svsel_f32(ge_zero_3, svadd_f32_m(pg, v3, half), svsub_f32_m(pg, v3, half)); + +// svbool_t ge_zero_4 = svcmpge_f32(pg, v4, zero); +// svfloat32_t v_rounded_4 = svsel_f32(ge_zero_4, svadd_f32_m(pg, v4, half), svsub_f32_m(pg, v4, half)); + +// svint32_t result = svcvt_s32_f32_x(pg, v_rounded); +// svst1b_s32(pg, &y[i].qs[0], result); + +// svint32_t result_2 = svcvt_s32_f32_x(pg, v_rounded_2); +// svst1b_s32(pg, &y[i].qs[8], result_2); + +// svint32_t result_3 = svcvt_s32_f32_x(pg, v_rounded_3); +// svst1b_s32(pg, &y[i].qs[16], result_3); + +// svint32_t result_4 = svcvt_s32_f32_x(pg, v_rounded_4); +// svst1b_s32(pg, &y[i].qs[24], result_4); +// } + +#if defined(__ARM_NEON) for (int i = 0; i < nb; i++) { float32x4_t srcv [8]; float32x4_t asrcv[8]; @@ -115,6 +148,15 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i // scalar quantize_row_q8_0_ref(x, y, k); #endif + clock_t end = clock(); + + total_time += (end - start); + call_count++; + + printf("my_function call %d, time this call: %.3f ms, total time: %.3f ms\n", + call_count, + 1000.0 * (end - start) / CLOCKS_PER_SEC, + 1000.0 * total_time / CLOCKS_PER_SEC); } void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index e389a46dbe..587751c72b 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -347,13 +347,31 @@ void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRI const int nb = k / qk; - for (int i = 0; i < nb; i++) { - const float d = GGML_FP16_TO_FP32(x[i].d); + #if defined(__ARM_FEATURE_SVE) + svbool_t pg = svptrue_b32(); + const svfloat32_t inactive1 = svdup_n_f32(0.0f); + const int ggml_f32_epr = svcntw(); - for (int j = 0; j < qk; ++j) { - y[i*qk + j] = x[i].qs[j]*d; + for (int i = 0; i < nb; i+=1) { + const float d1 = GGML_FP16_TO_FP32(x[i].d); // d:0 + + const int8_t *x_data1 = x[i].qs; + float *y_base = y + i * qk; + for (int j = 0; j < qk; j+=ggml_f32_epr) { + svint32_t vec0 = svld1sb_s32(pg, x_data1 + j); + svfloat32_t fvec0 = svmul_n_f32_m(pg, svcvt_f32_s32_m(inactive1, pg, vec0), d1); // Convert to float and scale + svst1_f32(pg, y_base + j, fvec0); + } } - } + #else + for (int i = 0; i < nb; i++) { + const float d = GGML_FP16_TO_FP32(x[i].d); + + for (int j = 0; j < qk; ++j) { + y[i*qk + j] = x[i].qs[j]*d; + } + } + #endif } // From 89bbd505993d78be8b5ba31f625fb989e3d886d2 Mon Sep 17 00:00:00 2001 From: vithulep Date: Thu, 31 Jul 2025 09:44:18 +0530 Subject: [PATCH 20/21] Added sve for dequantized_q8_0 --- ggml/src/ggml-cpu/arch/arm/quants.c | 94 ++++++++++------------------- 1 file changed, 32 insertions(+), 62 deletions(-) diff --git a/ggml/src/ggml-cpu/arch/arm/quants.c b/ggml/src/ggml-cpu/arch/arm/quants.c index 61fde4f162..6457c649e4 100644 --- a/ggml/src/ggml-cpu/arch/arm/quants.c +++ b/ggml/src/ggml-cpu/arch/arm/quants.c @@ -47,73 +47,43 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i block_q8_0 * GGML_RESTRICT y = vy; -// #if defined(__ARM_FEATURE_SVE) -// // const int sve_register_length = svcntb() * 8; //get the vector length -// // const int ggml_f32_epr = sve_register_length / 32; -// const svfloat32_t inactive1 = svdup_n_f32(0.0f); -// const svbool_t pg = svptrue_b32(); -// svfloat32_t zero = svdup_f32(0.0f); -// svfloat32_t half = svdup_f32(0.5f); -// const svint32_t inactive2 = svdup_n_s32(0); +#if defined(__ARM_FEATURE_SVE) + const int sve_register_length = svcntb() * 8; + const int ggml_f32_epr = sve_register_length / 32; + const svfloat32_t inactive1 = svdup_n_f32(0.0f); + const svbool_t pg = svptrue_b32(); + svfloat32_t zero = svdup_f32(0.0f); + svfloat32_t half = svdup_f32(0.5f); -// for (int i = 0; i < nb; i+=1) { -// svfloat32_t srcv1, srcv2, srcv3, srcv4; -// svfloat32_t asrcv1, asrcv2, asrcv3, asrcv4; -// float32_t amax1 = 0.0; + for (int i = 0; i < nb; i+=1) { + svfloat32_t srcv1, asrcv1; + svfloat32_t sv_max = svdup_n_f32(0.0f); + float32_t amax = 0.0; -// srcv1 = svld1_f32(pg, x + i*32); -// asrcv1 = svabs_f32_m(inactive1, pg, srcv1); + for (int j = 0; j < QK8_0; j+=ggml_f32_epr) { + srcv1 = svld1_f32(pg, x + i*32 + j); + asrcv1 = svabs_f32_m(inactive1, pg, srcv1); + sv_max = svmax_f32_m(pg, sv_max, asrcv1); + } + amax = svmaxv_f32(pg, sv_max); + float32_t d = amax / ((1 << 7) - 1); + float32_t id = d ? 1.0f/d : 0.0f; + y[i].d = GGML_FP32_TO_FP16(d); + for (int j = 0; j < QK8_0; j+=ggml_f32_epr) { + srcv1 = svld1_f32(pg, x + i*32 + j); + const svfloat32_t v1 = svmul_n_f32_m(pg, srcv1, id); -// srcv2 = svld1_f32(pg, x + i*32 + 8); -// asrcv2 = svabs_f32_m(inactive1, pg, srcv2); + svbool_t ge_zero = svcmpge_f32(pg, v1, zero); + svfloat32_t v_pos = svadd_f32_m(pg, v1, half); + svfloat32_t v_neg = svsub_f32_m(pg, v1, half); -// srcv3 = svld1_f32(pg, x + i*32 + 16); -// asrcv3 = svabs_f32_m(inactive1, pg, srcv3); + svfloat32_t v_rounded = svsel_f32(ge_zero, v_pos, v_neg); + svint32_t result = svcvt_s32_f32_x(pg, v_rounded); + svst1b_s32(pg, &y[i].qs[j], result); + } + } -// srcv4 = svld1_f32(pg, x + i*32 + 24); -// asrcv4 = svabs_f32_m(inactive1, pg, srcv4); - -// svfloat32_t max1 = svmax_f32_m(pg, asrcv2, asrcv1); -// svfloat32_t max2 = svmax_f32_m(pg, asrcv4, asrcv3); -// svfloat32_t max3 = svmax_f32_m(pg, max2, max1); -// amax1 = svmaxv_f32(pg, max3); - -// float32_t d1 = amax1 / ((1 << 7) - 1); -// float32_t id1 = d1 ? 1.0f/d1 : 0.0f; -// y[i].d = GGML_FP32_TO_FP16(d1); - -// const svfloat32_t v1 = svmul_n_f32_m(pg, srcv1, id1); -// const svfloat32_t v2 = svmul_n_f32_m(pg, srcv2, id1); -// const svfloat32_t v3 = svmul_n_f32_m(pg, srcv3, id1); -// const svfloat32_t v4 = svmul_n_f32_m(pg, srcv4, id1); - -// svbool_t ge_zero = svcmpge_f32(pg, v1, zero); -// svfloat32_t v_rounded = svsel_f32(ge_zero, svadd_f32_m(pg, v1, half), svsub_f32_m(pg, v1, half)); -// // svint32_t v_rounded = svcvt_s32_f32_m(inactive2, pg, v1); - -// svbool_t ge_zero_2 = svcmpge_f32(pg, v2, zero); -// svfloat32_t v_rounded_2 = svsel_f32(ge_zero_2, svadd_f32_m(pg, v2, half), svsub_f32_m(pg, v2, half)); - -// svbool_t ge_zero_3 = svcmpge_f32(pg, v3, zero); -// svfloat32_t v_rounded_3 = svsel_f32(ge_zero_3, svadd_f32_m(pg, v3, half), svsub_f32_m(pg, v3, half)); - -// svbool_t ge_zero_4 = svcmpge_f32(pg, v4, zero); -// svfloat32_t v_rounded_4 = svsel_f32(ge_zero_4, svadd_f32_m(pg, v4, half), svsub_f32_m(pg, v4, half)); - -// svint32_t result = svcvt_s32_f32_x(pg, v_rounded); -// svst1b_s32(pg, &y[i].qs[0], result); - -// svint32_t result_2 = svcvt_s32_f32_x(pg, v_rounded_2); -// svst1b_s32(pg, &y[i].qs[8], result_2); - -// svint32_t result_3 = svcvt_s32_f32_x(pg, v_rounded_3); -// svst1b_s32(pg, &y[i].qs[16], result_3); - -// svint32_t result_4 = svcvt_s32_f32_x(pg, v_rounded_4); -// svst1b_s32(pg, &y[i].qs[24], result_4); -// } - -#if defined(__ARM_NEON) +#elif defined(__ARM_NEON) for (int i = 0; i < nb; i++) { float32x4_t srcv [8]; float32x4_t asrcv[8]; From 2c87ef415b709ecaae20e5210ba74cc9cfd4df11 Mon Sep 17 00:00:00 2001 From: vithulep Date: Thu, 31 Jul 2025 12:33:36 +0530 Subject: [PATCH 21/21] Removed clock time code --- ggml/src/ggml-cpu/arch/arm/quants.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/ggml/src/ggml-cpu/arch/arm/quants.c b/ggml/src/ggml-cpu/arch/arm/quants.c index 6457c649e4..42f018efd7 100644 --- a/ggml/src/ggml-cpu/arch/arm/quants.c +++ b/ggml/src/ggml-cpu/arch/arm/quants.c @@ -36,11 +36,8 @@ static const uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b) << 4 static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4 #endif -#include -static clock_t total_time = 0; -static int call_count = 0; + void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { - clock_t start = clock(); assert(QK8_0 == 32); assert(k % QK8_0 == 0); const int nb = k / QK8_0; @@ -118,15 +115,6 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i // scalar quantize_row_q8_0_ref(x, y, k); #endif - clock_t end = clock(); - - total_time += (end - start); - call_count++; - - printf("my_function call %d, time this call: %.3f ms, total time: %.3f ms\n", - call_count, - 1000.0 * (end - start) / CLOCKS_PER_SEC, - 1000.0 * total_time / CLOCKS_PER_SEC); } void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {