use RM=4 for arm

This commit is contained in:
Aman Gupta 2026-02-13 15:19:51 +05:30
parent 9c660ddafe
commit 8d1be6c4cd
1 changed files with 5 additions and 5 deletions

View File

@ -11,11 +11,11 @@
// TODO: add support for sizeless vector types
#if defined(GGML_SIMD) && !defined(__ARM_FEATURE_SVE) && !defined(__riscv_v_intrinsic)
// TODO: untested on avx512 and arm
// TODO: untested on avx512
// These are in units of GGML_F32_EPR
#if defined(__AVX512F__) || defined (__ARM_NEON__)
static constexpr int GEMM_RM = 6;
static constexpr int GEMM_RN = 4; // 24+4+1 = 29/32
static constexpr int GEMM_RM = 4;
static constexpr int GEMM_RN = 4; // 16+4+1 = 25/32
#elif defined(__AVX2__) || defined(__AVX__)
static constexpr int GEMM_RM = 6;
static constexpr int GEMM_RN = 2; // 12+2+1 = 15/16
@ -66,7 +66,7 @@ static void simd_gemm(
float * GGML_RESTRICT C,
const float * GGML_RESTRICT A,
const float * GGML_RESTRICT B,
int M, int64_t K, int64_t N)
int64_t M, int64_t K, int64_t N)
{
static constexpr int KN = GGML_F32_EPR;
@ -115,7 +115,7 @@ static void simd_gemm(
float * GGML_RESTRICT C,
const float * GGML_RESTRICT A,
const float * GGML_RESTRICT B,
int M, int64_t K, int64_t N)
int64_t M, int64_t K, int64_t N)
{
for (int i = 0; i < M; i++) {
for (int64_t j = 0; j < N; j++) {