From 1b79db877dd58fa7b6ccf34bff07abe3bb29799a Mon Sep 17 00:00:00 2001
From: Han Yin <han.yin@arm.com>
Date: Wed, 25 Jun 2025 12:59:45 -0700
Subject: [PATCH] core: implement cpu_detector native lib

---
 .../llama/src/main/cpp/CMakeLists.txt         |  30 +++-
 .../llama/src/main/cpp/cpu_detector.cpp       |  62 +++++++
 .../android/llama/cpp/LLamaLibraryLoader.kt   | 170 ++++++++++++++++++
 3 files changed, 254 insertions(+), 8 deletions(-)
 create mode 100644 examples/llama.android/llama/src/main/cpp/cpu_detector.cpp
 create mode 100644 examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaLibraryLoader.kt

diff --git a/examples/llama.android/llama/src/main/cpp/CMakeLists.txt b/examples/llama.android/llama/src/main/cpp/CMakeLists.txt
index 6fb349185d..c1fde86356 100644
--- a/examples/llama.android/llama/src/main/cpp/CMakeLists.txt
+++ b/examples/llama.android/llama/src/main/cpp/CMakeLists.txt
@@ -17,18 +17,32 @@ set(CMAKE_CXX_STANDARD 17  CACHE STRING "" FORCE)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 
 # --------------------------------------------------------------------------
-# 1. Locate the root of the llama.cpp source tree
+# 1 CPU feature detection library
+# --------------------------------------------------------------------------
+add_subdirectory(
+        ${CMAKE_CURRENT_LIST_DIR}/../../../../../../include/cpu_features
+        ${CMAKE_BINARY_DIR}/cpu_features_build)
+
+# CPU feature detection library (lightweight, loads first)
+add_library(llama_cpu_detector SHARED cpu_detector.cpp)
+target_link_libraries(llama_cpu_detector
+        PRIVATE CpuFeatures::cpu_features
+        android
+        log)
+
+# --------------------------------------------------------------------------
+# 2. Locate the root of the llama.cpp source tree
 #    (six levels up from this CMakeLists.txt).
 # --------------------------------------------------------------------------
 set(LLAMA_SRC ${CMAKE_CURRENT_LIST_DIR}/../../../../../../)
 
 # --------------------------------------------------------------------------
-# 2. Build helper  –  one invocation = one hardware tier
+# 3. Build helper  –  one invocation = one hardware tier
 # --------------------------------------------------------------------------
 include(ExternalProject)
 
 function(build_llama_tier tier march)
-    # ---------- 2.1  configure & build core code in an external project -----
+    # ---------- 3.1  configure & build core code in an external project -----
     set(build_dir ${CMAKE_BINARY_DIR}/llama_build_${tier})
 
     # KleidiAI requires dotprod and i8mm
@@ -71,7 +85,7 @@ function(build_llama_tier tier march)
             ${build_dir}/libggml-cpu.a
     )
 
-    # ---------- 2.2  make the static libs produced above visible ------------
+    # ---------- 3.2  make the static libs produced above visible ------------
     set(llama_a   ${build_dir}/libllama.a)
     set(common_a  ${build_dir}/libcommon.a)
     set(ggml_a    ${build_dir}/libggml.a)
@@ -103,7 +117,7 @@ function(build_llama_tier tier march)
         IMPORTED_LOCATION ${ggml_cpu_a})
     add_dependencies(ggml_cpu_core_${tier} llama_build_${tier})
 
-    # ---------- 2.3  JNI wrapper DSO ---------------------------------------
+    # ---------- 3.3  JNI wrapper DSO ---------------------------------------
     add_library(llama_android_${tier} SHARED llama-android.cpp)
 
     target_compile_options(llama_android_${tier} PRIVATE "-march=${march}")
@@ -124,13 +138,13 @@ function(build_llama_tier tier march)
         android
         log)
 
-    # ---------- 2.4  nice SONAME & filename -------------------------------
+    # ---------- 3.4  nice SONAME & filename -------------------------------
     set_target_properties(llama_android_${tier} PROPERTIES
         OUTPUT_NAME "llama_android_${tier}")
 endfunction()
 
 # --------------------------------------------------------------------------
-# 3.  Build all five tiers
+# 4.  Build all five tiers
 # --------------------------------------------------------------------------
 build_llama_tier(t0 "armv8-a+simd")
 build_llama_tier(t1 "armv8.2-a+dotprod")
@@ -144,7 +158,7 @@ add_dependencies(llama_build_t3 llama_build_t2)
 #add_dependencies(llama_build_t4 llama_build_t3)
 
 # --------------------------------------------------------------------------
-# 4.  Default variant when Gradle hasn’t told us (keeps IDE happy)
+# 5.  Default variant when Gradle hasn’t told us (keeps IDE happy)
 # --------------------------------------------------------------------------
 if(NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE Release CACHE STRING "" FORCE)
diff --git a/examples/llama.android/llama/src/main/cpp/cpu_detector.cpp b/examples/llama.android/llama/src/main/cpp/cpu_detector.cpp
new file mode 100644
index 0000000000..d8be2a4f72
--- /dev/null
+++ b/examples/llama.android/llama/src/main/cpp/cpu_detector.cpp
@@ -0,0 +1,62 @@
+#include <jni.h>
+#include <cpuinfo_aarch64.h>
+#include <android/log.h>
+#include <string>
+
+using namespace cpu_features;
+
+#define LOG_TAG "CpuDetector"
+#define LOGI(...) __android_log_print(ANDROID_LOG_INFO, LOG_TAG, __VA_ARGS__)
+
+static const Aarch64Info info = GetAarch64Info();
+static const Aarch64Features features = info.features;
+
+extern "C" JNIEXPORT jint JNICALL
+Java_android_llama_cpp_LLamaLibraryLoader_getOptimalTier(
+        JNIEnv* env,
+        jclass clazz) {
+    int tier = 0;  // Default to T0 (baseline)
+
+    // Check features in reverse order (highest tier first)
+    // TODO-han.yin: implement T4 once obtaining an Android device with SME!
+    if (features.sve && features.sve2) {
+        tier = 3;  // T3: ARMv9-a with SVE/SVE2
+        LOGI("Detected SVE/SVE2 support - selecting T3");
+    }
+    else if (features.i8mm) {
+        tier = 2;  // T2: ARMv8.6-a with i8mm
+        LOGI("Detected i8mm support - selecting T2");
+    }
+    else if (features.asimddp) {
+        tier = 1;  // T1: ARMv8.2-a with dotprod
+        LOGI("Detected dotprod support - selecting T1");
+    }
+    else if (features.asimd) {
+        tier = 0;  // T0: baseline ARMv8-a with SIMD
+        LOGI("Detected basic ASIMD support - selecting T0");
+    }
+    else {
+        // Fallback - this shouldn't happen on arm64-v8a devices
+        tier = 0;
+        LOGI("No expected features detected - falling back to T0");
+    }
+
+    return tier;
+}
+
+// Optional: Keep a feature string function for debugging
+extern "C" JNIEXPORT jstring JNICALL
+Java_android_llama_cpp_LLamaLibraryLoader_getCpuFeaturesString(
+        JNIEnv* env,
+        jclass clazz) {
+    std::string text;
+
+    if (features.asimd) text += "ASIMD ";
+    if (features.asimddp) text += "ASIMDDP ";
+    if (features.i8mm) text += "I8MM ";
+    if (features.sve) text += "SVE ";
+    if (features.sve2) text += "SVE2 ";
+    if (features.sme) text += "SME ";
+
+    return env->NewStringUTF(text.c_str());
+}
diff --git a/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaLibraryLoader.kt b/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaLibraryLoader.kt
new file mode 100644
index 0000000000..23dbdbac30
--- /dev/null
+++ b/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaLibraryLoader.kt
@@ -0,0 +1,170 @@
+package android.llama.cpp
+
+import android.content.Context
+import android.content.SharedPreferences
+import android.util.Log
+import androidx.core.content.edit
+
+enum class LLamaTier(val rawValue: Int, val libraryName: String, val description: String) {
+    T0(0, "llama_android_t0", "ARMv8-a baseline with SIMD"),
+    T1(1, "llama_android_t1", "ARMv8.2-a with DotProd"),
+    T2(2, "llama_android_t2", "ARMv8.6-a with DotProd + I8MM"),
+    T3(3, "llama_android_t3", "ARMv9-a with DotProd + I8MM + SVE/SVE2");
+    // TODO-han.yin: implement T4 once obtaining an Android device with SME!
+
+    companion object {
+        fun fromRawValue(value: Int): LLamaTier? {
+            return entries.find { it.rawValue == value }
+        }
+
+        fun getMaxSupportedTier(): LLamaTier = T3
+    }
+}
+
+class LLamaLibraryLoader private constructor() {
+
+    companion object {
+        private val TAG = LLamaLibraryLoader::class.simpleName
+
+        private const val DETECTION_VERSION = 1
+        private const val PREFS_NAME = "llama_cpu_detection"
+        private const val KEY_DETECTED_TIER = "detected_tier"
+        private const val KEY_DETECTION_VERSION = "detection_version"
+
+        @JvmStatic
+        private external fun getOptimalTier(): Int
+
+        @JvmStatic
+        private external fun getCpuFeaturesString(): String
+
+        private var _cachedInstance: LLamaAndroid? = null
+        private var _detectedTier: LLamaTier? = null
+        val detectedTier: LLamaTier? get() = _detectedTier
+
+        /**
+         * Factory method to get a configured LLamaAndroid instance.
+         * Handles tier detection, caching, and library loading automatically.
+         */
+        @Synchronized
+        fun createInstance(context: Context): LLamaAndroid? {
+            // Return cached instance if available
+            _cachedInstance?.let { return it }
+
+            try {
+                // Obtain the optimal tier from cache if available
+                val tier = getOrDetectOptimalTier(context) ?: run {
+                    Log.e(TAG, "Failed to determine optimal tier")
+                    return null
+                }
+                _detectedTier = tier
+                Log.i(TAG, "Using tier: ${tier.name} (${tier.description})")
+
+                // Create and cache LLamaAndroid instance
+                val instance = LLamaAndroid.createWithTier(tier) ?: run {
+                    Log.e(TAG, "Failed to instantiate LLamaAndroid")
+                    return null
+                }
+                _cachedInstance = instance
+                Log.i(TAG, "Successfully created LLamaAndroid instance with ${tier.name}")
+
+                return instance
+
+            } catch (e: Exception) {
+                Log.e(TAG, "Error creating LLamaAndroid instance", e)
+                return null
+            }
+        }
+
+        /**
+         * Clear cached detection results (for testing/debugging)
+         */
+        fun clearCache(context: Context) {
+            getSharedPrefs(context).edit { clear() }
+            _cachedInstance = null
+            _detectedTier = null
+            Log.i(TAG, "Cleared detection results and cached instance")
+        }
+
+        /**
+         * Get optimal tier from cache or detect it fresh
+         */
+        private fun getOrDetectOptimalTier(context: Context): LLamaTier? {
+            val prefs = getSharedPrefs(context)
+
+            // Check if we have a cached result with the current detection version
+            val cachedVersion = prefs.getInt(KEY_DETECTION_VERSION, -1)
+            val cachedTierValue = prefs.getInt(KEY_DETECTED_TIER, -1)
+            if (cachedVersion == DETECTION_VERSION && cachedTierValue >= 0) {
+                val cachedTier = LLamaTier.fromRawValue(cachedTierValue)
+                if (cachedTier != null) {
+                    Log.i(TAG, "Using cached tier detection: ${cachedTier.name}")
+                    return cachedTier
+                }
+            }
+
+            // No valid cache, detect fresh
+            Log.i(TAG, "Performing fresh tier detection")
+            return detectAndCacheOptimalTier(context)
+        }
+
+        /**
+         * Detect optimal tier and save to cache
+         */
+        private fun detectAndCacheOptimalTier(context: Context): LLamaTier? {
+            try {
+                // Load CPU detection library
+                System.loadLibrary("llama_cpu_detector")
+                Log.i(TAG, "CPU feature detector loaded successfully")
+
+                // Detect optimal tier
+                val tierValue = getOptimalTier()
+                val features = getCpuFeaturesString()
+                Log.i(TAG, "Raw tier $tierValue w/ CPU features: $features")
+
+                // Convert to enum and validate
+                val tier = LLamaTier.fromRawValue(tierValue) ?: run {
+                    Log.w(TAG, "Invalid tier value $tierValue")
+                    return null
+                }
+
+                // Ensure we don't exceed maximum supported tier
+                val finalTier = if (tier.rawValue > LLamaTier.getMaxSupportedTier().rawValue) {
+                    Log.w(TAG, "Detected tier ${tier.name} exceeds max supported, using ${LLamaTier.getMaxSupportedTier().name}")
+                    LLamaTier.getMaxSupportedTier()
+                } else {
+                    tier
+                }
+
+                // Cache the result
+                getSharedPrefs(context).edit {
+                    putInt(KEY_DETECTED_TIER, finalTier.rawValue)
+                    putInt(KEY_DETECTION_VERSION, DETECTION_VERSION)
+                }
+
+                Log.i(TAG, "Detected and cached optimal tier: ${finalTier.name}")
+                return finalTier
+
+            } catch (e: UnsatisfiedLinkError) {
+                Log.e(TAG, "Failed to load CPU detection library", e)
+
+                // Fallback to T0 and cache it
+                val fallbackTier = LLamaTier.T0
+                getSharedPrefs(context).edit {
+                    putInt(KEY_DETECTED_TIER, fallbackTier.rawValue)
+                    putInt(KEY_DETECTION_VERSION, DETECTION_VERSION)
+                }
+
+                Log.i(TAG, "Using fallback tier: ${fallbackTier.name}")
+                return fallbackTier
+
+            } catch (e: Exception) {
+                Log.e(TAG, "Unexpected error during tier detection", e)
+                return null
+            }
+        }
+
+        private fun getSharedPrefs(context: Context): SharedPreferences {
+            return context.getSharedPreferences(PREFS_NAME, Context.MODE_PRIVATE)
+        }
+    }
+}