From 1b79db877dd58fa7b6ccf34bff07abe3bb29799a Mon Sep 17 00:00:00 2001 From: Han Yin Date: Wed, 25 Jun 2025 12:59:45 -0700 Subject: [PATCH] core: implement cpu_detector native lib --- .../llama/src/main/cpp/CMakeLists.txt | 30 +++- .../llama/src/main/cpp/cpu_detector.cpp | 62 +++++++ .../android/llama/cpp/LLamaLibraryLoader.kt | 170 ++++++++++++++++++ 3 files changed, 254 insertions(+), 8 deletions(-) create mode 100644 examples/llama.android/llama/src/main/cpp/cpu_detector.cpp create mode 100644 examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaLibraryLoader.kt diff --git a/examples/llama.android/llama/src/main/cpp/CMakeLists.txt b/examples/llama.android/llama/src/main/cpp/CMakeLists.txt index 6fb349185d..c1fde86356 100644 --- a/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +++ b/examples/llama.android/llama/src/main/cpp/CMakeLists.txt @@ -17,18 +17,32 @@ set(CMAKE_CXX_STANDARD 17 CACHE STRING "" FORCE) set(CMAKE_POSITION_INDEPENDENT_CODE ON) # -------------------------------------------------------------------------- -# 1. Locate the root of the llama.cpp source tree +# 1 CPU feature detection library +# -------------------------------------------------------------------------- +add_subdirectory( + ${CMAKE_CURRENT_LIST_DIR}/../../../../../../include/cpu_features + ${CMAKE_BINARY_DIR}/cpu_features_build) + +# CPU feature detection library (lightweight, loads first) +add_library(llama_cpu_detector SHARED cpu_detector.cpp) +target_link_libraries(llama_cpu_detector + PRIVATE CpuFeatures::cpu_features + android + log) + +# -------------------------------------------------------------------------- +# 2. Locate the root of the llama.cpp source tree # (six levels up from this CMakeLists.txt). # -------------------------------------------------------------------------- set(LLAMA_SRC ${CMAKE_CURRENT_LIST_DIR}/../../../../../../) # -------------------------------------------------------------------------- -# 2. Build helper – one invocation = one hardware tier +# 3. Build helper – one invocation = one hardware tier # -------------------------------------------------------------------------- include(ExternalProject) function(build_llama_tier tier march) - # ---------- 2.1 configure & build core code in an external project ----- + # ---------- 3.1 configure & build core code in an external project ----- set(build_dir ${CMAKE_BINARY_DIR}/llama_build_${tier}) # KleidiAI requires dotprod and i8mm @@ -71,7 +85,7 @@ function(build_llama_tier tier march) ${build_dir}/libggml-cpu.a ) - # ---------- 2.2 make the static libs produced above visible ------------ + # ---------- 3.2 make the static libs produced above visible ------------ set(llama_a ${build_dir}/libllama.a) set(common_a ${build_dir}/libcommon.a) set(ggml_a ${build_dir}/libggml.a) @@ -103,7 +117,7 @@ function(build_llama_tier tier march) IMPORTED_LOCATION ${ggml_cpu_a}) add_dependencies(ggml_cpu_core_${tier} llama_build_${tier}) - # ---------- 2.3 JNI wrapper DSO --------------------------------------- + # ---------- 3.3 JNI wrapper DSO --------------------------------------- add_library(llama_android_${tier} SHARED llama-android.cpp) target_compile_options(llama_android_${tier} PRIVATE "-march=${march}") @@ -124,13 +138,13 @@ function(build_llama_tier tier march) android log) - # ---------- 2.4 nice SONAME & filename ------------------------------- + # ---------- 3.4 nice SONAME & filename ------------------------------- set_target_properties(llama_android_${tier} PROPERTIES OUTPUT_NAME "llama_android_${tier}") endfunction() # -------------------------------------------------------------------------- -# 3. Build all five tiers +# 4. Build all five tiers # -------------------------------------------------------------------------- build_llama_tier(t0 "armv8-a+simd") build_llama_tier(t1 "armv8.2-a+dotprod") @@ -144,7 +158,7 @@ add_dependencies(llama_build_t3 llama_build_t2) #add_dependencies(llama_build_t4 llama_build_t3) # -------------------------------------------------------------------------- -# 4. Default variant when Gradle hasn’t told us (keeps IDE happy) +# 5. Default variant when Gradle hasn’t told us (keeps IDE happy) # -------------------------------------------------------------------------- if(NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release CACHE STRING "" FORCE) diff --git a/examples/llama.android/llama/src/main/cpp/cpu_detector.cpp b/examples/llama.android/llama/src/main/cpp/cpu_detector.cpp new file mode 100644 index 0000000000..d8be2a4f72 --- /dev/null +++ b/examples/llama.android/llama/src/main/cpp/cpu_detector.cpp @@ -0,0 +1,62 @@ +#include +#include +#include +#include + +using namespace cpu_features; + +#define LOG_TAG "CpuDetector" +#define LOGI(...) __android_log_print(ANDROID_LOG_INFO, LOG_TAG, __VA_ARGS__) + +static const Aarch64Info info = GetAarch64Info(); +static const Aarch64Features features = info.features; + +extern "C" JNIEXPORT jint JNICALL +Java_android_llama_cpp_LLamaLibraryLoader_getOptimalTier( + JNIEnv* env, + jclass clazz) { + int tier = 0; // Default to T0 (baseline) + + // Check features in reverse order (highest tier first) + // TODO-han.yin: implement T4 once obtaining an Android device with SME! + if (features.sve && features.sve2) { + tier = 3; // T3: ARMv9-a with SVE/SVE2 + LOGI("Detected SVE/SVE2 support - selecting T3"); + } + else if (features.i8mm) { + tier = 2; // T2: ARMv8.6-a with i8mm + LOGI("Detected i8mm support - selecting T2"); + } + else if (features.asimddp) { + tier = 1; // T1: ARMv8.2-a with dotprod + LOGI("Detected dotprod support - selecting T1"); + } + else if (features.asimd) { + tier = 0; // T0: baseline ARMv8-a with SIMD + LOGI("Detected basic ASIMD support - selecting T0"); + } + else { + // Fallback - this shouldn't happen on arm64-v8a devices + tier = 0; + LOGI("No expected features detected - falling back to T0"); + } + + return tier; +} + +// Optional: Keep a feature string function for debugging +extern "C" JNIEXPORT jstring JNICALL +Java_android_llama_cpp_LLamaLibraryLoader_getCpuFeaturesString( + JNIEnv* env, + jclass clazz) { + std::string text; + + if (features.asimd) text += "ASIMD "; + if (features.asimddp) text += "ASIMDDP "; + if (features.i8mm) text += "I8MM "; + if (features.sve) text += "SVE "; + if (features.sve2) text += "SVE2 "; + if (features.sme) text += "SME "; + + return env->NewStringUTF(text.c_str()); +} diff --git a/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaLibraryLoader.kt b/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaLibraryLoader.kt new file mode 100644 index 0000000000..23dbdbac30 --- /dev/null +++ b/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaLibraryLoader.kt @@ -0,0 +1,170 @@ +package android.llama.cpp + +import android.content.Context +import android.content.SharedPreferences +import android.util.Log +import androidx.core.content.edit + +enum class LLamaTier(val rawValue: Int, val libraryName: String, val description: String) { + T0(0, "llama_android_t0", "ARMv8-a baseline with SIMD"), + T1(1, "llama_android_t1", "ARMv8.2-a with DotProd"), + T2(2, "llama_android_t2", "ARMv8.6-a with DotProd + I8MM"), + T3(3, "llama_android_t3", "ARMv9-a with DotProd + I8MM + SVE/SVE2"); + // TODO-han.yin: implement T4 once obtaining an Android device with SME! + + companion object { + fun fromRawValue(value: Int): LLamaTier? { + return entries.find { it.rawValue == value } + } + + fun getMaxSupportedTier(): LLamaTier = T3 + } +} + +class LLamaLibraryLoader private constructor() { + + companion object { + private val TAG = LLamaLibraryLoader::class.simpleName + + private const val DETECTION_VERSION = 1 + private const val PREFS_NAME = "llama_cpu_detection" + private const val KEY_DETECTED_TIER = "detected_tier" + private const val KEY_DETECTION_VERSION = "detection_version" + + @JvmStatic + private external fun getOptimalTier(): Int + + @JvmStatic + private external fun getCpuFeaturesString(): String + + private var _cachedInstance: LLamaAndroid? = null + private var _detectedTier: LLamaTier? = null + val detectedTier: LLamaTier? get() = _detectedTier + + /** + * Factory method to get a configured LLamaAndroid instance. + * Handles tier detection, caching, and library loading automatically. + */ + @Synchronized + fun createInstance(context: Context): LLamaAndroid? { + // Return cached instance if available + _cachedInstance?.let { return it } + + try { + // Obtain the optimal tier from cache if available + val tier = getOrDetectOptimalTier(context) ?: run { + Log.e(TAG, "Failed to determine optimal tier") + return null + } + _detectedTier = tier + Log.i(TAG, "Using tier: ${tier.name} (${tier.description})") + + // Create and cache LLamaAndroid instance + val instance = LLamaAndroid.createWithTier(tier) ?: run { + Log.e(TAG, "Failed to instantiate LLamaAndroid") + return null + } + _cachedInstance = instance + Log.i(TAG, "Successfully created LLamaAndroid instance with ${tier.name}") + + return instance + + } catch (e: Exception) { + Log.e(TAG, "Error creating LLamaAndroid instance", e) + return null + } + } + + /** + * Clear cached detection results (for testing/debugging) + */ + fun clearCache(context: Context) { + getSharedPrefs(context).edit { clear() } + _cachedInstance = null + _detectedTier = null + Log.i(TAG, "Cleared detection results and cached instance") + } + + /** + * Get optimal tier from cache or detect it fresh + */ + private fun getOrDetectOptimalTier(context: Context): LLamaTier? { + val prefs = getSharedPrefs(context) + + // Check if we have a cached result with the current detection version + val cachedVersion = prefs.getInt(KEY_DETECTION_VERSION, -1) + val cachedTierValue = prefs.getInt(KEY_DETECTED_TIER, -1) + if (cachedVersion == DETECTION_VERSION && cachedTierValue >= 0) { + val cachedTier = LLamaTier.fromRawValue(cachedTierValue) + if (cachedTier != null) { + Log.i(TAG, "Using cached tier detection: ${cachedTier.name}") + return cachedTier + } + } + + // No valid cache, detect fresh + Log.i(TAG, "Performing fresh tier detection") + return detectAndCacheOptimalTier(context) + } + + /** + * Detect optimal tier and save to cache + */ + private fun detectAndCacheOptimalTier(context: Context): LLamaTier? { + try { + // Load CPU detection library + System.loadLibrary("llama_cpu_detector") + Log.i(TAG, "CPU feature detector loaded successfully") + + // Detect optimal tier + val tierValue = getOptimalTier() + val features = getCpuFeaturesString() + Log.i(TAG, "Raw tier $tierValue w/ CPU features: $features") + + // Convert to enum and validate + val tier = LLamaTier.fromRawValue(tierValue) ?: run { + Log.w(TAG, "Invalid tier value $tierValue") + return null + } + + // Ensure we don't exceed maximum supported tier + val finalTier = if (tier.rawValue > LLamaTier.getMaxSupportedTier().rawValue) { + Log.w(TAG, "Detected tier ${tier.name} exceeds max supported, using ${LLamaTier.getMaxSupportedTier().name}") + LLamaTier.getMaxSupportedTier() + } else { + tier + } + + // Cache the result + getSharedPrefs(context).edit { + putInt(KEY_DETECTED_TIER, finalTier.rawValue) + putInt(KEY_DETECTION_VERSION, DETECTION_VERSION) + } + + Log.i(TAG, "Detected and cached optimal tier: ${finalTier.name}") + return finalTier + + } catch (e: UnsatisfiedLinkError) { + Log.e(TAG, "Failed to load CPU detection library", e) + + // Fallback to T0 and cache it + val fallbackTier = LLamaTier.T0 + getSharedPrefs(context).edit { + putInt(KEY_DETECTED_TIER, fallbackTier.rawValue) + putInt(KEY_DETECTION_VERSION, DETECTION_VERSION) + } + + Log.i(TAG, "Using fallback tier: ${fallbackTier.name}") + return fallbackTier + + } catch (e: Exception) { + Log.e(TAG, "Unexpected error during tier detection", e) + return null + } + } + + private fun getSharedPrefs(context: Context): SharedPreferences { + return context.getSharedPreferences(PREFS_NAME, Context.MODE_PRIVATE) + } + } +}