Merge branch 'ggml-org:master' into power-law-sampler
This commit is contained in:
commit
2d67b1c008
|
|
@ -319,7 +319,7 @@ json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msg
|
|||
}
|
||||
}
|
||||
} else {
|
||||
jmsg["content"] = json(); // null
|
||||
jmsg["content"] = "";
|
||||
}
|
||||
if (!msg.reasoning_content.empty()) {
|
||||
jmsg["reasoning_content"] = msg.reasoning_content;
|
||||
|
|
|
|||
|
|
@ -1109,6 +1109,25 @@ common_init_result::common_init_result(common_params & params) :
|
|||
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
|
||||
// load and optionally apply lora adapters (must be loaded before context creation)
|
||||
for (auto & la : params.lora_adapters) {
|
||||
llama_adapter_lora_ptr lora;
|
||||
lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
|
||||
if (lora == nullptr) {
|
||||
LOG_ERR("%s: failed to load lora adapter '%s'\n", __func__, la.path.c_str());
|
||||
pimpl->model.reset(model);
|
||||
return;
|
||||
}
|
||||
|
||||
char buf[1024];
|
||||
la.ptr = lora.get();
|
||||
llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf));
|
||||
la.task_name = buf;
|
||||
llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
|
||||
la.prompt_prefix = buf;
|
||||
pimpl->lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
|
||||
}
|
||||
|
||||
// updates params.sampling
|
||||
// TODO: fix naming
|
||||
common_init_sampler_from_model(model, params.sampling);
|
||||
|
|
@ -1245,24 +1264,6 @@ common_init_result_ptr common_init_from_params(common_params & params) {
|
|||
}
|
||||
}
|
||||
|
||||
// load and optionally apply lora adapters
|
||||
for (auto & la : params.lora_adapters) {
|
||||
llama_adapter_lora_ptr lora;
|
||||
lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
|
||||
if (lora == nullptr) {
|
||||
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
|
||||
return res;
|
||||
}
|
||||
|
||||
char buf[1024];
|
||||
la.ptr = lora.get();
|
||||
llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf));
|
||||
la.task_name = buf;
|
||||
llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
|
||||
la.prompt_prefix = buf;
|
||||
res->lora().emplace_back(std::move(lora)); // copy to list of loaded adapters
|
||||
}
|
||||
|
||||
if (!params.lora_init_without_apply) {
|
||||
common_set_adapter_lora(lctx, params.lora_adapters);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -5,8 +5,11 @@ set -e
|
|||
MODEL_PATH="${1:-"$MODEL_PATH"}"
|
||||
MODEL_NAME="${2:-$(basename "$MODEL_PATH")}"
|
||||
|
||||
CONVERTED_MODEL_PATH="${1:-"$CONVERTED_MODEL"}"
|
||||
CONVERTED_MODEL_NAME="${2:-$(basename "$CONVERTED_MODEL_PATH" ".gguf")}"
|
||||
|
||||
if [ -t 0 ]; then
|
||||
CPP_EMBEDDINGS="data/llamacpp-${MODEL_NAME}-embeddings.bin"
|
||||
CPP_EMBEDDINGS="data/llamacpp-${CONVERTED_MODEL_NAME}-embeddings.bin"
|
||||
else
|
||||
# Process piped JSON data and convert to binary (matching logits.cpp format)
|
||||
TEMP_FILE=$(mktemp /tmp/tmp.XXXXXX.binn)
|
||||
|
|
|
|||
|
|
@ -401,8 +401,8 @@ if (GGML_CPU_ALL_VARIANTS)
|
|||
ggml_add_cpu_backend_variant(android_armv8.2_2 DOTPROD FP16_VECTOR_ARITHMETIC)
|
||||
ggml_add_cpu_backend_variant(android_armv8.6_1 DOTPROD FP16_VECTOR_ARITHMETIC MATMUL_INT8)
|
||||
ggml_add_cpu_backend_variant(android_armv9.0_1 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SVE2)
|
||||
ggml_add_cpu_backend_variant(android_armv9.2_1 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SME)
|
||||
ggml_add_cpu_backend_variant(android_armv9.2_2 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SVE SME)
|
||||
ggml_add_cpu_backend_variant(android_armv9.2_1 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SVE SME)
|
||||
ggml_add_cpu_backend_variant(android_armv9.2_2 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SVE SVE2 SME)
|
||||
elseif (APPLE)
|
||||
ggml_add_cpu_backend_variant(apple_m1 DOTPROD)
|
||||
ggml_add_cpu_backend_variant(apple_m2_m3 DOTPROD MATMUL_INT8)
|
||||
|
|
|
|||
|
|
@ -561,9 +561,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|||
|
||||
# Fetch KleidiAI sources:
|
||||
include(FetchContent)
|
||||
set(KLEIDIAI_COMMIT_TAG "v1.14.0")
|
||||
set(KLEIDIAI_COMMIT_TAG "v1.16.0")
|
||||
set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
|
||||
set(KLEIDIAI_ARCHIVE_MD5 "45e110675d93f99f82c23a1afcca76bc")
|
||||
set(KLEIDIAI_ARCHIVE_MD5 "0a9e9008adb6031f9e8cf70dff4a3321")
|
||||
|
||||
if (POLICY CMP0135)
|
||||
cmake_policy(SET CMP0135 NEW)
|
||||
|
|
@ -615,6 +615,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|||
string(FIND "${ARCH_FLAGS_TEMP}" "+dotprod" DOTPROD_ENABLED)
|
||||
string(FIND "${ARCH_FLAGS_TEMP}" "+i8mm" I8MM_ENABLED)
|
||||
string(FIND "${ARCH_FLAGS_TEMP}" "+sme" SME_ENABLED)
|
||||
string(FIND "${ARCH_FLAGS_TEMP}" "+sve" SVE_ENABLED)
|
||||
|
||||
set(PRIVATE_ARCH_FLAGS ${ARCH_FLAGS_TEMP})
|
||||
|
||||
|
|
@ -659,6 +660,15 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|||
set(PRIVATE_ARCH_FLAGS "-fno-tree-vectorize;${PRIVATE_ARCH_FLAGS}+sve+sve2")
|
||||
endif()
|
||||
|
||||
if (NOT SVE_ENABLED MATCHES -1)
|
||||
list(APPEND GGML_KLEIDIAI_SOURCES
|
||||
${KLEIDIAI_SRC}/kai/kai_common_sve_asm.S
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod_asm.S
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod.c
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm_asm.S
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm.c)
|
||||
endif()
|
||||
|
||||
set_source_files_properties(${GGML_KLEIDIAI_SOURCES} PROPERTIES COMPILE_OPTIONS "${PRIVATE_ARCH_FLAGS}")
|
||||
list(APPEND GGML_CPU_SOURCES ${GGML_KLEIDIAI_SOURCES})
|
||||
endif()
|
||||
|
|
|
|||
|
|
@ -18,6 +18,8 @@
|
|||
#include "kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod.h"
|
||||
#include "kai_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod.h"
|
||||
#include "kai_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm.h"
|
||||
#include "kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm.h"
|
||||
#include "kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod.h"
|
||||
|
||||
#include "kai_lhs_pack_bf16p2vlx2_f32_sme.h"
|
||||
#include "kai_lhs_quant_pack_qsi8d32p_f32.h"
|
||||
|
|
@ -69,9 +71,9 @@ static inline void kernel_run_fn10(size_t m, size_t n, size_t k, size_t /*bl*/,
|
|||
|
||||
template<void(*Fn)(size_t,size_t,size_t,const void*,const void*,float*,size_t,size_t,float,float)>
|
||||
static inline void kernel_run_float_fn10(size_t m, size_t n, size_t k, size_t /*bl*/,
|
||||
const void* lhs, const void* rhs, void* dst,
|
||||
size_t dst_stride_row, size_t dst_stride_col,
|
||||
float clamp_min, float clamp_max) {
|
||||
const void* lhs, const void* rhs, void* dst,
|
||||
size_t dst_stride_row, size_t dst_stride_col,
|
||||
float clamp_min, float clamp_max) {
|
||||
Fn(m, n, k, lhs, rhs, static_cast<float*>(dst), dst_stride_row, dst_stride_col, clamp_min, clamp_max);
|
||||
}
|
||||
|
||||
|
|
@ -152,8 +154,8 @@ static inline void rhs_pack_fn12(size_t num_groups, size_t n, size_t k, size_t n
|
|||
|
||||
template<void(*Fn)(size_t,size_t,size_t,size_t,size_t,size_t,const int8_t*,const float*,const float*,void*,size_t,const struct kai_rhs_pack_qsi8cx_params*)>
|
||||
static inline void rhs_pack_scale_fn12(size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t /*bl*/,
|
||||
size_t /*rhs_stride*/, const void* rhs, const void* bias, const void* scale,
|
||||
void* rhs_packed, size_t extra_bytes, const void* params) {
|
||||
size_t /*rhs_stride*/, const void* rhs, const void* bias, const void* scale,
|
||||
void* rhs_packed, size_t extra_bytes, const void* params) {
|
||||
Fn(num_groups, n, k, nr, kr, sr,
|
||||
static_cast<const int8_t*>(rhs),
|
||||
static_cast<const float*>(bias),
|
||||
|
|
@ -524,6 +526,61 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
|||
},
|
||||
#endif
|
||||
#else
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
{
|
||||
/* SVE i8mm GEMM */
|
||||
/* .kern_info = */ {
|
||||
/* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm,
|
||||
/* .get_n_step = */ kai_get_n_step_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm,
|
||||
/* .get_mr = */ kai_get_mr_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm,
|
||||
/* .get_nr = */ kai_get_nr_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm,
|
||||
/* .get_kr = */ kai_get_kr_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm,
|
||||
/* .get_sr = */ kai_get_sr_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm,
|
||||
/* .get_dst_offset = */ kai_get_dst_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm,
|
||||
/* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm,
|
||||
/* .get_lhs_offset_ex = */ &kernel_offs_fn3<kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm>,
|
||||
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3<kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm>,
|
||||
/* .run_kernel_ex = */ &kernel_run_fn11<kai_run_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm>,
|
||||
},
|
||||
/* .gemm_lhs_info = */ {
|
||||
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p4x8sb_f32_neon,
|
||||
/* .get_packed_offset_ex = */ &lhs_offs_fn6<kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p4x8sb_f32_neon>,
|
||||
/* .packed_size_ex = */ &lhs_ps_fn6<kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p4x8sb_f32_neon>,
|
||||
/* .pack_func_ex = */ &lhs_pack_float_fn10<kai_run_lhs_quant_pack_qsi8d32p4x8sb_f32_neon>,
|
||||
},
|
||||
/* SVE dotprod GEMV */
|
||||
/* .kern_info = */ {
|
||||
/* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod,
|
||||
/* .get_n_step = */ kai_get_n_step_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod,
|
||||
/* .get_mr = */ kai_get_mr_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod,
|
||||
/* .get_nr = */ kai_get_nr_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod,
|
||||
/* .get_kr = */ kai_get_kr_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod,
|
||||
/* .get_sr = */ kai_get_sr_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod,
|
||||
/* .get_dst_offset = */ kai_get_dst_offset_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod,
|
||||
/* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod,
|
||||
/* .get_lhs_offset_ex = */ &kernel_offs_fn3<kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod>,
|
||||
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3<kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod>,
|
||||
/* .run_kernel_ex = */ &kernel_run_fn11<kai_run_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod>,
|
||||
},
|
||||
/* .gemv_lhs_info = */ {
|
||||
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
|
||||
/* .get_packed_offset_ex = */ &lhs_offs_fn6<kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32>,
|
||||
/* .packed_size_ex = */ &lhs_ps_fn6<kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32>,
|
||||
/* .pack_func_ex = */ &lhs_pack_float_fn10<kai_run_lhs_quant_pack_qsi8d32p_f32>,
|
||||
},
|
||||
/* .rhs_info = */ {
|
||||
/* .packed_stride = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
|
||||
/* .to_float = */ dequantize_row_qsi4c32pscalef16,
|
||||
/* .packed_size_ex = */ &rhs_ps_fn5<kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
||||
/* .packed_stride_ex = */ &rhs_stride_fn4<kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
||||
/* .pack_func_ex = */ &rhs_pack_fn12<kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
||||
},
|
||||
/* .required_cpu = */ CPU_FEATURE_SVE | CPU_FEATURE_I8MM | CPU_FEATURE_DOTPROD,
|
||||
/* .lhs_type = */ GGML_TYPE_F32,
|
||||
/* .rhs_type = */ GGML_TYPE_Q4_0,
|
||||
/* .op_type = */ GGML_TYPE_F32,
|
||||
},
|
||||
#endif
|
||||
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
{
|
||||
/* i8mm GEMM */
|
||||
|
|
@ -578,7 +635,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
|||
/* .rhs_type = */ GGML_TYPE_Q4_0,
|
||||
/* .op_type = */ GGML_TYPE_F32,
|
||||
},
|
||||
#endif
|
||||
#endif // __ARM_FEATURE_MATMUL_INT8
|
||||
#if defined(__ARM_FEATURE_DOTPROD)
|
||||
{
|
||||
/* DOTPROD GEMM */
|
||||
|
|
@ -811,26 +868,27 @@ ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature cpu_features, c
|
|||
ggml_kleidiai_kernels * kernel = nullptr;
|
||||
|
||||
if (tensor->op == GGML_OP_MUL_MAT && tensor->src[0] != nullptr && tensor->src[1] != nullptr) {
|
||||
#if defined(__ARM_FEATURE_SME) || defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
for (size_t i = 0; i < NELEMS(gemm_gemv_kernels) - 1; ++i) {
|
||||
if ((cpu_features & gemm_gemv_kernels[i].required_cpu) == gemm_gemv_kernels[i].required_cpu &&
|
||||
gemm_gemv_kernels[i].lhs_type == tensor->src[1]->type &&
|
||||
gemm_gemv_kernels[i].rhs_type == tensor->src[0]->type &&
|
||||
gemm_gemv_kernels[i].op_type == tensor->type) {
|
||||
kernel = &gemm_gemv_kernels[i];
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!kernel) {
|
||||
for (size_t i = 0; i < NELEMS(gemm_gemv_kernels_q8) - 1; ++i) {
|
||||
if ((cpu_features & gemm_gemv_kernels_q8[i].required_cpu) == gemm_gemv_kernels_q8[i].required_cpu &&
|
||||
gemm_gemv_kernels_q8[i].lhs_type == tensor->src[1]->type &&
|
||||
gemm_gemv_kernels_q8[i].rhs_type == tensor->src[0]->type &&
|
||||
gemm_gemv_kernels_q8[i].op_type == tensor->type) {
|
||||
kernel = &gemm_gemv_kernels_q8[i];
|
||||
break;
|
||||
#if defined(__ARM_FEATURE_SME) || \
|
||||
defined(__ARM_FEATURE_DOTPROD) || \
|
||||
defined(__ARM_FEATURE_MATMUL_INT8) || \
|
||||
defined(__ARM_FEATURE_SVE)
|
||||
auto try_table = [&](auto & table) {
|
||||
for (size_t i = 0; i < NELEMS(table) - 1; ++i) {
|
||||
if ((cpu_features & table[i].required_cpu) == table[i].required_cpu &&
|
||||
table[i].lhs_type == tensor->src[1]->type &&
|
||||
table[i].rhs_type == tensor->src[0]->type &&
|
||||
table[i].op_type == tensor->type) {
|
||||
kernel = &table[i];
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
if (tensor->src[0]->type == GGML_TYPE_Q8_0) {
|
||||
try_table(gemm_gemv_kernels_q8);
|
||||
} else {
|
||||
try_table(gemm_gemv_kernels);
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED(gemm_gemv_kernels);
|
||||
|
|
@ -845,7 +903,10 @@ ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature cpu_features, c
|
|||
ggml_kleidiai_kernels * ggml_kleidiai_select_kernels_q4_0(cpu_feature features) {
|
||||
ggml_kleidiai_kernels * kernels = nullptr;
|
||||
|
||||
#if defined(__ARM_FEATURE_SME) || defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
#if defined(__ARM_FEATURE_SME) || \
|
||||
defined(__ARM_FEATURE_DOTPROD) || \
|
||||
defined(__ARM_FEATURE_MATMUL_INT8) || \
|
||||
defined(__ARM_FEATURE_SVE)
|
||||
for (size_t i = 0; i < NELEMS(gemm_gemv_kernels) - 1; ++i) {
|
||||
if ((features & gemm_gemv_kernels[i].required_cpu) == gemm_gemv_kernels[i].required_cpu) {
|
||||
kernels = &gemm_gemv_kernels[i];
|
||||
|
|
|
|||
|
|
@ -46,13 +46,20 @@ struct ggml_kleidiai_context {
|
|||
} static ctx = { CPU_FEATURE_NONE, NULL, NULL };
|
||||
|
||||
static const char* cpu_feature_to_string(cpu_feature f) {
|
||||
switch (f) {
|
||||
case CPU_FEATURE_NONE: return "NONE";
|
||||
case CPU_FEATURE_DOTPROD: return "DOTPROD";
|
||||
case CPU_FEATURE_I8MM: return "I8MM";
|
||||
case CPU_FEATURE_SVE: return "SVE";
|
||||
case CPU_FEATURE_SME: return "SME";
|
||||
default: return "UNKNOWN";
|
||||
if (f == CPU_FEATURE_NONE) {
|
||||
return "NONE";
|
||||
} else if ((f & CPU_FEATURE_SME) == CPU_FEATURE_SME) {
|
||||
return "SME";
|
||||
} else if ((f & CPU_FEATURE_SVE) == CPU_FEATURE_SVE) {
|
||||
return "SVE";
|
||||
}
|
||||
else if ((f & CPU_FEATURE_I8MM) == CPU_FEATURE_I8MM) {
|
||||
return "I8MM";
|
||||
} else if ((f & CPU_FEATURE_DOTPROD) == CPU_FEATURE_DOTPROD) {
|
||||
return "DOTPROD";
|
||||
}
|
||||
else {
|
||||
return "UNKNOWN";
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -68,7 +75,7 @@ static void init_kleidiai_context(void) {
|
|||
|
||||
ctx.features = (ggml_cpu_has_dotprod() ? CPU_FEATURE_DOTPROD : CPU_FEATURE_NONE) |
|
||||
(ggml_cpu_has_matmul_int8() ? CPU_FEATURE_I8MM : CPU_FEATURE_NONE) |
|
||||
(ggml_cpu_has_sve() ? CPU_FEATURE_SVE : CPU_FEATURE_NONE);
|
||||
((ggml_cpu_has_sve() && ggml_cpu_get_sve_cnt() == QK8_0) ? CPU_FEATURE_SVE : CPU_FEATURE_NONE);
|
||||
|
||||
if (env_var) {
|
||||
sme_enabled = atoi(env_var);
|
||||
|
|
|
|||
|
|
@ -4785,6 +4785,16 @@ static ggml_backend_feature * ggml_backend_cuda_get_features(ggml_backend_reg_t
|
|||
features.push_back({ "FA_ALL_QUANTS", "1" });
|
||||
#endif
|
||||
|
||||
{
|
||||
const auto & info = ggml_cuda_info();
|
||||
for (int id = 0; id < info.device_count; ++id) {
|
||||
if (blackwell_mma_available(info.devices[id].cc)) {
|
||||
features.push_back({ "BLACKWELL_NATIVE_FP4", "1"});
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#undef _STRINGIFY
|
||||
#undef STRINGIFY
|
||||
|
||||
|
|
|
|||
|
|
@ -607,6 +607,8 @@ extern "C" {
|
|||
//
|
||||
|
||||
// Load a LoRA adapter from file
|
||||
// The adapter is valid as long as the associated model is not freed
|
||||
// All adapters must be loaded before context creation
|
||||
LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init(
|
||||
struct llama_model * model,
|
||||
const char * path_lora);
|
||||
|
|
|
|||
|
|
@ -38,7 +38,7 @@ Example function tool call syntax:
|
|||
{%- if message['role'] == 'user' -%}
|
||||
{{- '<|User|>' + message['content'] + '<|end▁of▁sentence|>' -}}
|
||||
{%- endif -%}
|
||||
{%- if message['role'] == 'assistant' and message['content'] is none -%}
|
||||
{%- if message['role'] == 'assistant' and not message['content'] -%}
|
||||
{{- '<|Assistant|><|tool▁calls▁begin|>' -}}
|
||||
{%- set ns.is_first = true -%}
|
||||
{%- for tc in message['tool_calls'] -%}
|
||||
|
|
@ -53,7 +53,7 @@ Example function tool call syntax:
|
|||
{%- endfor -%}
|
||||
{{- '<|tool▁calls▁end|><|end▁of▁sentence|>' -}}
|
||||
{%- endif -%}
|
||||
{%- if message['role'] == 'assistant' and message['content'] is not none -%}
|
||||
{%- if message['role'] == 'assistant' and message['content'] -%}
|
||||
{{- flush_tool_outputs() -}}
|
||||
{%- set content = message['content'] -%}
|
||||
{%- if '</think>' in content -%}
|
||||
|
|
@ -73,4 +73,4 @@ Example function tool call syntax:
|
|||
{{- flush_tool_outputs() -}}
|
||||
{%- if add_generation_prompt and not ns.is_tool_outputs -%}
|
||||
{{- '<|Assistant|><think>\n' -}}
|
||||
{%- endif -%}
|
||||
{%- endif -%}
|
||||
|
|
|
|||
|
|
@ -146,9 +146,11 @@ llama_adapter_lora_weight * llama_adapter_lora::get_weight(ggml_tensor * w) {
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
static void llama_adapter_lora_init_impl(llama_model & model, const char * path_lora, llama_adapter_lora & adapter) {
|
||||
static void llama_adapter_lora_init_impl(const char * path_lora, llama_adapter_lora & adapter) {
|
||||
LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
|
||||
|
||||
llama_model & model = adapter.model;
|
||||
|
||||
ggml_context * ctx_init;
|
||||
gguf_init_params meta_gguf_params = {
|
||||
/* .no_alloc = */ true,
|
||||
|
|
@ -411,14 +413,17 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
|
|||
}
|
||||
}
|
||||
|
||||
// update number of nodes used
|
||||
model.n_lora_nodes += adapter.get_n_nodes();
|
||||
|
||||
LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
|
||||
}
|
||||
|
||||
llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) {
|
||||
llama_adapter_lora * adapter = new llama_adapter_lora();
|
||||
llama_adapter_lora * adapter = new llama_adapter_lora(*model);
|
||||
|
||||
try {
|
||||
llama_adapter_lora_init_impl(*model, path_lora, *adapter);
|
||||
llama_adapter_lora_init_impl(path_lora, *adapter);
|
||||
return adapter;
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
||||
|
|
@ -469,6 +474,10 @@ int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter,
|
|||
}
|
||||
|
||||
void llama_adapter_lora_free(llama_adapter_lora * adapter) {
|
||||
// update number of nodes used
|
||||
GGML_ASSERT(adapter->model.n_lora_nodes >= adapter->get_n_nodes());
|
||||
adapter->model.n_lora_nodes -= adapter->get_n_nodes();
|
||||
|
||||
delete adapter;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -59,6 +59,8 @@ struct llama_adapter_lora_weight {
|
|||
};
|
||||
|
||||
struct llama_adapter_lora {
|
||||
llama_model & model;
|
||||
|
||||
// map tensor name to lora_a_b
|
||||
std::unordered_map<std::string, llama_adapter_lora_weight> ab_map;
|
||||
|
||||
|
|
@ -73,10 +75,14 @@ struct llama_adapter_lora {
|
|||
// activated lora (aLoRA)
|
||||
std::vector<llama_token> alora_invocation_tokens;
|
||||
|
||||
llama_adapter_lora() = default;
|
||||
llama_adapter_lora(llama_model & model) : model(model) {}
|
||||
~llama_adapter_lora() = default;
|
||||
|
||||
llama_adapter_lora_weight * get_weight(ggml_tensor * w);
|
||||
|
||||
uint32_t get_n_nodes() const {
|
||||
return ab_map.size() * 6u; // a, b, scale, add, 2 x mul_mat
|
||||
}
|
||||
};
|
||||
|
||||
using llama_adapter_loras = std::unordered_map<llama_adapter_lora *, float>;
|
||||
|
|
|
|||
|
|
@ -1442,7 +1442,9 @@ uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const {
|
|||
if (model.arch == LLM_ARCH_QWEN3NEXT) {
|
||||
return std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
|
||||
}
|
||||
return std::max<uint32_t>(1024u, 8u*model.n_tensors());
|
||||
uint32_t res = std::max<uint32_t>(1024u, 8u*model.n_tensors());
|
||||
res += model.n_lora_nodes;
|
||||
return res;
|
||||
}
|
||||
|
||||
llm_graph_result * llama_context::get_gf_res_reserve() const {
|
||||
|
|
|
|||
|
|
@ -305,7 +305,7 @@ public:
|
|||
bool do_shift,
|
||||
stream_copy_info sc_info);
|
||||
|
||||
// used to create a batch procesing context from a batch
|
||||
// used to create a batch processing context from a batch
|
||||
llama_kv_cache_context(
|
||||
llama_kv_cache * kv,
|
||||
slot_info_vec_t sinfos,
|
||||
|
|
|
|||
|
|
@ -475,6 +475,9 @@ struct llama_model {
|
|||
// for quantize-stats only
|
||||
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
|
||||
|
||||
// for keeping track of extra nodes used by lora adapters
|
||||
uint32_t n_lora_nodes = 0;
|
||||
|
||||
int64_t t_load_us = 0;
|
||||
int64_t t_start_us = 0;
|
||||
|
||||
|
|
|
|||
|
|
@ -421,39 +421,6 @@ void llama_sampler_free(struct llama_sampler * smpl) {
|
|||
delete smpl;
|
||||
}
|
||||
|
||||
llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx) {
|
||||
const auto * logits = llama_get_logits_ith(ctx, idx);
|
||||
|
||||
const llama_model * model = llama_get_model(ctx);
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
|
||||
const int n_vocab = llama_vocab_n_tokens(vocab);
|
||||
|
||||
// TODO: do not allocate each time
|
||||
std::vector<llama_token_data> cur;
|
||||
cur.reserve(n_vocab);
|
||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
||||
cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
||||
}
|
||||
|
||||
llama_token_data_array cur_p = {
|
||||
/* .data = */ cur.data(),
|
||||
/* .size = */ cur.size(),
|
||||
/* .selected = */ -1,
|
||||
/* .sorted = */ false,
|
||||
};
|
||||
|
||||
llama_sampler_apply(smpl, &cur_p);
|
||||
|
||||
GGML_ASSERT(cur_p.selected >= 0 && cur_p.selected < (int32_t) cur_p.size);
|
||||
|
||||
auto token = cur_p.data[cur_p.selected].id;
|
||||
|
||||
llama_sampler_accept(smpl, token);
|
||||
|
||||
return token;
|
||||
}
|
||||
|
||||
// sampler chain
|
||||
|
||||
static const char * llama_sampler_chain_name(const struct llama_sampler * /*smpl*/) {
|
||||
|
|
@ -527,12 +494,56 @@ struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_param
|
|||
/* .ctx = */ new llama_sampler_chain {
|
||||
/* .params = */ params,
|
||||
/* .samplers = */ {},
|
||||
/* .cur = */ {},
|
||||
/* .t_sample_us = */ 0,
|
||||
/* .n_sample = */ 0,
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx) {
|
||||
const auto * logits = llama_get_logits_ith(ctx, idx);
|
||||
|
||||
const llama_model * model = llama_get_model(ctx);
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
|
||||
const int n_vocab = llama_vocab_n_tokens(vocab);
|
||||
|
||||
// use pre-allocated buffer from chain if available, otherwise allocate locally
|
||||
std::vector<llama_token_data> * cur_ptr;
|
||||
std::vector<llama_token_data> cur_local;
|
||||
|
||||
if (smpl->iface == &llama_sampler_chain_i) {
|
||||
auto * chain = (llama_sampler_chain *) smpl->ctx;
|
||||
cur_ptr = &chain->cur;
|
||||
} else {
|
||||
cur_ptr = &cur_local;
|
||||
}
|
||||
|
||||
auto & cur = *cur_ptr;
|
||||
cur.resize(n_vocab);
|
||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
||||
cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
|
||||
}
|
||||
|
||||
llama_token_data_array cur_p = {
|
||||
/* .data = */ cur.data(),
|
||||
/* .size = */ cur.size(),
|
||||
/* .selected = */ -1,
|
||||
/* .sorted = */ false,
|
||||
};
|
||||
|
||||
llama_sampler_apply(smpl, &cur_p);
|
||||
|
||||
GGML_ASSERT(cur_p.selected >= 0 && cur_p.selected < (int32_t) cur_p.size);
|
||||
|
||||
auto token = cur_p.data[cur_p.selected].id;
|
||||
|
||||
llama_sampler_accept(smpl, token);
|
||||
|
||||
return token;
|
||||
}
|
||||
|
||||
void llama_sampler_chain_add(struct llama_sampler * chain, struct llama_sampler * smpl) {
|
||||
auto * p = (llama_sampler_chain *) chain->ctx;
|
||||
p->samplers.push_back(smpl);
|
||||
|
|
|
|||
|
|
@ -16,6 +16,9 @@ struct llama_sampler_chain {
|
|||
|
||||
std::vector<struct llama_sampler *> samplers;
|
||||
|
||||
// pre-allocated buffer for llama_sampler_sample to avoid repeated allocations
|
||||
std::vector<llama_token_data> cur;
|
||||
|
||||
// timing
|
||||
|
||||
mutable int64_t t_sample_us;
|
||||
|
|
|
|||
|
|
@ -650,7 +650,7 @@ static void test_msgs_oaicompat_json_conversion() {
|
|||
"[\n"
|
||||
" {\n"
|
||||
" \"role\": \"assistant\",\n"
|
||||
" \"content\": null,\n"
|
||||
" \"content\": \"\",\n"
|
||||
" \"tool_calls\": [\n"
|
||||
" {\n"
|
||||
" \"type\": \"function\",\n"
|
||||
|
|
@ -906,7 +906,8 @@ static void test_template_output_parsers() {
|
|||
" },\n"
|
||||
" \"id\": \"123456789\"\n"
|
||||
" }\n"
|
||||
" ]\n"
|
||||
" ],\n"
|
||||
" \"content\": \"\"\n"
|
||||
"}");
|
||||
}
|
||||
{
|
||||
|
|
@ -1713,7 +1714,8 @@ static void test_template_output_parsers() {
|
|||
" },\n"
|
||||
" \"id\": \"123456789\"\n"
|
||||
" }\n"
|
||||
" ]\n"
|
||||
" ],\n"
|
||||
" \"content\": \"\"\n"
|
||||
"}",
|
||||
/* expect_grammar_triggered= */ false
|
||||
);
|
||||
|
|
|
|||
|
|
@ -38,14 +38,6 @@ set(TARGET_SRCS
|
|||
server-http.h
|
||||
server-models.cpp
|
||||
server-models.h
|
||||
server-task.cpp
|
||||
server-task.h
|
||||
server-queue.cpp
|
||||
server-queue.h
|
||||
server-common.cpp
|
||||
server-common.h
|
||||
server-context.cpp
|
||||
server-context.h
|
||||
)
|
||||
set(PUBLIC_ASSETS
|
||||
index.html.gz
|
||||
|
|
|
|||
Loading…
Reference in New Issue