Merge remote-tracking branch 'upstream/master' into backend-sampling

2025-11-25 06:10:33 +01:00 · 2025-11-25 06:10:33 +01:00 · 2b4c7927ee
parent a02adf4211 877566d512
commit 2b4c7927ee
35 changed files with 4384 additions and 3015 deletions
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -1232,6 +1232,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, const std::string & value) {
            const auto sampler_names = string_split<std::string>(value, ';');
            params.sampling.samplers = common_sampler_types_from_names(sampler_names, true);
            params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS;
        }
    ).set_sparam());
    add_opt(common_arg(
@ -1261,6 +1262,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, const std::string & value) {
            params.sampling.temp = std::stof(value);
            params.sampling.temp = std::max(params.sampling.temp, 0.0f);
            params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TEMP;
        }
    ).set_sparam());
    add_opt(common_arg(
@ -1268,6 +1270,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        string_format("top-k sampling (default: %d, 0 = disabled)", params.sampling.top_k),
        [](common_params & params, int value) {
            params.sampling.top_k = value;
            params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K;
        }
    ).set_sparam());
    add_opt(common_arg(
@ -1275,6 +1278,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sampling.top_p),
        [](common_params & params, const std::string & value) {
            params.sampling.top_p = std::stof(value);
            params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_P;
        }
    ).set_sparam());
    add_opt(common_arg(
@ -1282,6 +1286,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sampling.min_p),
        [](common_params & params, const std::string & value) {
            params.sampling.min_p = std::stof(value);
            params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIN_P;
        }
    ).set_sparam());
    add_opt(common_arg(
@ -1296,6 +1301,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability),
        [](common_params & params, const std::string & value) {
            params.sampling.xtc_probability = std::stof(value);
            params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY;
        }
    ).set_sparam());
    add_opt(common_arg(
@ -1303,6 +1309,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sampling.xtc_threshold),
        [](common_params & params, const std::string & value) {
            params.sampling.xtc_threshold = std::stof(value);
            params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD;
        }
    ).set_sparam());
    add_opt(common_arg(
@ -1321,6 +1328,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            }
            params.sampling.penalty_last_n = value;
            params.sampling.n_prev = std::max(params.sampling.n_prev, params.sampling.penalty_last_n);
            params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N;
        }
    ).set_sparam());
    add_opt(common_arg(
@ -1328,6 +1336,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sampling.penalty_repeat),
        [](common_params & params, const std::string & value) {
            params.sampling.penalty_repeat = std::stof(value);
            params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT;
        }
    ).set_sparam());
    add_opt(common_arg(
@ -1425,6 +1434,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sampling.mirostat),
        [](common_params & params, int value) {
            params.sampling.mirostat = value;
            params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT;
        }
    ).set_sparam());
    add_opt(common_arg(
@ -1432,6 +1442,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sampling.mirostat_eta),
        [](common_params & params, const std::string & value) {
            params.sampling.mirostat_eta = std::stof(value);
            params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA;
        }
    ).set_sparam());
    add_opt(common_arg(
@ -1439,6 +1450,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sampling.mirostat_tau),
        [](common_params & params, const std::string & value) {
            params.sampling.mirostat_tau = std::stof(value);
            params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU;
        }
    ).set_sparam());
    add_opt(common_arg(
--- a/common/common.cpp
+++ b/common/common.cpp
@ -950,6 +950,58 @@ std::vector<common_file_info> fs_list_files(const std::string & path) {
 // Model utils
 //
 static inline void common_init_sampler_from_model(
    const llama_model * model,
    common_params_sampling & sparams) {
    const uint64_t config = sparams.user_sampling_config;
    auto get_int32 = [&](const char * key, int32_t & dst, uint64_t user_config) {
        if (config & user_config) return;
        char buf[64] = {0};
        if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
            char * end = nullptr;
            int32_t v = strtol(buf, &end, 10);
            if (end && end != buf) dst = v;
        }
    };
    auto get_float = [&](const char * key, float & dst, uint64_t user_config) {
        if (config & user_config) return;
        char buf[128] = {0};
        if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
            char * end = nullptr;
            float v = strtof(buf, &end);
            if (end && end != buf) dst = v;
        }
    };
    // Sampling sequence
    if (!(config & common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS)) {
        char buf[512] = {0};
        if (llama_model_meta_val_str(model, llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE), buf, sizeof(buf)) > 0) {
            const std::vector<std::string> sampler_names = string_split<std::string>(std::string(buf), ';');
            if (!sampler_names.empty()) {
                sparams.samplers = common_sampler_types_from_names(sampler_names, true);
            }
        }
    }
    get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TOP_K),           sparams.top_k,           common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K);
    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TOP_P),           sparams.top_p,           common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_P);
    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIN_P),           sparams.min_p,           common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIN_P);
    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY), sparams.xtc_probability, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY);
    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD),   sparams.xtc_threshold,   common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD);
    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TEMP),            sparams.temp,            common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TEMP);
    get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N),  sparams.penalty_last_n,  common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N);
    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT),  sparams.penalty_repeat,  common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT);
    get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT),        sparams.mirostat,        common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT);
    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU),    sparams.mirostat_tau,    common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU);
    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA),    sparams.mirostat_eta,    common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA);
 }
 struct common_init_result common_init_from_params(common_params & params) {
    common_init_result iparams;
    auto mparams = common_model_params_to_llama(params);
@ -961,6 +1013,8 @@ struct common_init_result common_init_from_params(common_params & params) {
        return iparams;
    }
    common_init_sampler_from_model(model, params.sampling);
    const llama_vocab * vocab = llama_model_get_vocab(model);
    auto cparams = common_context_params_to_llama(params);
--- a/common/common.h
+++ b/common/common.h
@ -140,6 +140,22 @@ struct common_grammar_trigger {
    llama_token token = LLAMA_TOKEN_NULL;
 };
 enum common_params_sampling_config : uint64_t {
    COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS        = 1 << 0,
    COMMON_PARAMS_SAMPLING_CONFIG_TOP_K           = 1 << 1,
    COMMON_PARAMS_SAMPLING_CONFIG_TOP_P           = 1 << 2,
    COMMON_PARAMS_SAMPLING_CONFIG_MIN_P           = 1 << 3,
    COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY = 1 << 4,
    COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD   = 1 << 5,
    COMMON_PARAMS_SAMPLING_CONFIG_TEMP            = 1 << 6,
    COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N  = 1 << 7,
    COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT  = 1 << 8,
    COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT        = 1 << 9,
    COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU    = 1 << 10,
    COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA    = 1 << 11,
 };
 // sampling parameters
 struct common_params_sampling {
    uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
@ -172,6 +188,8 @@ struct common_params_sampling {
    bool    no_perf            = false; // disable performance metrics
    bool    timing_per_token   = false;
    uint64_t user_sampling_config = 0; // bitfield to track user-specified samplers
    std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"};     // default sequence breakers for DRY
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -565,7 +565,7 @@ class ModelBase:
                            gguf.MODEL_TENSOR.ALTUP_PREDICT_COEF,
                        )
                    )
-                    or not new_name.endswith(".weight")
+                    or new_name[-7:] not in (".weight", ".lora_a", ".lora_b")
                ):
                    data_qtype = gguf.GGMLQuantizationType.F32
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@ -242,7 +242,7 @@ def parse_args() -> argparse.Namespace:
        help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
    )
    parser.add_argument(
-        "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
+        "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f32",
        help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
    )
    parser.add_argument(
--- a/examples/batched/README.md
+++ b/examples/batched/README.md
@ -3,7 +3,7 @@
 The example demonstrates batched generation from a given prompt
 ```bash
-./llama-batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4
+./llama-batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4 --kv-unified
 ...
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@ -25,16 +25,17 @@ if(GIT_EXE)
    )
 endif()
 # Build the version string with optional dirty flag
 set(GGML_VERSION "${GGML_VERSION_BASE}")
 if(GGML_GIT_DIRTY AND NOT GGML_GIT_DIRTY EQUAL 0)
    set(GGML_VERSION "${GGML_VERSION}-dirty")
 endif()
 if(NOT GGML_BUILD_COMMIT)
    set(GGML_BUILD_COMMIT "unknown")
 endif()
 # Build the commit string with optional dirty flag
 if(DEFINED GGML_GIT_DIRTY AND GGML_GIT_DIRTY EQUAL 1)
    set(GGML_BUILD_COMMIT "${GGML_BUILD_COMMIT}-dirty")
 endif()
 include(CheckIncludeFileCXX)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@ -328,6 +328,14 @@ function(ggml_add_cpu_backend_variant tag_name)
            set(GGML_INTERNAL_${feat} OFF)
        endforeach()
        foreach (feat ${ARGN})
            set(GGML_INTERNAL_${feat} ON)
        endforeach()
    elseif (GGML_SYSTEM_ARCH STREQUAL "riscv64")
        foreach (feat RVV)
            set(GGML_INTERNAL_${feat} OFF)
        endforeach()
        foreach (feat ${ARGN})
            set(GGML_INTERNAL_${feat} ON)
        endforeach()
@ -402,6 +410,13 @@ if (GGML_CPU_ALL_VARIANTS)
        else()
            message(FATAL_ERROR "Unsupported s390x target OS: ${CMAKE_SYSTEM_NAME}")
        endif()
    elseif (GGML_SYSTEM_ARCH STREQUAL "riscv64")
        if (CMAKE_SYSTEM_NAME MATCHES "Linux")
            ggml_add_cpu_backend_variant(riscv64_0)
            ggml_add_cpu_backend_variant(riscv64_v   RVV)
        else()
            message(FATAL_ERROR "Unsupported RISC-V target OS: ${CMAKE_SYSTEM_NAME}")
        endif()
    else()
        message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported with ${GGML_SYSTEM_ARCH} on ${CMAKE_SYSTEM_NAME}")
    endif()
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@ -452,6 +452,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
                ggml-cpu/spacemit/ime_kernels.h
            )
        endif()
        if(NOT GGML_CPU_ALL_VARIANTS)
            set(MARCH_STR "rv64gc")
            if (GGML_RV_ZFH)
                string(APPEND MARCH_STR "_zfh")
@ -468,6 +469,18 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
                string(APPEND MARCH_STR "_zicbop")
            endif()
            list(APPEND ARCH_FLAGS "-march=${MARCH_STR}" -mabi=lp64d)
        else()
            # Begin with the lowest baseline
            set(ARCH_DEFINITIONS "")
            if (GGML_INTERNAL_RVV)
                message(STATUS "RVV enabled")
                list(APPEND ARCH_DEFINITIONS GGML_USE_RVV)
                list(APPEND ARCH_FLAGS -march=rv64gc_v -mabi=lp64d)
            endif()
            ggml_add_cpu_backend_features(${GGML_CPU_NAME} riscv ${ARCH_DEFINITIONS})
        endif()
    elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
        message(STATUS "s390x detected")
        list(APPEND GGML_CPU_SOURCES
--- a/ggml/src/ggml-cpu/arch-fallback.h
+++ b/ggml/src/ggml-cpu/arch-fallback.h
@ -51,10 +51,8 @@
 #elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
 // repack.cpp
 #define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
 #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
--- a/ggml/src/ggml-cpu/arch/arm/repack.cpp
+++ b/ggml/src/ggml-cpu/arch/arm/repack.cpp
@ -24,6 +24,29 @@
 #define UNUSED GGML_UNUSED
 static inline void decode_q4_Kx8_scales_mins(const uint8_t * scales_in,
                                             int16x8_t *     out_mins,
                                             int8_t *        out_scales) {
    constexpr uint32_t kmask1 = 0x3f3f3f3f;
    constexpr uint32_t kmask2 = 0x0f0f0f0f;
    constexpr uint32_t kmask3 = 0x03030303;
    constexpr uint8_t  scales_size = 12;
    uint32_t sm[3];
    memcpy(sm, scales_in, scales_size);
    const uint32_t   mins_0_3 = sm[1] & kmask1;
    const uint32_t   mins_4_7 = ((sm[2] >> 4) & kmask2) | (((sm[1] >> 6) & kmask3) << 4);
    const uint32x2_t mins_u32 = { mins_0_3, mins_4_7 };
    *out_mins = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(mins_u32)));
    uint32_t scales_u32[2];
    scales_u32[0] = sm[0] & kmask1;
    scales_u32[1] = (sm[2] & kmask2) | (((sm[0] >> 6) & kmask3) << 4);
    memcpy(out_scales, scales_u32, 8);
 }
 void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
    assert(QK8_0 == 32);
    assert(k % QK8_0 == 0);
@ -474,6 +497,162 @@ void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
    ggml_gemv_iq4_nl_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
 }
 void ggml_gemv_q4_K_8x8_q8_K(int                        n,
                             float * GGML_RESTRICT      s,
                             size_t                     bs,
                             const void * GGML_RESTRICT vx,
                             const void * GGML_RESTRICT vy,
                             int                        nr,
                             int                        nc) {
    constexpr int qk = QK_K;
    const int     nb = n / qk;
    constexpr int ncols_interleaved = 8;
    constexpr int blocklen          = 8;
    assert(n % qk == 0);
    assert(nr % 4 == 0);
    assert(nc % ncols_interleaved == 0);
    UNUSED(nb);
    UNUSED(ncols_interleaved);
    UNUSED(blocklen);
 #if defined(__aarch64__) && defined(__ARM_NEON)
    constexpr int    col_pairs = ncols_interleaved / 2;
    const uint8x16_t m4b       = vdupq_n_u8(0x0f);
    // 1x8 tile = 2 x 4
    float32x4_t acc_f32[ncols_interleaved / 4];
    const block_q8_K * GGML_RESTRICT q8_ptr = (const block_q8_K *) vy;
    for (int x = 0; x < nc / ncols_interleaved; x++) {
        const block_q4_Kx8 * GGML_RESTRICT q4_ptr = (const block_q4_Kx8 *) vx + (x * nb);
        for (int i = 0; i < ncols_interleaved / 4; i++) {
            acc_f32[i] = vdupq_n_f32(0);
        }
        for (int b = 0; b < nb; b++) {
            float32x4_t q4_d_0     = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].d));      // d0 d1 d2 d3
            float32x4_t q4_d_1     = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].d + 4));  // d4 d5 d6 d7
            float32x4_t q8_d       = vdupq_n_f32(q8_ptr[b].d);
            float32x4_t sb_scale_0 = vmulq_f32(q4_d_0, q8_d);
            float32x4_t sb_scale_1 = vmulq_f32(q4_d_1, q8_d);
            float32x4_t q4_dmin_0  = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].dmin));      // dmin 0..3
            float32x4_t q4_dmin_1  = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].dmin + 4));  // dmin 4..7
            float32x4_t sb_min_0   = vmulq_f32(q4_dmin_0, q8_d);
            float32x4_t sb_min_1   = vmulq_f32(q4_dmin_1, q8_d);
            // interleaved bias_acc: [0]->r0 0123, [1]->r0 4567
            int32x4_t bias_acc[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
            // 2 sb each iteration
            int32x4_t acc_lo[col_pairs];
            int32x4_t acc_hi[col_pairs];
            // Each bsum is 16 elements, pairwise add leaves us with the 8 bsums of the entire block
            const int16x8_t bsums = vpaddq_s16(vld1q_s16(q8_ptr[b].bsums), vld1q_s16(q8_ptr[b].bsums + 8));
            int16_t         bsums_arr[8];
            vst1q_s16(bsums_arr, bsums);
            for (int sb = 0; sb < QK_K / 64; sb++) {
                for (int i = 0; i < col_pairs; i++) {
                    acc_lo[i] = vdupq_n_s32(0);
                    acc_hi[i] = vdupq_n_s32(0);
                }
                // Need scales for the low and high nibbles
                // 2 * 12 = 24 bytes per subblock, 4 sbs -> 4 * 24 = 96 bytes total
                int16x8_t q4sb_mins[2];  // int16 as its needed for bias_acc later
                int16x8_t q4sb_scales[2];
                for (int i = 0; i < 2; i++) {
                    int8_t    aux_q4sb[8];
                    const int offset = sb * 24 + i * 12;
                    decode_q4_Kx8_scales_mins(&q4_ptr[b].scales[offset], &q4sb_mins[i], aux_q4sb);
                    q4sb_scales[i] = vmovl_s8(vld1_s8(aux_q4sb));
                }
                const uint8_t * q4_base = q4_ptr[b].qs + sb * QK_K;
                // Load the 64 quants from q8K duplicated to use vecdots with the interelaved columns
                // but still need the qs to use the low and hi bits from q4
                const int8_t * q8_base = q8_ptr[b].qs + sb * 64;
                int8x16_t      q8_qs[8];
                for (int i = 0; i < 8; i++) {
                    q8_qs[i] = (int8x16_t) vld1q_dup_s64((const int64_t *) (q8_base + i * 8));
                }
                // Q4s columns iterated in pairs (01, 23, 45, 67)
                for (int cp = 0; cp < col_pairs; cp++) {
                    uint8x16_t q4_qs_cp_0 = vld1q_u8(q4_base + 16 * cp);
                    uint8x16_t q4_qs_cp_1 = vld1q_u8(q4_base + 16 * cp + 64);
                    uint8x16_t q4_qs_cp_2 = vld1q_u8(q4_base + 16 * cp + 128);
                    uint8x16_t q4_qs_cp_3 = vld1q_u8(q4_base + 16 * cp + 192);
                    acc_lo[cp] =
                        ggml_vdotq_s32(acc_lo[cp], vreinterpretq_s8_u8(vandq_u8(q4_qs_cp_0, m4b)), q8_qs[0]);  // 0 .. 7
                    acc_lo[cp] =
                        ggml_vdotq_s32(acc_lo[cp], vreinterpretq_s8_u8(vandq_u8(q4_qs_cp_1, m4b)), q8_qs[1]);  // 8 ..15
                    acc_lo[cp] =
                        ggml_vdotq_s32(acc_lo[cp], vreinterpretq_s8_u8(vandq_u8(q4_qs_cp_2, m4b)), q8_qs[2]);  // 16..23
                    acc_lo[cp] =
                        ggml_vdotq_s32(acc_lo[cp], vreinterpretq_s8_u8(vandq_u8(q4_qs_cp_3, m4b)), q8_qs[3]);  // 24..31
                    acc_hi[cp] =
                        ggml_vdotq_s32(acc_hi[cp], vreinterpretq_s8_u8(vshrq_n_u8(q4_qs_cp_0, 4)), q8_qs[4]);  // 32..39
                    acc_hi[cp] =
                        ggml_vdotq_s32(acc_hi[cp], vreinterpretq_s8_u8(vshrq_n_u8(q4_qs_cp_1, 4)), q8_qs[5]);  // 40..47
                    acc_hi[cp] =
                        ggml_vdotq_s32(acc_hi[cp], vreinterpretq_s8_u8(vshrq_n_u8(q4_qs_cp_2, 4)), q8_qs[6]);  // 48..55
                    acc_hi[cp] =
                        ggml_vdotq_s32(acc_hi[cp], vreinterpretq_s8_u8(vshrq_n_u8(q4_qs_cp_3, 4)), q8_qs[7]);  // 56..63
                }
                // Iterates over a pair of column pairs (4 columns) to use a single 128 register
                // p = 0 -> 0123  p2 -> 4567
                for (int i = 0, p = 0; p < col_pairs; i++, p += 2) {
                    int16x4_t   group_scales_lo = p == 0 ? vget_low_s16(q4sb_scales[0]) : vget_high_s16(q4sb_scales[0]);
                    int16x4_t   group_scales_hi = p == 0 ? vget_low_s16(q4sb_scales[1]) : vget_high_s16(q4sb_scales[1]);
                    float32x4_t sb_scale        = p == 0 ? sb_scale_0 : sb_scale_1;
                    // 0123 or 4567
                    // TODO: Single superblock mul at the end of the superblock
                    float32x4_t sumf_0 =
                        vcvtq_f32_s32(vmulq_s32(vmovl_s16(group_scales_lo), vpaddq_s32(acc_lo[p], acc_lo[p + 1])));
                    acc_f32[i] = vfmaq_f32(acc_f32[i], sb_scale, sumf_0);
                    float32x4_t sumf_1 =
                        vcvtq_f32_s32(vmulq_s32(vmovl_s16(group_scales_hi), vpaddq_s32(acc_hi[p], acc_hi[p + 1])));
                    acc_f32[i] = vfmaq_f32(acc_f32[i], sb_scale, sumf_1);
                }
                // Multiply Acc bsum + mins
                // Each pair of subblocks share the same bsums
                // Load scalar bsum → broadcast to a vector (vdupq_n_s16(s)).
                int16x4_t bsums_vec_lo = vdup_n_s16(bsums_arr[2 * sb + 0]);
                int16x4_t bsums_vec_hi = vdup_n_s16(bsums_arr[2 * sb + 1]);
                // cols 0-3 bias
                bias_acc[0] = vmlal_s16(bias_acc[0], bsums_vec_lo, vget_low_s16(q4sb_mins[0]));
                bias_acc[0] = vmlal_s16(bias_acc[0], bsums_vec_hi, vget_low_s16(q4sb_mins[1]));
                // cols 4-7 bias
                bias_acc[1] = vmlal_s16(bias_acc[1], bsums_vec_lo, vget_high_s16(q4sb_mins[0]));
                bias_acc[1] = vmlal_s16(bias_acc[1], bsums_vec_hi, vget_high_s16(q4sb_mins[1]));
            }  // for sb
            acc_f32[0] = vmlsq_f32(acc_f32[0], vcvtq_f32_s32(bias_acc[0]), sb_min_0);
            acc_f32[1] = vmlsq_f32(acc_f32[1], vcvtq_f32_s32(bias_acc[1]), sb_min_1);
        }  // for b
        int base = x * ncols_interleaved;
        vst1q_f32(s + base, acc_f32[0]);
        vst1q_f32(s + base + 4, acc_f32[1]);
    }  // for x
    return;
 #endif  // defined(__aarch64__) && defined(__ARM_NEON)
    ggml_gemv_q4_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc);
 }
 void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
    const int qk = QK8_0;
    const int nb = n / qk;
@ -1889,3 +2068,212 @@ void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
 #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
    ggml_gemm_iq4_nl_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
 }
 void ggml_gemm_q4_K_8x8_q8_K(int                        n,
                             float * GGML_RESTRICT      s,
                             size_t                     bs,
                             const void * GGML_RESTRICT vx,
                             const void * GGML_RESTRICT vy,
                             int                        nr,
                             int                        nc) {
    constexpr int qk = QK_K;
    const int     nb = n / qk;
    constexpr int ncols_interleaved = 8;
    constexpr int blocklen          = 8;
    assert(n % qk == 0);
    assert(nr % 4 == 0);
    assert(nc % ncols_interleaved == 0);
    UNUSED(nb);
    UNUSED(ncols_interleaved);
    UNUSED(blocklen);
 #if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
    constexpr int    q8_k_blocklen = 4;
    const uint8x16_t m4b           = vdupq_n_u8(0x0f);
    // 8 accumulators: 2 row pairs × 4 col pairs
    float32x4_t acc_f32[blocklen];
    for (int y = 0; y < nr / q8_k_blocklen; y++) {
        const block_q8_Kx4 * GGML_RESTRICT q8_ptr = (const block_q8_Kx4 *) vy + (y * nb);
        for (int x = 0; x < nc / ncols_interleaved; x++) {
            const block_q4_Kx8 * GGML_RESTRICT q4_ptr = (const block_q4_Kx8 *) vx + (x * nb);
            for (int i = 0; i < blocklen; i++) {
                acc_f32[i] = vdupq_n_f32(0);
            }
            for (int b = 0; b < nb; b++) {
                // bsums pairs belongs to the same q8_k subblock
                const int16x8_t bsums[4]{
                    vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 0), vld1q_s16(q8_ptr[b].bsums + 16 * 0 + 8)),
                    vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 1), vld1q_s16(q8_ptr[b].bsums + 16 * 1 + 8)),
                    vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 2), vld1q_s16(q8_ptr[b].bsums + 16 * 2 + 8)),
                    vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 3), vld1q_s16(q8_ptr[b].bsums + 16 * 3 + 8)),
                };
                int16_t bsums_arr[4][8];
                for (int q8_row = 0; q8_row < 4; q8_row++) {
                    vst1q_s16(bsums_arr[q8_row], bsums[q8_row]);
                }
                int32x4_t sb_acc[4];    // Aux accumulators to store subblock (partial) results
                int32x4_t acc[8];       // rows 01 stored in [0][1][2][3] rows 23 stored in [4][5][6][7]
                int32x4_t bias_acc[8];  // interleaved bias_acc: [0]->r0 0123, [1]->r0 4567, [2]->r1 0123 ...
                for (int i = 0; i < 8; i++) {
                    acc[i]      = vdupq_n_s32(0);
                    bias_acc[i] = vdupq_n_s32(0);
                }
                for (int sb = 0; sb < QK_K / 64; sb++) {
                    // Need scales for the low and high nibbles
                    // 2 * 12 = 24 bytes per subblock, 4 sbs -> 4 * 24 = 96 bytes total
                    int8_t    q4sb_scales[2][8];
                    int16x8_t q4sb_mins[2];  // int16 as its needed for bias_acc later
                    for (int i = 0; i < 2; i++) {
                        const int offset = sb * 24 + i * 12;
                        decode_q4_Kx8_scales_mins(&q4_ptr[b].scales[offset], &q4sb_mins[i], q4sb_scales[i]);
                    }
                    // q8_ptr[b].qs has interleaved Q8 rows (01, 23)
                    const int8_t * q8_base = q8_ptr[b].qs + sb * 256;
                    int8x16_t q8_qs_01[8];
                    int8x16_t q8_qs_23[8];
                    // Load 32-byte per row pair, 1 subblock each time
                    for (int i = 0; i < 8; i++) {
                        const int offset = i * 32;  // 16 for row 01, 16 for row 23
                        q8_qs_01[i]      = vld1q_s8(q8_base + offset);
                        q8_qs_23[i]      = vld1q_s8(q8_base + offset + 16);
                    }
                    const int8x16_t q8s[2][8] = {
                        { q8_qs_01[0], q8_qs_01[1], q8_qs_01[2], q8_qs_01[3],
                          q8_qs_01[4], q8_qs_01[5], q8_qs_01[6], q8_qs_01[7] },
                        { q8_qs_23[0], q8_qs_23[1], q8_qs_23[2], q8_qs_23[3],
                          q8_qs_23[4], q8_qs_23[5], q8_qs_23[6], q8_qs_23[7] },
                    };
                    // Q4s columns iterated in pairs (01, 23, 45, 67)
                    for (int cp = 0; cp < ncols_interleaved / 2; cp++) {
                        for (int i = 0; i < 4; i++) {
                            sb_acc[i] = vdupq_n_s32(0);
                        }
                        uint8x16_t q4_qs_cp_0 = vld1q_u8(q4_ptr[b].qs + sb * QK_K + 16 * cp + 0);    // 0 .. 7 & 32..39
                        uint8x16_t q4_qs_cp_1 = vld1q_u8(q4_ptr[b].qs + sb * QK_K + 16 * cp + 64);   // 8 ..15 & 40..47
                        uint8x16_t q4_qs_cp_2 = vld1q_u8(q4_ptr[b].qs + sb * QK_K + 16 * cp + 128);  // 16..23 & 48..55
                        uint8x16_t q4_qs_cp_3 = vld1q_u8(q4_ptr[b].qs + sb * QK_K + 16 * cp + 192);  // 24..31 & 56..63
                        const int8x16_t q4_nibbles[2][4] = {
                            {
                                vreinterpretq_s8_u8(vandq_u8(q4_qs_cp_0, m4b)),
                                vreinterpretq_s8_u8(vandq_u8(q4_qs_cp_1, m4b)),
                                vreinterpretq_s8_u8(vandq_u8(q4_qs_cp_2, m4b)),
                                vreinterpretq_s8_u8(vandq_u8(q4_qs_cp_3, m4b)),
                            },
                            {
                                vreinterpretq_s8_u8(vshrq_n_u8(q4_qs_cp_0, 4)),
                                vreinterpretq_s8_u8(vshrq_n_u8(q4_qs_cp_1, 4)),
                                vreinterpretq_s8_u8(vshrq_n_u8(q4_qs_cp_2, 4)),
                                vreinterpretq_s8_u8(vshrq_n_u8(q4_qs_cp_3, 4)),
                            }
                        };
                        // Calculates the Qs muladd of every row pair (rp) rows 01 and 23 of q8
                        // for each of the internal 32 qs subblock (blk)
                        for (int rp = 0; rp < 2; rp++) {
                            for (int blk = 0; blk < 2; blk++) {
                                const int8x16_t * q8  = &q8s[rp][4 * blk];
                                const int8x16_t * q4  = q4_nibbles[blk];
                                int32x4_t         acc = sb_acc[2 * rp + blk];
                                // mul add for each qs in the same subblock
                                for (int qs_offset = 0; qs_offset < 4; qs_offset++) {
                                    acc = vmmlaq_s32(acc, q4[qs_offset], q8[qs_offset]);
                                }
                                sb_acc[2 * rp + blk] = acc;
                            }
                        }
                        // Scales[i] corresponds to column i
                        const int scale_offset = cp * 2;
                        for (int blk = 0; blk < 2; blk++) {
                            const int32x4_t block_scale = {
                                (int32_t) q4sb_scales[blk][scale_offset],
                                (int32_t) q4sb_scales[blk][scale_offset],
                                (int32_t) q4sb_scales[blk][scale_offset + 1],
                                (int32_t) q4sb_scales[blk][scale_offset + 1],
                            };
                            acc[cp]     = vmlaq_s32(acc[cp], sb_acc[blk], block_scale);
                            acc[cp + 4] = vmlaq_s32(acc[cp + 4], sb_acc[blk + 2], block_scale);
                        }
                    }
                    // Multiply Acc bsum + mins
                    for (int q8_row = 0; q8_row < 4; q8_row++) {
                        // Each pair of subblocks share the same bsums
                        // Load scalar bsum → broadcast to a vector (vdupq_n_s16(s)).
                        int16x4_t bsums_vec_lo = vdup_n_s16(bsums_arr[sb][q8_row * 2]);
                        int16x4_t bsums_vec_hi = vdup_n_s16(bsums_arr[sb][q8_row * 2 + 1]);
                        bias_acc[2 * q8_row] =
                            vmlal_s16(bias_acc[2 * q8_row], bsums_vec_lo, vget_low_s16(q4sb_mins[0]));
                        bias_acc[2 * q8_row] =
                            vmlal_s16(bias_acc[2 * q8_row], bsums_vec_hi, vget_low_s16(q4sb_mins[1]));
                        bias_acc[2 * q8_row + 1] =
                            vmlal_s16(bias_acc[2 * q8_row + 1], bsums_vec_lo, vget_high_s16(q4sb_mins[0]));
                        bias_acc[2 * q8_row + 1] =
                            vmlal_s16(bias_acc[2 * q8_row + 1], bsums_vec_hi, vget_high_s16(q4sb_mins[1]));
                    }
                }  // for sb
                // Reorder of i8mm output with bias and output layout
                for (int i = 0; i < 8; i++) {
                    int32x2x2_t aux = vzip_s32(vget_low_s32(acc[i]), vget_high_s32(acc[i]));
                    acc[i]          = vcombine_s32(aux.val[0], aux.val[1]);
                }
                int32x4_t reorder_acc[8] = {
                    vcombine_s32(vget_low_s32(acc[0]), vget_low_s32(acc[1])),
                    vcombine_s32(vget_low_s32(acc[2]), vget_low_s32(acc[3])),
                    vcombine_s32(vget_high_s32(acc[0]), vget_high_s32(acc[1])),
                    vcombine_s32(vget_high_s32(acc[2]), vget_high_s32(acc[3])),
                    vcombine_s32(vget_low_s32(acc[4]), vget_low_s32(acc[5])),
                    vcombine_s32(vget_low_s32(acc[6]), vget_low_s32(acc[7])),
                    vcombine_s32(vget_high_s32(acc[4]), vget_high_s32(acc[5])),
                    vcombine_s32(vget_high_s32(acc[6]), vget_high_s32(acc[7])),
                };
                for (int i = 0; i < q8_k_blocklen; i++) {
                    for (int j = 0; j < 2; j++) {
                        float32x4_t       q8_d    = vdupq_n_f32(q8_ptr[b].d[i]);
                        float32x4_t       q4_dmin = vcvt_f32_f16(vld1_f16((const __fp16 *) (q4_ptr[b].dmin + j * 4)));
                        const float32x4_t dmins   = vmulq_f32(q4_dmin, q8_d);
                        float32x4_t       q4_d  = vcvt_f32_f16(vld1_f16((const __fp16 *) (q4_ptr[b].d + j * 4)));
                        const float32x4_t scale = vmulq_f32(q4_d, q8_d);
                        acc_f32[2 * i + j] = vmlsq_f32(acc_f32[2 * i + j], vcvtq_f32_s32(bias_acc[2 * i + j]), dmins);
                        acc_f32[2 * i + j] =
                            vmlaq_f32(acc_f32[2 * i + j], vcvtq_f32_s32(reorder_acc[2 * i + j]), scale);
                    }
                }
            }  // for b
            // With the previous reorder, the tile is already in the correct memory layout.
            for (int i = 0; i < q8_k_blocklen; i++) {
                int row = y * q8_k_blocklen + i;
                for (int j = 0; j < 2; j++) {
                    int col    = x * ncols_interleaved + j * 4;
                    int offset = row * bs + col;
                    vst1q_f32(s + offset, acc_f32[2 * i + j]);
                }
            }
        }  // for x
    }  // for y
    return;
 #endif  // defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
    ggml_gemm_q4_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc);
 }
--- a/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp
+++ b/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp
@ -0,0 +1,35 @@
 #include "ggml-backend-impl.h"
 #if defined(__riscv) && __riscv_xlen == 64
 #include <sys/auxv.h>
 //https://github.com/torvalds/linux/blob/master/arch/riscv/include/uapi/asm/hwcap.h#L24
 #ifndef COMPAT_HWCAP_ISA_V
 #define COMPAT_HWCAP_ISA_V (1 << ('V' - 'A'))
 #endif
 struct riscv64_features {
    bool has_rvv = false;
    riscv64_features() {
        uint32_t hwcap = getauxval(AT_HWCAP);
        has_rvv = !!(hwcap & COMPAT_HWCAP_ISA_V);
    }
 };
 static int ggml_backend_cpu_riscv64_score() {
    int score = 1;
    riscv64_features rf;
 #ifdef GGML_USE_RVV
    if (!rf.has_rvv) { return 0; }
    score += 1 << 1;
 #endif
    return score;
 }
 GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_riscv64_score)
 #endif  // __riscv && __riscv_xlen == 64
--- a/ggml/src/ggml-cpu/repack.cpp
+++ b/ggml/src/ggml-cpu/repack.cpp
@ -1961,6 +1961,11 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
                return &q4_K_8x8_q8_K;
            }
        }
        if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
            if (cur->ne[1] % 8 == 0) {
                return &q4_K_8x8_q8_K;
            }
        }
    } else if (cur->type == GGML_TYPE_Q2_K) {
        if (ggml_cpu_has_avx512()) {
            if (cur->ne[1] % 8 == 0) {
--- a/ggml/src/ggml-cuda/mma.cuh
+++ b/ggml/src/ggml-cuda/mma.cuh
@ -73,34 +73,7 @@ namespace ggml_cuda_mma {
        static constexpr int I  = I_;
        static constexpr int J  = J_;
-#if defined(GGML_USE_HIP)
+#if defined(AMD_MFMA_AVAILABLE)
 #if defined(RDNA4)
        static constexpr int ne = I * J / 32;
        T x[ne] = {0};
        static constexpr __device__ bool supported() {
            if (I == 16 && J == 16) return true;
            return false;
        }
        static __device__ __forceinline__ int get_i(const int l) {
            if constexpr (I == 16 && J == 16) {
                return 8 * (threadIdx.x / 16) + l;
            } else {
                NO_DEVICE_CODE;
                return -1;
            }
        }
        static __device__ __forceinline__ int get_j(const int l) {
            if constexpr (I == 16 && J == 16) {
                return threadIdx.x % 16;
            } else {
                NO_DEVICE_CODE;
                return -1;
            }
        }
 #else
        static constexpr int ne = I * J / 64;
        T x[ne] = {0};
@ -146,7 +119,6 @@ namespace ggml_cuda_mma {
                return -1;
            }
        }
 #endif // defined(RDNA4)
 #elif __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
        static constexpr int ne = I * J / 32;
        T x[ne] = {0};
@ -177,6 +149,34 @@ namespace ggml_cuda_mma {
                return -1;
            }
        }
 #elif defined(AMD_WMMA_AVAILABLE)
 #if defined(RDNA4)
        static constexpr int ne = I * J / 32;
        T x[ne] = {0};
        static constexpr __device__ bool supported() {
            if (I == 16 && J == 16) return true;
            return false;
        }
        static __device__ __forceinline__ int get_i(const int l) {
            if constexpr (I == 16 && J == 16) {
                return 8 * (threadIdx.x / 16) + l;
            } else {
                NO_DEVICE_CODE;
                return -1;
            }
        }
        static __device__ __forceinline__ int get_j(const int l) {
            if constexpr (I == 16 && J == 16) {
                return threadIdx.x % 16;
            } else {
                NO_DEVICE_CODE;
                return -1;
            }
        }
 #endif
 #else
        static constexpr int ne = I * J / 32;
        T x[ne] = {0};
@ -437,7 +437,20 @@ namespace ggml_cuda_mma {
            xi[0] = xs[0];
        }
 #elif defined(AMD_WMMA_AVAILABLE)
-        ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
+        if constexpr (I == 16 && J == 4) {
            int64_t * xi = (int64_t *) t.x;
            const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 2 * (threadIdx.x / t.I));
            xi[0] = xs[0];
        }else if constexpr (I == 16 && J == 8) {
            int64_t * xi = (int64_t *) t.x;
            const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 4 * (threadIdx.x / t.I));
            xi[0] = xs[0];
            const int64_t * xs1 = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 4 * (threadIdx.x / t.I) + 2);
            xi[1] = xs1[0];
        }else{
            NO_DEVICE_CODE;
        }
 #else
 #pragma unroll
        for (int l = 0; l < t.ne; ++l) {
@ -772,6 +785,36 @@ namespace ggml_cuda_mma {
                                                      acc[0],
                                                      0, 0, 0);
 #endif // defined(CDNA3)
 #elif defined(AMD_WMMA_AVAILABLE)
        using int32x2_t = __attribute__((__vector_size__(2 * sizeof(int)))) int;
        int32x2_t * a_vec = (int32x2_t *) A.x;
        int32x2_t * b_vec = (int32x2_t *) B.x;
        using int32x8_t = __attribute__((__vector_size__(8 * sizeof(int)))) int;
        int32x8_t * acc = (int32x8_t *) D.x;
 #if defined(RDNA4)
        acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(
            true,
            a_vec[0],
            true,
            b_vec[0],
            acc[0],
            true
        );
        acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(
            true,
            a_vec[1],
            true,
            b_vec[1],
            acc[0],
            true
        );
 #endif // defined(RDNA4)
 #else
        GGML_UNUSED_VARS(D, A, B);
        NO_DEVICE_CODE;
@ -798,6 +841,7 @@ namespace ggml_cuda_mma {
                                                     acc[0],
                                                     0, 0, 0);
 #endif // defined(CDNA3)
 #else
        GGML_UNUSED_VARS(D, A, B);
        NO_DEVICE_CODE;
@ -842,4 +886,31 @@ namespace ggml_cuda_mma {
        mma(D16[1], A16[1], B);
 #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
    }
 static __device__ __forceinline__ void mma(
            tile<16, 16, int> & D, const tile<16, 4, int> & A, const tile<16, 4, int> & B) {
 #if defined(AMD_WMMA_AVAILABLE)
    using int32x2_t = __attribute__((__vector_size__(2 * sizeof(int)))) int;
    int32x2_t * a_vec = (int32x2_t *) A.x;
    int32x2_t * b_vec = (int32x2_t *) B.x;
    using int32x8_t = __attribute__((__vector_size__(8 * sizeof(int)))) int;
    int32x8_t * acc = (int32x8_t *) D.x;
    acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(
        true,
        a_vec[0],
        true,
        b_vec[0],
        acc[0],
        false
    );
 #else
        GGML_UNUSED(D);
        GGML_UNUSED(A);
        GGML_UNUSED(B);
        NO_DEVICE_CODE;
 #endif
    }
 }
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@ -306,5 +306,11 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
        return false;
    }
-    return (!GGML_CUDA_CC_IS_RDNA4(cc) && !GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
+    if (amd_wmma_available(cc)) {
        if (GGML_CUDA_CC_IS_RDNA4(cc)) {
            return true;
        }
    }
    return (!GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
 }
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@ -1629,6 +1629,22 @@ class vk_perf_logger {
            timings[name].push_back(time);
            return;
        }
        if (node->op == GGML_OP_FLASH_ATTN_EXT) {
            const ggml_tensor * dst = node;
            const ggml_tensor * q = node->src[0];
            const ggml_tensor * k = node->src[1];
            const ggml_tensor * v = node->src[2];
            const ggml_tensor * m = node->src[3];
            std::stringstream name;
            name << ggml_op_name(node->op) <<
                " dst(" << dst->ne[0] << "," << dst->ne[1] << "," << dst->ne[2] << "," << dst->ne[3] << "), " <<
                " q(" << q->ne[0] << "," << q->ne[1] << "," << q->ne[2] << "," << q->ne[3] << "), " <<
                " k(" << k->ne[0] << "," << k->ne[1] << "," << k->ne[2] << "," << k->ne[3] << "), " <<
                " v(" << v->ne[0] << "," << v->ne[1] << "," << v->ne[2] << "," << v->ne[3] << "), " <<
                " m(" << (m?m->ne[0]:0) << "," << (m?m->ne[1]:0) << "," << (m?m->ne[2]:0) << "," << (m?m->ne[3]:0) << ")";
            timings[name.str()].push_back(time);
            return;
        }
        timings[ggml_op_name(node->op)].push_back(time);
    }
  private:
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -25,6 +25,20 @@ class Keys:
        ALIGNMENT                  = "general.alignment"
        FILE_TYPE                  = "general.file_type"
        # Recommended Sampler Parameters
        SAMPLING_SEQUENCE           = "general.sampling.sequence"
        SAMPLING_TOP_K              = "general.sampling.top_k"
        SAMPLING_TOP_P              = "general.sampling.top_p"
        SAMPLING_MIN_P              = "general.sampling.min_p"
        SAMPLING_XTC_PROBABILITY    = "general.sampling.xtc_probability"
        SAMPLING_XTC_THRESHOLD      = "general.sampling.xtc_threshold"
        SAMPLING_TEMP               = "general.sampling.temp"
        SAMPLING_PENALTY_LAST_N     = "general.sampling.penalty_last_n"
        SAMPLING_PENALTY_REPEAT     = "general.sampling.penalty_repeat"
        SAMPLING_MIROSTAT           = "general.sampling.mirostat"
        SAMPLING_MIROSTAT_TAU       = "general.sampling.mirostat_tau"
        SAMPLING_MIROSTAT_ETA       = "general.sampling.mirostat_eta"
        # Authorship Metadata
        NAME                       = "general.name"
        AUTHOR                     = "general.author"
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@ -496,6 +496,42 @@ class GGUFWriter:
    def add_file_type(self, ftype: int) -> None:
        self.add_uint32(Keys.General.FILE_TYPE, ftype)
    def add_sampling_sequence(self, sequence: str) -> None:
        self.add_string(Keys.General.SAMPLING_SEQUENCE, sequence)
    def add_sampling_top_k(self, top_k: int) -> None:
        self.add_int32(Keys.General.SAMPLING_TOP_K, top_k)
    def add_sampling_top_p(self, top_p: float) -> None:
        self.add_float32(Keys.General.SAMPLING_TOP_P, top_p)
    def add_sampling_min_p(self, min_p: float) -> None:
        self.add_float32(Keys.General.SAMPLING_MIN_P, min_p)
    def add_sampling_xtc_probability(self, xtc_probability: float) -> None:
        self.add_float32(Keys.General.SAMPLING_XTC_PROBABILITY, xtc_probability)
    def add_sampling_xtc_threshold(self, xtc_threshold: float) -> None:
        self.add_float32(Keys.General.SAMPLING_XTC_THRESHOLD, xtc_threshold)
    def add_sampling_temp(self, temp: float) -> None:
        self.add_float32(Keys.General.SAMPLING_TEMP, temp)
    def add_sampling_penalty_last_n(self, penalty_last_n: int) -> None:
        self.add_int32(Keys.General.SAMPLING_PENALTY_LAST_N, penalty_last_n)
    def add_sampling_penalty_repeat(self, penalty_repeat: float) -> None:
        self.add_float32(Keys.General.SAMPLING_PENALTY_REPEAT, penalty_repeat)
    def add_sampling_mirostat(self, mirostat: int) -> None:
        self.add_int32(Keys.General.SAMPLING_MIROSTAT, mirostat)
    def add_sampling_mirostat_tau(self, mirostat_tau: float) -> None:
        self.add_float32(Keys.General.SAMPLING_MIROSTAT_TAU, mirostat_tau)
    def add_sampling_mirostat_eta(self, mirostat_eta: float) -> None:
        self.add_float32(Keys.General.SAMPLING_MIROSTAT_ETA, mirostat_eta)
    def add_name(self, name: str) -> None:
        self.add_string(Keys.General.NAME, name)
--- a/gguf-py/gguf/metadata.py
+++ b/gguf-py/gguf/metadata.py
@ -17,6 +17,20 @@ logger = logging.getLogger("metadata")
@dataclass
 class Metadata:
    # Recommended Sampler Parameters to be written to GGUF KV Store
    sampling_sequence: Optional[str] = None
    sampling_top_k: Optional[int] = None
    sampling_top_p: Optional[float] = None
    sampling_min_p: Optional[float] = None
    sampling_xtc_probability: Optional[float] = None
    sampling_xtc_threshold: Optional[float] = None
    sampling_temp: Optional[float] = None
    sampling_penalty_last_n: Optional[int] = None
    sampling_penalty_repeat: Optional[float] = None
    sampling_mirostat: Optional[int] = None
    sampling_mirostat_tau: Optional[float] = None
    sampling_mirostat_eta: Optional[float] = None
    # Authorship Metadata to be written to GGUF KV Store
    name: Optional[str] = None
    author: Optional[str] = None
@ -54,15 +68,43 @@ class Metadata:
        model_card = Metadata.load_model_card(model_path)
        hf_params = Metadata.load_hf_parameters(model_path)
        gen_config = Metadata.load_generation_config(model_path)
        # TODO: load adapter_config.json when possible, it usually contains the base model of the LoRA adapter
        # heuristics
        metadata = Metadata.apply_metadata_heuristic(metadata, model_card, hf_params, model_path, total_params)
        if gen_config:
            metadata.sampling_sequence        = gen_config.get("sequence",        metadata.sampling_sequence)
            metadata.sampling_top_k           = gen_config.get("top_k",           metadata.sampling_top_k)
            metadata.sampling_top_p           = gen_config.get("top_p",           metadata.sampling_top_p)
            metadata.sampling_min_p           = gen_config.get("min_p",           metadata.sampling_min_p)
            metadata.sampling_xtc_probability = gen_config.get("xtc_probability", metadata.sampling_xtc_probability)
            metadata.sampling_xtc_threshold   = gen_config.get("xtc_threshold",   metadata.sampling_xtc_threshold)
            metadata.sampling_temp            = gen_config.get("temperature",     metadata.sampling_temp)
            metadata.sampling_penalty_last_n  = gen_config.get("penalty_last_n",  metadata.sampling_penalty_last_n)
            metadata.sampling_penalty_repeat  = gen_config.get("penalty_repeat",  metadata.sampling_penalty_repeat)
            metadata.sampling_mirostat        = gen_config.get("mirostat",        metadata.sampling_mirostat)
            metadata.sampling_mirostat_tau    = gen_config.get("mirostat_tau",    metadata.sampling_mirostat_tau)
            metadata.sampling_mirostat_eta    = gen_config.get("mirostat_eta",    metadata.sampling_mirostat_eta)
        # Metadata Override File Provided
        # This is based on LLM_KV_NAMES mapping in llama.cpp
        metadata_override = Metadata.load_metadata_override(metadata_override_path)
        metadata.sampling_sequence        = metadata_override.get(Keys.General.SAMPLING_SEQUENCE,        metadata.sampling_sequence)
        metadata.sampling_top_k           = metadata_override.get(Keys.General.SAMPLING_TOP_K,           metadata.sampling_top_k)
        metadata.sampling_top_p           = metadata_override.get(Keys.General.SAMPLING_TOP_P,           metadata.sampling_top_p)
        metadata.sampling_min_p           = metadata_override.get(Keys.General.SAMPLING_MIN_P,           metadata.sampling_min_p)
        metadata.sampling_xtc_probability = metadata_override.get(Keys.General.SAMPLING_XTC_PROBABILITY, metadata.sampling_xtc_probability)
        metadata.sampling_xtc_threshold   = metadata_override.get(Keys.General.SAMPLING_XTC_THRESHOLD,   metadata.sampling_xtc_threshold)
        metadata.sampling_temp            = metadata_override.get(Keys.General.SAMPLING_TEMP,            metadata.sampling_temp)
        metadata.sampling_penalty_last_n  = metadata_override.get(Keys.General.SAMPLING_PENALTY_LAST_N,  metadata.sampling_penalty_last_n)
        metadata.sampling_penalty_repeat  = metadata_override.get(Keys.General.SAMPLING_PENALTY_REPEAT,  metadata.sampling_penalty_repeat)
        metadata.sampling_mirostat        = metadata_override.get(Keys.General.SAMPLING_MIROSTAT,        metadata.sampling_mirostat)
        metadata.sampling_mirostat_tau    = metadata_override.get(Keys.General.SAMPLING_MIROSTAT_TAU,    metadata.sampling_mirostat_tau)
        metadata.sampling_mirostat_eta    = metadata_override.get(Keys.General.SAMPLING_MIROSTAT_ETA,    metadata.sampling_mirostat_eta)
        metadata.name            = metadata_override.get(Keys.General.NAME,            metadata.name)
        metadata.author          = metadata_override.get(Keys.General.AUTHOR,          metadata.author)
        metadata.version         = metadata_override.get(Keys.General.VERSION,         metadata.version)
@ -172,6 +214,23 @@ class Metadata:
        with open(config_path, "r", encoding="utf-8") as f:
            return json.load(f)
    @staticmethod
    def load_generation_config(model_path: Optional[Path] = None) -> dict[str, Any]:
        if model_path is None or not model_path.is_dir():
            return {}
        generation_config_path = model_path / "generation_config.json"
        if not generation_config_path.is_file():
            return {}
        try:
            with open(generation_config_path, "r", encoding="utf-8") as f:
                return json.load(f)
        except (json.JSONDecodeError, IOError):
            # not all models have valid generation_config.json
            return {}
    @staticmethod
    def id_to_title(string):
        # Convert capitalization into title form unless acronym or version number
@ -546,6 +605,32 @@ class Metadata:
    def set_gguf_meta_model(self, gguf_writer: gguf.GGUFWriter):
        assert self.name is not None
        if self.sampling_sequence is not None:
            gguf_writer.add_sampling_sequence(self.sampling_sequence)
        if self.sampling_top_k is not None:
            gguf_writer.add_sampling_top_k(self.sampling_top_k)
        if self.sampling_top_p is not None:
            gguf_writer.add_sampling_top_p(self.sampling_top_p)
        if self.sampling_min_p is not None:
            gguf_writer.add_sampling_min_p(self.sampling_min_p)
        if self.sampling_xtc_probability is not None:
            gguf_writer.add_sampling_xtc_probability(self.sampling_xtc_probability)
        if self.sampling_xtc_threshold is not None:
            gguf_writer.add_sampling_xtc_threshold(self.sampling_xtc_threshold)
        if self.sampling_temp is not None:
            gguf_writer.add_sampling_temp(self.sampling_temp)
        if self.sampling_penalty_last_n is not None:
            gguf_writer.add_sampling_penalty_last_n(self.sampling_penalty_last_n)
        if self.sampling_penalty_repeat is not None:
            gguf_writer.add_sampling_penalty_repeat(self.sampling_penalty_repeat)
        if self.sampling_mirostat is not None:
            gguf_writer.add_sampling_mirostat(self.sampling_mirostat)
        if self.sampling_mirostat_tau is not None:
            gguf_writer.add_sampling_mirostat_tau(self.sampling_mirostat_tau)
        if self.sampling_mirostat_eta is not None:
            gguf_writer.add_sampling_mirostat_eta(self.sampling_mirostat_eta)
        gguf_writer.add_name(self.name)
        if self.author is not None:
--- a/include/llama.h
+++ b/include/llama.h
@ -253,6 +253,21 @@ extern "C" {
        LLAMA_KV_OVERRIDE_TYPE_STR,
    };
    enum llama_model_meta_key {
        LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE,
        LLAMA_MODEL_META_KEY_SAMPLING_TOP_K,
        LLAMA_MODEL_META_KEY_SAMPLING_TOP_P,
        LLAMA_MODEL_META_KEY_SAMPLING_MIN_P,
        LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY,
        LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD,
        LLAMA_MODEL_META_KEY_SAMPLING_TEMP,
        LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N,
        LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT,
        LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT,
        LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU,
        LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA,
    };
    struct llama_model_kv_override {
        enum llama_model_kv_override_type tag;
@ -534,6 +549,9 @@ extern "C" {
    // Get the number of metadata key/value pairs
    LLAMA_API int32_t llama_model_meta_count(const struct llama_model * model);
    // Get sampling metadata key name. Returns nullptr if the key is invalid
    LLAMA_API const char * llama_model_meta_key_str(enum llama_model_meta_key key);
    // Get metadata key name by index
    LLAMA_API int32_t llama_model_meta_key_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size);
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@ -1 +1 @@
-781baf2a14d9e0aaee542b2e1bb918bfc4132199
+55bc9320a4aae82af18e23eefd5de319a755d7b9
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@ -119,6 +119,18 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_GENERAL_QUANTIZATION_VERSION,     "general.quantization_version"          },
    { LLM_KV_GENERAL_ALIGNMENT,                "general.alignment"                     },
    { LLM_KV_GENERAL_FILE_TYPE,                "general.file_type"                     },
    { LLM_KV_GENERAL_SAMPLING_SEQUENCE,        "general.sampling.sequence"             },
    { LLM_KV_GENERAL_SAMPLING_TOP_K,           "general.sampling.top_k"                },
    { LLM_KV_GENERAL_SAMPLING_TOP_P,           "general.sampling.top_p"                },
    { LLM_KV_GENERAL_SAMPLING_MIN_P,           "general.sampling.min_p"                },
    { LLM_KV_GENERAL_SAMPLING_XTC_PROBABILITY, "general.sampling.xtc_probability"      },
    { LLM_KV_GENERAL_SAMPLING_XTC_THRESHOLD,   "general.sampling.xtc_threshold"        },
    { LLM_KV_GENERAL_SAMPLING_TEMP,            "general.sampling.temp"                 },
    { LLM_KV_GENERAL_SAMPLING_PENALTY_LAST_N,  "general.sampling.penalty_last_n"       },
    { LLM_KV_GENERAL_SAMPLING_PENALTY_REPEAT,  "general.sampling.penalty_repeat"       },
    { LLM_KV_GENERAL_SAMPLING_MIROSTAT,        "general.sampling.mirostat"             },
    { LLM_KV_GENERAL_SAMPLING_MIROSTAT_TAU,    "general.sampling.mirostat_tau"         },
    { LLM_KV_GENERAL_SAMPLING_MIROSTAT_ETA,    "general.sampling.mirostat_eta"         },
    { LLM_KV_GENERAL_NAME,                     "general.name"                          },
    { LLM_KV_GENERAL_AUTHOR,                   "general.author"                        },
    { LLM_KV_GENERAL_VERSION,                  "general.version"                       },
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@ -123,6 +123,18 @@ enum llm_kv {
    LLM_KV_GENERAL_QUANTIZATION_VERSION,
    LLM_KV_GENERAL_ALIGNMENT,
    LLM_KV_GENERAL_FILE_TYPE,
    LLM_KV_GENERAL_SAMPLING_SEQUENCE,
    LLM_KV_GENERAL_SAMPLING_TOP_K,
    LLM_KV_GENERAL_SAMPLING_TOP_P,
    LLM_KV_GENERAL_SAMPLING_MIN_P,
    LLM_KV_GENERAL_SAMPLING_XTC_PROBABILITY,
    LLM_KV_GENERAL_SAMPLING_XTC_THRESHOLD,
    LLM_KV_GENERAL_SAMPLING_TEMP,
    LLM_KV_GENERAL_SAMPLING_PENALTY_LAST_N,
    LLM_KV_GENERAL_SAMPLING_PENALTY_REPEAT,
    LLM_KV_GENERAL_SAMPLING_MIROSTAT,
    LLM_KV_GENERAL_SAMPLING_MIROSTAT_TAU,
    LLM_KV_GENERAL_SAMPLING_MIROSTAT_ETA,
    LLM_KV_GENERAL_NAME,
    LLM_KV_GENERAL_AUTHOR,
    LLM_KV_GENERAL_VERSION,
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@ -1594,7 +1594,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
        // make the outputs have the same order they had in the user-provided batch
        // note: this is mostly relevant for recurrent models atm
-        if (!sorted_output) {
+        if (!sorted_output && n_outputs > 1) {
            GGML_ASSERT((size_t) n_outputs == out_ids.size());
            // TODO: is there something more efficient which also minimizes swaps?
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@ -7690,6 +7690,24 @@ int32_t llama_model_meta_count(const llama_model * model) {
    return (int)model->gguf_kv.size();
 }
 const char * llama_model_meta_key_str(llama_model_meta_key key) {
    switch (key) {
        case LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE:        return "general.sampling.sequence";
        case LLAMA_MODEL_META_KEY_SAMPLING_TOP_K:           return "general.sampling.top_k";
        case LLAMA_MODEL_META_KEY_SAMPLING_TOP_P:           return "general.sampling.top_p";
        case LLAMA_MODEL_META_KEY_SAMPLING_MIN_P:           return "general.sampling.min_p";
        case LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY: return "general.sampling.xtc_probability";
        case LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD:   return "general.sampling.xtc_threshold";
        case LLAMA_MODEL_META_KEY_SAMPLING_TEMP:            return "general.sampling.temp";
        case LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N:  return "general.sampling.penalty_last_n";
        case LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT:  return "general.sampling.penalty_repeat";
        case LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT:        return "general.sampling.mirostat";
        case LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU:    return "general.sampling.mirostat_tau";
        case LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA:    return "general.sampling.mirostat_eta";
        default:                                            return nullptr;
    }
 }
 int32_t llama_model_meta_key_by_index(const llama_model * model, int i, char * buf, size_t buf_size) {
    if (i < 0 || i >= (int)model->gguf_kv.size()) {
        if (buf_size > 0) {
--- a/tools/server/CMakeLists.txt
+++ b/tools/server/CMakeLists.txt
@ -13,9 +13,14 @@ endif()
 set(TARGET_SRCS
    server.cpp
    utils.hpp
    server-http.cpp
    server-http.h
    server-task.cpp
    server-task.h
    server-queue.cpp
    server-queue.h
    server-common.cpp
    server-common.h
 )
 set(PUBLIC_ASSETS
    index.html.gz
--- a/tools/server/server-common.cpp
+++ b/tools/server/server-common.cpp
--- a/tools/server/server-common.h
+++ b/tools/server/server-common.h
@ -0,0 +1,349 @@
 #pragma once
 #include "common.h"
 #include "log.h"
 #include "llama.h"
 #include "chat.h"
 #include "mtmd.h"
 #define JSON_ASSERT GGML_ASSERT
 #include <nlohmann/json.hpp>
 #include <string>
 #include <vector>
 #include <cinttypes>
 #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo"
 const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
 using json = nlohmann::ordered_json;
 #define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
 #define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
 #define SLT_ERR(slot, fmt, ...) LOG_ERR("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
 #define SLT_DBG(slot, fmt, ...) LOG_DBG("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
 #define SRV_INF(fmt, ...) LOG_INF("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
 #define SRV_WRN(fmt, ...) LOG_WRN("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
 #define SRV_ERR(fmt, ...) LOG_ERR("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
 #define SRV_DBG(fmt, ...) LOG_DBG("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
 using raw_buffer = std::vector<uint8_t>;
 template <typename T>
 static T json_value(const json & body, const std::string & key, const T & default_value) {
    // Fallback null to default value
    if (body.contains(key) && !body.at(key).is_null()) {
        try {
            return body.at(key);
        } catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const & err) {
            LOG_WRN("Wrong type supplied for parameter '%s'. Expected '%s', using default value: %s\n", key.c_str(), json(default_value).type_name(), err.what());
            return default_value;
        }
    } else {
        return default_value;
    }
 }
 // https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
 enum error_type {
    ERROR_TYPE_INVALID_REQUEST,
    ERROR_TYPE_AUTHENTICATION,
    ERROR_TYPE_SERVER,
    ERROR_TYPE_NOT_FOUND,
    ERROR_TYPE_PERMISSION,
    ERROR_TYPE_UNAVAILABLE, // custom error
    ERROR_TYPE_NOT_SUPPORTED, // custom error
    ERROR_TYPE_EXCEED_CONTEXT_SIZE, // custom error
 };
 // thin wrapper around common_grammar_trigger with (de)serialization functions
 struct server_grammar_trigger {
    common_grammar_trigger value;
    server_grammar_trigger() = default;
    server_grammar_trigger(const common_grammar_trigger & value) : value(value) {}
    server_grammar_trigger(const json & in) {
        value.type = (common_grammar_trigger_type) in.at("type").get<int>();
        value.value = in.at("value").get<std::string>();
        if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
            value.token = (llama_token) in.at("token").get<int>();
        }
    }
    json to_json() const {
        json out {
            {"type", (int) value.type},
            {"value", value.value},
        };
        if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
            out["token"] = (int) value.token;
        }
        return out;
    }
 };
 json format_error_response(const std::string & message, const enum error_type type);
 //
 // random string / id
 //
 std::string random_string();
 std::string gen_chatcmplid();
 std::string gen_tool_call_id();
 //
 // lora utils
 //
 // check whether the given lora set has only aloras activated (empty => false)
 bool lora_all_alora(const std::vector<common_adapter_lora_info> & loras);
 // if the two sets of loras are different, they require a cache clear unless the
 // change is only from aloras to aloras.
 bool lora_should_clear_cache(
        const std::vector<common_adapter_lora_info> & current,
        const std::vector<common_adapter_lora_info> & next);
 std::vector<common_adapter_lora_info> parse_lora_request(
        const std::vector<common_adapter_lora_info> & lora_base,
        const json & data);
 bool are_lora_equal(
        const std::vector<common_adapter_lora_info> & l1,
        const std::vector<common_adapter_lora_info> & l2);
 // get the ids of all enabled loras
 std::vector<size_t> lora_get_enabled_ids(const std::vector<common_adapter_lora_info> & loras);
 //
 // server_tokens
 //
 /**
 * server_tokens is a helper to manage the input tokens and image for the server.
 * it is made this way to simplify the logic of KV cache management.
 */
 struct server_tokens {
    bool has_mtmd = false;
 private: // disallow accessing these members directly, risking out-of-sync
    // map a **start** index in tokens to the image chunk
    // note: the order need to be in-sync with tokens
    std::map<size_t, mtmd::input_chunk_ptr> map_idx_to_media;
    // list of tokens
    //   if the token is LLAMA_TOKEN_NULL, it indicates that this position is occupied by media chunk
    //   otherwise, it is a normal text token
    // note: a non-text chunk can occupy multiple tokens (aka memory cells) in the token list
    // note(2): for M-RoPE, an image can occupy different number of pos; do not assume 1-to-1 mapping tokens <-> pos
    llama_tokens tokens;
    // for ex. with input of 5 text tokens and 2 images (each image occupies 3 tokens and 2 pos):
    //      [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1] [img1]
    // idx  0   1   2   3   4   5      6      7      8      9      10
    // pos  0   1   2   3   4   5      5      5      7      7      7
    // map_idx_to_media will contain: {5, img0}, {8, img1}
 public:
    server_tokens() = default;
    ~server_tokens() = default;
    // Prevent copying
    // TODO: server_tokens should be copyable - remove this:
    server_tokens(const server_tokens&) = delete;
    server_tokens& operator=(const server_tokens&) = delete;
    // Allow moving (usually implicitly generated if members are movable)
    server_tokens(server_tokens&&) = default;
    server_tokens& operator=(server_tokens&&) = default;
    // Allow accessing elements using [] operator
    llama_token operator[](size_t index) { return tokens[index]; }
    const llama_token& operator[](size_t index) const { return tokens[index]; }
    server_tokens(mtmd::input_chunks & mtmd_chunks, bool has_mtmd);
    server_tokens(const llama_tokens & tokens, bool has_mtmd);
    // for debugging
    std::string str() const;
    llama_pos pos_next() const;
    const mtmd::input_chunk_ptr & find_chunk(size_t idx) const;
    void push_back(llama_token tok);
    // will create a copy of the chunk if it contains non-text data
    void push_back(const mtmd_input_chunk * chunk);
    // appends server tokens, updates the media map. copies media chunks.
    void push_back(server_tokens & tokens);
    // for compatibility with context shift and prompt truncation
    void insert(const llama_tokens & inp_tokens);
    // for compatibility with speculative decoding, ctx shift, slot save/load
    const llama_tokens & get_text_tokens() const;
    // for compatibility with speculative decoding
    void set_token(llama_pos pos, llama_token id);
    size_t size() const { return tokens.size(); }
    bool empty() const { return tokens.empty(); }
    void clear() {
        map_idx_to_media.clear();
        tokens.clear();
    }
    void keep_first(size_t n);
    std::string detokenize(const llama_context * ctx, bool special) const;
    size_t get_common_prefix(const server_tokens & b) const;
    // make sure all text tokens are within the vocab range
    bool validate(const struct llama_context * ctx) const;
    // encode and decode the image chunk
    int32_t process_chunk(
                llama_context * ctx,
                mtmd_context * mctx,
                size_t idx,
                llama_pos pos,
                int32_t seq_id,
                size_t & n_tokens_out) const;
 };
 //
 // tokenizer and input processing utils
 //
 bool json_is_array_of_numbers(const json & data);
 // is array having BOTH numbers & strings?
 bool json_is_array_of_mixed_numbers_strings(const json & data);
 // does array have any individual integers/tokens?
 bool json_is_array_and_contains_numbers(const json & data);
 // get value by path(key1 / key2)
 json json_get_nested_values(const std::vector<std::string> & paths, const json & js);
 /**
 * this handles 2 cases:
 * - only string, example: "string"
 * - mixed string and tokens, example: [12, 34, "string", 56, 78]
 */
 llama_tokens tokenize_mixed(const llama_vocab * vocab, const json & json_prompt, bool add_special, bool parse_special);
 // return the last index of character that can form a valid string
 // if the last character is potentially cut in half, return the index before the cut
 // if validate_utf8(text) == text.size(), then the whole text is valid utf8
 size_t validate_utf8(const std::string& text);
 // process mtmd prompt, return the server_tokens containing both text tokens and media chunks
 server_tokens process_mtmd_prompt(mtmd_context * mctx, std::string prompt, std::vector<raw_buffer> files);
 /**
 * break the input "prompt" object into multiple prompt if needed, then tokenize them
 * this supports these cases:
 * - "prompt": "string"
 * - "prompt": [12, 34, 56]
 * - "prompt": [12, 34, "string", 56, 78]
 * - "prompt": { "prompt_string": "string", "multimodal_data": [ "base64" ] }
 * and multiple prompts (multi-tasks):
 * - "prompt": ["string1", "string2"]
 * - "prompt": ["string1", [12, 34, 56]]
 * - "prompt": [[12, 34, 56], [78, 90, 12]]
 * - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56], { "prompt_string": "string", "multimodal_data": [ "base64" ]}]
 */
 std::vector<server_tokens> tokenize_input_prompts(
                                        const llama_vocab * vocab,
                                        mtmd_context * mctx,
                                        const json & json_prompt,
                                        bool add_special,
                                        bool parse_special);
 //
 // OAI utils
 //
 // used by /completions endpoint
 json oaicompat_completion_params_parse(const json & body);
 struct oaicompat_parser_options {
    bool use_jinja;
    bool prefill_assistant;
    common_reasoning_format reasoning_format;
    std::map<std::string,std::string> chat_template_kwargs;
    common_chat_templates * tmpls;
    bool allow_image;
    bool allow_audio;
    bool enable_thinking = true;
 };
 // used by /chat/completions endpoint
 json oaicompat_chat_params_parse(
    json & body, /* openai api json semantics */
    const oaicompat_parser_options & opt,
    std::vector<raw_buffer> & out_files);
 // TODO: move it to server-task.cpp
 json format_embeddings_response_oaicompat(const json & request, const json & embeddings, bool use_base64 = false);
 // TODO: move it to server-task.cpp
 json format_response_rerank(
        const json & request,
        const json & ranks,
        bool is_tei_format,
        std::vector<std::string> & texts,
        int top_n);
 //
 // other utils
 //
 std::vector<llama_token_data> get_token_probabilities(llama_context * ctx, int idx);
 std::string safe_json_to_str(const json & data);
 std::string tokens_to_str(llama_context * ctx, const llama_tokens & tokens);
 // format incomplete utf-8 multibyte character for output
 std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token);
 // format server-sent event (SSE), return the formatted string to send
 // note: if data is a json array, it will be sent as multiple events, one per item
 std::string format_sse(const json & data);
 bool is_valid_utf8(const std::string & str);
 //
 // formatting output responses
 // TODO: move these to server-task.cpp
 //
 llama_tokens format_prompt_infill(
        const llama_vocab * vocab,
        const json & input_prefix,
        const json & input_suffix,
        const json & input_extra,
        const int n_batch,
        const int n_predict,
        const int n_ctx,
        const bool spm_infill,
        const llama_tokens & tokens_prompt);
 // format rerank task: [BOS]query[EOS][SEP]doc[EOS].
 server_tokens format_prompt_rerank(
        const struct llama_model * model,
        const struct llama_vocab * vocab,
        mtmd_context * mctx,
        const std::string & query,
        const std::string & doc);
--- a/tools/server/server-http.cpp
+++ b/tools/server/server-http.cpp
@ -1,6 +1,6 @@
 #include "utils.hpp"
 #include "common.h"
 #include "server-http.h"
 #include "server-common.h"
 #include <cpp-httplib/httplib.h>
--- a/tools/server/server-queue.cpp
+++ b/tools/server/server-queue.cpp
@ -0,0 +1,268 @@
 #include "server-task.h"
 #include "server-queue.h"
 #include "log.h"
 #include <chrono>
 #define QUE_INF(fmt, ...) LOG_INF("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
 #define QUE_WRN(fmt, ...) LOG_WRN("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
 #define QUE_ERR(fmt, ...) LOG_ERR("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
 #define QUE_DBG(fmt, ...) LOG_DBG("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
 #define RES_INF(fmt, ...) LOG_INF("res  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
 #define RES_WRN(fmt, ...) LOG_WRN("res  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
 #define RES_ERR(fmt, ...) LOG_ERR("res  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
 #define RES_DBG(fmt, ...) LOG_DBG("res  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
 //
 // server_queue
 //
 int server_queue::post(server_task && task, bool front) {
    std::unique_lock<std::mutex> lock(mutex_tasks);
    GGML_ASSERT(task.id != -1);
    // if this is cancel task make sure to clean up pending tasks
    if (task.type == SERVER_TASK_TYPE_CANCEL) {
        cleanup_pending_task(task.id_target);
    }
    const int task_id = task.id;
    QUE_DBG("new task, id = %d, front = %d\n", task_id, front);
    if (front) {
        queue_tasks.push_front(std::move(task));
    } else {
        queue_tasks.push_back(std::move(task));
    }
    condition_tasks.notify_one();
    return task_id;
 }
 int server_queue::post(std::vector<server_task> && tasks, bool front) {
    std::unique_lock<std::mutex> lock(mutex_tasks);
    for (auto & task : tasks) {
        if (task.id == -1) {
            task.id = id++;
        }
        // if this is cancel task make sure to clean up pending tasks
        if (task.type == SERVER_TASK_TYPE_CANCEL) {
            cleanup_pending_task(task.id_target);
        }
        QUE_DBG("new task, id = %d/%d, front = %d\n", task.id, (int) tasks.size(), front);
        if (front) {
            queue_tasks.push_front(std::move(task));
        } else {
            queue_tasks.push_back(std::move(task));
        }
    }
    condition_tasks.notify_one();
    return 0;
 }
 void server_queue::defer(server_task && task) {
    std::unique_lock<std::mutex> lock(mutex_tasks);
    QUE_DBG("defer task, id = %d\n", task.id);
    queue_tasks_deferred.push_back(std::move(task));
    condition_tasks.notify_one();
 }
 int server_queue::get_new_id() {
    std::unique_lock<std::mutex> lock(mutex_tasks);
    int new_id = id++;
    return new_id;
 }
 void server_queue::on_new_task(std::function<void(server_task &&)> callback) {
    callback_new_task = std::move(callback);
 }
 void server_queue::on_update_slots(std::function<void(void)> callback) {
    callback_update_slots = std::move(callback);
 }
 void server_queue::pop_deferred_task() {
    std::unique_lock<std::mutex> lock(mutex_tasks);
    if (!queue_tasks_deferred.empty()) {
        queue_tasks.emplace_front(std::move(queue_tasks_deferred.front()));
        queue_tasks_deferred.pop_front();
    }
    condition_tasks.notify_one();
 }
 void server_queue::terminate() {
    std::unique_lock<std::mutex> lock(mutex_tasks);
    running = false;
    condition_tasks.notify_all();
 }
 void server_queue::start_loop() {
    running = true;
    while (true) {
        QUE_DBG("%s", "processing new tasks\n");
        while (true) {
            std::unique_lock<std::mutex> lock(mutex_tasks);
            if (!running) {
                QUE_DBG("%s", "terminate\n");
                return;
            }
            if (queue_tasks.empty()) {
                lock.unlock();
                break;
            }
            server_task task = std::move(queue_tasks.front());
            queue_tasks.pop_front();
            lock.unlock();
            QUE_DBG("processing task, id = %d\n", task.id);
            callback_new_task(std::move(task));
        }
        // all tasks in the current loop is processed, slots data is now ready
        QUE_DBG("%s", "update slots\n");
        callback_update_slots();
        QUE_DBG("%s", "waiting for new tasks\n");
        {
            std::unique_lock<std::mutex> lock(mutex_tasks);
            if (!running) {
                QUE_DBG("%s", "terminate\n");
                return;
            }
            if (queue_tasks.empty()) {
                condition_tasks.wait(lock, [&]{
                    return (!queue_tasks.empty() || !running);
                });
            }
        }
    }
 }
 void server_queue::cleanup_pending_task(int id_target) {
    // no need lock because this is called exclusively by post()
    auto rm_func = [id_target](const server_task & task) {
        return task.id == id_target;
    };
    queue_tasks.erase(
        std::remove_if(queue_tasks.begin(),          queue_tasks.end(),          rm_func),
        queue_tasks.end());
    queue_tasks_deferred.erase(
        std::remove_if(queue_tasks_deferred.begin(), queue_tasks_deferred.end(), rm_func),
        queue_tasks_deferred.end());
 }
 //
 // server_response
 //
 void server_response::add_waiting_task_id(int id_task) {
    RES_DBG("add task %d to waiting list. current waiting = %d (before add)\n", id_task, (int) waiting_task_ids.size());
    std::unique_lock<std::mutex> lock(mutex_results);
    waiting_task_ids.insert(id_task);
 }
 void server_response::add_waiting_tasks(const std::vector<server_task> & tasks) {
    std::unique_lock<std::mutex> lock(mutex_results);
    for (const auto & task : tasks) {
        RES_DBG("add task %d to waiting list. current waiting = %d (before add)\n", task.id, (int) waiting_task_ids.size());
        waiting_task_ids.insert(task.id);
    }
 }
 void server_response::remove_waiting_task_id(int id_task) {
    RES_DBG("remove task %d from waiting list. current waiting = %d (before remove)\n", id_task, (int) waiting_task_ids.size());
    std::unique_lock<std::mutex> lock(mutex_results);
    waiting_task_ids.erase(id_task);
    // make sure to clean up all pending results
    queue_results.erase(
        std::remove_if(queue_results.begin(), queue_results.end(), [id_task](const server_task_result_ptr & res) {
            return res->id == id_task;
        }),
        queue_results.end());
 }
 void server_response::remove_waiting_task_ids(const std::unordered_set<int> & id_tasks) {
    std::unique_lock<std::mutex> lock(mutex_results);
    for (const auto & id_task : id_tasks) {
        RES_DBG("remove task %d from waiting list. current waiting = %d (before remove)\n", id_task, (int) waiting_task_ids.size());
        waiting_task_ids.erase(id_task);
    }
 }
 server_task_result_ptr server_response::recv(const std::unordered_set<int> & id_tasks) {
    while (true) {
        std::unique_lock<std::mutex> lock(mutex_results);
        condition_results.wait(lock, [&]{
            if (!running) {
                RES_DBG("%s : queue result stop\n", __func__);
                std::terminate(); // we cannot return here since the caller is HTTP code
            }
            return !queue_results.empty();
        });
        for (size_t i = 0; i < queue_results.size(); i++) {
            if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) {
                server_task_result_ptr res = std::move(queue_results[i]);
                queue_results.erase(queue_results.begin() + i);
                return res;
            }
        }
    }
    // should never reach here
 }
 server_task_result_ptr server_response::recv_with_timeout(const std::unordered_set<int> & id_tasks, int timeout) {
    while (true) {
        std::unique_lock<std::mutex> lock(mutex_results);
        for (int i = 0; i < (int) queue_results.size(); i++) {
            if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) {
                server_task_result_ptr res = std::move(queue_results[i]);
                queue_results.erase(queue_results.begin() + i);
                return res;
            }
        }
        std::cv_status cr_res = condition_results.wait_for(lock, std::chrono::seconds(timeout));
        if (!running) {
            RES_DBG("%s : queue result stop\n", __func__);
            std::terminate(); // we cannot return here since the caller is HTTP code
        }
        if (cr_res == std::cv_status::timeout) {
            return nullptr;
        }
    }
    // should never reach here
 }
 server_task_result_ptr server_response::recv(int id_task) {
    std::unordered_set<int> id_tasks = {id_task};
    return recv(id_tasks);
 }
 void server_response::send(server_task_result_ptr && result) {
    RES_DBG("sending result for task id = %d\n", result->id);
    std::unique_lock<std::mutex> lock(mutex_results);
    for (const auto & id_task : waiting_task_ids) {
        if (result->id == id_task) {
            RES_DBG("task id = %d pushed to result queue\n", result->id);
            queue_results.emplace_back(std::move(result));
            condition_results.notify_all();
            return;
        }
    }
 }
 void server_response::terminate() {
    running = false;
    condition_results.notify_all();
 }
--- a/tools/server/server-queue.h
+++ b/tools/server/server-queue.h
@ -0,0 +1,110 @@
 #pragma once
 #include "server-task.h"
 #include <condition_variable>
 #include <deque>
 #include <mutex>
 #include <unordered_set>
 struct server_queue {
 private:
    int id = 0;
    bool running;
    // queues
    std::deque<server_task> queue_tasks;
    std::deque<server_task> queue_tasks_deferred;
    std::mutex mutex_tasks;
    std::condition_variable condition_tasks;
    // callback functions
    std::function<void(server_task &&)> callback_new_task;
    std::function<void(void)>           callback_update_slots;
 public:
    // Add a new task to the end of the queue
    int post(server_task && task, bool front = false);
    // multi-task version of post()
    int post(std::vector<server_task> && tasks, bool front = false);
    // Add a new task, but defer until one slot is available
    void defer(server_task && task);
    // Get the next id for creating a new task
    int get_new_id();
    // Register function to process a new task
    void on_new_task(std::function<void(server_task &&)> callback);
    // Register the function to be called when all slots data is ready to be processed
    void on_update_slots(std::function<void(void)> callback);
    // Call when the state of one slot is changed, it will move one task from deferred to main queue
    void pop_deferred_task();
    // end the start_loop routine
    void terminate();
    /**
     * Main loop consists of these steps:
     * - Wait until a new task arrives
     * - Process the task (i.e. maybe copy data into slot)
     * - Check if multitask is finished
     * - Update all slots
     */
    void start_loop();
    // for metrics
    size_t queue_tasks_deferred_size() {
        std::unique_lock<std::mutex> lock(mutex_tasks);
        return queue_tasks_deferred.size();
    }
 private:
    void cleanup_pending_task(int id_target);
 };
 struct server_response {
 private:
    bool running = true;
    // for keeping track of all tasks waiting for the result
    std::unordered_set<int> waiting_task_ids;
    // the main result queue (using ptr for polymorphism)
    std::vector<server_task_result_ptr> queue_results;
    std::mutex mutex_results;
    std::condition_variable condition_results;
 public:
    // add the id_task to the list of tasks waiting for response
    void add_waiting_task_id(int id_task);
    void add_waiting_tasks(const std::vector<server_task> & tasks);
    // when the request is finished, we can remove task associated with it
    void remove_waiting_task_id(int id_task);
    // remove multiple tasks from waiting list
    void remove_waiting_task_ids(const std::unordered_set<int> & id_tasks);
    // This function blocks the thread until there is a response for one of the id_tasks
    server_task_result_ptr recv(const std::unordered_set<int> & id_tasks);
    // same as recv(), but have timeout in seconds
    // if timeout is reached, nullptr is returned
    server_task_result_ptr recv_with_timeout(const std::unordered_set<int> & id_tasks, int timeout);
    // single-task version of recv()
    server_task_result_ptr recv(int id_task);
    // Send a new result to a waiting id_task
    void send(server_task_result_ptr && result);
    // terminate the waiting loop
    void terminate();
 };
--- a/tools/server/server-task.cpp
+++ b/tools/server/server-task.cpp
--- a/tools/server/server-task.h
+++ b/tools/server/server-task.h
@ -0,0 +1,453 @@
 #pragma once
 #include "common.h"
 #include "llama.h"
 #include <string>
 #include <unordered_set>
 #include <list>
 // TODO: prevent including the whole server-common.h as we only use server_tokens
 #include "server-common.h"
 using json = nlohmann::ordered_json;
 enum server_task_type {
    SERVER_TASK_TYPE_COMPLETION,
    SERVER_TASK_TYPE_EMBEDDING,
    SERVER_TASK_TYPE_RERANK,
    SERVER_TASK_TYPE_INFILL,
    SERVER_TASK_TYPE_CANCEL,
    SERVER_TASK_TYPE_NEXT_RESPONSE,
    SERVER_TASK_TYPE_METRICS,
    SERVER_TASK_TYPE_SLOT_SAVE,
    SERVER_TASK_TYPE_SLOT_RESTORE,
    SERVER_TASK_TYPE_SLOT_ERASE,
    SERVER_TASK_TYPE_SET_LORA,
 };
 // TODO: change this to more generic "response_format" to replace the "format_response_*" in server-common
 enum oaicompat_type {
    OAICOMPAT_TYPE_NONE,
    OAICOMPAT_TYPE_CHAT,
    OAICOMPAT_TYPE_COMPLETION,
    OAICOMPAT_TYPE_EMBEDDING,
 };
 enum stop_type {
    STOP_TYPE_NONE,
    STOP_TYPE_EOS,
    STOP_TYPE_WORD,
    STOP_TYPE_LIMIT,
 };
 struct task_params {
    bool stream          = true;
    bool include_usage   = false;
    bool cache_prompt    = true; // remember the prompt to avoid reprocessing all prompt
    bool return_tokens   = false;
    bool return_progress = false;
    int32_t n_keep    =  0; // number of tokens to keep from initial prompt
    int32_t n_discard =  0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
    int32_t n_predict = -1; // new tokens to predict
    int32_t n_indent  =  0; // minimum line indentation for the generated text in number of whitespace characters
    int64_t t_max_prompt_ms  = -1; // TODO: implement
    int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
    std::vector<common_adapter_lora_info> lora;
    std::vector<std::string> antiprompt;
    std::vector<std::string> response_fields;
    bool timings_per_token = false;
    bool post_sampling_probs = false;
    struct common_params_sampling sampling;
    struct common_params_speculative speculative;
    // OAI-compat fields
    bool                         verbose                   = false;
    oaicompat_type               oaicompat                 = OAICOMPAT_TYPE_NONE;
    std::string                  oaicompat_model;
    std::string                  oaicompat_cmpl_id;
    common_chat_syntax           oaicompat_chat_syntax;
    // Embeddings
    int32_t embd_normalize = 2; // (-1=none, 0=max absolute int16, 1=taxicab, 2=Euclidean/L2, >2=p-norm)
    json format_logit_bias(const std::vector<llama_logit_bias> & logit_bias) const;
    json to_json(bool only_metrics = false) const;
 };
 struct server_task {
    int id    = -1; // to be filled by server_queue
    int index = -1; // used when there are multiple prompts (batch request)
    // used by SERVER_TASK_TYPE_CANCEL
    int id_target = -1;
    int id_slot   = -1;
    // used by SERVER_TASK_TYPE_INFERENCE
    task_params   params;
    server_tokens tokens;
    server_task_type type;
    // used by SERVER_TASK_TYPE_SLOT_SAVE, SERVER_TASK_TYPE_SLOT_RESTORE, SERVER_TASK_TYPE_SLOT_ERASE
    struct slot_action {
        int slot_id;
        std::string filename;
        std::string filepath;
    };
    slot_action slot_action;
    // used by SERVER_TASK_TYPE_METRICS
    bool metrics_reset_bucket = false;
    // used by SERVER_TASK_TYPE_SET_LORA
    std::vector<common_adapter_lora_info> set_lora;
    server_task() = default;
    server_task(server_task_type type) : type(type) {}
    int32_t n_tokens() const {
        return tokens.size();
    }
    static task_params params_from_json_cmpl(
            const llama_context * ctx,
            const common_params & params_base,
            const json & data);
    // utility function
    static std::unordered_set<int> get_list_id(const std::vector<server_task> & tasks) {
        std::unordered_set<int> ids(tasks.size());
        for (size_t i = 0; i < tasks.size(); i++) {
            ids.insert(tasks[i].id);
        }
        return ids;
    }
 };
 struct result_timings {
    int32_t cache_n = -1;
    int32_t prompt_n = -1;
    double prompt_ms;
    double prompt_per_token_ms;
    double prompt_per_second;
    int32_t predicted_n = -1;
    double predicted_ms;
    double predicted_per_token_ms;
    double predicted_per_second;
    // Optional speculative metrics - only included when > 0
    int32_t draft_n = 0;
    int32_t draft_n_accepted = 0;
    json to_json() const;
 };
 struct result_prompt_progress {
    int32_t total = 0;
    int32_t cache = 0;
    int32_t processed = 0;
    int64_t time_ms = 0;
    json to_json() const;
 };
 struct server_task_result {
    int id           = -1;
    int id_slot      = -1;
    virtual bool is_error() {
        // only used by server_task_result_error
        return false;
    }
    virtual bool is_stop() {
        // only used by server_task_result_cmpl_*
        return true;
    }
    virtual int get_index() {
        return -1;
    }
    virtual json to_json() = 0;
    virtual ~server_task_result() = default;
 };
 // using shared_ptr for polymorphism of server_task_result
 using server_task_result_ptr = std::unique_ptr<server_task_result>;
 struct completion_token_output {
    llama_token tok;
    float prob;
    std::string text_to_send;
    struct prob_info {
        llama_token tok;
        std::string txt;
        float prob;
    };
    std::vector<prob_info> probs;
    json to_json(bool post_sampling_probs) const;
    static json probs_vector_to_json(const std::vector<completion_token_output> & probs, bool post_sampling_probs);
    static float logarithm(float x);
    static std::vector<unsigned char> str_to_bytes(const std::string & str);
 };
 struct server_task_result_cmpl_final : server_task_result {
    int index = 0;
    std::string content;
    llama_tokens tokens;
    bool stream;
    bool include_usage;
    result_timings timings;
    std::string prompt;
    bool truncated;
    int32_t n_decoded;
    int32_t n_prompt_tokens;
    int32_t n_tokens_cached;
    bool has_new_line;
    std::string stopping_word;
    stop_type stop = STOP_TYPE_NONE;
    bool post_sampling_probs;
    std::vector<completion_token_output> probs_output;
    std::vector<std::string>  response_fields;
    task_params generation_params;
    // OAI-compat fields
    bool            verbose   = false;
    oaicompat_type  oaicompat = OAICOMPAT_TYPE_NONE;
    std::string     oaicompat_model;
    std::string     oaicompat_cmpl_id;
    common_chat_msg oaicompat_msg;
    std::vector<common_chat_msg_diff> oaicompat_msg_diffs;
    virtual int get_index() override {
        return index;
    }
    virtual bool is_stop() override {
        return true; // in stream mode, final responses are considered stop
    }
    virtual json to_json() override;
    json to_json_non_oaicompat();
    json to_json_oaicompat();
    json to_json_oaicompat_chat();
    json to_json_oaicompat_chat_stream();
 };
 struct server_task_result_cmpl_partial : server_task_result {
    int index = 0;
    std::string  content;
    llama_tokens tokens;
    int32_t n_decoded;
    int32_t n_prompt_tokens;
    bool post_sampling_probs;
    bool is_progress = false;
    completion_token_output prob_output;
    result_timings timings;
    result_prompt_progress progress;
    // OAI-compat fields
    bool            verbose   = false;
    oaicompat_type  oaicompat = OAICOMPAT_TYPE_NONE;
    std::string     oaicompat_model;
    std::string     oaicompat_cmpl_id;
    std::vector<common_chat_msg_diff> oaicompat_msg_diffs;
    virtual int get_index() override {
        return index;
    }
    virtual bool is_stop() override {
        return false; // in stream mode, partial responses are not considered stop
    }
    virtual json to_json() override;
    json to_json_non_oaicompat();
    json to_json_oaicompat();
    json to_json_oaicompat_chat();
 };
 struct server_task_result_embd : server_task_result {
    int index = 0;
    std::vector<std::vector<float>> embedding;
    int32_t n_tokens;
    // OAI-compat fields
    oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE;
    virtual int get_index() override {
        return index;
    }
    virtual json to_json() override;
    json to_json_non_oaicompat();
    json to_json_oaicompat();
 };
 struct server_task_result_rerank : server_task_result {
    int index = 0;
    float score = -1e6;
    int32_t n_tokens;
    virtual int get_index() override {
        return index;
    }
    virtual json to_json() override;
 };
 struct server_task_result_error : server_task_result {
    int index = 0;
    error_type err_type = ERROR_TYPE_SERVER;
    std::string err_msg;
    // for ERROR_TYPE_EXCEED_CONTEXT_SIZE
    int32_t n_prompt_tokens = 0;
    int32_t n_ctx           = 0;
    virtual bool is_error() override {
        return true;
    }
    virtual json to_json() override;
 };
 struct server_task_result_metrics : server_task_result {
    int n_idle_slots;
    int n_processing_slots;
    int n_tasks_deferred;
    int64_t t_start;
    // TODO: somehow reuse server_metrics in the future, instead of duplicating the fields
    uint64_t n_prompt_tokens_processed_total = 0;
    uint64_t t_prompt_processing_total       = 0;
    uint64_t n_tokens_predicted_total        = 0;
    uint64_t t_tokens_generation_total       = 0;
    uint64_t n_tokens_max = 0;
    uint64_t n_prompt_tokens_processed = 0;
    uint64_t t_prompt_processing       = 0;
    uint64_t n_tokens_predicted  = 0;
    uint64_t t_tokens_generation = 0;
    uint64_t n_decode_total     = 0;
    uint64_t n_busy_slots_total = 0;
    // while we can also use std::vector<server_slot> this requires copying the slot object which can be quite messy
    // therefore, we use json to temporarily store the slot.to_json() result
    json slots_data = json::array();
    virtual json to_json() override;
 };
 struct server_task_result_slot_save_load : server_task_result {
    std::string filename;
    bool is_save; // true = save, false = load
    size_t n_tokens;
    size_t n_bytes;
    double t_ms;
    virtual json to_json() override;
 };
 struct server_task_result_slot_erase : server_task_result {
    size_t n_erased;
    virtual json to_json() override;
 };
 struct server_task_result_apply_lora : server_task_result {
    virtual json to_json() override;
 };
 struct server_prompt_checkpoint {
    llama_pos pos_min;
    llama_pos pos_max;
    std::vector<uint8_t> data;
    size_t size() const {
        return data.size();
    }
 };
 struct server_prompt {
    server_tokens tokens;
    std::vector<uint8_t> data;
    std::list<server_prompt_checkpoint> checkpoints;
    size_t size() const {
        size_t res = data.size();
        for (const auto & checkpoint : checkpoints) {
            res += checkpoint.size();
        }
        return res;
    }
    int n_tokens() const {
        return tokens.size();
    }
 };
 struct server_prompt_cache {
    server_prompt_cache(int32_t limit_size_mib, size_t limit_tokens) {
        this->limit_size   = 1024ull*1024ull*(limit_size_mib < 0 ? 0 : limit_size_mib);
        this->limit_tokens = limit_tokens;
    }
    std::list<server_prompt> states;
    // in bytes, 0 = no limit
    size_t limit_size = 0;
    // in tokens, 0 = no limit
    size_t limit_tokens = 0;
    size_t size() const;
    size_t n_tokens() const;
    server_prompt * alloc(const server_prompt & prompt, size_t state_size);
    bool load(server_prompt & prompt, const server_tokens & tokens_new, llama_context * ctx, int32_t id_slot);
    void update();
 };
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
`@ -1 +1 @@`
	`781baf2a14d9e0aaee542b2e1bb918bfc4132199`	`55bc9320a4aae82af18e23eefd5de319a755d7b9`