llama-fit-params: lower ctx size for multi GPU

2025-12-16 16:36:57 +01:00 · 2025-12-16 16:36:57 +01:00 · 92175d33a0
parent ec98e20021
commit 92175d33a0
1 changed files with 29 additions and 13 deletions
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -184,6 +184,7 @@ static void llama_params_fit_impl(
    int64_t sum_projected_free  = 0;
    int64_t min_projected_free  = INT64_MAX;
    int64_t sum_projected_used  = 0;
    int64_t sum_projected_model = 0;
    int64_t sum_projected_ctx   = 0;
    if (nd > 1) {
@ -199,6 +200,7 @@ static void llama_params_fit_impl(
        sum_projected_used  += projected_used;
        sum_projected_free  += projected_free;
        min_projected_free   = std::min(min_projected_free, projected_free);
        sum_projected_model += dmd.mb.model;
        sum_projected_ctx   += dmd.mb.context;
        if (nd > 1) {
@ -234,10 +236,24 @@ static void llama_params_fit_impl(
            if (cparams->n_ctx == 0) {
                if (hp_nct > n_ctx_min) {
                    const int64_t bytes_per_ctx = sum_projected_ctx / hp_nct;
-                    const uint32_t ctx_reduction = std::min(
+
-                        uint32_t((-global_surplus + bytes_per_ctx - 1) / bytes_per_ctx), hp_nct - n_ctx_min);
+                    int64_t memory_reduction = -global_surplus;
                    if (nd > 1) {
                        // for multiple devices we need to be more conservative in terms of how much context we think can fit:
                        //   - for dense models only whole layers can be assigned to devices
                        //   - for MoE models only whole tensors can be assigned to devices, which we estimate to be <= 1/3 of a layer
                        //   - on average we expect a waste of 0.5 layers/tensors per device
                        //   - use slightly more than the expected average for nd devices to be safe
                        const int64_t model_per_layer = sum_projected_model / std::min(uint32_t(mparams->n_gpu_layers), hp_ngl);
                        memory_reduction += (nd + 1) * model_per_layer / (hp_nex == 0 ? 2 : 6);
                    }
                    uint32_t ctx_reduction = std::min(uint32_t((memory_reduction + bytes_per_ctx - 1) / bytes_per_ctx), hp_nct - n_ctx_min);
                    cparams->n_ctx = hp_nct - ctx_reduction;
-                    const int64_t memory_reduction = ctx_reduction * bytes_per_ctx;
+                    cparams->n_ctx = std::max(cparams->n_ctx - cparams->n_ctx % 256, n_ctx_min); // round down context for CUDA backend
                    ctx_reduction = hp_nct - cparams->n_ctx;
                    memory_reduction = ctx_reduction * bytes_per_ctx;
                    global_surplus += memory_reduction;
                    LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
                        __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);