llama-params-fit: fix output layer assignment

2025-12-30 17:11:20 +01:00 · 2025-12-30 17:11:20 +01:00 · 15182a1bb0
parent cd78e57c3a
commit 15182a1bb0
1 changed files with 2 additions and 4 deletions
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -512,9 +512,6 @@ static void llama_params_fit_impl(
            if (mem_high[id] > targets[id]) {
                assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer);
                uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
-                if (hp_nex > 0 && size_t(id) == nd - 1) {
-                    delta--;
-                }
                LLAMA_LOG_DEBUG("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta);
                while (delta > 1) {
                    uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
@ -524,7 +521,8 @@ static void llama_params_fit_impl(
                    std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
                    ngl_per_device_test[id].n_layer += step_size;
                    if (hp_nex) {
-                        ngl_per_device_test[id].n_part += step_size;
+                        ngl_per_device_test[id].n_part += size_t(id) == nd - 1 && ngl_per_device_test[id].n_part == 0 ?
+                            step_size - 1 : step_size; // the first layer is the output layer which must always be full
                    }
                    const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);