From e9fd8dcab45d6cd147874e32565923bdfd0efbdb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 24 Jan 2026 22:13:08 +0100 Subject: [PATCH] llama-fit-params: keep explicit --ctx-size 0 (#19070) --- common/arg.cpp | 4 ++++ include/llama.h | 1 + src/llama.cpp | 8 ++++++-- tools/fit-params/fit-params.cpp | 2 +- 4 files changed, 12 insertions(+), 3 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 163c9b71b0..98477e8117 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1231,6 +1231,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex string_format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx), [](common_params & params, int value) { params.n_ctx = value; + if (value == 0) { + // disable context reduction in llama_params_fit if the user explicitly requests the full context size: + params.fit_params_min_ctx = UINT32_MAX; + } } ).set_env("LLAMA_ARG_CTX_SIZE")); add_opt(common_arg( diff --git a/include/llama.h b/include/llama.h index 280745713e..1507107f1a 100644 --- a/include/llama.h +++ b/include/llama.h @@ -489,6 +489,7 @@ extern "C" { // - returns true if the parameters could be successfully modified to fit device memory // - this function is NOT thread safe because it modifies the global llama logger state // - only parameters that have the same value as in llama_default_model_params are modified + // with the exception of the context size which is modified if and only if equal to 0 LLAMA_API enum llama_params_fit_status llama_params_fit( const char * path_model, struct llama_model_params * mparams, diff --git a/src/llama.cpp b/src/llama.cpp index f1096d960e..11b75fcff9 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -311,8 +311,12 @@ static void llama_params_fit_impl( __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB); } } else { - LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n", - __func__, hp_nct, n_ctx_min); + if (n_ctx_min == UINT32_MAX) { + LLAMA_LOG_INFO("%s: user has requested full context size of %" PRIu32 " -> no change\n", __func__, hp_nct); + } else { + LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n", + __func__, hp_nct, n_ctx_min); + } } } else { LLAMA_LOG_INFO("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx); diff --git a/tools/fit-params/fit-params.cpp b/tools/fit-params/fit-params.cpp index f9d9cb34c7..0176be06e7 100644 --- a/tools/fit-params/fit-params.cpp +++ b/tools/fit-params/fit-params.cpp @@ -36,7 +36,7 @@ int main(int argc, char ** argv) { LOG_INF("%s: printing fitted CLI arguments to stdout...\n", __func__); common_log_flush(common_log_main()); - printf("-c %" PRIu32 " -ngl %" PRIu32, cparams.n_ctx, mparams.n_gpu_layers); + printf("-c %" PRIu32 " -ngl %" PRIi32, cparams.n_ctx, mparams.n_gpu_layers); size_t nd = llama_max_devices(); while (nd > 1 && mparams.tensor_split[nd - 1] == 0.0f) {