From bdd7ec7f56a4dc2f2108496c316a94949fe99472 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Wed, 7 Jan 2026 18:11:51 +0000 Subject: [PATCH] Implement target_size logic --- src/llama-quant.cpp | 48 +++++++++++++++++++++++++++++++++------------ 1 file changed, 36 insertions(+), 12 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 67e5aa9827..dc695fba0e 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -635,6 +635,7 @@ static std::unordered_map target_bpw_type( int nthread ) { bpw_stop.store(false, std::memory_order_relaxed); + // SIGINT/SIGTERM signal handlers struct signal_scope_guard { using handler_t = void (*)(int); @@ -1565,9 +1566,26 @@ static std::unordered_map target_bpw_type( if (q_elements == 0) { return {}; } - const double target_bpw = params->target_bpw; - size_t target_total_bytes = std::llround(target_bpw * (double)nq_elements / 8.0); - size_t budget_bytes = target_total_bytes >= nq_bytes ? target_total_bytes - nq_bytes : min_bytes; + size_t budget_bytes = 0; + + if (params->target_size != -1) { + const auto metadata_size = gguf_get_meta_size(ml.meta.get()); + // Budget for quantizable weights = target - metadata - Non-Quantizable Weights + int64_t available = (int64_t)params->target_size - (int64_t)metadata_size - (int64_t)nq_bytes; + + // Clamp to the absolute minimum possible size for the variable tensors + if (available < (int64_t)min_bytes) { + LLAMA_LOG_WARN("%s: requested file size %zu is smaller than minimum possible model size (~%zu), clamping to minimum.\n", + func, (size_t)params->target_size, min_bytes + nq_bytes + metadata_size); + budget_bytes = min_bytes; + } else { + budget_bytes = (size_t)available; + } + } else { + const double target_bpw = params->target_bpw; + size_t target_total_bytes = std::llround(target_bpw * (double)nq_elements / 8.0); + budget_bytes = target_total_bytes >= nq_bytes ? target_total_bytes - nq_bytes : min_bytes; + } // Get the types' override auto emit_overrides = [&]() -> std::unordered_map { @@ -2004,7 +2022,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } std::unordered_map bpw_overrides = {}; - if (params->target_bpw != -1.0f && !params->only_copy) { + if ((params->target_bpw != -1.0f || params->target_size != -1) && !params->only_copy) { if (params->imatrix) { if (params->activations) { LLAMA_LOG_INFO("%s: imatrix has activations, process will be more accurate\n", __func__); @@ -2012,15 +2030,20 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: LLAMA_LOG_INFO("%s: imatrix does not have activations, process may be less accurate\n", __func__); } if (params->no_importance) { - LLAMA_LOG_INFO("%s: distributing bpw budget equitably across all tensors\n", __func__); + LLAMA_LOG_INFO("%s: distributing budget equitably across all tensors\n", __func__); } else { - LLAMA_LOG_INFO("%s: assigning more bpw budget to important tensors\n", __func__); + LLAMA_LOG_INFO("%s: assigning more budget to important tensors\n", __func__); + } + if (params->target_size >= 0) { + LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve file size %.2f MiB\n", + __func__, (double)params->target_size / 1024.0 / 1024.0); + } else { + LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw); } - LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw); bpw_overrides = target_bpw_type(ml, model, tensors, mapped, values_data, activations_data, params, nthread); } else { - LLAMA_LOG_WARN("%s: --target-bpw requires an imatrix but none was provided, option will be ignored\n", __func__); + LLAMA_LOG_WARN("%s: --target-bpw/--target-size require an imatrix but none was provided, ignoring\n", __func__); } } @@ -2091,12 +2114,12 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: new_type = default_type; // get more optimal quantization type based on the tensor shape, layer, etc. - if (!params->pure && (ggml_is_quantized(default_type) || params->target_bpw != -1.0f)) { + if (!params->pure && (ggml_is_quantized(default_type) || params->target_bpw != -1.0f || params->target_size != -1)) { int fallback = qs.n_fallback; new_type = llama_tensor_get_type(qs, new_type, tensor, ftype); - // get quantization type overrides targeting a given bits per weight budget - if (params->target_bpw != -1.0f && !bpw_overrides.empty()) { + // get quantization type overrides targeting a bpw or file size budget + if ((params->target_bpw != -1.0f || params->target_size != -1) && !bpw_overrides.empty()) { const auto override = bpw_overrides.find(name); if (override != bpw_overrides.end() && override->second != new_type) { LLAMA_LOG_DEBUG("(bpw override %s) ", ggml_type_name(new_type)); @@ -2104,7 +2127,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } } - // unless the user specifies a type, and the tensor shape will not require fallback quantisation + // unless the user specifies a type, and the tensor shape will not require fallback quantization if (params->tensor_types && qs.n_fallback - fallback == 0) { const std::vector & tensor_types = *static_cast *>(params->tensor_types); const std::string tensor_name(tensor->name); @@ -2281,6 +2304,7 @@ llama_model_quantize_params llama_model_quantize_default_params() { /*.tensor_type =*/ nullptr, /*.prune_layers =*/ nullptr, /*.target_bpw =*/ -1.0f, + /*.target_size =*/ -1, /*.keep_bpw_state =*/ false, /*.bpw_state =*/ nullptr, /*.no_importance =*/ false