From bdd7ec7f56a4dc2f2108496c316a94949fe99472 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Wed, 7 Jan 2026 18:11:51 +0000
Subject: [PATCH] Implement target_size logic

---
 src/llama-quant.cpp | 48 +++++++++++++++++++++++++++++++++------------
 1 file changed, 36 insertions(+), 12 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 67e5aa9827..dc695fba0e 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -635,6 +635,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     int nthread
 ) {
     bpw_stop.store(false, std::memory_order_relaxed);
+
     // SIGINT/SIGTERM signal handlers
     struct signal_scope_guard {
         using handler_t = void (*)(int);
@@ -1565,9 +1566,26 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     if (q_elements == 0) { return {}; }
 
-    const double target_bpw = params->target_bpw;
-    size_t target_total_bytes = std::llround(target_bpw * (double)nq_elements / 8.0);
-    size_t budget_bytes = target_total_bytes >= nq_bytes ? target_total_bytes - nq_bytes : min_bytes;
+    size_t budget_bytes = 0;
+
+    if (params->target_size != -1) {
+        const auto metadata_size = gguf_get_meta_size(ml.meta.get());
+        // Budget for quantizable weights = target - metadata - Non-Quantizable Weights
+        int64_t available = (int64_t)params->target_size - (int64_t)metadata_size - (int64_t)nq_bytes;
+
+        // Clamp to the absolute minimum possible size for the variable tensors
+        if (available < (int64_t)min_bytes) {
+            LLAMA_LOG_WARN("%s: requested file size %zu is smaller than minimum possible model size (~%zu), clamping to minimum.\n",
+                func, (size_t)params->target_size, min_bytes + nq_bytes + metadata_size);
+            budget_bytes = min_bytes;
+        } else {
+            budget_bytes = (size_t)available;
+        }
+    } else {
+        const double target_bpw = params->target_bpw;
+        size_t target_total_bytes = std::llround(target_bpw * (double)nq_elements / 8.0);
+        budget_bytes = target_total_bytes >= nq_bytes ? target_total_bytes - nq_bytes : min_bytes;
+    }
 
     // Get the types' override
     auto emit_overrides = [&]() -> std::unordered_map<std::string, ggml_type> {
@@ -2004,7 +2022,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     }
 
     std::unordered_map<std::string, ggml_type> bpw_overrides = {};
-    if (params->target_bpw != -1.0f && !params->only_copy) {
+    if ((params->target_bpw != -1.0f || params->target_size != -1) && !params->only_copy) {
         if (params->imatrix) {
             if (params->activations) {
                 LLAMA_LOG_INFO("%s: imatrix has activations, process will be more accurate\n", __func__);
@@ -2012,15 +2030,20 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                 LLAMA_LOG_INFO("%s: imatrix does not have activations, process may be less accurate\n", __func__);
             }
             if (params->no_importance) {
-                LLAMA_LOG_INFO("%s: distributing bpw budget equitably across all tensors\n", __func__);
+                LLAMA_LOG_INFO("%s: distributing budget equitably across all tensors\n", __func__);
             } else {
-                LLAMA_LOG_INFO("%s: assigning more bpw budget to important tensors\n", __func__);
+                LLAMA_LOG_INFO("%s: assigning more budget to important tensors\n", __func__);
+            }
+            if (params->target_size >= 0) {
+                LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve file size %.2f MiB\n",
+                    __func__, (double)params->target_size / 1024.0 / 1024.0);
+            } else {
+                LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw);
             }
-            LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw);
 
             bpw_overrides = target_bpw_type(ml, model, tensors, mapped, values_data, activations_data, params, nthread);
         } else {
-            LLAMA_LOG_WARN("%s: --target-bpw requires an imatrix but none was provided, option will be ignored\n", __func__);
+            LLAMA_LOG_WARN("%s: --target-bpw/--target-size require an imatrix but none was provided, ignoring\n", __func__);
         }
     }
 
@@ -2091,12 +2114,12 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
             new_type = default_type;
 
             // get more optimal quantization type based on the tensor shape, layer, etc.
-            if (!params->pure && (ggml_is_quantized(default_type) || params->target_bpw != -1.0f)) {
+            if (!params->pure && (ggml_is_quantized(default_type) || params->target_bpw != -1.0f || params->target_size != -1)) {
                 int fallback = qs.n_fallback;
                 new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
 
-                // get quantization type overrides targeting a given bits per weight budget
-                if (params->target_bpw != -1.0f && !bpw_overrides.empty()) {
+                // get quantization type overrides targeting a bpw or file size budget
+                if ((params->target_bpw != -1.0f || params->target_size != -1) && !bpw_overrides.empty()) {
                     const auto override = bpw_overrides.find(name);
                     if (override != bpw_overrides.end() && override->second != new_type) {
                         LLAMA_LOG_DEBUG("(bpw override %s) ", ggml_type_name(new_type));
@@ -2104,7 +2127,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                     }
                 }
 
-                // unless the user specifies a type, and the tensor shape will not require fallback quantisation
+                // unless the user specifies a type, and the tensor shape will not require fallback quantization
                 if (params->tensor_types && qs.n_fallback - fallback == 0) {
                     const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
                     const std::string tensor_name(tensor->name);
@@ -2281,6 +2304,7 @@ llama_model_quantize_params llama_model_quantize_default_params() {
         /*.tensor_type                 =*/ nullptr,
         /*.prune_layers                =*/ nullptr,
         /*.target_bpw                  =*/ -1.0f,
+        /*.target_size                 =*/ -1,
         /*.keep_bpw_state              =*/ false,
         /*.bpw_state                   =*/ nullptr,
         /*.no_importance               =*/ false