From 097bdb34deec16534c3b925924c2e72e1b296d33 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Wed, 7 Jan 2026 18:10:27 +0000
Subject: [PATCH] Add --target-size option

---
 include/llama.h             |  1 +
 tools/quantize/quantize.cpp | 14 +++++++++++---
 2 files changed, 12 insertions(+), 3 deletions(-)
diff --git a/include/llama.h b/include/llama.h
index 138fa60708..cd76871be2 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -394,6 +394,7 @@ extern "C" {
         void * tensor_types;                  // pointer to vector containing tensor types
         void * prune_layers;                  // pointer to vector containing layer indices to prune
         float target_bpw;                     // target bits per weight (bpw)
+        int64_t target_size;                  // target file size in bytes
         bool keep_bpw_state;                  // keep bpw state file
         void * bpw_state;                     // pointer to bpw state file
         bool no_importance;                   // allocate target bpw budget equitably across all tensors
diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index 3d461348fb..128750a213 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -574,6 +574,7 @@ int main(int argc, char ** argv) {
     std::vector<tensor_quantization> tensor_types;
     std::vector<int> prune_layers;
     float target_bpw = -1.0f;
+    int64_t target_size = -1;
 
     for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
         if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
@@ -604,6 +605,10 @@ int main(int argc, char ** argv) {
             if (arg_idx == argc-1 || !parse_target_bpw(argv[++arg_idx], target_bpw)) {
                 usage(argv[0]);
             }
+        } else if (strcmp(argv[arg_idx], "--target-size") == 0) {
+            if (arg_idx == argc-1 || !parse_target_size(argv[++arg_idx], target_size)) {
+                usage(argv[0]);
+            }
         } else if (strcmp(argv[arg_idx], "--no-importance") == 0) {
             params.no_importance = true;
         } else if (strcmp(argv[arg_idx], "--keep-bpw-state") == 0) {
@@ -716,6 +721,9 @@ int main(int argc, char ** argv) {
     if (target_bpw != -1.0f) {
         params.target_bpw = target_bpw;
     }
+    if (target_size != -1) {
+        params.target_size = target_size;
+    }
 
     llama_backend_init();
 
@@ -750,9 +758,9 @@ int main(int argc, char ** argv) {
         }
         arg_idx++;
 
-        // select quantization type if target_bpw is set unless user specifies type and threads
-        if (argc - arg_idx <= 1 && params.target_bpw != -1.0f) {
-            auto * ftype = const_cast<char *>(get_ftype(params.target_bpw));
+        // If --target-bpw or --target-size are set, select a quantization type unless user specifies type and threads
+        if (argc - arg_idx <= 1 && (params.target_bpw != -1.0f || params.target_size != -1)) {
+            auto * ftype = params.target_bpw != -1.0f ? const_cast<char *>(get_ftype(params.target_bpw)) : const_cast<char *>("F16");
             if (argc == arg_idx) { tmp_argv.push_back(ftype); }
             else { tmp_argv.insert(tmp_argv.end() - 1, ftype); }
             tmp_argv.push_back(nullptr);