From 097bdb34deec16534c3b925924c2e72e1b296d33 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Wed, 7 Jan 2026 18:10:27 +0000 Subject: [PATCH] Add --target-size option --- include/llama.h | 1 + tools/quantize/quantize.cpp | 14 +++++++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/include/llama.h b/include/llama.h index 138fa60708..cd76871be2 100644 --- a/include/llama.h +++ b/include/llama.h @@ -394,6 +394,7 @@ extern "C" { void * tensor_types; // pointer to vector containing tensor types void * prune_layers; // pointer to vector containing layer indices to prune float target_bpw; // target bits per weight (bpw) + int64_t target_size; // target file size in bytes bool keep_bpw_state; // keep bpw state file void * bpw_state; // pointer to bpw state file bool no_importance; // allocate target bpw budget equitably across all tensors diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 3d461348fb..128750a213 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -574,6 +574,7 @@ int main(int argc, char ** argv) { std::vector tensor_types; std::vector prune_layers; float target_bpw = -1.0f; + int64_t target_size = -1; for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) { if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) { @@ -604,6 +605,10 @@ int main(int argc, char ** argv) { if (arg_idx == argc-1 || !parse_target_bpw(argv[++arg_idx], target_bpw)) { usage(argv[0]); } + } else if (strcmp(argv[arg_idx], "--target-size") == 0) { + if (arg_idx == argc-1 || !parse_target_size(argv[++arg_idx], target_size)) { + usage(argv[0]); + } } else if (strcmp(argv[arg_idx], "--no-importance") == 0) { params.no_importance = true; } else if (strcmp(argv[arg_idx], "--keep-bpw-state") == 0) { @@ -716,6 +721,9 @@ int main(int argc, char ** argv) { if (target_bpw != -1.0f) { params.target_bpw = target_bpw; } + if (target_size != -1) { + params.target_size = target_size; + } llama_backend_init(); @@ -750,9 +758,9 @@ int main(int argc, char ** argv) { } arg_idx++; - // select quantization type if target_bpw is set unless user specifies type and threads - if (argc - arg_idx <= 1 && params.target_bpw != -1.0f) { - auto * ftype = const_cast(get_ftype(params.target_bpw)); + // If --target-bpw or --target-size are set, select a quantization type unless user specifies type and threads + if (argc - arg_idx <= 1 && (params.target_bpw != -1.0f || params.target_size != -1)) { + auto * ftype = params.target_bpw != -1.0f ? const_cast(get_ftype(params.target_bpw)) : const_cast("F16"); if (argc == arg_idx) { tmp_argv.push_back(ftype); } else { tmp_argv.insert(tmp_argv.end() - 1, ftype); } tmp_argv.push_back(nullptr);