From e34fe515c699408e733be02197e95bbdaa364c38 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Tue, 3 Feb 2026 22:36:19 -0600 Subject: [PATCH] cleanup `llama-quantize --help` output some much needed TLC --- tools/quantize/quantize.cpp | 59 +++++++++++++++++++++++++------------ 1 file changed, 40 insertions(+), 19 deletions(-) diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 0709e0bda0..e5f4bd8b48 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -119,27 +119,48 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp [[noreturn]] static void usage(const char * executable) { printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights]\n", executable); - printf(" [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--tensor-type-file] [--prune-layers] [--keep-split] [--override-kv]\n"); + printf(" [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--tensor-type-file]\n"); + printf(" [--prune-layers] [--keep-split] [--override-kv] [--dry-run]\n"); printf(" model-f32.gguf [model-quant.gguf] type [nthreads]\n\n"); - printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); - printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); - printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n"); - printf(" --imatrix file_name: use data in file_name as importance matrix for quant optimizations\n"); - printf(" --include-weights tensor_name: use importance matrix for this/these tensor(s)\n"); - printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n"); - printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n"); - printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n"); - printf(" --tensor-type TENSOR=TYPE: quantize this tensor to this ggml_type. example: --tensor-type attn_q=q8_0\n"); - printf(" Advanced option to selectively quantize tensors. May be specified multiple times.\n"); - printf(" --tensor-type-file tensor_type.txt: list of tensors to quantize to specific ggml_type. example: --tensor-type-file tensor_type_list.txt\n"); - printf(" Advanced option to selectively quantize a long list of tensors. Format to be tensor_name=ggml_type, separated by spaces/newline.\n"); - printf(" --prune-layers L0,L1,L2...comma-separated list of layer numbers to prune from the model\n"); - printf(" Advanced option to remove all tensors from the given layers\n"); - printf(" --keep-split: will generate quantized model in the same shards as input\n"); + printf(" --allow-requantize\n"); + printf(" allows requantizing tensors that have already been quantized.\n"); + printf(" WARNING: this can severely reduce quality compared to quantizing from 16bit\n"); + printf(" or 32bit!\n"); + printf(" --leave-output-tensor\n"); + printf(" will leave output.weight un(re)quantized. Increases model size but may\n"); + printf(" also increase quality, especially when requantizing\n"); + printf(" --pure\n"); + printf(" disable k-quant mixtures and quantize all tensors to the same type\n"); + printf(" --imatrix file_name\n"); + printf(" use data in file_name as importance matrix for quant optimizations"); + printf(" --include-weights tensor_name\n"); + printf(" use importance matrix for this/these tensor(s)\n"); + printf(" --exclude-weights tensor_name\n"); + printf(" do not use importance matrix for this/these tensor(s)\n"); + printf(" --output-tensor-type ggml_type\n"); + printf(" use this ggml_type for the output.weight tensor\n"); + printf(" --token-embedding-type ggml_type\n"); + printf(" use this ggml_type for the token embeddings tensor\n"); + printf(" --tensor-type tensor_name=ggml_type\n"); + printf(" quantize this tensor to this ggml_type\n"); + printf(" this is an advanced option to selectively quantize tensors. may be specified multiple times.\n"); + printf(" example: --tensor-type attn_q=q8_0\n"); + printf(" --tensor-type-file tensor_type.txt\n"); + printf(" list of tensors to quantize to a specific ggml_type\n"); + printf(" this is an advanced option to selectively quantize a long list of tensors.\n"); + printf(" the file should use the same format as above, separated by spaces or newlines.\n"); + printf(" --prune-layers L0,L1,L2...\n"); + printf(" comma-separated list of layer numbers to prune from the model\n"); + printf(" WARNING: this is an advanced option, use with care.\n"); + printf(" --keep-split\n"); + printf(" will generate quantized model in the same shards as input\n"); printf(" --override-kv KEY=TYPE:VALUE\n"); - printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n"); - printf("Note: --include-weights and --exclude-weights cannot be used together\n"); - printf("\nAllowed quantization types:\n"); + printf(" override model metadata by key in the quantized model. may be specified multiple times.\n"); + printf(" WARNING: this is an advanced option, use with care.\n\n"); + printf("note: --include-weights and --exclude-weights cannot be used together\n\n"); + printf("-----------------------------------------------------------------------------\n"); + printf(" allowed quantization types\n"); + printf("-----------------------------------------------------------------------------\n\n"); for (const auto & it : QUANT_OPTIONS) { if (it.name != "COPY") { printf(" %2d or ", it.ftype);