From 3dd95914d09b155eed84664b9abdbbffae238738 Mon Sep 17 00:00:00 2001 From: EugeoSynthesisThirtyTwo Date: Sat, 31 Jan 2026 04:39:21 +0100 Subject: [PATCH] quantize: add option --tensor-type-file to llama-quantize (#18572) * add option --tensor-type-file to llama-quantize, but it raises an error. * add error message when file not found * quantize: update help menu, fix CI Signed-off-by: Aaron Teo --------- Signed-off-by: Aaron Teo Co-authored-by: Your Name Co-authored-by: Aaron Teo --- tools/quantize/quantize.cpp | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 881f4b3dd9..0709e0bda0 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -119,7 +119,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp [[noreturn]] static void usage(const char * executable) { printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights]\n", executable); - printf(" [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n"); + printf(" [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--tensor-type-file] [--prune-layers] [--keep-split] [--override-kv]\n"); printf(" model-f32.gguf [model-quant.gguf] type [nthreads]\n\n"); printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); @@ -131,6 +131,8 @@ static void usage(const char * executable) { printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n"); printf(" --tensor-type TENSOR=TYPE: quantize this tensor to this ggml_type. example: --tensor-type attn_q=q8_0\n"); printf(" Advanced option to selectively quantize tensors. May be specified multiple times.\n"); + printf(" --tensor-type-file tensor_type.txt: list of tensors to quantize to specific ggml_type. example: --tensor-type-file tensor_type_list.txt\n"); + printf(" Advanced option to selectively quantize a long list of tensors. Format to be tensor_name=ggml_type, separated by spaces/newline.\n"); printf(" --prune-layers L0,L1,L2...comma-separated list of layer numbers to prune from the model\n"); printf(" Advanced option to remove all tensors from the given layers\n"); printf(" --keep-split: will generate quantized model in the same shards as input\n"); @@ -415,6 +417,23 @@ static bool parse_tensor_type(const char * data, std::vector & tensor_type) { + std::ifstream file(filename); + if (!file) { + printf("\n%s: failed to open file '%s': %s\n\n", __func__, filename, std::strerror(errno)); + return false; + } + + std::string arg; + while (file >> arg) { + if (!parse_tensor_type(arg.c_str(), tensor_type)) { + return false; + } + } + + return true; +} + static bool parse_layer_prune(const char * data, std::vector & prune_layers) { if (!data) { printf("\n%s: no layer pruning ids provided\n\n", __func__); @@ -480,6 +499,10 @@ int main(int argc, char ** argv) { if (arg_idx == argc-1 || !parse_tensor_type(argv[++arg_idx], tensor_types)) { usage(argv[0]); } + } else if (strcmp(argv[arg_idx], "--tensor-type-file") == 0) { + if (arg_idx == argc-1 || !parse_tensor_type_file(argv[++arg_idx], tensor_types)) { + usage(argv[0]); + } } else if (strcmp(argv[arg_idx], "--prune-layers") == 0) { if (arg_idx == argc-1 || !parse_layer_prune(argv[++arg_idx], prune_layers)) { usage(argv[0]); @@ -686,3 +709,4 @@ int main(int argc, char ** argv) { return 0; } +