quantize: add option --tensor-type-file to llama-quantize (#18572)
* add option --tensor-type-file to llama-quantize, but it raises an error. * add error message when file not found * quantize: update help menu, fix CI Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> --------- Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> Co-authored-by: Your Name <you@example.com> Co-authored-by: Aaron Teo <aaron.teo1@ibm.com>
This commit is contained in:
parent
ec6c7421e4
commit
3dd95914d0
|
|
@ -119,7 +119,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
|
|||
[[noreturn]]
|
||||
static void usage(const char * executable) {
|
||||
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights]\n", executable);
|
||||
printf(" [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n");
|
||||
printf(" [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--tensor-type-file] [--prune-layers] [--keep-split] [--override-kv]\n");
|
||||
printf(" model-f32.gguf [model-quant.gguf] type [nthreads]\n\n");
|
||||
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
|
||||
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
|
||||
|
|
@ -131,6 +131,8 @@ static void usage(const char * executable) {
|
|||
printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
|
||||
printf(" --tensor-type TENSOR=TYPE: quantize this tensor to this ggml_type. example: --tensor-type attn_q=q8_0\n");
|
||||
printf(" Advanced option to selectively quantize tensors. May be specified multiple times.\n");
|
||||
printf(" --tensor-type-file tensor_type.txt: list of tensors to quantize to specific ggml_type. example: --tensor-type-file tensor_type_list.txt\n");
|
||||
printf(" Advanced option to selectively quantize a long list of tensors. Format to be tensor_name=ggml_type, separated by spaces/newline.\n");
|
||||
printf(" --prune-layers L0,L1,L2...comma-separated list of layer numbers to prune from the model\n");
|
||||
printf(" Advanced option to remove all tensors from the given layers\n");
|
||||
printf(" --keep-split: will generate quantized model in the same shards as input\n");
|
||||
|
|
@ -415,6 +417,23 @@ static bool parse_tensor_type(const char * data, std::vector<tensor_quantization
|
|||
return true;
|
||||
}
|
||||
|
||||
static bool parse_tensor_type_file(const char * filename, std::vector<tensor_quantization> & tensor_type) {
|
||||
std::ifstream file(filename);
|
||||
if (!file) {
|
||||
printf("\n%s: failed to open file '%s': %s\n\n", __func__, filename, std::strerror(errno));
|
||||
return false;
|
||||
}
|
||||
|
||||
std::string arg;
|
||||
while (file >> arg) {
|
||||
if (!parse_tensor_type(arg.c_str(), tensor_type)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool parse_layer_prune(const char * data, std::vector<int> & prune_layers) {
|
||||
if (!data) {
|
||||
printf("\n%s: no layer pruning ids provided\n\n", __func__);
|
||||
|
|
@ -480,6 +499,10 @@ int main(int argc, char ** argv) {
|
|||
if (arg_idx == argc-1 || !parse_tensor_type(argv[++arg_idx], tensor_types)) {
|
||||
usage(argv[0]);
|
||||
}
|
||||
} else if (strcmp(argv[arg_idx], "--tensor-type-file") == 0) {
|
||||
if (arg_idx == argc-1 || !parse_tensor_type_file(argv[++arg_idx], tensor_types)) {
|
||||
usage(argv[0]);
|
||||
}
|
||||
} else if (strcmp(argv[arg_idx], "--prune-layers") == 0) {
|
||||
if (arg_idx == argc-1 || !parse_layer_prune(argv[++arg_idx], prune_layers)) {
|
||||
usage(argv[0]);
|
||||
|
|
@ -686,3 +709,4 @@ int main(int argc, char ** argv) {
|
|||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue