diff --git a/include/llama.h b/include/llama.h index 077f66dc65..9a77638c81 100644 --- a/include/llama.h +++ b/include/llama.h @@ -378,22 +378,33 @@ extern "C" { size_t n_samplers; }; + typedef struct llama_model_tensor_override { + const char * pattern; + enum ggml_type type; + } llama_model_tensor_override; + + typedef struct llama_imatrix_data { + const char * name; + const float * data; + size_t size; + } llama_imatrix_data; + // model quantization parameters typedef struct llama_model_quantize_params { - int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() - enum llama_ftype ftype; // quantize to this llama_ftype - enum ggml_type output_tensor_type; // output tensor type - enum ggml_type token_embedding_type; // token embeddings tensor type - bool allow_requantize; // allow quantizing non-f32/f16 tensors - bool quantize_output_tensor; // quantize output.weight - bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored - bool pure; // quantize all tensors to the default type - bool keep_split; // quantize to the same number of shards - bool dry_run; // calculate and show the final quantization size without performing quantization - void * imatrix; // pointer to importance matrix data - void * kv_overrides; // pointer to vector containing overrides - void * tensor_types; // pointer to vector containing tensor types - void * prune_layers; // pointer to vector containing layer indices to prune + int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() + enum llama_ftype ftype; // quantize to this llama_ftype + enum ggml_type output_tensor_type; // output tensor type + enum ggml_type token_embedding_type; // token embeddings tensor type + bool allow_requantize; // allow quantizing non-f32/f16 tensors + bool quantize_output_tensor; // quantize output.weight + bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored + bool pure; // quantize all tensors to the default type + bool keep_split; // quantize to the same number of shards + bool dry_run; // calculate and show the final quantization size without performing quantization + const struct llama_imatrix_data * imatrix; // pointer to importance matrix data + const struct llama_model_kv_override * kv_overrides; // pointer to kv overrides + const struct llama_model_tensor_override * tensor_types; // pointer to tensor overrides + const int32_t * prune_layers; // pointer to layer indices to prune } llama_model_quantize_params; typedef struct llama_logit_bias { diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 24770430e1..de6928216a 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -549,12 +549,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: constexpr bool use_mmap = false; #endif - llama_model_kv_override * kv_overrides = nullptr; - if (params->kv_overrides) { - auto * v = (std::vector*)params->kv_overrides; - kv_overrides = v->data(); - } - + const llama_model_kv_override * kv_overrides = params->kv_overrides; std::vector splits = {}; llama_model_loader ml(fname_inp, splits, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr); ml.init_mappings(false); // no prefetching @@ -570,9 +565,13 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: if (params->only_copy) { ftype = ml.ftype; } + std::unordered_map> i_data; const std::unordered_map> * imatrix_data = nullptr; if (params->imatrix) { - imatrix_data = static_cast>*>(params->imatrix); + for (const llama_imatrix_data * p = params->imatrix; p->name != nullptr; p++) { + i_data.emplace(p->name, std::vector(p->data, p->data + p->size)); + } + imatrix_data = & i_data; if (imatrix_data) { LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size())); qs.has_imatrix = true; @@ -592,7 +591,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: std::vector prune_list = {}; if (params->prune_layers) { - prune_list = *static_cast *>(params->prune_layers); + for (const int32_t * p = params->prune_layers; * p != -1; p++) { + prune_list.push_back(* p); + } } // copy the KV pairs from the input file @@ -606,20 +607,18 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: gguf_remove_key(ctx_out.get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str()); if (params->kv_overrides) { - const std::vector & overrides = *(const std::vector *)params->kv_overrides; - for (const auto & o : overrides) { - if (o.key[0] == 0) break; - if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) { - gguf_set_val_f32(ctx_out.get(), o.key, o.val_f64); - } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) { + for (const llama_model_kv_override * o = params->kv_overrides; o->key[0] != 0; ++o) { + if (o->tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) { + gguf_set_val_f32(ctx_out.get(), o->key, o->val_f64); + } else if (o->tag == LLAMA_KV_OVERRIDE_TYPE_INT) { // Setting type to UINT32. See https://github.com/ggml-org/llama.cpp/pull/14182 for context - gguf_set_val_u32(ctx_out.get(), o.key, (uint32_t)std::abs(o.val_i64)); - } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) { - gguf_set_val_bool(ctx_out.get(), o.key, o.val_bool); - } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) { - gguf_set_val_str(ctx_out.get(), o.key, o.val_str); + gguf_set_val_u32(ctx_out.get(), o->key, (uint32_t)std::abs(o->val_i64)); + } else if (o->tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) { + gguf_set_val_bool(ctx_out.get(), o->key, o->val_bool); + } else if (o->tag == LLAMA_KV_OVERRIDE_TYPE_STR) { + gguf_set_val_str(ctx_out.get(), o->key, o->val_str); } else { - LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key); + LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o->key); } } } @@ -850,13 +849,12 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // if the user provided tensor types - use those bool manual = false; if (params->tensor_types) { - const std::vector & tensor_types = *static_cast *>(params->tensor_types); const std::string tensor_name(tensor->name); - for (const auto & [tname, qtype] : tensor_types) { - if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) { - if (qtype != new_type) { - LLAMA_LOG_WARN("(manual override: %s -> %s) ", ggml_type_name(new_type), ggml_type_name(qtype)); - new_type = qtype; // if two or more types are specified for the same tensor, the last match wins + for (const auto * p = params->tensor_types; p->pattern != nullptr; p++) { + if (std::regex pattern(p->pattern); std::regex_search(tensor_name, pattern)) { + if (p->type != new_type) { + LLAMA_LOG_WARN("(manual override: %s -> %s) ", ggml_type_name(new_type), ggml_type_name(p->type)); + new_type = p->type; // if two or more types are specified for the same tensor, the last match wins manual = true; break; } diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 59bf9bd3fd..431e002da8 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -577,8 +577,16 @@ int main(int argc, char ** argv) { std::vector imatrix_datasets; std::unordered_map> imatrix_data; int m_last_call = prepare_imatrix(imatrix_file, imatrix_datasets, included_weights, excluded_weights, imatrix_data); + + std::vector i_data; + std::vector t_override; if (!imatrix_data.empty()) { - params.imatrix = &imatrix_data; + i_data.reserve(imatrix_data.size() + 1); + for (const auto & kv : imatrix_data) { + i_data.push_back({kv.first.c_str(), kv.second.data(), kv.second.size()}); + } + i_data.push_back({nullptr, nullptr, 0}); // array terminator + params.imatrix = i_data.data(); { llama_model_kv_override kvo; std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_FILE); @@ -596,7 +604,6 @@ int main(int argc, char ** argv) { kvo.val_str[127] = '\0'; kv_overrides.emplace_back(std::move(kvo)); } - { llama_model_kv_override kvo; std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES); @@ -604,7 +611,6 @@ int main(int argc, char ** argv) { kvo.val_i64 = imatrix_data.size(); kv_overrides.emplace_back(std::move(kvo)); } - if (m_last_call > 0) { llama_model_kv_override kvo; std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS); @@ -616,13 +622,19 @@ int main(int argc, char ** argv) { if (!kv_overrides.empty()) { kv_overrides.emplace_back(); kv_overrides.back().key[0] = 0; - params.kv_overrides = &kv_overrides; + params.kv_overrides = kv_overrides.data(); } if (!tensor_types.empty()) { - params.tensor_types = &tensor_types; + t_override.reserve(tensor_types.size() + 1); + for (const auto & tt : tensor_types) { + t_override.push_back({tt.name.c_str(), tt.quant}); + } + t_override.push_back({nullptr, GGML_TYPE_COUNT}); // array terminator + params.tensor_types = t_override.data(); } if (!prune_layers.empty()) { - params.prune_layers = &prune_layers; + prune_layers.push_back(-1); // array terminator + params.prune_layers = prune_layers.data(); } llama_backend_init();