This commit is contained in:
Ed Addario 2026-03-15 23:55:07 +02:00 committed by GitHub
commit 6350cb55d8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 65 additions and 49 deletions

View File

@ -380,22 +380,33 @@ extern "C" {
size_t n_samplers;
};
struct llama_model_tensor_override {
const char * pattern;
enum ggml_type type;
};
struct llama_imatrix_data {
const char * name;
const float * data;
size_t size;
};
// model quantization parameters
typedef struct llama_model_quantize_params {
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
enum llama_ftype ftype; // quantize to this llama_ftype
enum ggml_type output_tensor_type; // output tensor type
enum ggml_type token_embedding_type; // token embeddings tensor type
bool allow_requantize; // allow quantizing non-f32/f16 tensors
bool quantize_output_tensor; // quantize output.weight
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
bool pure; // quantize all tensors to the default type
bool keep_split; // quantize to the same number of shards
bool dry_run; // calculate and show the final quantization size without performing quantization
void * imatrix; // pointer to importance matrix data
void * kv_overrides; // pointer to vector containing overrides
void * tensor_types; // pointer to vector containing tensor types
void * prune_layers; // pointer to vector containing layer indices to prune
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
enum llama_ftype ftype; // quantize to this llama_ftype
enum ggml_type output_tensor_type; // output tensor type
enum ggml_type token_embedding_type; // token embeddings tensor type
bool allow_requantize; // allow quantizing non-f32/f16 tensors
bool quantize_output_tensor; // quantize output.weight
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
bool pure; // quantize all tensors to the default type
bool keep_split; // quantize to the same number of shards
bool dry_run; // calculate and show the final quantization size without performing quantization
const struct llama_imatrix_data * imatrix; // pointer to importance matrix data
const struct llama_model_kv_override * kv_overrides; // pointer to kv overrides
const struct llama_model_tensor_override * tensor_types; // pointer to tensor overrides
const int32_t * prune_layers; // pointer to layer indices to prune
} llama_model_quantize_params;
typedef struct llama_logit_bias {

View File

@ -84,7 +84,6 @@ static std::string remap_imatrix(const std::string & orig_name, const std::map<i
for (const auto & p : mapped) {
if (p.second == blk) {
LLAMA_LOG_DEBUG("(blk.%d imatrix) ", p.first);
return new_name.replace(match.position(1), match.length(1), std::to_string(p.first));
}
}
@ -189,9 +188,8 @@ struct quantize_state_impl {
{
// compile regex patterns once - they are expensive
if (params->tensor_types) {
const auto & tensor_types = *static_cast<const std::vector<tensor_type_option> *>(params->tensor_types);
for (const auto & [tname, qtype] : tensor_types) {
tensor_type_patterns.emplace_back(std::regex(tname), qtype);
for (const auto * p = params->tensor_types; p->pattern != nullptr; p++) {
tensor_type_patterns.emplace_back(std::regex(p->pattern), p->type);
}
}
}
@ -851,12 +849,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
constexpr bool use_mmap = false;
#endif
llama_model_kv_override * kv_overrides = nullptr;
if (params->kv_overrides) {
auto * v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
kv_overrides = v->data();
}
const llama_model_kv_override * kv_overrides = params->kv_overrides;
std::vector<std::string> splits = {};
llama_model_loader ml(/*metadata*/ nullptr, /*set_tensor_data*/ nullptr, /*set_tensor_data_ud*/ nullptr,
fname_inp, splits, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
@ -873,9 +866,13 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
if (params->only_copy) {
ftype = ml.ftype;
}
std::unordered_map<std::string, std::vector<float>> i_data;
const std::unordered_map<std::string, std::vector<float>> * imatrix_data = nullptr;
if (params->imatrix) {
imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
for (const llama_imatrix_data * p = params->imatrix; p->name != nullptr; p++) {
i_data.emplace(p->name, std::vector<float>(p->data, p->data + p->size));
}
imatrix_data = & i_data;
if (imatrix_data) {
LLAMA_LOG_INFO("\n%s: have importance matrix data with %d entries\n",
__func__, (int)imatrix_data->size());
@ -896,7 +893,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
std::vector<int> prune_list = {};
if (params->prune_layers) {
prune_list = *static_cast<const std::vector<int> *>(params->prune_layers);
for (const int32_t * p = params->prune_layers; * p != -1; p++) {
prune_list.push_back(* p);
}
}
// copy the KV pairs from the input file
@ -910,20 +909,18 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
gguf_remove_key(ctx_out.get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str());
if (params->kv_overrides) {
const std::vector<llama_model_kv_override> & overrides = *(const std::vector<llama_model_kv_override> *)params->kv_overrides;
for (const auto & o : overrides) {
if (o.key[0] == 0) break;
if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
gguf_set_val_f32(ctx_out.get(), o.key, o.val_f64);
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
for (const llama_model_kv_override * o = params->kv_overrides; o->key[0] != 0; ++o) {
if (o->tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
gguf_set_val_f32(ctx_out.get(), o->key, o->val_f64);
} else if (o->tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
// Setting type to UINT32. See https://github.com/ggml-org/llama.cpp/pull/14182 for context
gguf_set_val_u32(ctx_out.get(), o.key, (uint32_t)std::abs(o.val_i64));
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
gguf_set_val_bool(ctx_out.get(), o.key, o.val_bool);
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
gguf_set_val_str(ctx_out.get(), o.key, o.val_str);
gguf_set_val_u32(ctx_out.get(), o->key, (uint32_t)std::abs(o->val_i64));
} else if (o->tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
gguf_set_val_bool(ctx_out.get(), o->key, o->val_bool);
} else if (o->tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
gguf_set_val_str(ctx_out.get(), o->key, o->val_str);
} else {
LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key);
LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o->key);
}
}
}

View File

@ -13,13 +13,10 @@
#include <unordered_map>
#include <map>
#include <fstream>
#include <cmath>
#include <cctype>
#include <algorithm>
#include <filesystem>
// result of parsing --tensor-type option
// (changes to this struct must be reflected in src/llama-quant.cpp)
// changes to this struct must also be reflected in src/llama-quant.cpp
struct tensor_type_option {
std::string name;
ggml_type type = GGML_TYPE_COUNT;
@ -491,7 +488,6 @@ static bool parse_layer_prune(const char * data, std::vector<int> & prune_layers
int main(int argc, char ** argv) {
std::setlocale(LC_NUMERIC, "C");
if (argc < 3) {
usage(argv[0]);
}
@ -584,8 +580,16 @@ int main(int argc, char ** argv) {
std::vector<std::string> imatrix_datasets;
std::unordered_map<std::string, std::vector<float>> imatrix_data;
int m_last_call = prepare_imatrix(imatrix_file, imatrix_datasets, included_weights, excluded_weights, imatrix_data);
std::vector<llama_imatrix_data> i_data;
std::vector<llama_model_tensor_override> t_override;
if (!imatrix_data.empty()) {
params.imatrix = &imatrix_data;
i_data.reserve(imatrix_data.size() + 1);
for (const auto & kv : imatrix_data) {
i_data.push_back({kv.first.c_str(), kv.second.data(), kv.second.size()});
}
i_data.push_back({nullptr, nullptr, 0}); // array terminator
params.imatrix = i_data.data();
{
llama_model_kv_override kvo;
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_FILE);
@ -603,7 +607,6 @@ int main(int argc, char ** argv) {
kvo.val_str[127] = '\0';
kv_overrides.emplace_back(std::move(kvo));
}
{
llama_model_kv_override kvo;
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES);
@ -611,7 +614,6 @@ int main(int argc, char ** argv) {
kvo.val_i64 = imatrix_data.size();
kv_overrides.emplace_back(std::move(kvo));
}
if (m_last_call > 0) {
llama_model_kv_override kvo;
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS);
@ -623,13 +625,19 @@ int main(int argc, char ** argv) {
if (!kv_overrides.empty()) {
kv_overrides.emplace_back();
kv_overrides.back().key[0] = 0;
params.kv_overrides = &kv_overrides;
params.kv_overrides = kv_overrides.data();
}
if (!tensor_type_opts.empty()) {
params.tensor_types = &tensor_type_opts;
t_override.reserve(tensor_type_opts.size() + 1);
for (const auto & tt : tensor_type_opts) {
t_override.push_back({tt.name.c_str(), tt.type});
}
t_override.push_back({nullptr, GGML_TYPE_COUNT}); // array terminator
params.tensor_types = t_override.data();
}
if (!prune_layers.empty()) {
params.prune_layers = &prune_layers;
prune_layers.push_back(-1); // array terminator
params.prune_layers = prune_layers.data();
}
llama_backend_init();