Refactor llama_model_quantize_params to expose a pure C interface
This commit is contained in:
parent
66d65ec29b
commit
04ee7d02d8
|
|
@ -378,22 +378,33 @@ extern "C" {
|
||||||
size_t n_samplers;
|
size_t n_samplers;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
typedef struct llama_model_tensor_override {
|
||||||
|
const char * pattern;
|
||||||
|
enum ggml_type type;
|
||||||
|
} llama_model_tensor_override;
|
||||||
|
|
||||||
|
typedef struct llama_imatrix_data {
|
||||||
|
const char * name;
|
||||||
|
const float * data;
|
||||||
|
size_t size;
|
||||||
|
} llama_imatrix_data;
|
||||||
|
|
||||||
// model quantization parameters
|
// model quantization parameters
|
||||||
typedef struct llama_model_quantize_params {
|
typedef struct llama_model_quantize_params {
|
||||||
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
||||||
enum llama_ftype ftype; // quantize to this llama_ftype
|
enum llama_ftype ftype; // quantize to this llama_ftype
|
||||||
enum ggml_type output_tensor_type; // output tensor type
|
enum ggml_type output_tensor_type; // output tensor type
|
||||||
enum ggml_type token_embedding_type; // token embeddings tensor type
|
enum ggml_type token_embedding_type; // token embeddings tensor type
|
||||||
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
||||||
bool quantize_output_tensor; // quantize output.weight
|
bool quantize_output_tensor; // quantize output.weight
|
||||||
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
||||||
bool pure; // quantize all tensors to the default type
|
bool pure; // quantize all tensors to the default type
|
||||||
bool keep_split; // quantize to the same number of shards
|
bool keep_split; // quantize to the same number of shards
|
||||||
bool dry_run; // calculate and show the final quantization size without performing quantization
|
bool dry_run; // calculate and show the final quantization size without performing quantization
|
||||||
void * imatrix; // pointer to importance matrix data
|
const struct llama_imatrix_data * imatrix; // pointer to importance matrix data
|
||||||
void * kv_overrides; // pointer to vector containing overrides
|
const struct llama_model_kv_override * kv_overrides; // pointer to kv overrides
|
||||||
void * tensor_types; // pointer to vector containing tensor types
|
const struct llama_model_tensor_override * tensor_types; // pointer to tensor overrides
|
||||||
void * prune_layers; // pointer to vector containing layer indices to prune
|
const int32_t * prune_layers; // pointer to layer indices to prune
|
||||||
} llama_model_quantize_params;
|
} llama_model_quantize_params;
|
||||||
|
|
||||||
typedef struct llama_logit_bias {
|
typedef struct llama_logit_bias {
|
||||||
|
|
|
||||||
|
|
@ -549,12 +549,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||||
constexpr bool use_mmap = false;
|
constexpr bool use_mmap = false;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
llama_model_kv_override * kv_overrides = nullptr;
|
const llama_model_kv_override * kv_overrides = params->kv_overrides;
|
||||||
if (params->kv_overrides) {
|
|
||||||
auto * v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
|
|
||||||
kv_overrides = v->data();
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<std::string> splits = {};
|
std::vector<std::string> splits = {};
|
||||||
llama_model_loader ml(fname_inp, splits, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
|
llama_model_loader ml(fname_inp, splits, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
|
||||||
ml.init_mappings(false); // no prefetching
|
ml.init_mappings(false); // no prefetching
|
||||||
|
|
@ -570,9 +565,13 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||||
if (params->only_copy) {
|
if (params->only_copy) {
|
||||||
ftype = ml.ftype;
|
ftype = ml.ftype;
|
||||||
}
|
}
|
||||||
|
std::unordered_map<std::string, std::vector<float>> i_data;
|
||||||
const std::unordered_map<std::string, std::vector<float>> * imatrix_data = nullptr;
|
const std::unordered_map<std::string, std::vector<float>> * imatrix_data = nullptr;
|
||||||
if (params->imatrix) {
|
if (params->imatrix) {
|
||||||
imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
|
for (const llama_imatrix_data * p = params->imatrix; p->name != nullptr; p++) {
|
||||||
|
i_data.emplace(p->name, std::vector<float>(p->data, p->data + p->size));
|
||||||
|
}
|
||||||
|
imatrix_data = & i_data;
|
||||||
if (imatrix_data) {
|
if (imatrix_data) {
|
||||||
LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
|
LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
|
||||||
qs.has_imatrix = true;
|
qs.has_imatrix = true;
|
||||||
|
|
@ -592,7 +591,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||||
|
|
||||||
std::vector<int> prune_list = {};
|
std::vector<int> prune_list = {};
|
||||||
if (params->prune_layers) {
|
if (params->prune_layers) {
|
||||||
prune_list = *static_cast<const std::vector<int> *>(params->prune_layers);
|
for (const int32_t * p = params->prune_layers; * p != -1; p++) {
|
||||||
|
prune_list.push_back(* p);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// copy the KV pairs from the input file
|
// copy the KV pairs from the input file
|
||||||
|
|
@ -606,20 +607,18 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||||
gguf_remove_key(ctx_out.get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str());
|
gguf_remove_key(ctx_out.get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str());
|
||||||
|
|
||||||
if (params->kv_overrides) {
|
if (params->kv_overrides) {
|
||||||
const std::vector<llama_model_kv_override> & overrides = *(const std::vector<llama_model_kv_override> *)params->kv_overrides;
|
for (const llama_model_kv_override * o = params->kv_overrides; o->key[0] != 0; ++o) {
|
||||||
for (const auto & o : overrides) {
|
if (o->tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
|
||||||
if (o.key[0] == 0) break;
|
gguf_set_val_f32(ctx_out.get(), o->key, o->val_f64);
|
||||||
if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
|
} else if (o->tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
|
||||||
gguf_set_val_f32(ctx_out.get(), o.key, o.val_f64);
|
|
||||||
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
|
|
||||||
// Setting type to UINT32. See https://github.com/ggml-org/llama.cpp/pull/14182 for context
|
// Setting type to UINT32. See https://github.com/ggml-org/llama.cpp/pull/14182 for context
|
||||||
gguf_set_val_u32(ctx_out.get(), o.key, (uint32_t)std::abs(o.val_i64));
|
gguf_set_val_u32(ctx_out.get(), o->key, (uint32_t)std::abs(o->val_i64));
|
||||||
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
|
} else if (o->tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
|
||||||
gguf_set_val_bool(ctx_out.get(), o.key, o.val_bool);
|
gguf_set_val_bool(ctx_out.get(), o->key, o->val_bool);
|
||||||
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
|
} else if (o->tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
|
||||||
gguf_set_val_str(ctx_out.get(), o.key, o.val_str);
|
gguf_set_val_str(ctx_out.get(), o->key, o->val_str);
|
||||||
} else {
|
} else {
|
||||||
LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key);
|
LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o->key);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -850,13 +849,12 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||||
// if the user provided tensor types - use those
|
// if the user provided tensor types - use those
|
||||||
bool manual = false;
|
bool manual = false;
|
||||||
if (params->tensor_types) {
|
if (params->tensor_types) {
|
||||||
const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
|
|
||||||
const std::string tensor_name(tensor->name);
|
const std::string tensor_name(tensor->name);
|
||||||
for (const auto & [tname, qtype] : tensor_types) {
|
for (const auto * p = params->tensor_types; p->pattern != nullptr; p++) {
|
||||||
if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
|
if (std::regex pattern(p->pattern); std::regex_search(tensor_name, pattern)) {
|
||||||
if (qtype != new_type) {
|
if (p->type != new_type) {
|
||||||
LLAMA_LOG_WARN("(manual override: %s -> %s) ", ggml_type_name(new_type), ggml_type_name(qtype));
|
LLAMA_LOG_WARN("(manual override: %s -> %s) ", ggml_type_name(new_type), ggml_type_name(p->type));
|
||||||
new_type = qtype; // if two or more types are specified for the same tensor, the last match wins
|
new_type = p->type; // if two or more types are specified for the same tensor, the last match wins
|
||||||
manual = true;
|
manual = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -577,8 +577,16 @@ int main(int argc, char ** argv) {
|
||||||
std::vector<std::string> imatrix_datasets;
|
std::vector<std::string> imatrix_datasets;
|
||||||
std::unordered_map<std::string, std::vector<float>> imatrix_data;
|
std::unordered_map<std::string, std::vector<float>> imatrix_data;
|
||||||
int m_last_call = prepare_imatrix(imatrix_file, imatrix_datasets, included_weights, excluded_weights, imatrix_data);
|
int m_last_call = prepare_imatrix(imatrix_file, imatrix_datasets, included_weights, excluded_weights, imatrix_data);
|
||||||
|
|
||||||
|
std::vector<llama_imatrix_data> i_data;
|
||||||
|
std::vector<llama_model_tensor_override> t_override;
|
||||||
if (!imatrix_data.empty()) {
|
if (!imatrix_data.empty()) {
|
||||||
params.imatrix = &imatrix_data;
|
i_data.reserve(imatrix_data.size() + 1);
|
||||||
|
for (const auto & kv : imatrix_data) {
|
||||||
|
i_data.push_back({kv.first.c_str(), kv.second.data(), kv.second.size()});
|
||||||
|
}
|
||||||
|
i_data.push_back({nullptr, nullptr, 0}); // array terminator
|
||||||
|
params.imatrix = i_data.data();
|
||||||
{
|
{
|
||||||
llama_model_kv_override kvo;
|
llama_model_kv_override kvo;
|
||||||
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_FILE);
|
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_FILE);
|
||||||
|
|
@ -596,7 +604,6 @@ int main(int argc, char ** argv) {
|
||||||
kvo.val_str[127] = '\0';
|
kvo.val_str[127] = '\0';
|
||||||
kv_overrides.emplace_back(std::move(kvo));
|
kv_overrides.emplace_back(std::move(kvo));
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
llama_model_kv_override kvo;
|
llama_model_kv_override kvo;
|
||||||
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES);
|
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES);
|
||||||
|
|
@ -604,7 +611,6 @@ int main(int argc, char ** argv) {
|
||||||
kvo.val_i64 = imatrix_data.size();
|
kvo.val_i64 = imatrix_data.size();
|
||||||
kv_overrides.emplace_back(std::move(kvo));
|
kv_overrides.emplace_back(std::move(kvo));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (m_last_call > 0) {
|
if (m_last_call > 0) {
|
||||||
llama_model_kv_override kvo;
|
llama_model_kv_override kvo;
|
||||||
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS);
|
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS);
|
||||||
|
|
@ -616,13 +622,19 @@ int main(int argc, char ** argv) {
|
||||||
if (!kv_overrides.empty()) {
|
if (!kv_overrides.empty()) {
|
||||||
kv_overrides.emplace_back();
|
kv_overrides.emplace_back();
|
||||||
kv_overrides.back().key[0] = 0;
|
kv_overrides.back().key[0] = 0;
|
||||||
params.kv_overrides = &kv_overrides;
|
params.kv_overrides = kv_overrides.data();
|
||||||
}
|
}
|
||||||
if (!tensor_types.empty()) {
|
if (!tensor_types.empty()) {
|
||||||
params.tensor_types = &tensor_types;
|
t_override.reserve(tensor_types.size() + 1);
|
||||||
|
for (const auto & tt : tensor_types) {
|
||||||
|
t_override.push_back({tt.name.c_str(), tt.quant});
|
||||||
|
}
|
||||||
|
t_override.push_back({nullptr, GGML_TYPE_COUNT}); // array terminator
|
||||||
|
params.tensor_types = t_override.data();
|
||||||
}
|
}
|
||||||
if (!prune_layers.empty()) {
|
if (!prune_layers.empty()) {
|
||||||
params.prune_layers = &prune_layers;
|
prune_layers.push_back(-1); // array terminator
|
||||||
|
params.prune_layers = prune_layers.data();
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue