initial commit for branch

This commit is contained in:
ddh0 2026-02-12 17:52:59 -06:00
parent 0301b1db65
commit 5d6c92440c
2 changed files with 226 additions and 186 deletions

View File

@ -479,7 +479,8 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float *
return new_size;
}
static bool tensor_type_requires_imatrix(const ggml_tensor * t, const ggml_type dst_type) {
// based on this tensor and the destination tensor type, do we require an importance matrix?
static bool tensor_requires_imatrix(const ggml_tensor * t, const ggml_type dst_type) {
return (
dst_type == GGML_TYPE_IQ2_XXS || dst_type == GGML_TYPE_IQ2_XS ||
dst_type == GGML_TYPE_IQ3_XXS || dst_type == GGML_TYPE_IQ1_S ||
@ -490,6 +491,151 @@ static bool tensor_type_requires_imatrix(const ggml_tensor * t, const ggml_type
);
}
// do we allow this tensor to be quantized?
static bool tensor_allows_quantization(const llama_model_quantize_params * params, llm_arch arch, const ggml_tensor * tensor) {
const std::string name = tensor->name;
// This used to be a regex, but <regex> has an extreme cost to compile times.
bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
// quantize only 2D and 3D tensors (experts)
quantize &= (ggml_n_dims(tensor) >= 2);
// do not quantize norm tensors
quantize &= name.find("_norm.weight") == std::string::npos;
quantize &= params->quantize_output_tensor || name != "output.weight";
quantize &= !params->only_copy;
// do not quantize expert gating tensors
// NOTE: can't use LLM_TN here because the layer number is not known
quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
// these are very small (e.g. 4x4)
quantize &= name.find("altup") == std::string::npos;
quantize &= name.find("laurel") == std::string::npos;
// these are not too big so keep them as it is
quantize &= name.find("per_layer_model_proj") == std::string::npos;
// do not quantize positional embeddings and token types (BERT)
quantize &= name != LLM_TN(arch)(LLM_TENSOR_POS_EMBD, "weight");
quantize &= name != LLM_TN(arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
// do not quantize Mamba /Kimi's small conv1d weights
// NOTE: can't use LLM_TN here because the layer number is not known
quantize &= name.find("ssm_conv1d") == std::string::npos;
quantize &= name.find("shortconv.conv.weight") == std::string::npos;
// do not quantize RWKV's small yet 2D weights
quantize &= name.find("time_mix_first.weight") == std::string::npos;
quantize &= name.find("time_mix_w0.weight") == std::string::npos;
quantize &= name.find("time_mix_w1.weight") == std::string::npos;
quantize &= name.find("time_mix_w2.weight") == std::string::npos;
quantize &= name.find("time_mix_v0.weight") == std::string::npos;
quantize &= name.find("time_mix_v1.weight") == std::string::npos;
quantize &= name.find("time_mix_v2.weight") == std::string::npos;
quantize &= name.find("time_mix_a0.weight") == std::string::npos;
quantize &= name.find("time_mix_a1.weight") == std::string::npos;
quantize &= name.find("time_mix_a2.weight") == std::string::npos;
quantize &= name.find("time_mix_g1.weight") == std::string::npos;
quantize &= name.find("time_mix_g2.weight") == std::string::npos;
quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
// do not quantize relative position bias (T5)
quantize &= name.find("attn_rel_b.weight") == std::string::npos;
// do not quantize specific multimodal tensors
quantize &= name.find(".position_embd.") == std::string::npos;
return quantize;
}
static ggml_type get_tensor_target_type(
quantize_state_impl & qs,
const llama_model_quantize_params * params,
const ggml_tensor * tensor,
ggml_type default_type
) {
ggml_type new_type;
// get more optimal quantization type based on the tensor shape, layer, etc.
if (!params->pure && ggml_is_quantized(default_type)) {
// if the user provided tensor types - use those
bool manual = false;
if (params->tensor_types) {
const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
const std::string tensor_name(tensor->name);
for (const auto & [tname, qtype] : tensor_types) {
if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
if (qtype != new_type) {
LLAMA_LOG_WARN("(manual override: %s -> %s) ", ggml_type_name(new_type), ggml_type_name(qtype));
new_type = qtype; // if two or more types are specified for the same tensor, the last match wins
manual = true;
break;
}
}
}
}
// if not manual - use the standard logic for choosing the quantization type based on the selected mixture
if (!manual) {
new_type = llama_tensor_get_type(qs, new_type, tensor, params->ftype);
}
// incompatible tensor shapes are handled here - fallback to a compatible type
{
bool convert_incompatible_tensor = false;
const int64_t nx = tensor->ne[0];
const int64_t ny = tensor->ne[1];
const int64_t qk_k = ggml_blck_size(new_type);
if (nx % qk_k != 0) {
LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
convert_incompatible_tensor = true;
} else {
++qs.n_k_quantized;
}
if (convert_incompatible_tensor) {
switch (new_type) {
case GGML_TYPE_TQ1_0:
case GGML_TYPE_TQ2_0: new_type = GGML_TYPE_Q4_0; break; // TODO: use a symmetric type instead
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ3_XXS:
case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ1_S:
case GGML_TYPE_IQ1_M:
case GGML_TYPE_Q2_K:
case GGML_TYPE_Q3_K:
case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
}
if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
new_type = GGML_TYPE_F16;
}
LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
++qs.n_fallback;
}
}
}
if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
new_type = params->token_embedding_type;
}
if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
new_type = params->output_tensor_type;
}
return new_type;
}
static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
ggml_type default_type;
llama_ftype ftype = params->ftype;
@ -628,8 +774,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
int blk_id = 0;
// make a list of weights
std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
tensors.reserve(ml.weights_map.size());
std::vector<const llama_model_loader::llama_tensor_weight *> weights;
weights.reserve(ml.weights_map.size());
for (const auto & it : ml.weights_map) {
const std::string remapped_name(remap_layer(it.first, prune_list, mapped, blk_id));
if (remapped_name.empty()) {
@ -641,8 +787,16 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
ggml_set_name(it.second.tensor, remapped_name.c_str());
LLAMA_LOG_DEBUG("%s: tensor %s remapped to %s\n", __func__, it.first.c_str(), ggml_get_name(it.second.tensor));
}
tensors.push_back(&it.second);
weights.push_back(&it.second);
}
// make a list of tensors (same pointers as from weights)
std::vector<ggml_tensor*> tensors;
tensors.reserve(weights.size());
for (size_t i = 0; i < weights.size(); ++i) {
tensors.push_back(weights[i]->tensor);
}
if (!prune_list.empty()) {
gguf_set_val_u32(ctx_out.get(), ml.llm_kv(LLM_KV_BLOCK_COUNT).c_str(), blk_id);
}
@ -657,26 +811,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
});
}
for (const auto * it : tensors) {
const struct ggml_tensor * tensor = it->tensor;
const std::string name = ggml_get_name(tensor);
// TODO: avoid hardcoded tensor names - use the TN_* constants
if (name.find("attn_v.weight") != std::string::npos ||
name.find("attn_qkv.weight") != std::string::npos ||
name.find("attn_kv_b.weight")!= std::string::npos) {
++qs.n_attention_wv;
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
qs.has_output = true;
}
}
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
size_t total_size_org = 0;
size_t total_size_new = 0;
std::vector<std::thread> workers;
workers.reserve(nthread);
@ -690,23 +824,61 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
// Assume split index is continuous
if (params->keep_split) {
for (const auto * it : tensors) {
for (const auto * it : weights) {
n_split = std::max(uint16_t(it->idx + 1), n_split);
}
}
std::vector<gguf_context_ptr> ctx_outs(n_split);
ctx_outs[0] = std::move(ctx_out);
// populate the original tensors so we get an initial meta data
for (const auto * it : tensors) {
// flag for `--dry-run`, to let the user know if imatrix will be required for a real
// quantization, as a courtesy
bool will_require_imatrix = false;
// this is the preliminary iteration over all weights (not the main loop)
for (const auto * it : weights) {
const ggml_tensor * tensor = it->tensor;
const std::string name = tensor->name;
// TODO: avoid hardcoded tensor names - use the TN_* constants
if (name.find("attn_v.weight") != std::string::npos ||
name.find("attn_qkv.weight") != std::string::npos ||
name.find("attn_kv_b.weight")!= std::string::npos) {
++qs.n_attention_wv;
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
qs.has_output = true;
}
// populate the original tensors so we get an initial meta data
uint16_t i_split = params->keep_split ? it->idx : 0;
ggml_tensor * tensor = it->tensor;
if (!ctx_outs[i_split]) {
ctx_outs[i_split].reset(gguf_init_empty());
}
gguf_add_tensor(ctx_outs[i_split].get(), tensor);
// TODO: we could save this per-tensor and correlate it with the vector of tensors so we
// don't have to call this function again later (currently twice per tensor)
ggml_type target_type = get_tensor_target_type(qs, params, tensor, default_type);
if (!params->imatrix &&
tensor_allows_quantization(params, model.arch, tensor) &&
tensor_requires_imatrix(tensor, target_type)
) {
if (params->dry_run) {
will_require_imatrix = true; // set flag for warning later, but continue with dry run
} else {
LLAMA_LOG_ERROR("\n\n============================================================================\n"
" ERROR: this quantization requires an importance matrix!\n"
" offending tensor: %s (target type: %s)\n"
"============================================================================\n\n",
name, ggml_type_name(target_type));
throw new std::runtime_error("this quantization requires an imatrix!");
}
}
}
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
// Set split info if needed
if (n_split > 1) {
for (size_t i = 0; i < ctx_outs.size(); ++i) {
@ -752,13 +924,14 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
new_ofstream(0);
}
// flag for `--dry-run`, to let the user know if imatrix will be required for a real
// quantization, as a courtesy
bool will_require_imatrix = false;
size_t total_size_org = 0;
size_t total_size_new = 0;
for (const auto * it : tensors) {
// iterate over all weights (main loop)
for (const auto * it : weights) {
const auto & weight = *it;
ggml_tensor * tensor = weight.tensor;
if (!params->dry_run && (weight.idx != cur_split && params->keep_split)) {
close_ofstream();
new_ofstream(weight.idx);
@ -778,161 +951,40 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
}
LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
++idx, ml.n_tensors,
ggml_get_name(tensor),
llama_format_tensor_shape(tensor).c_str(),
ggml_type_name(tensor->type));
++idx, ml.n_tensors,
ggml_get_name(tensor),
llama_format_tensor_shape(tensor).c_str(),
ggml_type_name(tensor->type));
// This used to be a regex, but <regex> has an extreme cost to compile times.
bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
// will we quantize this tensor?
bool do_quantize = tensor_allows_quantization(params, model.arch, tensor);
// quantize only 2D and 3D tensors (experts)
quantize &= (ggml_n_dims(tensor) >= 2);
ggml_type new_type = default_type;
// do not quantize norm tensors
quantize &= name.find("_norm.weight") == std::string::npos;
// if so, what will be the target type?
if (do_quantize) {
new_type = get_tensor_target_type(qs, params, tensor, default_type);
// If we've decided to quantize to the same type the tensor is already
// in then there's nothing to do.
do_quantize = tensor->type != new_type;
}
quantize &= params->quantize_output_tensor || name != "output.weight";
quantize &= !params->only_copy;
// do not quantize expert gating tensors
// NOTE: can't use LLM_TN here because the layer number is not known
quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
// these are very small (e.g. 4x4)
quantize &= name.find("altup") == std::string::npos;
quantize &= name.find("laurel") == std::string::npos;
// these are not too big so keep them as it is
quantize &= name.find("per_layer_model_proj") == std::string::npos;
// do not quantize positional embeddings and token types (BERT)
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight");
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
// do not quantize Mamba /Kimi's small conv1d weights
// NOTE: can't use LLM_TN here because the layer number is not known
quantize &= name.find("ssm_conv1d") == std::string::npos;
quantize &= name.find("shortconv.conv.weight") == std::string::npos;
// do not quantize RWKV's small yet 2D weights
quantize &= name.find("time_mix_first.weight") == std::string::npos;
quantize &= name.find("time_mix_w0.weight") == std::string::npos;
quantize &= name.find("time_mix_w1.weight") == std::string::npos;
quantize &= name.find("time_mix_w2.weight") == std::string::npos;
quantize &= name.find("time_mix_v0.weight") == std::string::npos;
quantize &= name.find("time_mix_v1.weight") == std::string::npos;
quantize &= name.find("time_mix_v2.weight") == std::string::npos;
quantize &= name.find("time_mix_a0.weight") == std::string::npos;
quantize &= name.find("time_mix_a1.weight") == std::string::npos;
quantize &= name.find("time_mix_a2.weight") == std::string::npos;
quantize &= name.find("time_mix_g1.weight") == std::string::npos;
quantize &= name.find("time_mix_g2.weight") == std::string::npos;
quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
// do not quantize relative position bias (T5)
quantize &= name.find("attn_rel_b.weight") == std::string::npos;
// do not quantize specific multimodal tensors
quantize &= name.find(".position_embd.") == std::string::npos;
ggml_type new_type;
void * new_data;
size_t new_size;
if (quantize) {
new_type = default_type;
//
// perform quantization (or dry run)
//
// get more optimal quantization type based on the tensor shape, layer, etc.
if (!params->pure && ggml_is_quantized(default_type)) {
// if the user provided tensor types - use those
bool manual = false;
if (params->tensor_types) {
const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
const std::string tensor_name(tensor->name);
for (const auto & [tname, qtype] : tensor_types) {
if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
if (qtype != new_type) {
LLAMA_LOG_WARN("(manual override: %s -> %s) ", ggml_type_name(new_type), ggml_type_name(qtype));
new_type = qtype; // if two or more types are specified for the same tensor, the last match wins
manual = true;
break;
}
}
}
}
// if not manual - use the standard logic for choosing the quantization type based on the selected mixture
if (!manual) {
new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
}
// incompatible tensor shapes are handled here - fallback to a compatible type
{
bool convert_incompatible_tensor = false;
const int64_t nx = tensor->ne[0];
const int64_t ny = tensor->ne[1];
const int64_t qk_k = ggml_blck_size(new_type);
if (nx % qk_k != 0) {
LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
convert_incompatible_tensor = true;
} else {
++qs.n_k_quantized;
}
if (convert_incompatible_tensor) {
switch (new_type) {
case GGML_TYPE_TQ1_0:
case GGML_TYPE_TQ2_0: new_type = GGML_TYPE_Q4_0; break; // TODO: use a symmetric type instead
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ3_XXS:
case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ1_S:
case GGML_TYPE_IQ1_M:
case GGML_TYPE_Q2_K:
case GGML_TYPE_Q3_K:
case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
}
if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
new_type = GGML_TYPE_F16;
}
LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
++qs.n_fallback;
}
}
}
if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
new_type = params->token_embedding_type;
}
if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
new_type = params->output_tensor_type;
}
// If we've decided to quantize to the same type the tensor is already
// in then there's nothing to do.
quantize = tensor->type != new_type;
}
// we have now decided on the target type for this tensor
if (params->dry_run) {
// the --dry-run option calculates the final quantization size without quantizting
if (quantize) {
if (do_quantize) {
new_size = ggml_nrows(tensor) * ggml_row_size(new_type, tensor->ne[0]);
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB (%s)\n",
tensor_size/1024.0/1024.0,
new_size/1024.0/1024.0,
ggml_type_name(new_type));
if (!will_require_imatrix && tensor_type_requires_imatrix(tensor, new_type)) {
if (!will_require_imatrix && tensor_requires_imatrix(tensor, new_type)) {
will_require_imatrix = true;
}
} else {
@ -944,7 +996,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
continue;
} else {
// no --dry-run, perform quantization
if (!quantize) {
if (!do_quantize) {
new_type = tensor->type;
new_data = tensor->data;
new_size = tensor_size;
@ -975,7 +1027,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
}
}
}
if (!imatrix && tensor_type_requires_imatrix(tensor, new_type)) {
if (!imatrix && tensor_requires_imatrix(tensor, new_type)) {
LLAMA_LOG_ERROR("\n\n============================================================\n");
LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");

View File

@ -686,18 +686,6 @@ int main(int argc, char ** argv) {
}
}
if (!params.dry_run &&
(
params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS ||
params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S ||
params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M
) && imatrix_data.empty()) {
fprintf(stderr, "\n==========================================================================================================\n");
fprintf(stderr, "Please do not use IQ1_S, IQ1_M, IQ2_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");
fprintf(stderr, "==========================================================================================================\n\n\n");
return 1;
}
if (!params.dry_run) {
if (std::error_code ec; std::filesystem::equivalent(fname_inp, fname_out, ec)) {
fprintf(stderr, "%s: error: input and output files are the same: '%s'\n", __func__, fname_inp.c_str());