llama.cpp/src/llama-quant.cpp

2207 lines
95 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include "ggml.h"
#include "llama-impl.h"
#include "llama-model.h"
#include "llama-model-loader.h"
#include "llama.h"
#include "llama-ext.h"
#include <algorithm>
#include <cmath>
#include <cstring>
#include <cinttypes>
#include <fstream>
#include <mutex>
#include <regex>
#include <thread>
#include <unordered_map>
// Q3_PT levels functions (defined in ggml-quants.c)
extern "C" {
void q3pt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
const float * imatrix, float levels_out[8]);
void q3pt_set_levels(const float * levels);
}
// Q3_KPT levels functions (defined in ggml-quants.c)
extern "C" {
void q3kpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
const float * imatrix, float levels_out[8]);
void q3kpt_set_levels(const float * levels);
}
// Q4_DPT levels functions (defined in ggml-quants.c)
extern "C" {
void q4dpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
const float * imatrix, int8_t levels_out[16]);
void q4dpt_set_levels(const int8_t * levels);
}
// Q2_KPT levels are handled internally by quantize_q2_kpt
#define Q2KPT_N_LEVELS 4
#define QK_K 256
extern "C" const float * q2kpt_get_levels(void);
extern "C" void q2kpt_prepare_levels(int64_t nrows, int64_t n_per_row);
extern "C" void q2kpt_free_levels(void);
// IQ2_TQ functions — per-tensor trained grid
extern "C" size_t quantize_iq2_tq(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
extern "C" void iq2tq_train_grid(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t grid_out[64]);
extern "C" void iq2tq_set_grid(const int8_t grid[64]);
extern "C" const int8_t * iq2tq_get_grid(void);
// IQ3_TQ functions — per-tensor trained grid (3-bit, 128 bytes per tensor)
extern "C" size_t quantize_iq3_tq(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
extern "C" void iq3tq_train_grid(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t grid_out[128]);
extern "C" void iq3tq_set_grid(const int8_t grid[128]);
extern "C" const int8_t * iq3tq_get_grid(void);
// IQ1_BN functions — 8D vector quantized with per-tensor trained 4096-entry codebook (32768 bytes per tensor)
extern "C" size_t quantize_iq1_bn(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
extern "C" void iq1bn_train_codebook(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t aux_out[32768], int nthread);
extern "C" void iq1bn_set_aux(const int8_t aux[32768]);
extern "C" const int8_t * iq1bn_get_aux(void);
// Q3_PT levels functions (defined in ggml-quants.c)
extern "C" {
void q3pt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
const float * imatrix, float levels_out[8]);
void q3pt_set_levels(const float * levels);
}
// Q3_KPT levels functions (defined in ggml-quants.c)
extern "C" {
void q3kpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
const float * imatrix, float levels_out[8]);
void q3kpt_set_levels(const float * levels);
}
// Q4_DPT levels functions (defined in ggml-quants.c)
extern "C" {
void q4dpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
const float * imatrix, int8_t levels_out[16]);
void q4dpt_set_levels(const int8_t * levels);
}
// Q2_KPT levels are handled internally by quantize_q2_kpt
#define Q2KPT_N_LEVELS 4
#define QK_K 256
extern "C" const float * q2kpt_get_levels(void);
extern "C" void q2kpt_prepare_levels(int64_t nrows, int64_t n_per_row);
extern "C" void q2kpt_free_levels(void);
// IQ2_TQ functions — per-tensor trained grid
extern "C" size_t quantize_iq2_tq(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
extern "C" void iq2tq_train_grid(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t grid_out[64]);
extern "C" void iq2tq_set_grid(const int8_t grid[64]);
extern "C" const int8_t * iq2tq_get_grid(void);
// IQ3_TQ functions — per-tensor trained grid (3-bit, 128 bytes per tensor)
extern "C" size_t quantize_iq3_tq(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
extern "C" void iq3tq_train_grid(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t grid_out[128]);
extern "C" void iq3tq_set_grid(const int8_t grid[128]);
extern "C" const int8_t * iq3tq_get_grid(void);
// IQ1_BN functions — 8D vector quantized with per-tensor trained 4096-entry codebook (32768 bytes per tensor)
extern "C" size_t quantize_iq1_bn(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
extern "C" void iq1bn_train_codebook(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t aux_out[32768], int nthread);
extern "C" void iq1bn_set_aux(const int8_t aux[32768]);
extern "C" const int8_t * iq1bn_get_aux(void);
// result of parsing --tensor-type option
// (changes to this struct must be reflected in tools/quantize/quantize.cpp)
struct tensor_type_option {
std::string name;
ggml_type type = GGML_TYPE_COUNT;
};
// tensor categorization - used to avoid repeated string matching in quantization logic.
// this is different from LLM_TN - we want broad categories, not specific tensor names per arch.
enum class tensor_category {
TOKEN_EMBD,
ATTENTION_Q,
ATTENTION_V,
ATTENTION_K,
ATTENTION_QKV,
ATTENTION_KV_B,
ATTENTION_OUTPUT,
FFN_UP,
FFN_GATE,
FFN_DOWN,
OUTPUT,
OTHER
};
static void zeros(std::ofstream & file, size_t n) {
char zero = 0;
for (size_t i = 0; i < n; ++i) {
file.write(&zero, 1);
}
}
static std::string remap_layer(const std::string & orig_name, const std::vector<int> & prune, std::map<int, std::string> & mapped, int & next_id) {
if (prune.empty()) {
return orig_name;
}
static const std::regex pattern(R"(blk\.(\d+)\.)");
if (std::smatch match; std::regex_search(orig_name, match, pattern)) {
const int blk = std::stoi(match[1]);
std::string new_name = orig_name;
if (mapped.count(blk)) {
// Already mapped, do nothing
} else if (std::find(prune.begin(), prune.end(), blk) != prune.end()) {
mapped[blk] = "";
} else if (blk < prune.front()) {
mapped[blk] = std::to_string(blk);
next_id = blk + 1;
} else {
mapped[blk] = std::to_string(next_id);
++next_id;
}
return mapped[blk].empty() ? mapped[blk] : new_name.replace(match.position(1), match.length(1), mapped[blk]);
}
return orig_name;
}
static std::string remap_imatrix(const std::string & orig_name, const std::map<int, std::string> & mapped) {
if (mapped.empty()) {
return orig_name;
}
static const std::regex pattern(R"(blk\.(\d+)\.)");
if (std::smatch match; std::regex_search(orig_name, match, pattern)) {
const std::string blk(match[1]);
std::string new_name = orig_name;
for (const auto & p : mapped) {
if (p.second == blk) {
return new_name.replace(match.position(1), match.length(1), std::to_string(p.first));
}
}
GGML_ABORT("\n%s: imatrix mapping error for %s\n", __func__, orig_name.c_str());
}
return orig_name;
}
//
// helper functions for tensor name matching
//
static bool tensor_name_match_token_embd(const char * tensor_name) {
return std::strcmp(tensor_name, "token_embd.weight") == 0 ||
std::strcmp(tensor_name, "per_layer_token_embd.weight") == 0;
}
static bool tensor_name_match_output_weight(const char * tensor_name) {
return std::strcmp(tensor_name, "output.weight") == 0;
}
//
// tensor categorization for quantization
//
// (this is different from LLM_TN - we want broad categories, not specific tensor names per arch)
//
static tensor_category tensor_get_category(const std::string & tensor_name) {
if (tensor_name_match_output_weight(tensor_name.c_str())) {
return tensor_category::OUTPUT;
}
if (tensor_name_match_token_embd(tensor_name.c_str())) {
return tensor_category::TOKEN_EMBD;
}
if (tensor_name.find("attn_qkv.weight") != std::string::npos) {
return tensor_category::ATTENTION_QKV;
}
if (tensor_name.find("attn_kv_b.weight") != std::string::npos) {
return tensor_category::ATTENTION_KV_B;
}
if (tensor_name.find("attn_v.weight") != std::string::npos) {
return tensor_category::ATTENTION_V;
}
if (tensor_name.find("attn_k.weight") != std::string::npos) {
return tensor_category::ATTENTION_K;
}
if (tensor_name.find("attn_q.weight") != std::string::npos) {
return tensor_category::ATTENTION_Q;
}
if (tensor_name.find("attn_output.weight") != std::string::npos) {
return tensor_category::ATTENTION_OUTPUT;
}
if (tensor_name.find("ffn_up") != std::string::npos) {
return tensor_category::FFN_UP;
}
if (tensor_name.find("ffn_gate") != std::string::npos) {
return tensor_category::FFN_GATE;
}
if (tensor_name.find("ffn_down") != std::string::npos) {
return tensor_category::FFN_DOWN;
}
return tensor_category::OTHER;
}
// check if category is for attention-v-like tensors (more sensitive to quantization)
static bool category_is_attn_v(tensor_category cat) {
return cat == tensor_category::ATTENTION_V ||
cat == tensor_category::ATTENTION_QKV ||
cat == tensor_category::ATTENTION_KV_B;
}
//
// quantization state
//
struct quantize_state_impl {
const llama_model & model;
const llama_model_quantize_params * params;
int n_attention_wv = 0;
int n_ffn_down = 0;
int n_ffn_gate = 0;
int n_ffn_up = 0;
int i_attention_wv = 0;
int i_ffn_down = 0;
int i_ffn_gate = 0;
int i_ffn_up = 0;
int n_fallback = 0;
bool has_imatrix = false;
// used to figure out if a model has tied embeddings (tok_embd shares weights with output)
bool has_tied_embeddings = true; // assume tied until we see output.weight
// tensor type override patterns (compiled once, used twice)
std::vector<std::pair<std::regex, ggml_type>> tensor_type_patterns;
quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params):
model(model), params(params)
{
// compile regex patterns once - they are expensive
if (params->tt_overrides) {
for (const auto * p = params->tt_overrides; p->pattern != nullptr; p++) {
tensor_type_patterns.emplace_back(std::regex(p->pattern), p->type);
}
}
}
};
// per-tensor metadata, computed in the preliminary loop and used in the main loop
struct tensor_metadata {
std::string name;
ggml_type target_type;
tensor_category category;
std::string remapped_imatrix_name;
bool allows_quantization;
bool requires_imatrix;
};
//
// dequantization
//
static void llama_tensor_dequantize_impl(
ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
const size_t nelements, const int nthread
) {
if (output.size() < nelements) {
output.resize(nelements);
}
float * f32_output = (float *) output.data();
const ggml_type_traits * qtype = ggml_get_type_traits(tensor->type);
if (ggml_is_quantized(tensor->type)) {
if (qtype->to_float == NULL) {
throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type)));
}
} else if (tensor->type != GGML_TYPE_F16 &&
tensor->type != GGML_TYPE_BF16) {
throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor->type)));
}
if (nthread < 2) {
if (tensor->type == GGML_TYPE_F16) {
ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements);
} else if (tensor->type == GGML_TYPE_BF16) {
ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements);
} else if (ggml_is_quantized(tensor->type)) {
qtype->to_float(tensor->data, f32_output, nelements, tensor->quant_levels);
} else {
GGML_ABORT("fatal error"); // unreachable
}
return;
}
size_t block_size;
if (tensor->type == GGML_TYPE_F16 ||
tensor->type == GGML_TYPE_BF16) {
block_size = 1;
} else {
block_size = (size_t)ggml_blck_size(tensor->type);
}
size_t block_size_bytes = ggml_type_size(tensor->type);
GGML_ASSERT(nelements % block_size == 0);
size_t nblocks = nelements / block_size;
size_t blocks_per_thread = nblocks / nthread;
size_t spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
size_t in_buff_offs = 0;
size_t out_buff_offs = 0;
for (int tnum = 0; tnum < nthread; tnum++) {
size_t thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
size_t thr_elems = thr_blocks * block_size; // number of elements for this thread
size_t thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
const void * quant_levels = tensor->quant_levels;
auto compute = [qtype, quant_levels] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
if (typ == GGML_TYPE_F16) {
ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
} else if (typ == GGML_TYPE_BF16) {
ggml_bf16_to_fp32_row((ggml_bf16_t *)inbuf, outbuf, nels);
} else {
qtype->to_float(inbuf, outbuf, nels, quant_levels);
}
};
workers.emplace_back(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems);
in_buff_offs += thr_block_bytes;
out_buff_offs += thr_elems;
}
for (auto & w : workers) { w.join(); }
workers.clear();
}
//
// do we allow this tensor to be quantized?
//
static bool tensor_allows_quantization(const llama_model_quantize_params * params, llm_arch arch, const ggml_tensor * tensor) {
// trivial checks first -- no string ops needed
if (params->only_copy) return false;
// quantize only 2D and 3D tensors (experts)
if (ggml_n_dims(tensor) < 2) return false;
const std::string name = ggml_get_name(tensor);
// This used to be a regex, but <regex> has an extreme cost to compile times.
bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
// do not quantize norm tensors
quantize &= name.find("_norm.weight") == std::string::npos;
quantize &= params->quantize_output_tensor || name != "output.weight";
// do not quantize expert gating tensors
// NOTE: can't use LLM_TN here because the layer number is not known
quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
// these are very small (e.g. 4x4)
quantize &= name.find("altup") == std::string::npos;
quantize &= name.find("laurel") == std::string::npos;
// these are not too big so keep them as it is
quantize &= name.find("per_layer_model_proj") == std::string::npos;
// do not quantize positional embeddings and token types (BERT)
quantize &= name != LLM_TN(arch)(LLM_TENSOR_POS_EMBD, "weight");
quantize &= name != LLM_TN(arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
// do not quantize Mamba/Kimi's small conv1d weights
// NOTE: can't use LLM_TN here because the layer number is not known
quantize &= name.find("ssm_conv1d") == std::string::npos;
quantize &= name.find("shortconv.conv.weight") == std::string::npos;
// do not quantize RWKV's small yet 2D weights
quantize &= name.find("time_mix_first.weight") == std::string::npos;
quantize &= name.find("time_mix_w0.weight") == std::string::npos;
quantize &= name.find("time_mix_w1.weight") == std::string::npos;
quantize &= name.find("time_mix_w2.weight") == std::string::npos;
quantize &= name.find("time_mix_v0.weight") == std::string::npos;
quantize &= name.find("time_mix_v1.weight") == std::string::npos;
quantize &= name.find("time_mix_v2.weight") == std::string::npos;
quantize &= name.find("time_mix_a0.weight") == std::string::npos;
quantize &= name.find("time_mix_a1.weight") == std::string::npos;
quantize &= name.find("time_mix_a2.weight") == std::string::npos;
quantize &= name.find("time_mix_g1.weight") == std::string::npos;
quantize &= name.find("time_mix_g2.weight") == std::string::npos;
quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
// do not quantize relative position bias (T5)
quantize &= name.find("attn_rel_b.weight") == std::string::npos;
// do not quantize specific multimodal tensors
quantize &= name.find(".position_embd") == std::string::npos;
quantize &= name.find("sam.pos_embd") == std::string::npos;
quantize &= name.find("sam.neck.") == std::string::npos;
quantize &= name.find("sam.net_") == std::string::npos;
quantize &= name.find(".rel_pos") == std::string::npos;
quantize &= name.find(".patch_embd") == std::string::npos;
quantize &= name.find(".patch_merger") == std::string::npos;
return quantize;
}
//
// tensor type selection
//
// incompatible tensor shapes are handled here - fallback to a compatible type
static ggml_type tensor_type_fallback(quantize_state_impl & qs, const ggml_tensor * t, const ggml_type target_type) {
ggml_type return_type = target_type;
const int64_t ncols = t->ne[0];
const int64_t qk_k = ggml_blck_size(target_type);
if (ncols % qk_k != 0) { // this tensor's shape is incompatible with this quant
LLAMA_LOG_WARN("warning: %-36s - ncols %6" PRId64 " not divisible by %3" PRId64 " (required for type %7s) ",
t->name, ncols, qk_k, ggml_type_name(target_type));
++qs.n_fallback;
switch (target_type) {
// types on the left: block size 256
case GGML_TYPE_IQ1_S:
case GGML_TYPE_IQ1_M:
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ3_XXS:
case GGML_TYPE_IQ3_S: // types on the right: block size 32
case GGML_TYPE_IQ4_XS: return_type = GGML_TYPE_IQ4_NL; break;
case GGML_TYPE_Q2_K:
case GGML_TYPE_Q3_K:
case GGML_TYPE_TQ1_0:
case GGML_TYPE_TQ2_0: return_type = GGML_TYPE_Q4_0; break;
case GGML_TYPE_Q4_K: return_type = GGML_TYPE_Q5_0; break;
case GGML_TYPE_Q5_K: return_type = GGML_TYPE_Q5_1; break;
case GGML_TYPE_Q6_K: return_type = GGML_TYPE_Q8_0; break;
default:
throw std::runtime_error(format("no tensor type fallback is defined for type %s",
ggml_type_name(target_type)));
}
if (ncols % ggml_blck_size(return_type) != 0) {
//
// the fallback return type is still not compatible for this tensor!
//
// most likely, this tensor's first dimension is not divisible by 32.
// this is very rare. we can either abort the quantization, or
// fallback to F16 / F32.
//
LLAMA_LOG_WARN("(WARNING: must use F16 due to unusual shape) ");
return_type = GGML_TYPE_F16;
}
LLAMA_LOG_WARN("-> falling back to %7s\n", ggml_type_name(return_type));
}
return return_type;
}
// internal standard logic for selecting the target tensor type based on tensor category, ftype, and model arch
static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype, tensor_category category) {
const std::string name = ggml_get_name(tensor);
// TODO: avoid hardcoded tensor names - use the TN_* constants
const llm_arch arch = qs.model.arch;
auto use_more_bits = [](int i_layer, int n_layers) -> bool {
return i_layer < n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer - n_layers/8)%3 == 2;
};
const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) {
if (n_expert > 1) {
// Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but occasionally randomly
// sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
// for getting the current layer as I initially thought, and we need to resort to parsing the
// tensor name.
if (sscanf(name, "blk.%d.", &i_layer) != 1) {
throw std::runtime_error(format("Failed to determine layer for tensor %s", name));
}
if (i_layer < 0 || i_layer >= n_layer) {
throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name, n_layer));
}
}
return std::make_pair(i_layer, n_layer);
};
// for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
// with the quantization of the output tensor
if (category == tensor_category::OUTPUT || (qs.has_tied_embeddings && category == tensor_category::TOKEN_EMBD)) {
if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
new_type = qs.params->output_tensor_type;
} else {
const int64_t nx = tensor->ne[0];
const int64_t qk_k = ggml_blck_size(new_type);
if (ftype == LLAMA_FTYPE_MOSTLY_MXFP4_MOE) {
new_type = GGML_TYPE_Q8_0;
}
else if (arch == LLM_ARCH_FALCON || nx % qk_k != 0) {
new_type = GGML_TYPE_Q8_0;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ||
ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
new_type = GGML_TYPE_Q5_K;
}
else if (new_type != GGML_TYPE_Q8_0) {
new_type = GGML_TYPE_Q6_K;
}
}
} else if (ftype == LLAMA_FTYPE_MOSTLY_MXFP4_MOE) {
// MoE tensors -> MXFP4
// other tensors -> Q8_0
if (tensor->ne[2] > 1) {
new_type = GGML_TYPE_MXFP4;
} else {
new_type = GGML_TYPE_Q8_0;
}
} else if (category == tensor_category::TOKEN_EMBD) {
if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
new_type = qs.params->token_embedding_type;
} else {
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
new_type = GGML_TYPE_Q2_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
new_type = GGML_TYPE_IQ3_S;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
new_type = GGML_TYPE_IQ3_S;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_PT) {
new_type = GGML_TYPE_IQ4_XS;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_KPT) {
new_type = GGML_TYPE_Q4_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_DPT) {
new_type = GGML_TYPE_IQ4_XS;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_KPT) {
new_type = GGML_TYPE_Q4_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) {
new_type = GGML_TYPE_Q4_K;
}
}
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
if (category_is_attn_v(category)) {
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
++qs.i_attention_wv;
}
else if (qs.model.hparams.n_expert == 8 && category == tensor_category::ATTENTION_K) {
new_type = GGML_TYPE_Q4_K;
}
else if (category == tensor_category::FFN_DOWN) {
if (qs.i_ffn_down < qs.n_ffn_down/8) {
new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
}
++qs.i_ffn_down;
}
else if (category == tensor_category::ATTENTION_OUTPUT) {
if (qs.model.hparams.n_expert == 8) {
new_type = GGML_TYPE_Q5_K;
} else {
if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS;
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
}
}
} else if (category_is_attn_v(category)) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) {
new_type = GGML_TYPE_Q4_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_PT) {
new_type = GGML_TYPE_Q3_PT;
}
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) {
new_type = GGML_TYPE_Q4_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
new_type = GGML_TYPE_Q4_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_KPT) {
new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && qs.model.hparams.n_gqa() >= 4) {
new_type = GGML_TYPE_Q5_K;
}
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
if (qs.model.type == LLM_TYPE_70B) {
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
// nearly negligible increase in model size by quantizing this tensor with more bits:
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
}
if (qs.model.hparams.n_expert == 8) {
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
// TODO: explore better strategies
new_type = GGML_TYPE_Q8_0;
}
++qs.i_attention_wv;
} else if (category == tensor_category::ATTENTION_K) {
if (qs.model.hparams.n_expert == 8) {
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
// TODO: explore better strategies
new_type = GGML_TYPE_Q8_0;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
new_type = GGML_TYPE_IQ3_XXS;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
new_type = GGML_TYPE_IQ2_S;
}
} else if (category == tensor_category::ATTENTION_Q) {
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
new_type = GGML_TYPE_IQ3_XXS;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
new_type = GGML_TYPE_IQ2_S;
}
} else if (category == tensor_category::FFN_DOWN) {
auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
int i_layer = info.first, n_layer = info.second;
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_KPT) new_type = GGML_TYPE_Q3_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) {
new_type = i_layer < n_layer/8 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_KPT) {
new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K
: arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
: (ftype == LLAMA_FTYPE_MOSTLY_Q3_KPT ? GGML_TYPE_Q3_KPT : GGML_TYPE_Q3_K);
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 ||
(qs.model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) {
new_type = GGML_TYPE_Q4_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_PT) {
new_type = GGML_TYPE_IQ4_XS;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
if (arch == LLM_ARCH_FALCON) {
new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K :
use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
} else {
if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
}
}
else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) {
new_type = GGML_TYPE_Q5_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
new_type = GGML_TYPE_Q5_K;
}
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0)
&& qs.has_imatrix && i_layer < n_layer/8) {
// Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0.
// We only do it when an imatrix is provided because a) we want to make sure that one can always get the
// same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
}
++qs.i_ffn_down;
} else if (category == tensor_category::ATTENTION_OUTPUT) {
if (arch != LLM_ARCH_FALCON) {
if (qs.model.hparams.n_expert == 8) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || ftype == LLAMA_FTYPE_MOSTLY_Q3_KPT ||
ftype == LLAMA_FTYPE_MOSTLY_Q2_KPT) {
new_type = GGML_TYPE_Q5_K;
}
} else {
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_KPT) new_type = GGML_TYPE_Q3_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_KPT) new_type = GGML_TYPE_Q4_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_Q4_K;
}
} else {
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
}
}
else if (category == tensor_category::ATTENTION_QKV) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
new_type = GGML_TYPE_Q4_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
}
else if (category == tensor_category::FFN_GATE) {
auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
int i_layer = info.first, n_layer = info.second;
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
new_type = GGML_TYPE_IQ3_XXS;
}
++qs.i_ffn_gate;
}
else if (category == tensor_category::FFN_UP) {
auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
int i_layer = info.first, n_layer = info.second;
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
new_type = GGML_TYPE_IQ3_XXS;
}
++qs.i_ffn_up;
}
return new_type;
}
// outer wrapper: determine the ggml_type that this tensor should be quantized to
static ggml_type llama_tensor_get_type(quantize_state_impl & qs, const llama_model_quantize_params * params, const ggml_tensor * tensor, ggml_type default_type, const tensor_metadata & tm) {
if (!tensor_allows_quantization(params, qs.model.arch, tensor)) {
return tensor->type;
}
if (params->token_embedding_type < GGML_TYPE_COUNT && tm.category == tensor_category::TOKEN_EMBD) {
return params->token_embedding_type;
}
if (params->output_tensor_type < GGML_TYPE_COUNT && tm.category == tensor_category::OUTPUT) {
return params->output_tensor_type;
}
ggml_type new_type = default_type;
// get more optimal quantization type based on the tensor shape, layer, etc.
if (!params->pure && ggml_is_quantized(default_type)) {
// if the user provided tensor types - use those
bool manual = false;
if (!qs.tensor_type_patterns.empty()) {
const std::string tensor_name(tensor->name);
for (const auto & [pattern, qtype] : qs.tensor_type_patterns) {
if (std::regex_search(tensor_name, pattern)) {
if (qtype != new_type) {
LLAMA_LOG_WARN("%s: %-36s - applying manual override: %s -> %s\n",
__func__, tensor_name.c_str(), ggml_type_name(new_type), ggml_type_name(qtype));
new_type = qtype;
manual = true;
break;
}
}
}
}
// if not manual - use the standard logic for choosing the quantization type based on the selected mixture
if (!manual) {
new_type = llama_tensor_get_type_impl(qs, new_type, tensor, params->ftype, tm.category);
}
// incompatible tensor shapes are handled here - fallback to a compatible type
new_type = tensor_type_fallback(qs, tensor, new_type);
}
return new_type;
}
//
// quantization implementation
//
static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
if (nthread < 2) {
// single-thread
size_t new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix);
if (!ggml_validate_row_data(new_type, new_data, new_size)) {
throw std::runtime_error("quantized data validation failed");
}
return new_size;
}
std::mutex mutex;
int64_t counter = 0;
size_t new_size = 0;
bool valid = true;
auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, chunk_size,
nrows, n_per_row, imatrix]() {
const int64_t nrows_per_chunk = chunk_size / n_per_row;
size_t local_size = 0;
while (true) {
std::unique_lock<std::mutex> lock(mutex);
int64_t first_row = counter; counter += nrows_per_chunk;
if (first_row >= nrows) {
if (local_size > 0) {
new_size += local_size;
}
break;
}
lock.unlock();
const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk);
size_t this_size = ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
local_size += this_size;
// validate the quantized data
const size_t row_size = ggml_row_size(new_type, n_per_row);
void * this_data = (char *) new_data + first_row * row_size;
if (!ggml_validate_row_data(new_type, this_data, this_size)) {
std::unique_lock<std::mutex> lock(mutex);
valid = false;
break;
}
}
};
for (int it = 0; it < nthread - 1; ++it) {
workers.emplace_back(compute);
}
compute();
for (auto & w : workers) { w.join(); }
workers.clear();
if (!valid) {
throw std::runtime_error("quantized data validation failed");
}
return new_size;
}
//
// imatrix requirement check
//
static bool tensor_requires_imatrix(const char * tensor_name, const ggml_type dst_type, const llama_ftype ftype) {
if (tensor_name_match_token_embd(tensor_name) || tensor_name_match_output_weight(tensor_name)) {
return false;
}
switch (dst_type) {
case GGML_TYPE_IQ3_XXS:
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ1_M:
case GGML_TYPE_IQ1_S:
return true;
case GGML_TYPE_Q2_K:
// as a general rule, the k-type quantizations don't require imatrix data.
// the only exception is Q2_K tensors that are part of a Q2_K_S file.
return ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S;
default:
return false;
}
}
//
// given a file type, get the default tensor type
//
ggml_type llama_ftype_get_default_type(llama_ftype ftype) {
switch (ftype) {
case LLAMA_FTYPE_MOSTLY_Q4_0: return GGML_TYPE_Q4_0;
case LLAMA_FTYPE_MOSTLY_Q4_1: return GGML_TYPE_Q4_1;
case LLAMA_FTYPE_MOSTLY_Q5_0: return GGML_TYPE_Q5_0;
case LLAMA_FTYPE_MOSTLY_Q5_1: return GGML_TYPE_Q5_1;
case LLAMA_FTYPE_MOSTLY_Q8_0: return GGML_TYPE_Q8_0;
case LLAMA_FTYPE_MOSTLY_F16: return GGML_TYPE_F16;
case LLAMA_FTYPE_MOSTLY_BF16: return GGML_TYPE_BF16;
case LLAMA_FTYPE_ALL_F32: return GGML_TYPE_F32;
case LLAMA_FTYPE_MOSTLY_Q1_0: return GGML_TYPE_Q1_0;
case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: return GGML_TYPE_MXFP4;
// K-quants
case LLAMA_FTYPE_MOSTLY_Q2_K_S:
case LLAMA_FTYPE_MOSTLY_Q2_K: return GGML_TYPE_Q2_K;
case LLAMA_FTYPE_MOSTLY_IQ3_XS: return GGML_TYPE_IQ3_S;
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return GGML_TYPE_Q3_K;
case LLAMA_FTYPE_MOSTLY_Q4_K_S:
case LLAMA_FTYPE_MOSTLY_Q4_K_M: return GGML_TYPE_Q4_K;
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return GGML_TYPE_Q5_K;
case LLAMA_FTYPE_MOSTLY_Q6_K: return GGML_TYPE_Q6_K;
case LLAMA_FTYPE_MOSTLY_TQ1_0: return GGML_TYPE_TQ1_0;
case LLAMA_FTYPE_MOSTLY_TQ2_0: return GGML_TYPE_TQ2_0;
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return GGML_TYPE_IQ2_XXS;
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return GGML_TYPE_IQ2_XS;
case LLAMA_FTYPE_MOSTLY_IQ2_S: return GGML_TYPE_IQ2_XS;
case LLAMA_FTYPE_MOSTLY_IQ2_M: return GGML_TYPE_IQ2_S;
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return GGML_TYPE_IQ3_XXS;
case LLAMA_FTYPE_MOSTLY_IQ1_S: return GGML_TYPE_IQ1_S;
case LLAMA_FTYPE_MOSTLY_IQ1_M: return GGML_TYPE_IQ1_M;
case LLAMA_FTYPE_MOSTLY_IQ4_NL: return GGML_TYPE_IQ4_NL;
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return GGML_TYPE_IQ4_XS;
case LLAMA_FTYPE_MOSTLY_IQ3_S:
case LLAMA_FTYPE_MOSTLY_IQ3_M: return GGML_TYPE_IQ3_S;
case LLAMA_FTYPE_MOSTLY_Q3_PT: return GGML_TYPE_Q3_PT;
case LLAMA_FTYPE_MOSTLY_Q3_KPT: return GGML_TYPE_Q3_KPT;
case LLAMA_FTYPE_MOSTLY_Q4_DPT: return GGML_TYPE_Q4_DPT;
case LLAMA_FTYPE_MOSTLY_Q2_KPT: return GGML_TYPE_Q2_KPT;
case LLAMA_FTYPE_MOSTLY_IQ2_TQ: return GGML_TYPE_IQ2_TQ;
case LLAMA_FTYPE_MOSTLY_IQ3_TQ: return GGML_TYPE_IQ3_TQ;
case LLAMA_FTYPE_MOSTLY_IQ1_BN: return GGML_TYPE_IQ1_BN;
default: return GGML_TYPE_COUNT;
}
}
static void init_quantize_state_counters(quantize_state_impl & qs, std::vector<tensor_metadata> & metadata) {
for (auto & tm : metadata) {
tensor_category cat = tensor_get_category(tm.name);
tm.category = cat;
if (category_is_attn_v(cat)) {
++qs.n_attention_wv;
}
if (cat == tensor_category::OUTPUT) {
qs.has_tied_embeddings = false;
}
}
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)qs.model.hparams.n_layer;
}
//
// main quantization driver
//
static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
llama_ftype ftype = params->ftype;
int nthread = params->nthread;
if (nthread <= 0) {
nthread = std::thread::hardware_concurrency();
}
ggml_type default_type = llama_ftype_get_default_type(ftype);
if (default_type == GGML_TYPE_COUNT) {
throw std::runtime_error(format("invalid output file type %d\n", ftype));
}
// mmap consistently increases speed on Linux, and also increases speed on Windows with
// hot cache. It may cause a slowdown on macOS, possibly related to free memory.
#if defined(__linux__) || defined(_WIN32)
constexpr bool use_mmap = true;
#else
constexpr bool use_mmap = false;
#endif
const llama_model_kv_override * kv_overrides = params->kv_overrides;
std::vector<std::string> splits = {};
llama_model_loader ml(/*metadata*/ nullptr, /*set_tensor_data*/ nullptr, /*set_tensor_data_ud*/ nullptr,
fname_inp, splits, /*file*/ nullptr, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
ml.init_mappings(false); // no prefetching
llama_model model(llama_model_default_params());
model.load_arch (ml);
model.load_hparams(ml);
model.load_stats (ml);
quantize_state_impl qs(model, params);
if (params->only_copy) {
ftype = ml.ftype;
}
std::unordered_map<std::string, std::vector<float>> i_data;
const std::unordered_map<std::string, std::vector<float>> * imatrix_data = nullptr;
if (params->imatrix) {
for (const llama_model_imatrix_data * p = params->imatrix; p->name != nullptr; p++) {
i_data.emplace(p->name, std::vector<float>(p->data, p->data + p->size));
}
imatrix_data = & i_data;
if (imatrix_data) {
LLAMA_LOG_INFO("\n%s: have importance matrix data with %d entries\n",
__func__, (int)imatrix_data->size());
qs.has_imatrix = true;
// check imatrix for nans or infs
for (const auto & kv : *imatrix_data) {
for (float f : kv.second) {
if (!std::isfinite(f)) {
throw std::runtime_error(format("imatrix contains non-finite value %f\n", f));
}
}
}
}
}
const size_t align = GGUF_DEFAULT_ALIGNMENT;
gguf_context_ptr ctx_out { gguf_init_empty() };
std::vector<int> prune_list = {};
if (params->prune_layers) {
for (const int32_t * p = params->prune_layers; * p != -1; p++) {
prune_list.push_back(* p);
}
}
// copy the KV pairs from the input file
gguf_set_kv (ctx_out.get(), ml.metadata);
gguf_set_val_u32(ctx_out.get(), "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV
gguf_set_val_u32(ctx_out.get(), "general.file_type", ftype); // TODO: use LLM_KV
// Remove split metadata
gguf_remove_key(ctx_out.get(), ml.llm_kv(LLM_KV_SPLIT_NO).c_str());
gguf_remove_key(ctx_out.get(), ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str());
gguf_remove_key(ctx_out.get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str());
if (params->kv_overrides) {
for (const llama_model_kv_override * o = params->kv_overrides; o->key[0] != 0; ++o) {
if (o->tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
gguf_set_val_f32(ctx_out.get(), o->key, o->val_f64);
} else if (o->tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
// Setting type to UINT32. See https://github.com/ggml-org/llama.cpp/pull/14182 for context
gguf_set_val_u32(ctx_out.get(), o->key, (uint32_t)std::abs(o->val_i64));
} else if (o->tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
gguf_set_val_bool(ctx_out.get(), o->key, o->val_bool);
} else if (o->tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
gguf_set_val_str(ctx_out.get(), o->key, o->val_str);
} else {
LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o->key);
}
}
}
std::map<int, std::string> mapped;
int blk_id = 0;
// make a list of weights
std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
tensors.reserve(ml.weights_map.size());
for (const auto & it : ml.weights_map) {
const std::string remapped_name(remap_layer(it.first, prune_list, mapped, blk_id));
if (remapped_name.empty()) {
LLAMA_LOG_DEBUG("%s: pruning tensor %s\n", __func__, it.first.c_str());
continue;
}
if (remapped_name != it.first) {
ggml_set_name(it.second.tensor, remapped_name.c_str());
LLAMA_LOG_DEBUG("%s: tensor %s remapped to %s\n", __func__, it.first.c_str(), ggml_get_name(it.second.tensor));
}
tensors.push_back(&it.second);
}
if (!prune_list.empty()) {
gguf_set_val_u32(ctx_out.get(), ml.llm_kv(LLM_KV_BLOCK_COUNT).c_str(), blk_id);
}
// keep_split requires that the weights are sorted by split index
if (params->keep_split) {
std::sort(tensors.begin(), tensors.end(), [](const llama_model_loader::llama_tensor_weight * a, const llama_model_loader::llama_tensor_weight * b) {
if (a->idx == b->idx) {
return a->offs < b->offs;
}
return a->idx < b->idx;
});
}
// compute tensor metadata once and cache it
std::vector<tensor_metadata> metadata(tensors.size());
for (size_t i = 0; i < tensors.size(); ++i) {
metadata[i].name = ggml_get_name(tensors[i]->tensor);
}
// initialize quantization state counters and metadata categories
init_quantize_state_counters(qs, metadata);
int idx = 0;
uint16_t n_split = 1;
// Assume split index is continuous
if (params->keep_split) {
for (const auto * it : tensors) {
n_split = std::max(uint16_t(it->idx + 1), n_split);
}
}
std::vector<gguf_context_ptr> ctx_outs(n_split);
ctx_outs[0] = std::move(ctx_out);
// flag for --dry-run
bool will_require_imatrix = false;
//
// preliminary iteration over all weights
//
for (size_t i = 0; i < tensors.size(); ++i) {
const auto * it = tensors[i];
const struct ggml_tensor * tensor = it->tensor;
uint16_t i_split = params->keep_split ? it->idx : 0;
if (!ctx_outs[i_split]) {
ctx_outs[i_split].reset(gguf_init_empty());
}
gguf_add_tensor(ctx_outs[i_split].get(), tensor);
metadata[i].allows_quantization = tensor_allows_quantization(params, model.arch, tensor);
if (metadata[i].allows_quantization) {
metadata[i].target_type = llama_tensor_get_type(qs, params, tensor, default_type, metadata[i]);
} else {
metadata[i].target_type = tensor->type;
}
metadata[i].requires_imatrix = tensor_requires_imatrix(tensor->name, metadata[i].target_type, ftype);
if (params->imatrix) {
metadata[i].remapped_imatrix_name = remap_imatrix(tensor->name, mapped);
} else if (metadata[i].allows_quantization && metadata[i].requires_imatrix) {
if (params->dry_run) {
will_require_imatrix = true;
} else {
LLAMA_LOG_ERROR("\n============================================================================\n"
" ERROR: this quantization requires an importance matrix!\n"
" - offending tensor: %s\n"
" - target type: %s\n"
"============================================================================\n\n",
metadata[i].name.c_str(), ggml_type_name(metadata[i].target_type));
throw std::runtime_error("this quantization requires an imatrix!");
}
}
}
// Set split info if needed
if (n_split > 1) {
for (size_t i = 0; i < ctx_outs.size(); ++i) {
gguf_set_val_u16(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i);
gguf_set_val_u16(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split);
gguf_set_val_i32(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), (int32_t)tensors.size());
}
}
size_t total_size_org = 0;
size_t total_size_new = 0;
std::vector<std::thread> workers;
workers.reserve(nthread);
std::vector<no_init<uint8_t>> read_data;
std::vector<no_init<uint8_t>> work;
std::vector<no_init<float>> f32_conv_buf;
int cur_split = -1;
std::ofstream fout;
auto close_ofstream = [&]() {
// Write metadata and close file handler
if (fout.is_open()) {
fout.seekp(0);
std::vector<uint8_t> data(gguf_get_meta_size(ctx_outs[cur_split].get()));
gguf_get_meta_data(ctx_outs[cur_split].get(), data.data());
fout.write((const char *) data.data(), data.size());
fout.close();
}
};
auto new_ofstream = [&](int index) {
cur_split = index;
GGML_ASSERT(ctx_outs[cur_split] && "Find uninitialized gguf_context");
std::string fname = fname_out;
if (params->keep_split) {
std::vector<char> split_path(llama_path_max(), 0);
llama_split_path(split_path.data(), split_path.size(), fname_out.c_str(), cur_split, n_split);
fname = std::string(split_path.data());
}
fout = std::ofstream(fname, std::ios::binary);
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
const size_t meta_size = gguf_get_meta_size(ctx_outs[cur_split].get());
// placeholder for the meta data
::zeros(fout, meta_size);
};
// Q3_PT two-pass approach: train all per-tensor levels BEFORE opening the output
// file, so the levels KV entry is already populated at the time of the metadata placeholder.
static const size_t Q3PT_N_LEVELS = 8;
std::vector<float> q3pt_all_levels; // indexed by position in tensors[]
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_PT && !params->dry_run) {
LLAMA_LOG_INFO("%s: Q3_PT pass 1: training per-tensor levels...\n", __func__);
q3pt_all_levels.assign(tensors.size() * Q3PT_N_LEVELS, 0.0f);
// Temporary dequant buffer for pass 1 (reuse f32_conv_buf / read_data declared below)
std::vector<no_init<uint8_t>> p1_read_data;
std::vector<no_init<float>> p1_f32_buf;
std::vector<std::thread> p1_workers;
p1_workers.reserve(nthread);
for (size_t ti = 0; ti < tensors.size(); ++ti) {
ggml_tensor * tensor = tensors[ti]->tensor;
const std::string tname = ggml_get_name(tensor);
// Determine whether this tensor will be Q3_PT (mirror the pass-2 logic)
bool quantize = tname.rfind("weight") == tname.size() - 6;
quantize &= (ggml_n_dims(tensor) >= 2);
quantize &= tname.find("_norm.weight") == std::string::npos;
quantize &= tname.find("ffn_gate_inp.weight") == std::string::npos;
if (!quantize) { continue; }
ggml_type new_type = default_type;
if (!params->pure) {
new_type = llama_tensor_get_type_impl(qs, new_type, tensor, ftype, tensor_get_category(tname));
}
if (new_type != GGML_TYPE_Q3_PT) { continue; }
// Load tensor data
const size_t tsz = ggml_nbytes(tensor);
if (!ml.use_mmap) {
if (p1_read_data.size() < tsz) { p1_read_data.resize(tsz); }
tensor->data = p1_read_data.data();
}
ml.load_data_for(tensor);
// Dequantize to f32 if needed
const int64_t nelements = ggml_nelements(tensor);
float * f32_data;
if (tensor->type == GGML_TYPE_F32) {
f32_data = (float *) tensor->data;
} else {
llama_tensor_dequantize_impl(tensor, p1_f32_buf, p1_workers, nelements, nthread);
f32_data = (float *) p1_f32_buf.data();
}
// Resolve imatrix
const float * imatrix = nullptr;
if (imatrix_data) {
auto it2 = imatrix_data->find(remap_imatrix(tensor->name, mapped));
if (it2 != imatrix_data->end() &&
it2->second.size() == (size_t)tensor->ne[0] * tensor->ne[2]) {
imatrix = it2->second.data();
}
}
const int64_t n_per_row = tensor->ne[0];
const int64_t nrows = tensor->ne[1];
LLAMA_LOG_INFO("%s: Q3_PT levels for [%zu/%zu] %s\n", __func__, ti+1, tensors.size(), tensor->name);
q3pt_train_levels(f32_data, nrows, n_per_row, imatrix,
q3pt_all_levels.data() + ti * Q3PT_N_LEVELS);
}
// All levels ready — store in GGUF metadata before the file is opened
for (auto & ctx : ctx_outs) {
if (ctx) {
gguf_set_arr_data(ctx.get(), "q3_pt.levels", GGUF_TYPE_FLOAT32,
q3pt_all_levels.data(), q3pt_all_levels.size());
}
}
LLAMA_LOG_INFO("%s: Q3_PT pass 1 complete.\n", __func__);
}
// Q3_KPT two-pass approach: train all per-tensor levels BEFORE opening the output
static const size_t Q3KPT_N_LEVELS = 8;
std::vector<float> q3kpt_all_levels; // indexed by position in tensors[]
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_KPT && !params->dry_run) {
LLAMA_LOG_INFO("%s: Q3_KPT pass 1: training per-tensor levels...\n", __func__);
q3kpt_all_levels.assign(tensors.size() * Q3KPT_N_LEVELS, 0.0f);
// Temporary dequant buffer for pass 1
std::vector<no_init<uint8_t>> p1_read_data;
std::vector<no_init<float>> p1_f32_buf;
std::vector<std::thread> p1_workers;
p1_workers.reserve(nthread);
for (size_t ti = 0; ti < tensors.size(); ++ti) {
ggml_tensor * tensor = tensors[ti]->tensor;
const std::string tname = ggml_get_name(tensor);
// Determine whether this tensor will be Q3_KPT (mirror the pass-2 logic)
bool quantize = tname.rfind("weight") == tname.size() - 6;
quantize &= (ggml_n_dims(tensor) >= 2);
quantize &= tname.find("_norm.weight") == std::string::npos;
quantize &= tname.find("ffn_gate_inp.weight") == std::string::npos;
if (!quantize) { continue; }
ggml_type new_type = default_type;
if (!params->pure) {
new_type = llama_tensor_get_type_impl(qs, new_type, tensor, ftype, tensor_get_category(tname));
}
if (params->token_embedding_type < GGML_TYPE_COUNT &&
(tname == "token_embd.weight" || tname == "per_layer_token_embd.weight")) {
new_type = params->token_embedding_type;
}
if (params->output_tensor_type < GGML_TYPE_COUNT && tname == "output.weight") {
new_type = params->output_tensor_type;
}
if (new_type != GGML_TYPE_Q3_KPT) { continue; }
// Load tensor data
const size_t tsz = ggml_nbytes(tensor);
if (!ml.use_mmap) {
if (p1_read_data.size() < tsz) { p1_read_data.resize(tsz); }
tensor->data = p1_read_data.data();
}
ml.load_data_for(tensor);
// Dequantize to f32 if needed
const int64_t nelements = ggml_nelements(tensor);
float * f32_data;
if (tensor->type == GGML_TYPE_F32) {
f32_data = (float *) tensor->data;
} else {
llama_tensor_dequantize_impl(tensor, p1_f32_buf, p1_workers, nelements, nthread);
f32_data = (float *) p1_f32_buf.data();
}
// Resolve imatrix
const float * imatrix = nullptr;
if (imatrix_data) {
auto it2 = imatrix_data->find(remap_imatrix(tensor->name, mapped));
if (it2 != imatrix_data->end() &&
it2->second.size() == (size_t)tensor->ne[0] * tensor->ne[2]) {
imatrix = it2->second.data();
}
}
const int64_t n_per_row = tensor->ne[0];
const int64_t nrows = tensor->ne[1];
LLAMA_LOG_INFO("%s: Q3_KPT levels for [%zu/%zu] %s\n", __func__, ti+1, tensors.size(), tensor->name);
q3kpt_train_levels(f32_data, nrows, n_per_row, imatrix,
q3kpt_all_levels.data() + ti * Q3KPT_N_LEVELS);
}
// All levels ready — store in GGUF metadata before the file is opened
for (auto & ctx : ctx_outs) {
if (ctx) {
gguf_set_arr_data(ctx.get(), "q3_kpt.levels", GGUF_TYPE_FLOAT32,
q3kpt_all_levels.data(), q3kpt_all_levels.size());
}
}
LLAMA_LOG_INFO("%s: Q3_KPT pass 1 complete.\n", __func__);
}
// Q4_DPT two-pass approach: train all per-tensor int8 levels BEFORE opening the output
// file, so the levels KV entry is already populated at the time of the metadata placeholder.
static const size_t Q4DPT_N_LEVELS = 16;
std::vector<int8_t> q4dpt_all_levels; // indexed by position in tensors[]
if (ftype == LLAMA_FTYPE_MOSTLY_Q4_DPT && !params->dry_run) {
LLAMA_LOG_INFO("%s: Q4_DPT pass 1: training per-tensor int8 levels...\n", __func__);
q4dpt_all_levels.assign(tensors.size() * Q4DPT_N_LEVELS, (int8_t)0);
std::vector<no_init<uint8_t>> p1_read_data;
std::vector<no_init<float>> p1_f32_buf;
std::vector<std::thread> p1_workers;
p1_workers.reserve(nthread);
for (size_t ti = 0; ti < tensors.size(); ++ti) {
ggml_tensor * tensor = tensors[ti]->tensor;
const std::string tname = ggml_get_name(tensor);
bool quantize = tname.rfind("weight") == tname.size() - 6;
quantize &= (ggml_n_dims(tensor) >= 2);
quantize &= tname.find("_norm.weight") == std::string::npos;
quantize &= tname.find("ffn_gate_inp.weight") == std::string::npos;
if (!quantize) { continue; }
ggml_type new_type = default_type;
if (!params->pure) {
new_type = llama_tensor_get_type_impl(qs, new_type, tensor, ftype, tensor_get_category(tname));
}
if (params->token_embedding_type < GGML_TYPE_COUNT &&
(tname == "token_embd.weight" || tname == "per_layer_token_embd.weight")) {
new_type = params->token_embedding_type;
}
if (params->output_tensor_type < GGML_TYPE_COUNT && tname == "output.weight") {
new_type = params->output_tensor_type;
}
if (new_type != GGML_TYPE_Q4_DPT) { continue; }
// Load tensor data
const size_t tsz = ggml_nbytes(tensor);
if (!ml.use_mmap) {
if (p1_read_data.size() < tsz) { p1_read_data.resize(tsz); }
tensor->data = p1_read_data.data();
}
ml.load_data_for(tensor);
// Dequantize to f32 if needed
const int64_t nelements = ggml_nelements(tensor);
float * f32_data;
if (tensor->type == GGML_TYPE_F32) {
f32_data = (float *) tensor->data;
} else {
llama_tensor_dequantize_impl(tensor, p1_f32_buf, p1_workers, nelements, nthread);
f32_data = (float *) p1_f32_buf.data();
}
// Resolve imatrix
const float * imatrix = nullptr;
if (imatrix_data) {
auto it2 = imatrix_data->find(remap_imatrix(tensor->name, mapped));
if (it2 != imatrix_data->end() &&
it2->second.size() == (size_t)tensor->ne[0] * tensor->ne[2]) {
imatrix = it2->second.data();
}
}
const int64_t n_per_row = tensor->ne[0];
const int64_t nrows = tensor->ne[1];
LLAMA_LOG_INFO("%s: Q4_DPT levels for [%zu/%zu] %s\n", __func__, ti+1, tensors.size(), tensor->name);
q4dpt_train_levels(f32_data, nrows, n_per_row, imatrix,
q4dpt_all_levels.data() + ti * Q4DPT_N_LEVELS);
}
// Store in GGUF metadata before the file is opened
for (auto & ctx : ctx_outs) {
if (ctx) {
gguf_set_arr_data(ctx.get(), "q4_dpt.levels", GGUF_TYPE_INT8,
q4dpt_all_levels.data(), q4dpt_all_levels.size());
}
}
LLAMA_LOG_INFO("%s: Q4_DPT pass 1 complete.\n", __func__);
}
// Q2_KPT two-pass approach: train all per-block levels BEFORE opening the output
// file, so the levels KV entry is already populated at the time of the metadata placeholder.
// Per-block levels: 4 floats per 256-element block.
struct q2kpt_tensor_levels {
std::string name;
std::vector<float> levels; // nrows * (n_per_row / QK_K) * Q2KPT_N_LEVELS floats
};
std::vector<q2kpt_tensor_levels> q2kpt_all_levels;
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_KPT && !params->dry_run) {
LLAMA_LOG_INFO("%s: Q2_KPT pass 1: training per-block levels...\n", __func__);
std::vector<no_init<uint8_t>> p1_read_data;
std::vector<no_init<float>> p1_f32_buf;
std::vector<std::thread> p1_workers;
p1_workers.reserve(nthread);
for (size_t ti = 0; ti < tensors.size(); ++ti) {
ggml_tensor * tensor = tensors[ti]->tensor;
const std::string tname = ggml_get_name(tensor);
// Determine whether this tensor will be Q2_KPT (mirror the pass-2 logic)
bool quantize = tname.rfind("weight") == tname.size() - 6;
quantize &= (ggml_n_dims(tensor) >= 2);
quantize &= tname.find("_norm.weight") == std::string::npos;
quantize &= tname.find("ffn_gate_inp.weight") == std::string::npos;
if (!quantize) { continue; }
ggml_type new_type = default_type;
if (!params->pure) {
new_type = llama_tensor_get_type_impl(qs, new_type, tensor, ftype, tensor_get_category(tname));
}
if (params->token_embedding_type < GGML_TYPE_COUNT &&
(tname == "token_embd.weight" || tname == "per_layer_token_embd.weight")) {
new_type = params->token_embedding_type;
}
if (params->output_tensor_type < GGML_TYPE_COUNT && tname == "output.weight") {
new_type = params->output_tensor_type;
}
if (new_type != GGML_TYPE_Q2_KPT) { continue; }
// Load tensor data
const size_t tsz = ggml_nbytes(tensor);
if (!ml.use_mmap) {
if (p1_read_data.size() < tsz) { p1_read_data.resize(tsz); }
tensor->data = p1_read_data.data();
}
ml.load_data_for(tensor);
// Dequantize to f32 if needed
const int64_t nelements = ggml_nelements(tensor);
float * f32_data;
if (tensor->type == GGML_TYPE_F32) {
f32_data = (float *) tensor->data;
} else {
llama_tensor_dequantize_impl(tensor, p1_f32_buf, p1_workers, nelements, nthread);
f32_data = (float *) p1_f32_buf.data();
}
// Resolve imatrix
const float * imatrix = nullptr;
if (imatrix_data) {
auto it2 = imatrix_data->find(remap_imatrix(tensor->name, mapped));
if (it2 != imatrix_data->end() &&
it2->second.size() == (size_t)tensor->ne[0] * tensor->ne[2]) {
imatrix = it2->second.data();
}
}
const int64_t n_per_row = tensor->ne[0];
const int64_t nrows = tensor->ne[1];
// Allocate levels buffer for this tensor
const int nb = n_per_row / QK_K;
const size_t n_levels = (size_t)nrows * tensor->ne[2] * nb * Q2KPT_N_LEVELS;
q2kpt_all_levels.push_back({tname, std::vector<float>(n_levels)});
LLAMA_LOG_INFO("%s: Q2_KPT levels for [%zu/%zu] %s (%zu floats)\n",
__func__, ti+1, tensors.size(), tensor->name, n_levels);
// Train levels by running quantization internally
// We need to quantize to f32 -> Q2_KPT -> f32 to get the trained levels
std::vector<no_init<uint8_t>> p1_qbuf(ggml_nbytes(tensor));
const size_t row_size = ggml_row_size(GGML_TYPE_Q2_KPT, n_per_row);
// Prepare levels buffer for this tensor
q2kpt_free_levels();
q2kpt_prepare_levels(nrows * tensor->ne[2], n_per_row);
// Quantize each expert slice
const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
const float * f32_data_03 = f32_data + i03 * nelements_matrix;
void * q_data_03 = (char *)p1_qbuf.data() + row_size * i03 * nrows;
const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
// start_row must be the absolute row index for correct levels indexing
ggml_quantize_chunk(GGML_TYPE_Q2_KPT, f32_data_03, q_data_03, i03 * nrows, nrows, n_per_row, imatrix_03);
}
// Copy trained levels to our storage
const float * trained_levels = q2kpt_get_levels();
if (trained_levels) {
memcpy(q2kpt_all_levels.back().levels.data(), trained_levels, n_levels * sizeof(float));
}
}
// Store all levels in GGUF metadata before the file is opened
for (const auto & tl : q2kpt_all_levels) {
for (auto & ctx : ctx_outs) {
if (ctx) {
const std::string key = tl.name + ".q2kpt_levels";
gguf_set_arr_data(ctx.get(), key.c_str(), GGUF_TYPE_FLOAT32,
tl.levels.data(), tl.levels.size());
}
}
}
LLAMA_LOG_INFO("%s: Q2_KPT pass 1 complete.\n", __func__);
}
// IQ2_TQ: train per-tensor grid in pass 1
struct iq2tq_meta {
std::string tensor_name;
int8_t grid[64];
};
std::vector<iq2tq_meta> iq2tq_all_meta;
if (params->ftype == LLAMA_FTYPE_MOSTLY_IQ2_TQ) {
const int64_t t_start_p1 = ggml_time_us();
LLAMA_LOG_INFO("%s: IQ2_TQ pass 1: training per-tensor grids...\n", __func__);
std::vector<no_init<uint8_t>> p1_read_data;
std::vector<no_init<float>> p1_f32_buf;
std::vector<std::thread> p1_workers;
p1_workers.reserve(nthread);
for (size_t ti = 0; ti < tensors.size(); ++ti) {
ggml_tensor * tensor = tensors[ti]->tensor;
const std::string tname = ggml_get_name(tensor);
// Mirror pass-2 logic: only quantize 2D+ weight tensors
bool quantize = tname.rfind("weight") == tname.size() - 6;
quantize &= (ggml_n_dims(tensor) >= 2);
quantize &= tname.find("_norm.weight") == std::string::npos;
quantize &= tname.find("ffn_gate_inp.weight") == std::string::npos;
if (!quantize) { continue; }
ggml_type new_type = default_type;
if (!params->pure) {
new_type = llama_tensor_get_type_impl(qs, new_type, tensor, ftype, tensor_get_category(tname));
}
if (params->token_embedding_type < GGML_TYPE_COUNT &&
(tname == "token_embd.weight" || tname == "per_layer_token_embd.weight")) {
new_type = params->token_embedding_type;
}
if (params->output_tensor_type < GGML_TYPE_COUNT && tname == "output.weight") {
new_type = params->output_tensor_type;
}
if (new_type != GGML_TYPE_IQ2_TQ) { continue; }
// Load tensor data
const size_t tsz = ggml_nbytes(tensor);
if (!ml.use_mmap) {
if (p1_read_data.size() < tsz) { p1_read_data.resize(tsz); }
tensor->data = p1_read_data.data();
}
ml.load_data_for(tensor);
// Dequantize to f32 if needed
const int64_t nelements = ggml_nelements(tensor);
float * f32_data;
if (tensor->type == GGML_TYPE_F32) {
f32_data = (float *) tensor->data;
} else {
llama_tensor_dequantize_impl(tensor, p1_f32_buf, p1_workers, nelements, nthread);
f32_data = (float *) p1_f32_buf.data();
}
// Resolve imatrix
const float * imatrix = nullptr;
if (imatrix_data) {
auto it2 = imatrix_data->find(remap_imatrix(tensor->name, mapped));
if (it2 != imatrix_data->end() &&
it2->second.size() == (size_t)tensor->ne[0] * tensor->ne[2]) {
imatrix = it2->second.data();
}
}
const int64_t n_per_row = tensor->ne[0];
const int64_t nrows = tensor->ne[1];
LLAMA_LOG_INFO("%s: IQ2_TQ grid for [%zu/%zu] %s\n", __func__, ti+1, tensors.size(), tensor->name);
iq2tq_meta meta;
meta.tensor_name = tname;
iq2tq_train_grid(f32_data, nrows, n_per_row, imatrix, meta.grid);
iq2tq_all_meta.push_back(meta);
// Save to GGUF
std::string grid_key = "iq2tq.grid." + tname;
gguf_set_arr_data(ctx_outs[0].get(), grid_key.c_str(), GGUF_TYPE_INT8, meta.grid, 64);
}
const int64_t t_end_p1 = ggml_time_us();
LLAMA_LOG_INFO("%s: IQ2_TQ pass 1 complete (%zu tensors trained, %.1f s).\n",
__func__, iq2tq_all_meta.size(), (t_end_p1 - t_start_p1) / 1e6);
}
// IQ3_TQ: train per-tensor grid in pass 1 (16 entries × 8 levels = 128 bytes)
struct iq3tq_meta {
std::string tensor_name;
int8_t grid[128];
};
std::vector<iq3tq_meta> iq3tq_all_meta;
if (params->ftype == LLAMA_FTYPE_MOSTLY_IQ3_TQ) {
const int64_t t_start_p1 = ggml_time_us();
LLAMA_LOG_INFO("%s: IQ3_TQ pass 1: training per-tensor grids...\n", __func__);
std::vector<no_init<uint8_t>> p1_read_data;
std::vector<no_init<float>> p1_f32_buf;
std::vector<std::thread> p1_workers;
p1_workers.reserve(nthread);
for (size_t ti = 0; ti < tensors.size(); ++ti) {
ggml_tensor * tensor = tensors[ti]->tensor;
const std::string tname = ggml_get_name(tensor);
bool quantize = tname.rfind("weight") == tname.size() - 6;
quantize &= (ggml_n_dims(tensor) >= 2);
quantize &= tname.find("_norm.weight") == std::string::npos;
quantize &= tname.find("ffn_gate_inp.weight") == std::string::npos;
if (!quantize) { continue; }
ggml_type new_type = default_type;
if (!params->pure) {
new_type = llama_tensor_get_type_impl(qs, new_type, tensor, ftype, tensor_get_category(tname));
}
if (params->token_embedding_type < GGML_TYPE_COUNT &&
(tname == "token_embd.weight" || tname == "per_layer_token_embd.weight")) {
new_type = params->token_embedding_type;
}
if (params->output_tensor_type < GGML_TYPE_COUNT && tname == "output.weight") {
new_type = params->output_tensor_type;
}
if (new_type != GGML_TYPE_IQ3_TQ) { continue; }
const size_t tsz = ggml_nbytes(tensor);
if (!ml.use_mmap) {
if (p1_read_data.size() < tsz) { p1_read_data.resize(tsz); }
tensor->data = p1_read_data.data();
}
ml.load_data_for(tensor);
const int64_t nelements = ggml_nelements(tensor);
float * f32_data;
if (tensor->type == GGML_TYPE_F32) {
f32_data = (float *) tensor->data;
} else {
llama_tensor_dequantize_impl(tensor, p1_f32_buf, p1_workers, nelements, nthread);
f32_data = (float *) p1_f32_buf.data();
}
const float * imatrix = nullptr;
if (imatrix_data) {
auto it2 = imatrix_data->find(remap_imatrix(tensor->name, mapped));
if (it2 != imatrix_data->end() &&
it2->second.size() == (size_t)tensor->ne[0] * tensor->ne[2]) {
imatrix = it2->second.data();
}
}
const int64_t n_per_row = tensor->ne[0];
const int64_t nrows = tensor->ne[1];
LLAMA_LOG_INFO("%s: IQ3_TQ grid for [%zu/%zu] %s\n", __func__, ti+1, tensors.size(), tensor->name);
iq3tq_meta meta;
meta.tensor_name = tname;
iq3tq_train_grid(f32_data, nrows, n_per_row, imatrix, meta.grid);
iq3tq_all_meta.push_back(meta);
std::string grid_key = "iq3tq.grid." + tname;
gguf_set_arr_data(ctx_outs[0].get(), grid_key.c_str(), GGUF_TYPE_INT8, meta.grid, 128);
}
const int64_t t_end_p1 = ggml_time_us();
LLAMA_LOG_INFO("%s: IQ3_TQ pass 1 complete (%zu tensors trained, %.1f s).\n",
__func__, iq3tq_all_meta.size(), (t_end_p1 - t_start_p1) / 1e6);
}
// IQ1_BN: train per-tensor codebook in pass 1 (4096 × 8D centroids = 32768 bytes)
struct iq1bn_meta {
std::string tensor_name;
int8_t aux[32768];
};
std::vector<iq1bn_meta> iq1bn_all_meta;
if (params->ftype == LLAMA_FTYPE_MOSTLY_IQ1_BN) {
const int64_t t_start_p1 = ggml_time_us();
LLAMA_LOG_INFO("%s: IQ1_BN pass 1: training per-tensor codebooks...\n", __func__);
std::vector<no_init<uint8_t>> p1_read_data;
std::vector<no_init<float>> p1_f32_buf;
std::vector<std::thread> p1_workers;
p1_workers.reserve(nthread);
for (size_t ti = 0; ti < tensors.size(); ++ti) {
ggml_tensor * tensor = tensors[ti]->tensor;
const std::string tname = ggml_get_name(tensor);
bool quantize = tname.rfind("weight") == tname.size() - 6;
quantize &= (ggml_n_dims(tensor) >= 2);
quantize &= tname.find("_norm.weight") == std::string::npos;
quantize &= tname.find("ffn_gate_inp.weight") == std::string::npos;
if (!quantize) { continue; }
ggml_type new_type = default_type;
if (!params->pure) {
new_type = llama_tensor_get_type_impl(qs, new_type, tensor, ftype, tensor_get_category(tname));
}
if (params->token_embedding_type < GGML_TYPE_COUNT &&
(tname == "token_embd.weight" || tname == "per_layer_token_embd.weight")) {
new_type = params->token_embedding_type;
}
if (params->output_tensor_type < GGML_TYPE_COUNT && tname == "output.weight") {
new_type = params->output_tensor_type;
}
if (new_type != GGML_TYPE_IQ1_BN) { continue; }
const size_t tsz = ggml_nbytes(tensor);
if (!ml.use_mmap) {
if (p1_read_data.size() < tsz) { p1_read_data.resize(tsz); }
tensor->data = p1_read_data.data();
}
ml.load_data_for(tensor);
const int64_t nelements = ggml_nelements(tensor);
float * f32_data;
if (tensor->type == GGML_TYPE_F32) {
f32_data = (float *) tensor->data;
} else {
llama_tensor_dequantize_impl(tensor, p1_f32_buf, p1_workers, nelements, nthread);
f32_data = (float *) p1_f32_buf.data();
}
const float * imatrix = nullptr;
if (imatrix_data) {
auto it2 = imatrix_data->find(remap_imatrix(tensor->name, mapped));
if (it2 != imatrix_data->end() &&
it2->second.size() == (size_t)tensor->ne[0] * tensor->ne[2]) {
imatrix = it2->second.data();
}
}
const int64_t n_per_row = tensor->ne[0];
const int64_t nrows = tensor->ne[1];
LLAMA_LOG_INFO("%s: IQ1_BN codebook for [%zu/%zu] %s\n", __func__, ti+1, tensors.size(), tensor->name);
iq1bn_meta meta;
meta.tensor_name = tname;
iq1bn_train_codebook(f32_data, nrows, n_per_row, imatrix, meta.aux, nthread);
iq1bn_all_meta.push_back(meta);
std::string aux_key = "iq1bn.aux." + tname;
gguf_set_arr_data(ctx_outs[0].get(), aux_key.c_str(), GGUF_TYPE_INT8, meta.aux, 32768);
}
const int64_t t_end_p1 = ggml_time_us();
LLAMA_LOG_INFO("%s: IQ1_BN pass 1 complete (%zu tensors trained, %.1f s).\n",
__func__, iq1bn_all_meta.size(), (t_end_p1 - t_start_p1) / 1e6);
}
// no output file for --dry-run
if (!params->dry_run) {
new_ofstream(0);
}
//
// main loop: iterate over all weights
//
size_t tensor_pass2_idx = 0; // index into tensors[], used for Q3_PT levels lookup
for (size_t i = 0; i < tensors.size(); ++i) {
const auto & weight = *tensors[i];
const auto & tm = metadata[i];
ggml_tensor * tensor = weight.tensor;
if (!params->dry_run && (weight.idx != cur_split && params->keep_split)) {
close_ofstream();
new_ofstream(weight.idx);
}
const size_t tensor_size = ggml_nbytes(tensor);
if (!params->dry_run) {
if (!ml.use_mmap) {
if (read_data.size() < tensor_size) {
read_data.resize(tensor_size);
}
tensor->data = read_data.data();
}
ml.load_data_for(tensor);
}
LLAMA_LOG_INFO("[%4d/%4d] %-36s - [%s], type = %6s, ",
++idx, ml.n_tensors,
ggml_get_name(tensor),
llama_format_tensor_shape(tensor).c_str(),
ggml_type_name(tensor->type));
const ggml_type cur_type = tensor->type;
const ggml_type new_type = tm.target_type;
// If we've decided to quantize to the same type the tensor is already
// in then there's nothing to do.
bool quantize = cur_type != new_type;
void * new_data;
size_t new_size;
if (params->dry_run) {
// the --dry-run option calculates the final quantization size without quantizing
if (quantize) {
new_size = ggml_nrows(tensor) * ggml_row_size(new_type, tensor->ne[0]);
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB (%s)\n",
tensor_size/1024.0/1024.0,
new_size/1024.0/1024.0,
ggml_type_name(new_type));
if (!will_require_imatrix && tm.requires_imatrix) {
will_require_imatrix = true;
}
} else {
new_size = tensor_size;
LLAMA_LOG_INFO("size = %8.3f MiB\n", new_size/1024.0/1024.0);
}
total_size_org += tensor_size;
total_size_new += new_size;
continue;
} else {
// no --dry-run, perform quantization
if (!quantize) {
new_data = tensor->data;
new_size = tensor_size;
LLAMA_LOG_INFO("size = %8.3f MiB\n", tensor_size/1024.0/1024.0);
} else {
const int64_t nelements = ggml_nelements(tensor);
const float * imatrix = nullptr;
if (imatrix_data) {
auto it = imatrix_data->find(tm.remapped_imatrix_name);
if (it == imatrix_data->end()) {
LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
} else {
if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) {
imatrix = it->second.data();
} else {
LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name);
// this can happen when quantizing an old mixtral model with split tensors with a new incompatible imatrix
// this is a significant error and it may be good idea to abort the process if this happens,
// since many people will miss the error and not realize that most of the model is being quantized without an imatrix
// tok_embd should be ignored in this case, since it always causes this warning
if (!tensor_name_match_token_embd(tensor->name)) {
throw std::runtime_error(format("imatrix size %d is different from tensor size %d for %s",
int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name));
}
}
}
}
if (!imatrix && tm.requires_imatrix) {
LLAMA_LOG_ERROR("\n\n============================================================\n");
LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");
LLAMA_LOG_ERROR("============================================================\n\n");
throw std::runtime_error(format("Missing importance matrix for tensor %s in a very low-bit quantization", tensor->name));
}
float * f32_data;
if (tensor->type == GGML_TYPE_F32) {
f32_data = (float *) tensor->data;
} else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
} else {
llama_tensor_dequantize_impl(tensor, f32_conv_buf, workers, nelements, nthread);
f32_data = (float *) f32_conv_buf.data();
}
LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
fflush(stdout);
if (work.size() < (size_t)nelements * 4) {
work.resize(nelements * 4); // upper bound on size
}
new_data = work.data();
const int64_t n_per_row = tensor->ne[0];
const int64_t nrows = tensor->ne[1];
static const int64_t min_chunk_size = 32 * 512;
const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row));
const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
// Q3_PT: set the per-tensor levels (trained in pass 1) as global for quantization
if (new_type == GGML_TYPE_Q3_PT) {
q3pt_set_levels(q3pt_all_levels.data() + tensor_pass2_idx * Q3PT_N_LEVELS);
}
// Q3_KPT: set the per-tensor levels (trained in pass 1) as global for quantization
if (new_type == GGML_TYPE_Q3_KPT) {
q3kpt_set_levels(q3kpt_all_levels.data() + tensor_pass2_idx * Q3KPT_N_LEVELS);
}
// Q4_DPT: set the per-tensor levels (trained in pass 1) as global for quantization
if (new_type == GGML_TYPE_Q4_DPT) {
q4dpt_set_levels(q4dpt_all_levels.data() + tensor_pass2_idx * Q4DPT_N_LEVELS);
}
// IQ2_TQ: set per-tensor trained grid
if (new_type == GGML_TYPE_IQ2_TQ) {
bool found = false;
for (const auto & meta : iq2tq_all_meta) {
if (meta.tensor_name == tm.name) {
iq2tq_set_grid(meta.grid);
found = true;
break;
}
}
if (!found) {
LLAMA_LOG_WARN("%s: WARNING: no trained grid for IQ2_TQ tensor %s\n", __func__, tm.name.c_str());
}
}
// IQ3_TQ: set per-tensor trained grid
if (new_type == GGML_TYPE_IQ3_TQ) {
bool found = false;
for (const auto & meta : iq3tq_all_meta) {
if (meta.tensor_name == tm.name) {
iq3tq_set_grid(meta.grid);
found = true;
break;
}
}
if (!found) {
LLAMA_LOG_WARN("%s: WARNING: no trained grid for IQ3_TQ tensor %s\n", __func__, tm.name.c_str());
}
}
// IQ1_BN: set per-tensor trained codebook
if (new_type == GGML_TYPE_IQ1_BN) {
bool found = false;
for (const auto & meta : iq1bn_all_meta) {
if (meta.tensor_name == tm.name) {
iq1bn_set_aux(meta.aux);
found = true;
break;
}
}
if (!found) {
LLAMA_LOG_WARN("%s: WARNING: no trained codebook for IQ1_BN tensor %s\n", __func__, tm.name.c_str());
}
}
// Q2_KPT: quantize_q2_kpt trains per-block levels internally.
// Levels were already trained and saved to GGUF in pass 1.
// We still need to allocate the levels buffer for quantization to work correctly.
if (new_type == GGML_TYPE_Q2_KPT) {
const int64_t total_rows = nrows * tensor->ne[2];
q2kpt_free_levels(); // Clear any stale levels from previous tensor
q2kpt_prepare_levels(total_rows, n_per_row); // Allocate for this tensor
}
// quantize each expert separately since they have different importance matrices
new_size = 0;
for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
const float * f32_data_03 = f32_data + i03 * nelements_matrix;
void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
}
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", tensor_size/1024.0/1024.0, new_size/1024.0/1024.0);
}
total_size_org += tensor_size;
total_size_new += new_size;
// update the gguf meta data as we go
gguf_set_tensor_type(ctx_outs[cur_split].get(), metadata[i].name.c_str(), new_type);
GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), metadata[i].name.c_str())) == new_size);
gguf_set_tensor_data(ctx_outs[cur_split].get(), metadata[i].name.c_str(), new_data);
// write tensor data + padding
fout.write((const char *) new_data, new_size);
zeros(fout, GGML_PAD(new_size, align) - new_size);
} // no --dry-run
tensor_pass2_idx++;
} // iterate over tensors
if (!params->dry_run) {
close_ofstream();
}
LLAMA_LOG_INFO("%s: model size = %8.2f MiB (%.2f BPW)\n", __func__, total_size_org/1024.0/1024.0, total_size_org*8.0/ml.n_elements);
LLAMA_LOG_INFO("%s: quant size = %8.2f MiB (%.2f BPW)\n", __func__, total_size_new/1024.0/1024.0, total_size_new*8.0/ml.n_elements);
if (!params->imatrix && params->dry_run && will_require_imatrix) {
LLAMA_LOG_WARN("%s: WARNING: dry run completed successfully, but actually completing this quantization will require an imatrix!\n",
__func__
);
}
if (qs.n_fallback > 0) {
LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n",
__func__, qs.n_fallback, ml.n_tensors);
}
}
//
// interface implementation
//
llama_model_quantize_params llama_model_quantize_default_params() {
llama_model_quantize_params result = {
/*.nthread =*/ 0,
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
/*.output_tensor_type =*/ GGML_TYPE_COUNT,
/*.token_embedding_type =*/ GGML_TYPE_COUNT,
/*.allow_requantize =*/ false,
/*.quantize_output_tensor =*/ true,
/*.only_copy =*/ false,
/*.pure =*/ false,
/*.keep_split =*/ false,
/*.dry_run =*/ false,
/*.imatrix =*/ nullptr,
/*.kv_overrides =*/ nullptr,
/*.tensor_type =*/ nullptr,
/*.prune_layers =*/ nullptr
};
return result;
}
uint32_t llama_model_quantize(
const char * fname_inp,
const char * fname_out,
const llama_model_quantize_params * params) {
try {
llama_model_quantize_impl(fname_inp, fname_out, params);
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: failed to quantize: %s\n", __func__, err.what());
return 1;
}
return 0;
}
//
// Helper functions for external tools exposed in llama-ext.h
//
quantize_state_impl * llama_quant_init(
const llama_model * model,
const llama_model_quantize_params * params) {
return new quantize_state_impl(*model, params);
}
void llama_quant_free(quantize_state_impl * qs) {
delete qs;
}
llama_model * llama_quant_model_from_metadata(const llama_quant_model_desc * desc) {
struct llama_model_params mparams = llama_model_default_params();
auto * model = new llama_model(mparams);
model->arch = llm_arch_from_string(desc->architecture);
// infer llm_type: only LLM_TYPE_70B matters for quantization logic
if (model->arch == LLM_ARCH_LLAMA && desc->n_layer == 80 && desc->n_head != desc->n_head_kv) {
model->type = LLM_TYPE_70B;
}
model->hparams.n_embd = desc->n_embd;
model->hparams.n_embd_head_k_full = desc->n_embd_head_k;
model->hparams.n_embd_head_v_full = desc->n_embd_head_v;
model->hparams.n_layer = desc->n_layer;
model->hparams.n_expert = desc->n_expert;
for (uint32_t i = 0; i < desc->n_layer; i++) {
model->hparams.n_head_arr[i] = desc->n_head;
model->hparams.n_head_kv_arr[i] = desc->n_head_kv;
model->hparams.n_ff_arr[i] = desc->n_ff;
}
return model;
}
bool llama_quant_tensor_allows_quantization(
const quantize_state_impl * qs,
const ggml_tensor * tensor) {
return tensor_allows_quantization(qs->params, qs->model.arch, tensor);
}
void llama_quant_compute_types(
quantize_state_impl * qs,
llama_ftype ftype,
ggml_tensor ** tensors,
ggml_type * result_types,
size_t n_tensors) {
// reset per-computation state
qs->n_attention_wv = 0;
qs->n_ffn_down = 0;
qs->n_ffn_gate = 0;
qs->n_ffn_up = 0;
qs->i_attention_wv = 0;
qs->i_ffn_down = 0;
qs->i_ffn_gate = 0;
qs->i_ffn_up = 0;
qs->n_fallback = 0;
qs->has_imatrix = false;
qs->has_tied_embeddings = true;
// build metadata from tensor names
std::vector<tensor_metadata> metadata(n_tensors);
for (size_t i = 0; i < n_tensors; i++) {
metadata[i].name = ggml_get_name(tensors[i]);
}
// initialize counters and categories
init_quantize_state_counters(*qs, metadata);
// use a local copy of params with the requested ftype
llama_model_quantize_params local_params = *qs->params;
local_params.ftype = ftype;
ggml_type default_type = llama_ftype_get_default_type(ftype);
// compute types
for (size_t i = 0; i < n_tensors; i++) {
result_types[i] = llama_tensor_get_type(*qs, &local_params, tensors[i], default_type, metadata[i]);
}
}