Update with latest changes to state counters

This commit is contained in:
Colin Kealty 2026-03-11 13:24:51 -04:00
parent 506a49006c
commit 3948227d23
18 changed files with 11748 additions and 2135 deletions

View File

@ -1 +1,36 @@
IMPORTANT: Ensure youve thoroughly reviewed the [AGENTS.md](AGENTS.md) file before beginning any work.
IMPORTANT: Ensure you've thoroughly reviewed the [AGENTS.md](AGENTS.md) file before beginning any work.
# AI Policy
- AI is assistive only; AI-generated PRs are restricted per AGENTS.md
- Contributor reviews and writes code themselves
# Code Style & Conventions
- snake_case naming; optimize for longest common prefix
- 4 spaces indentation, brackets on same line
- `void * ptr`, `int & a` (space around pointer/reference)
- Avoid templates and fancy STL
- Use sized integer types (`int32_t`) in public API
- See [CONTRIBUTING.md](CONTRIBUTING.md) for full guidelines, naming, and PR process
# ggml Tensor Conventions
- Data stored in row-major order
- Dimension 0 = columns, dimension 1 = rows, dimension 2 = matrices
- **Matrix multiply is unconventional**: `C = ggml_mul_mat(ctx, A, B)` means `C^T = A * B^T`
# Quantization
- See [docs/quantization/](docs/quantization/) for comprehensive documentation
- See [docs/quantization/09-adding-new-types.md](docs/quantization/09-adding-new-types.md) for adding new types
## Key Files
- `ggml/include/ggml.h`: type enums (`ggml_type`)
- `ggml/src/ggml-common.h`: block structures
- `ggml/src/ggml-quants.c`: reference quantize/dequantize implementations
- `tools/quantize/quantize.cpp`: CLI tool
- `src/llama-quant.cpp`: core quantization engine
## Quantization Families
- **Q**: simple uniform quantization
- **K**: super-block quantization (multiple sub-blocks per super-block)
- **IQ**: importance-weighted quantization
- **T**: ternary quantization
- **MXFP**: Microsoft floating-point quantization

View File

@ -13,22 +13,6 @@
#include <thread>
#include <unordered_map>
// tensor categorization - used to avoid repeated string matching in quantization logic.
// this is different from LLM_TN - we want broad categories, not specific tensor names per arch.
enum class tensor_category {
TOKEN_EMBD,
ATTENTION_Q,
ATTENTION_V,
ATTENTION_K,
ATTENTION_QKV,
ATTENTION_KV_B,
ATTENTION_OUTPUT,
FFN_UP,
FFN_GATE,
FFN_DOWN,
OUTPUT,
OTHER
};
static void zeros(std::ofstream & file, size_t n) {
char zero = 0;
@ -150,15 +134,6 @@ static bool category_is_attn_v(tensor_category cat) {
cat == tensor_category::ATTENTION_KV_B;
}
// per-tensor metadata, computed in the preliminary loop and used in the main loop
struct tensor_metadata {
ggml_type target_type;
tensor_category category;
std::string remapped_imatrix_name;
bool allows_quantization;
bool requires_imatrix;
};
//
// dequantization
//
@ -355,6 +330,7 @@ static ggml_type tensor_type_fallback(quantize_state_impl & qs, const ggml_tenso
return return_type;
}
// internal standard logic for selecting the target tensor type based on tensor category, ftype, and model arch
static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype, tensor_category category) {
const std::string name = ggml_get_name(tensor);
@ -604,15 +580,8 @@ static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type
return new_type;
}
// public API: compute category from tensor name and delegate to _impl
ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
const std::string name = ggml_get_name(tensor);
tensor_category category = tensor_get_category(name);
return llama_tensor_get_type_impl(qs, new_type, tensor, ftype, category);
}
// outer wrapper: determine the ggml_type that this tensor should be quantized to (used internally by llama_model_quantize_impl)
static ggml_type llama_tensor_get_type_internal(quantize_state_impl & qs, const llama_model_quantize_params * params, const ggml_tensor * tensor, ggml_type default_type, const tensor_metadata & tm) {
// outer wrapper: determine the ggml_type that this tensor should be quantized to
ggml_type llama_tensor_get_type(quantize_state_impl & qs, const llama_model_quantize_params * params, const ggml_tensor * tensor, ggml_type default_type, const tensor_metadata & tm) {
if (!tensor_allows_quantization(params, qs.model.arch, tensor)) {
return tensor->type;
}
@ -845,15 +814,16 @@ const char * llama_ftype_to_name(llama_ftype ftype) {
return nullptr;
}
void init_quantize_state_counters(quantize_state_impl & qs, const std::vector<std::string> & tensor_names) {
for (const auto & name : tensor_names) {
tensor_category cat = tensor_get_category(name);
void init_quantize_state_counters(quantize_state_impl & qs, std::vector<tensor_metadata> & metadata) {
for (auto & tm : metadata) {
tensor_category cat = tensor_get_category(tm.name);
tm.category = cat;
if (category_is_attn_v(cat)) {
++qs.n_attention_wv;
}
if (tensor_name_match_output_weight(name.c_str())) {
if (cat == tensor_category::OUTPUT) {
qs.has_tied_embeddings = false;
}
}
@ -996,15 +966,15 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
});
}
{
std::vector<std::string> tensor_names;
tensor_names.reserve(tensors.size());
for (const auto * it : tensors) {
tensor_names.emplace_back(ggml_get_name(it->tensor));
}
init_quantize_state_counters(qs, tensor_names);
// compute tensor metadata once and cache it
std::vector<tensor_metadata> metadata(tensors.size());
for (size_t i = 0; i < tensors.size(); ++i) {
metadata[i].name = ggml_get_name(tensors[i]->tensor);
}
// initialize quantization state counters and metadata categories
init_quantize_state_counters(qs, metadata);
int idx = 0;
uint16_t n_split = 1;
@ -1017,25 +987,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
std::vector<gguf_context_ptr> ctx_outs(n_split);
ctx_outs[0] = std::move(ctx_out);
// compute tensor metadata once and cache it
std::vector<tensor_metadata> metadata(tensors.size());
// initialize quantization state before preliminary loop (counters for use_more_bits)
{
for (size_t i = 0; i < tensors.size(); ++i) {
const auto cat = tensor_get_category(tensors[i]->tensor->name);
if (category_is_attn_v(cat)) {
++qs.n_attention_wv;
}
if (cat == tensor_category::OUTPUT) {
qs.has_tied_embeddings = false;
}
metadata[i].category = cat; // save and re-use the category while we're at it
}
// these also need to be set to n_layer by default
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)qs.model.hparams.n_layer;
}
// flag for --dry-run
bool will_require_imatrix = false;
@ -1059,7 +1010,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
metadata[i].allows_quantization = tensor_allows_quantization(params, model.arch, tensor);
if (metadata[i].allows_quantization) {
metadata[i].target_type = llama_tensor_get_type_internal(qs, params, tensor, default_type, metadata[i]);
metadata[i].target_type = llama_tensor_get_type(qs, params, tensor, default_type, metadata[i]);
} else {
metadata[i].target_type = tensor->type;
}

View File

@ -12,6 +12,33 @@
struct llama_model;
// tensor categorization - used to avoid repeated string matching in quantization logic.
// this is different from LLM_TN - we want broad categories, not specific tensor names per arch.
enum class tensor_category {
TOKEN_EMBD,
ATTENTION_Q,
ATTENTION_V,
ATTENTION_K,
ATTENTION_QKV,
ATTENTION_KV_B,
ATTENTION_OUTPUT,
FFN_UP,
FFN_GATE,
FFN_DOWN,
OUTPUT,
OTHER
};
// per-tensor metadata, computed in the preliminary loop and used in the main loop
struct tensor_metadata {
std::string name;
ggml_type target_type;
tensor_category category;
std::string remapped_imatrix_name;
bool allows_quantization;
bool requires_imatrix;
};
// result of parsing --tensor-type option
// (changes to this struct must be reflected in tools/quantize/quantize.cpp)
struct tensor_type_option {
@ -56,7 +83,7 @@ struct quantize_state_impl {
}
};
ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype);
ggml_type llama_tensor_get_type(quantize_state_impl & qs, const llama_model_quantize_params * params, const ggml_tensor * tensor, ggml_type default_type, const tensor_metadata & tm);
ggml_type llama_ftype_get_default_type(llama_ftype ftype);
// Ftype name <-> enum conversions.
@ -64,9 +91,9 @@ ggml_type llama_ftype_get_default_type(llama_ftype ftype);
llama_ftype llama_ftype_from_name(const char * name);
const char * llama_ftype_to_name(llama_ftype ftype);
// Initialize quantize_state_impl counters by scanning tensor names.
// tensor_names: all quantizable weight tensor names in the model.
void init_quantize_state_counters(quantize_state_impl & qs, const std::vector<std::string> & tensor_names);
// Initialize quantize_state_impl counters and populate tensor_metadata categories.
// metadata: vector with name fields already set, will have category field populated.
void init_quantize_state_counters(quantize_state_impl & qs, std::vector<tensor_metadata> & metadata);
// Returns true if this tensor should be quantized (based on name, dims, params).
bool tensor_allows_quantization(const llama_model_quantize_params * params, llm_arch arch, const ggml_tensor * tensor);

View File

@ -515,7 +515,8 @@ static std::string detect_gguf_filename(const std::string & repo, const std::str
static std::optional<gguf_remote_model> fetch_and_parse(
const std::string & repo,
const std::string & filename,
const std::string & cache_path) {
const std::string & cache_path,
bool verbose) {
std::string url = "https://huggingface.co/" + repo + "/resolve/main/" + filename;
// Progressive download inspired by RangeView.fetchChunk()
@ -524,7 +525,9 @@ static std::optional<gguf_remote_model> fetch_and_parse(
const size_t max_chunk = 64 * 1024 * 1024;
while (chunk_size <= max_chunk) {
fprintf(stderr, "gguf_fetch: downloading %zu bytes from %s\n", chunk_size, filename.c_str());
if (verbose) {
fprintf(stderr, "gguf_fetch: downloading %zu bytes from %s\n", chunk_size, filename.c_str());
}
char range_buf[64];
snprintf(range_buf, sizeof(range_buf), "bytes=0-%zu", chunk_size - 1);
@ -565,7 +568,8 @@ static std::optional<gguf_remote_model> fetch_or_cached(
const std::string & repo,
const std::string & filename,
const std::string & cdir,
const std::string & repo_part) {
const std::string & repo_part,
bool verbose) {
std::string fname_part = sanitize_for_path(filename);
std::string cache_path = cdir + "/" + repo_part + "--" + fname_part + ".partial";
@ -574,20 +578,23 @@ static std::optional<gguf_remote_model> fetch_or_cached(
if (std::filesystem::exists(cache_path) && read_file(cache_path, cached)) {
auto result = gguf_parse_meta(cached);
if (result.has_value()) {
fprintf(stderr, "gguf_fetch: loaded from cache: %s\n", cache_path.c_str());
if (verbose) {
fprintf(stderr, "gguf_fetch: loaded from cache: %s\n", cache_path.c_str());
}
return result;
}
}
}
fs_create_directory_with_parents(cdir);
return fetch_and_parse(repo, filename, cache_path);
return fetch_and_parse(repo, filename, cache_path, verbose);
}
std::optional<gguf_remote_model> gguf_fetch_model_meta(
const std::string & repo,
const std::string & quant,
const std::string & cache_dir) {
const std::string & cache_dir,
bool verbose) {
std::string cdir = cache_dir.empty() ? get_default_cache_dir() : cache_dir;
std::string repo_part = sanitize_for_path(repo);
@ -597,7 +604,7 @@ std::optional<gguf_remote_model> gguf_fetch_model_meta(
return std::nullopt;
}
auto model_opt = fetch_or_cached(repo, filename, cdir, repo_part);
auto model_opt = fetch_or_cached(repo, filename, cdir, repo_part, verbose);
if (!model_opt.has_value()) {
fprintf(stderr, "gguf_fetch: failed to fetch %s\n", filename.c_str());
return std::nullopt;
@ -612,8 +619,10 @@ std::optional<gguf_remote_model> gguf_fetch_model_meta(
return std::nullopt;
}
fprintf(stderr, "gguf_fetch: split model with %u shards, fetching remaining %u...\n",
model.n_split, model.n_split - 1);
if (verbose) {
fprintf(stderr, "gguf_fetch: split model with %u shards, fetching remaining %u...\n",
model.n_split, model.n_split - 1);
}
for (int i = 2; i <= model.n_split; i++) {
char num_buf[6], total_buf[6];
@ -621,7 +630,7 @@ std::optional<gguf_remote_model> gguf_fetch_model_meta(
snprintf(total_buf, sizeof(total_buf), "%05d", (int)model.n_split);
std::string shard_name = split_prefix + "-" + num_buf + "-of-" + total_buf + ".gguf";
auto shard = fetch_or_cached(repo, shard_name, cdir, repo_part);
auto shard = fetch_or_cached(repo, shard_name, cdir, repo_part, verbose);
if (!shard.has_value()) {
fprintf(stderr, "gguf_fetch: failed to fetch shard %d: %s\n", i, shard_name.c_str());
return std::nullopt;

View File

@ -39,4 +39,5 @@ struct gguf_remote_model {
std::optional<gguf_remote_model> gguf_fetch_model_meta(
const std::string & repo,
const std::string & quant = "Q8_0",
const std::string & cache_dir = ""); // empty = default
const std::string & cache_dir = "", // empty = default
bool verbose = true);

File diff suppressed because it is too large Load Diff

View File

@ -2,10 +2,8 @@
# n_embd=2560, n_ff=10240, n_vocab=262144, n_layer=34, n_head=8, n_head_kv=4
[F32] f32
token_embd.weight q6_K
[F16] f16
token_embd.weight q6_K
[Q4_0] q4_0
token_embd.weight q6_K
@ -1205,7 +1203,6 @@ blk.33.attn_output.weight iq2_xxs
blk.33.attn_v.weight q2_K
[BF16] bf16
token_embd.weight q6_K
[TQ1_0] tq1_0
token_embd.weight q6_K

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -2,10 +2,8 @@
# n_embd=8192, n_ff=28672, n_vocab=128256, n_layer=80, n_head=64, n_head_kv=8
[F32] f32
output.weight q6_K
[F16] f16
output.weight q6_K
[Q4_0] q4_0
output.weight q6_K
@ -3324,7 +3322,6 @@ blk.79.attn_v.weight q4_K
output.weight q5_K
[BF16] bf16
output.weight q6_K
[TQ1_0] tq1_0
token_embd.weight q4_K

File diff suppressed because it is too large Load Diff

View File

@ -2,10 +2,8 @@
# n_embd=1024, n_ff=3072, n_vocab=151936, n_layer=28, n_head=16, n_head_kv=8
[F32] f32
output.weight q6_K
[F16] f16
output.weight q6_K
[Q4_0] q4_0
output.weight q6_K
@ -1013,7 +1011,6 @@ blk.27.attn_output.weight iq2_xxs
blk.27.attn_v.weight q2_K
[BF16] bf16
output.weight q6_K
[TQ1_0] tq1_0
output.weight q6_K

View File

@ -2,10 +2,8 @@
# n_embd=5120, n_ff=17408, n_vocab=151936, n_layer=40, n_head=40, n_head_kv=8
[F32] f32
output.weight q6_K
[F16] f16
output.weight q6_K
[Q4_0] q4_0
output.weight q6_K
@ -1613,7 +1611,6 @@ blk.39.attn_output.weight iq2_xxs
blk.39.attn_v.weight q4_K
[BF16] bf16
output.weight q6_K
[TQ1_0] tq1_0
output.weight q6_K

View File

@ -2,10 +2,8 @@
# n_embd=2048, n_ff=5120, n_vocab=151936, n_layer=48, n_head=16, n_head_kv=2, n_expert=512
[F32] f32
output.weight q6_K
[F16] f16
output.weight q6_K
[Q4_0] q4_0
output.weight q6_K
@ -1790,7 +1788,6 @@ blk.47.attn_v.weight q4_K
output.weight q5_K
[BF16] bf16
output.weight q6_K
[TQ1_0] tq1_0
token_embd.weight q4_K

View File

@ -2,10 +2,8 @@
# n_embd=5120, n_ff=17408, n_vocab=248320, n_layer=64, n_head=24, n_head_kv=4
[F32] f32
output.weight q6_K
[F16] f16
output.weight q6_K
[Q4_0] q4_0
output.weight q6_K
@ -1898,7 +1896,6 @@ blk.63.attn_output.weight iq2_xxs
blk.63.attn_v.weight q4_K
[BF16] bf16
output.weight q6_K
[TQ1_0] tq1_0
output.weight q6_K

View File

@ -2,10 +2,8 @@
# n_embd=4096, n_ff=0, n_vocab=248320, n_layer=60, n_head=32, n_head_kv=2, n_expert=512
[F32] f32
output.weight q6_K
[F16] f16
output.weight q6_K
[Q4_0] q4_0
output.weight q6_K
@ -2205,7 +2203,6 @@ blk.59.attn_output.weight iq2_xxs
blk.59.attn_v.weight q4_K
[BF16] bf16
output.weight q6_K
[TQ1_0] tq1_0
output.weight q6_K

View File

@ -2,10 +2,8 @@
# n_embd=4096, n_ff=11264, n_vocab=128896, n_layer=45, n_head=64, n_head_kv=8, n_expert=288
[F32] f32
output.weight q6_K
[F16] f16
output.weight q6_K
[Q4_0] q4_0
output.weight q6_K
@ -2078,7 +2076,6 @@ blk.44.attn_output.weight iq2_xxs
blk.44.attn_v.weight q4_K
[BF16] bf16
output.weight q6_K
[TQ1_0] tq1_0
output.weight q6_K

View File

@ -268,21 +268,20 @@ static std::vector<std::pair<std::string, ggml_type>> compute_quant_types(llama_
quantize_state_impl qs(mdl, &qparams);
std::vector<std::string> names;
names.reserve(tensors.size());
for (const auto & mt : tensors) {
names.push_back(mt.tensor->name);
std::vector<tensor_metadata> metadata(tensors.size());
for (size_t i = 0; i < tensors.size(); ++i) {
metadata[i].name = tensors[i].tensor->name;
}
init_quantize_state_counters(qs, names);
init_quantize_state_counters(qs, metadata);
ggml_type default_type = llama_ftype_get_default_type(ftype);
std::vector<std::pair<std::string, ggml_type>> result;
result.reserve(tensors.size());
for (const auto & mt : tensors) {
ggml_type got = llama_tensor_get_type(qs, default_type, mt.tensor, ftype);
result.push_back({ mt.tensor->name, got });
for (size_t i = 0; i < tensors.size(); ++i) {
ggml_type got = llama_tensor_get_type(qs, &qparams, tensors[i].tensor, default_type, metadata[i]);
result.push_back({ metadata[i].name, got });
}
return result;
@ -408,7 +407,7 @@ static bool run_test_section(llama_model & mdl,
}
if (got != expected) {
printf(" FAIL %-50s expected %s, got %s\n", name.c_str(), ggml_type_name(expected), ggml_type_name(got));
printf(" FAIL %-50s %-10s expected %s, got %s\n", name.c_str(), llama_ftype_to_name(section.ftype), ggml_type_name(expected), ggml_type_name(got));
all_pass = false;
}
}
@ -432,8 +431,7 @@ static int run_remote_tests(const std::string & snapshot_dir, const char * argv0
std::string name = model_name_from_repo(spec.repo);
printf("=== %s ===\n", name.c_str());
fprintf(stderr, "Fetching model metadata for %s from %s...\n", name.c_str(), spec.repo);
auto result = gguf_fetch_model_meta(spec.repo, spec.quant);
auto result = gguf_fetch_model_meta(spec.repo, spec.quant, "", false);
if (!result.has_value()) {
printf(" SKIP (could not fetch model metadata)\n\n");
total_skip++;
@ -506,5 +504,8 @@ int main(int argc, char ** argv) {
return run_generate(snapshot_dir);
}
// suppress llama log warnings during test (e.g. tensor type fallback messages)
llama_log_set([](enum ggml_log_level, const char *, void *) {}, nullptr);
return run_remote_tests(snapshot_dir, argv[0]);
}