Update with latest changes to state counters

2026-03-11 13:24:51 -04:00 · 2026-03-11 13:24:51 -04:00 · 3948227d23
parent 506a49006c
commit 3948227d23
18 changed files with 11748 additions and 2135 deletions
--- a/CLAUDE.md
+++ b/CLAUDE.md
@ -1 +1,36 @@
-IMPORTANT: Ensure you’ve thoroughly reviewed the [AGENTS.md](AGENTS.md) file before beginning any work.
+IMPORTANT: Ensure you've thoroughly reviewed the [AGENTS.md](AGENTS.md) file before beginning any work.
+
+# AI Policy
+- AI is assistive only; AI-generated PRs are restricted per AGENTS.md
+- Contributor reviews and writes code themselves
+
+# Code Style & Conventions
+- snake_case naming; optimize for longest common prefix
+- 4 spaces indentation, brackets on same line
+- `void * ptr`, `int & a` (space around pointer/reference)
+- Avoid templates and fancy STL
+- Use sized integer types (`int32_t`) in public API
+- See [CONTRIBUTING.md](CONTRIBUTING.md) for full guidelines, naming, and PR process
+
+# ggml Tensor Conventions
+- Data stored in row-major order
+- Dimension 0 = columns, dimension 1 = rows, dimension 2 = matrices
+- **Matrix multiply is unconventional**: `C = ggml_mul_mat(ctx, A, B)` means `C^T = A * B^T`
+
+# Quantization
+- See [docs/quantization/](docs/quantization/) for comprehensive documentation
+- See [docs/quantization/09-adding-new-types.md](docs/quantization/09-adding-new-types.md) for adding new types
+
+## Key Files
+- `ggml/include/ggml.h`: type enums (`ggml_type`)
+- `ggml/src/ggml-common.h`: block structures
+- `ggml/src/ggml-quants.c`: reference quantize/dequantize implementations
+- `tools/quantize/quantize.cpp`: CLI tool
+- `src/llama-quant.cpp`: core quantization engine
+
+## Quantization Families
+- **Q**: simple uniform quantization
+- **K**: super-block quantization (multiple sub-blocks per super-block)
+- **IQ**: importance-weighted quantization
+- **T**: ternary quantization
+- **MXFP**: Microsoft floating-point quantization
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@ -13,22 +13,6 @@
 #include <thread>
 #include <unordered_map>

-// tensor categorization - used to avoid repeated string matching in quantization logic.
-// this is different from LLM_TN - we want broad categories, not specific tensor names per arch.
-enum class tensor_category {
-    TOKEN_EMBD,
-    ATTENTION_Q,
-    ATTENTION_V,
-    ATTENTION_K,
-    ATTENTION_QKV,
-    ATTENTION_KV_B,
-    ATTENTION_OUTPUT,
-    FFN_UP,
-    FFN_GATE,
-    FFN_DOWN,
-    OUTPUT,
-    OTHER
-};

 static void zeros(std::ofstream & file, size_t n) {
    char zero = 0;
@ -150,15 +134,6 @@ static bool category_is_attn_v(tensor_category cat) {
           cat == tensor_category::ATTENTION_KV_B;
 }

-// per-tensor metadata, computed in the preliminary loop and used in the main loop
-struct tensor_metadata {
-    ggml_type       target_type;
-    tensor_category category;
-    std::string     remapped_imatrix_name;
-    bool            allows_quantization;
-    bool            requires_imatrix;
-};
-
 //
 // dequantization
 //
@ -355,6 +330,7 @@ static ggml_type tensor_type_fallback(quantize_state_impl & qs, const ggml_tenso
    return return_type;
 }

+// internal standard logic for selecting the target tensor type based on tensor category, ftype, and model arch
 static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype, tensor_category category) {
    const std::string name = ggml_get_name(tensor);

@ -604,15 +580,8 @@ static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type
    return new_type;
 }

-// public API: compute category from tensor name and delegate to _impl
-ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
-    const std::string name = ggml_get_name(tensor);
-    tensor_category category = tensor_get_category(name);
-    return llama_tensor_get_type_impl(qs, new_type, tensor, ftype, category);
-}
-
-// outer wrapper: determine the ggml_type that this tensor should be quantized to (used internally by llama_model_quantize_impl)
-static ggml_type llama_tensor_get_type_internal(quantize_state_impl & qs, const llama_model_quantize_params * params, const ggml_tensor * tensor, ggml_type default_type, const tensor_metadata & tm) {
+// outer wrapper: determine the ggml_type that this tensor should be quantized to
+ggml_type llama_tensor_get_type(quantize_state_impl & qs, const llama_model_quantize_params * params, const ggml_tensor * tensor, ggml_type default_type, const tensor_metadata & tm) {
    if (!tensor_allows_quantization(params, qs.model.arch, tensor)) {
        return tensor->type;
    }
@ -845,15 +814,16 @@ const char * llama_ftype_to_name(llama_ftype ftype) {
    return nullptr;
 }

-void init_quantize_state_counters(quantize_state_impl & qs, const std::vector<std::string> & tensor_names) {
-    for (const auto & name : tensor_names) {
-        tensor_category cat = tensor_get_category(name);
+void init_quantize_state_counters(quantize_state_impl & qs, std::vector<tensor_metadata> & metadata) {
+    for (auto & tm : metadata) {
+        tensor_category cat = tensor_get_category(tm.name);
+        tm.category = cat;

        if (category_is_attn_v(cat)) {
            ++qs.n_attention_wv;
        }

-        if (tensor_name_match_output_weight(name.c_str())) {
+        if (cat == tensor_category::OUTPUT) {
            qs.has_tied_embeddings = false;
        }
    }
@ -996,15 +966,15 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
        });
    }

-    {
-        std::vector<std::string> tensor_names;
-        tensor_names.reserve(tensors.size());
-        for (const auto * it : tensors) {
-            tensor_names.emplace_back(ggml_get_name(it->tensor));
-        }
-        init_quantize_state_counters(qs, tensor_names);
+    // compute tensor metadata once and cache it
+    std::vector<tensor_metadata> metadata(tensors.size());
+    for (size_t i = 0; i < tensors.size(); ++i) {
+        metadata[i].name = ggml_get_name(tensors[i]->tensor);
    }

+    // initialize quantization state counters and metadata categories
+    init_quantize_state_counters(qs, metadata);
+
    int idx = 0;
    uint16_t n_split = 1;

@ -1017,25 +987,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
    std::vector<gguf_context_ptr> ctx_outs(n_split);
    ctx_outs[0] = std::move(ctx_out);

-    // compute tensor metadata once and cache it
-    std::vector<tensor_metadata> metadata(tensors.size());
-
-    // initialize quantization state before preliminary loop (counters for use_more_bits)
-    {
-        for (size_t i = 0; i < tensors.size(); ++i) {
-            const auto cat = tensor_get_category(tensors[i]->tensor->name);
-            if (category_is_attn_v(cat)) {
-                ++qs.n_attention_wv;
-            }
-            if (cat == tensor_category::OUTPUT) {
-                qs.has_tied_embeddings = false;
-            }
-            metadata[i].category = cat; // save and re-use the category while we're at it
-        }
-        // these also need to be set to n_layer by default
-        qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)qs.model.hparams.n_layer;
-    }
-
    // flag for --dry-run
    bool will_require_imatrix = false;

@ -1059,7 +1010,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
        metadata[i].allows_quantization = tensor_allows_quantization(params, model.arch, tensor);

        if (metadata[i].allows_quantization) {
-            metadata[i].target_type = llama_tensor_get_type_internal(qs, params, tensor, default_type, metadata[i]);
+            metadata[i].target_type = llama_tensor_get_type(qs, params, tensor, default_type, metadata[i]);
        } else {
            metadata[i].target_type = tensor->type;
        }
--- a/src/llama-quant.h
+++ b/src/llama-quant.h
@ -12,6 +12,33 @@

 struct llama_model;

+// tensor categorization - used to avoid repeated string matching in quantization logic.
+// this is different from LLM_TN - we want broad categories, not specific tensor names per arch.
+enum class tensor_category {
+    TOKEN_EMBD,
+    ATTENTION_Q,
+    ATTENTION_V,
+    ATTENTION_K,
+    ATTENTION_QKV,
+    ATTENTION_KV_B,
+    ATTENTION_OUTPUT,
+    FFN_UP,
+    FFN_GATE,
+    FFN_DOWN,
+    OUTPUT,
+    OTHER
+};
+
+// per-tensor metadata, computed in the preliminary loop and used in the main loop
+struct tensor_metadata {
+    std::string     name;
+    ggml_type       target_type;
+    tensor_category category;
+    std::string     remapped_imatrix_name;
+    bool            allows_quantization;
+    bool            requires_imatrix;
+};
+
 // result of parsing --tensor-type option
 // (changes to this struct must be reflected in tools/quantize/quantize.cpp)
 struct tensor_type_option {
@ -56,7 +83,7 @@ struct quantize_state_impl {
    }
 };

-ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype);
+ggml_type llama_tensor_get_type(quantize_state_impl & qs, const llama_model_quantize_params * params, const ggml_tensor * tensor, ggml_type default_type, const tensor_metadata & tm);
 ggml_type llama_ftype_get_default_type(llama_ftype ftype);

 // Ftype name <-> enum conversions.
@ -64,9 +91,9 @@ ggml_type llama_ftype_get_default_type(llama_ftype ftype);
 llama_ftype  llama_ftype_from_name(const char * name);
 const char * llama_ftype_to_name(llama_ftype ftype);

-// Initialize quantize_state_impl counters by scanning tensor names.
-// tensor_names: all quantizable weight tensor names in the model.
-void init_quantize_state_counters(quantize_state_impl & qs, const std::vector<std::string> & tensor_names);
+// Initialize quantize_state_impl counters and populate tensor_metadata categories.
+// metadata: vector with name fields already set, will have category field populated.
+void init_quantize_state_counters(quantize_state_impl & qs, std::vector<tensor_metadata> & metadata);

 // Returns true if this tensor should be quantized (based on name, dims, params).
 bool tensor_allows_quantization(const llama_model_quantize_params * params, llm_arch arch, const ggml_tensor * tensor);
--- a/tests/gguf-model-data.cpp
+++ b/tests/gguf-model-data.cpp
@ -515,7 +515,8 @@ static std::string detect_gguf_filename(const std::string & repo, const std::str
 static std::optional<gguf_remote_model> fetch_and_parse(
        const std::string & repo,
        const std::string & filename,
-        const std::string & cache_path) {
+        const std::string & cache_path,
+        bool verbose) {
    std::string url = "https://huggingface.co/" + repo + "/resolve/main/" + filename;

    // Progressive download inspired by RangeView.fetchChunk()
@ -524,7 +525,9 @@ static std::optional<gguf_remote_model> fetch_and_parse(
    const size_t max_chunk = 64 * 1024 * 1024;

    while (chunk_size <= max_chunk) {
-        fprintf(stderr, "gguf_fetch: downloading %zu bytes from %s\n", chunk_size, filename.c_str());
+        if (verbose) {
+            fprintf(stderr, "gguf_fetch: downloading %zu bytes from %s\n", chunk_size, filename.c_str());
+        }

        char range_buf[64];
        snprintf(range_buf, sizeof(range_buf), "bytes=0-%zu", chunk_size - 1);
@ -565,7 +568,8 @@ static std::optional<gguf_remote_model> fetch_or_cached(
        const std::string & repo,
        const std::string & filename,
        const std::string & cdir,
-        const std::string & repo_part) {
+        const std::string & repo_part,
+        bool verbose) {
    std::string fname_part = sanitize_for_path(filename);
    std::string cache_path = cdir + "/" + repo_part + "--" + fname_part + ".partial";

@ -574,20 +578,23 @@ static std::optional<gguf_remote_model> fetch_or_cached(
        if (std::filesystem::exists(cache_path) && read_file(cache_path, cached)) {
            auto result = gguf_parse_meta(cached);
            if (result.has_value()) {
-                fprintf(stderr, "gguf_fetch: loaded from cache: %s\n", cache_path.c_str());
+                if (verbose) {
+                    fprintf(stderr, "gguf_fetch: loaded from cache: %s\n", cache_path.c_str());
+                }
                return result;
            }
        }
    }

    fs_create_directory_with_parents(cdir);
-    return fetch_and_parse(repo, filename, cache_path);
+    return fetch_and_parse(repo, filename, cache_path, verbose);
 }

 std::optional<gguf_remote_model> gguf_fetch_model_meta(
        const std::string & repo,
        const std::string & quant,
-        const std::string & cache_dir) {
+        const std::string & cache_dir,
+        bool verbose) {
    std::string cdir = cache_dir.empty() ? get_default_cache_dir() : cache_dir;
    std::string repo_part = sanitize_for_path(repo);

@ -597,7 +604,7 @@ std::optional<gguf_remote_model> gguf_fetch_model_meta(
        return std::nullopt;
    }

-    auto model_opt = fetch_or_cached(repo, filename, cdir, repo_part);
+    auto model_opt = fetch_or_cached(repo, filename, cdir, repo_part, verbose);
    if (!model_opt.has_value()) {
        fprintf(stderr, "gguf_fetch: failed to fetch %s\n", filename.c_str());
        return std::nullopt;
@ -612,8 +619,10 @@ std::optional<gguf_remote_model> gguf_fetch_model_meta(
            return std::nullopt;
        }

-        fprintf(stderr, "gguf_fetch: split model with %u shards, fetching remaining %u...\n",
-                model.n_split, model.n_split - 1);
+        if (verbose) {
+            fprintf(stderr, "gguf_fetch: split model with %u shards, fetching remaining %u...\n",
+                    model.n_split, model.n_split - 1);
+        }

        for (int i = 2; i <= model.n_split; i++) {
            char num_buf[6], total_buf[6];
@ -621,7 +630,7 @@ std::optional<gguf_remote_model> gguf_fetch_model_meta(
            snprintf(total_buf, sizeof(total_buf), "%05d", (int)model.n_split);
            std::string shard_name = split_prefix + "-" + num_buf + "-of-" + total_buf + ".gguf";

-            auto shard = fetch_or_cached(repo, shard_name, cdir, repo_part);
+            auto shard = fetch_or_cached(repo, shard_name, cdir, repo_part, verbose);
            if (!shard.has_value()) {
                fprintf(stderr, "gguf_fetch: failed to fetch shard %d: %s\n", i, shard_name.c_str());
                return std::nullopt;
--- a/tests/gguf-model-data.h
+++ b/tests/gguf-model-data.h
@ -39,4 +39,5 @@ struct gguf_remote_model {
 std::optional<gguf_remote_model> gguf_fetch_model_meta(
    const std::string & repo,
    const std::string & quant = "Q8_0",
-    const std::string & cache_dir = "");  // empty = default
+    const std::string & cache_dir = "",  // empty = default
+    bool verbose = true);
--- a/tests/snapshots/deepseek-v3.1.schema
+++ b/tests/snapshots/deepseek-v3.1.schema
--- a/tests/snapshots/gemma-3-4b-it.schema
+++ b/tests/snapshots/gemma-3-4b-it.schema
@ -2,10 +2,8 @@
 # n_embd=2560, n_ff=10240, n_vocab=262144, n_layer=34, n_head=8, n_head_kv=4

 [F32] f32
-token_embd.weight q6_K

 [F16] f16
-token_embd.weight q6_K

 [Q4_0] q4_0
 token_embd.weight q6_K
@ -1205,7 +1203,6 @@ blk.33.attn_output.weight iq2_xxs
 blk.33.attn_v.weight q2_K

 [BF16] bf16
-token_embd.weight q6_K

 [TQ1_0] tq1_0
 token_embd.weight q6_K
--- a/tests/snapshots/glm-4.6v.schema
+++ b/tests/snapshots/glm-4.6v.schema
--- a/tests/snapshots/gpt-oss-120b.schema
+++ b/tests/snapshots/gpt-oss-120b.schema
--- a/tests/snapshots/meta-llama-3.1-70b-instruct.schema
+++ b/tests/snapshots/meta-llama-3.1-70b-instruct.schema
@ -2,10 +2,8 @@
 # n_embd=8192, n_ff=28672, n_vocab=128256, n_layer=80, n_head=64, n_head_kv=8

 [F32] f32
-output.weight q6_K

 [F16] f16
-output.weight q6_K

 [Q4_0] q4_0
 output.weight q6_K
@ -3324,7 +3322,6 @@ blk.79.attn_v.weight q4_K
 output.weight q5_K

 [BF16] bf16
-output.weight q6_K

 [TQ1_0] tq1_0
 token_embd.weight q4_K
--- a/tests/snapshots/nemotron-nano-3-30b-a3b.schema
+++ b/tests/snapshots/nemotron-nano-3-30b-a3b.schema
--- a/tests/snapshots/qwen3-0.6b.schema
+++ b/tests/snapshots/qwen3-0.6b.schema
@ -2,10 +2,8 @@
 # n_embd=1024, n_ff=3072, n_vocab=151936, n_layer=28, n_head=16, n_head_kv=8

 [F32] f32
-output.weight q6_K

 [F16] f16
-output.weight q6_K

 [Q4_0] q4_0
 output.weight q6_K
@ -1013,7 +1011,6 @@ blk.27.attn_output.weight iq2_xxs
 blk.27.attn_v.weight q2_K

 [BF16] bf16
-output.weight q6_K

 [TQ1_0] tq1_0
 output.weight q6_K
--- a/tests/snapshots/qwen3-14b.schema
+++ b/tests/snapshots/qwen3-14b.schema
@ -2,10 +2,8 @@
 # n_embd=5120, n_ff=17408, n_vocab=151936, n_layer=40, n_head=40, n_head_kv=8

 [F32] f32
-output.weight q6_K

 [F16] f16
-output.weight q6_K

 [Q4_0] q4_0
 output.weight q6_K
@ -1613,7 +1611,6 @@ blk.39.attn_output.weight iq2_xxs
 blk.39.attn_v.weight q4_K

 [BF16] bf16
-output.weight q6_K

 [TQ1_0] tq1_0
 output.weight q6_K
--- a/tests/snapshots/qwen3-coder-next.schema
+++ b/tests/snapshots/qwen3-coder-next.schema
@ -2,10 +2,8 @@
 # n_embd=2048, n_ff=5120, n_vocab=151936, n_layer=48, n_head=16, n_head_kv=2, n_expert=512

 [F32] f32
-output.weight q6_K

 [F16] f16
-output.weight q6_K

 [Q4_0] q4_0
 output.weight q6_K
@ -1790,7 +1788,6 @@ blk.47.attn_v.weight q4_K
 output.weight q5_K

 [BF16] bf16
-output.weight q6_K

 [TQ1_0] tq1_0
 token_embd.weight q4_K
--- a/tests/snapshots/qwen3.5-27b.schema
+++ b/tests/snapshots/qwen3.5-27b.schema
@ -2,10 +2,8 @@
 # n_embd=5120, n_ff=17408, n_vocab=248320, n_layer=64, n_head=24, n_head_kv=4

 [F32] f32
-output.weight q6_K

 [F16] f16
-output.weight q6_K

 [Q4_0] q4_0
 output.weight q6_K
@ -1898,7 +1896,6 @@ blk.63.attn_output.weight iq2_xxs
 blk.63.attn_v.weight q4_K

 [BF16] bf16
-output.weight q6_K

 [TQ1_0] tq1_0
 output.weight q6_K
--- a/tests/snapshots/qwen3.5-397b-a17b.schema
+++ b/tests/snapshots/qwen3.5-397b-a17b.schema
@ -2,10 +2,8 @@
 # n_embd=4096, n_ff=0, n_vocab=248320, n_layer=60, n_head=32, n_head_kv=2, n_expert=512

 [F32] f32
-output.weight q6_K

 [F16] f16
-output.weight q6_K

 [Q4_0] q4_0
 output.weight q6_K
@ -2205,7 +2203,6 @@ blk.59.attn_output.weight iq2_xxs
 blk.59.attn_v.weight q4_K

 [BF16] bf16
-output.weight q6_K

 [TQ1_0] tq1_0
 output.weight q6_K
--- a/tests/snapshots/step-3.5-flash.schema
+++ b/tests/snapshots/step-3.5-flash.schema
@ -2,10 +2,8 @@
 # n_embd=4096, n_ff=11264, n_vocab=128896, n_layer=45, n_head=64, n_head_kv=8, n_expert=288

 [F32] f32
-output.weight q6_K

 [F16] f16
-output.weight q6_K

 [Q4_0] q4_0
 output.weight q6_K
@ -2078,7 +2076,6 @@ blk.44.attn_output.weight iq2_xxs
 blk.44.attn_v.weight q4_K

 [BF16] bf16
-output.weight q6_K

 [TQ1_0] tq1_0
 output.weight q6_K
--- a/tests/test-quant-type-selection.cpp
+++ b/tests/test-quant-type-selection.cpp
@ -268,21 +268,20 @@ static std::vector<std::pair<std::string, ggml_type>> compute_quant_types(llama_

    quantize_state_impl qs(mdl, &qparams);

-    std::vector<std::string> names;
-    names.reserve(tensors.size());
-    for (const auto & mt : tensors) {
-        names.push_back(mt.tensor->name);
+    std::vector<tensor_metadata> metadata(tensors.size());
+    for (size_t i = 0; i < tensors.size(); ++i) {
+        metadata[i].name = tensors[i].tensor->name;
    }
-    init_quantize_state_counters(qs, names);
+    init_quantize_state_counters(qs, metadata);

    ggml_type default_type = llama_ftype_get_default_type(ftype);

    std::vector<std::pair<std::string, ggml_type>> result;
    result.reserve(tensors.size());

-    for (const auto & mt : tensors) {
-        ggml_type got = llama_tensor_get_type(qs, default_type, mt.tensor, ftype);
-        result.push_back({ mt.tensor->name, got });
+    for (size_t i = 0; i < tensors.size(); ++i) {
+        ggml_type got = llama_tensor_get_type(qs, &qparams, tensors[i].tensor, default_type, metadata[i]);
+        result.push_back({ metadata[i].name, got });
    }

    return result;
@ -408,7 +407,7 @@ static bool run_test_section(llama_model &                    mdl,
        }

        if (got != expected) {
-            printf("  FAIL  %-50s expected %s, got %s\n", name.c_str(), ggml_type_name(expected), ggml_type_name(got));
+            printf("  FAIL  %-50s %-10s expected %s, got %s\n", name.c_str(), llama_ftype_to_name(section.ftype), ggml_type_name(expected), ggml_type_name(got));
            all_pass = false;
        }
    }
@ -432,8 +431,7 @@ static int run_remote_tests(const std::string & snapshot_dir, const char * argv0
        std::string  name = model_name_from_repo(spec.repo);
        printf("=== %s ===\n", name.c_str());

-        fprintf(stderr, "Fetching model metadata for %s from %s...\n", name.c_str(), spec.repo);
-        auto result = gguf_fetch_model_meta(spec.repo, spec.quant);
+        auto result = gguf_fetch_model_meta(spec.repo, spec.quant, "", false);
        if (!result.has_value()) {
            printf("  SKIP  (could not fetch model metadata)\n\n");
            total_skip++;
@ -506,5 +504,8 @@ int main(int argc, char ** argv) {
        return run_generate(snapshot_dir);
    }

+    // suppress llama log warnings during test (e.g. tensor type fallback messages)
+    llama_log_set([](enum ggml_log_level, const char *, void *) {}, nullptr);
+
    return run_remote_tests(snapshot_dir, argv[0]);
 }