Add unit test coverage for llama_tensor_get_type

2026-03-04 13:31:57 -05:00 · 2026-03-04 13:31:57 -05:00 · 99119ceaf4
parent b5e1212063
commit 99119ceaf4
8 changed files with 4439 additions and 61 deletions
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@ -1,11 +1,11 @@
-#include "llama.h"
+#include "llama-quant.h"
 #include "llama-impl.h"
 #include "llama-model.h"
 #include "llama-model-loader.h"

+#include <algorithm>
 #include <cmath>
 #include <cstring>
-#include <string>
 #include <cinttypes>
 #include <fstream>
 #include <mutex>
@ -13,13 +13,6 @@
 #include <thread>
 #include <unordered_map>

-// result of parsing --tensor-type option
-// (changes to this struct must be reflected in tools/quantize/quantize.cpp)
-struct tensor_type_option {
-    std::string name;
-    ggml_type type = GGML_TYPE_COUNT;
-};
-
 // tensor categorization - used to avoid repeated string matching in quantization logic.
 // this is different from LLM_TN - we want broad categories, not specific tensor names per arch.
 enum class tensor_category {
@ -157,46 +150,6 @@ static bool category_is_attn_v(tensor_category cat) {
           cat == tensor_category::ATTENTION_KV_B;
 }

-//
-// quantization state
-//
-
-struct quantize_state_impl {
-    const llama_model                 & model;
-    const llama_model_quantize_params * params;
-
-    int n_attention_wv = 0;
-    int n_ffn_down     = 0;
-    int n_ffn_gate     = 0;
-    int n_ffn_up       = 0;
-    int i_attention_wv = 0;
-    int i_ffn_down     = 0;
-    int i_ffn_gate     = 0;
-    int i_ffn_up       = 0;
-
-    int n_fallback    = 0;
-
-    bool has_imatrix = false;
-
-    // used to figure out if a model has tied embeddings (tok_embd shares weights with output)
-    bool has_tied_embeddings = true; // assume tied until we see output.weight
-
-    // tensor type override patterns (compiled once, used twice)
-    std::vector<std::pair<std::regex, ggml_type>> tensor_type_patterns;
-
-    quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params):
-        model(model), params(params)
-    {
-        // compile regex patterns once - they are expensive
-        if (params->tensor_types) {
-            const auto & tensor_types = *static_cast<const std::vector<tensor_type_option> *>(params->tensor_types);
-            for (const auto & [tname, qtype] : tensor_types) {
-                tensor_type_patterns.emplace_back(std::regex(tname), qtype);
-            }
-        }
-    }
-};
-
 // per-tensor metadata, computed in the preliminary loop and used in the main loop
 struct tensor_metadata {
    ggml_type       target_type;
@ -286,7 +239,7 @@ static void llama_tensor_dequantize_impl(
 // do we allow this tensor to be quantized?
 //

-static bool tensor_allows_quantization(const llama_model_quantize_params * params, llm_arch arch, const ggml_tensor * tensor) {
+bool tensor_allows_quantization(const llama_model_quantize_params * params, llm_arch arch, const ggml_tensor * tensor) {
    // trivial checks first -- no string ops needed
    if (params->only_copy)       return false;

@ -402,8 +355,7 @@ static ggml_type tensor_type_fallback(quantize_state_impl & qs, const ggml_tenso
    return return_type;
 }

-// internal standard logic for selecting the target tensor type based on tensor category, ftype, and model arch
-static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype, tensor_category category) {
+ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype, tensor_category category) {
    const std::string name = ggml_get_name(tensor);

    // TODO: avoid hardcoded tensor names - use the TN_* constants
@ -652,8 +604,15 @@ static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type
    return new_type;
 }

-// outer wrapper: determine the ggml_type that this tensor should be quantized to
-static ggml_type llama_tensor_get_type(quantize_state_impl & qs, const llama_model_quantize_params * params, const ggml_tensor * tensor, ggml_type default_type, const tensor_metadata & tm) {
+// public API: compute category from tensor name and delegate to _impl
+ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
+    const std::string name = ggml_get_name(tensor);
+    tensor_category category = tensor_get_category(name);
+    return llama_tensor_get_type_impl(qs, new_type, tensor, ftype, category);
+}
+
+// outer wrapper: determine the ggml_type that this tensor should be quantized to (used internally by llama_model_quantize_impl)
+static ggml_type llama_tensor_get_type_internal(quantize_state_impl & qs, const llama_model_quantize_params * params, const ggml_tensor * tensor, ggml_type default_type, const tensor_metadata & tm) {
    if (!tensor_allows_quantization(params, qs.model.arch, tensor)) {
        return tensor->type;
    }
@ -784,7 +743,7 @@ static bool tensor_requires_imatrix(const char * tensor_name, const ggml_type ds
 // given a file type, get the default tensor type
 //

-static ggml_type llama_ftype_get_default_type(llama_ftype ftype) {
+ggml_type llama_ftype_get_default_type(llama_ftype ftype) {
    switch (ftype) {
        case LLAMA_FTYPE_MOSTLY_Q4_0: return GGML_TYPE_Q4_0;
        case LLAMA_FTYPE_MOSTLY_Q4_1: return GGML_TYPE_Q4_1;
@ -827,12 +786,85 @@ static ggml_type llama_ftype_get_default_type(llama_ftype ftype) {
    }
 }

+struct ftype_name_entry {
+    const char * name;
+    llama_ftype  ftype;
+};
+
+static const ftype_name_entry ftype_name_table[] = {
+    { "F32",        LLAMA_FTYPE_ALL_F32 },
+    { "F16",        LLAMA_FTYPE_MOSTLY_F16 },
+    { "BF16",       LLAMA_FTYPE_MOSTLY_BF16 },
+    { "Q4_0",       LLAMA_FTYPE_MOSTLY_Q4_0 },
+    { "Q4_1",       LLAMA_FTYPE_MOSTLY_Q4_1 },
+    { "Q5_0",       LLAMA_FTYPE_MOSTLY_Q5_0 },
+    { "Q5_1",       LLAMA_FTYPE_MOSTLY_Q5_1 },
+    { "Q8_0",       LLAMA_FTYPE_MOSTLY_Q8_0 },
+    { "Q2_K",       LLAMA_FTYPE_MOSTLY_Q2_K },
+    { "Q2_K_S",     LLAMA_FTYPE_MOSTLY_Q2_K_S },
+    { "Q3_K_S",     LLAMA_FTYPE_MOSTLY_Q3_K_S },
+    { "Q3_K_M",     LLAMA_FTYPE_MOSTLY_Q3_K_M },
+    { "Q3_K_L",     LLAMA_FTYPE_MOSTLY_Q3_K_L },
+    { "Q4_K_S",     LLAMA_FTYPE_MOSTLY_Q4_K_S },
+    { "Q4_K_M",     LLAMA_FTYPE_MOSTLY_Q4_K_M },
+    { "Q5_K_S",     LLAMA_FTYPE_MOSTLY_Q5_K_S },
+    { "Q5_K_M",     LLAMA_FTYPE_MOSTLY_Q5_K_M },
+    { "Q6_K",       LLAMA_FTYPE_MOSTLY_Q6_K },
+    { "IQ1_S",      LLAMA_FTYPE_MOSTLY_IQ1_S },
+    { "IQ1_M",      LLAMA_FTYPE_MOSTLY_IQ1_M },
+    { "IQ2_XXS",    LLAMA_FTYPE_MOSTLY_IQ2_XXS },
+    { "IQ2_XS",     LLAMA_FTYPE_MOSTLY_IQ2_XS },
+    { "IQ2_S",      LLAMA_FTYPE_MOSTLY_IQ2_S },
+    { "IQ2_M",      LLAMA_FTYPE_MOSTLY_IQ2_M },
+    { "IQ3_XXS",    LLAMA_FTYPE_MOSTLY_IQ3_XXS },
+    { "IQ3_XS",     LLAMA_FTYPE_MOSTLY_IQ3_XS },
+    { "IQ3_S",      LLAMA_FTYPE_MOSTLY_IQ3_S },
+    { "IQ3_M",      LLAMA_FTYPE_MOSTLY_IQ3_M },
+    { "IQ4_NL",     LLAMA_FTYPE_MOSTLY_IQ4_NL },
+    { "IQ4_XS",     LLAMA_FTYPE_MOSTLY_IQ4_XS },
+    { "TQ1_0",      LLAMA_FTYPE_MOSTLY_TQ1_0 },
+    { "TQ2_0",      LLAMA_FTYPE_MOSTLY_TQ2_0 },
+    { "MXFP4_MOE",  LLAMA_FTYPE_MOSTLY_MXFP4_MOE },
+};
+
+llama_ftype llama_ftype_from_name(const char * name) {
+    for (const auto & e : ftype_name_table) {
+        if (strcmp(name, e.name) == 0) {
+            return e.ftype;
+        }
+    }
+    return (llama_ftype)-1;
+}
+
+const char * llama_ftype_to_name(llama_ftype ftype) {
+    for (const auto & e : ftype_name_table) {
+        if (e.ftype == ftype) {
+            return e.name;
+        }
+    }
+    return nullptr;
+}
+
+void init_quantize_state_counters(quantize_state_impl & qs, const std::vector<std::string> & tensor_names) {
+    for (const auto & name : tensor_names) {
+        tensor_category cat = tensor_get_category(name);
+
+        if (category_is_attn_v(cat)) {
+            ++qs.n_attention_wv;
+        }
+
+        if (tensor_name_match_output_weight(name.c_str())) {
+            qs.has_tied_embeddings = false;
+        }
+    }
+    qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)qs.model.hparams.n_layer;
+}
+
 //
 // main quantization driver
 //

 static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
-    ggml_type default_type;
    llama_ftype ftype = params->ftype;

    int nthread = params->nthread;
@ -841,7 +873,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
        nthread = std::thread::hardware_concurrency();
    }

-    default_type = llama_ftype_get_default_type(ftype);
+    ggml_type default_type = llama_ftype_get_default_type(ftype);

    // mmap consistently increases speed on Linux, and also increases speed on Windows with
    // hot cache. It may cause a slowdown on macOS, possibly related to free memory.
@ -878,8 +910,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
        imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
        if (imatrix_data) {
            LLAMA_LOG_INFO("\n%s: have importance matrix data with %d entries\n",
-                           __func__, (int)imatrix_data->size());
-            qs.has_imatrix = true;
+                           __func__, (int)imatrix_data->size());            qs.has_imatrix = true;
            // check imatrix for nans or infs
            for (const auto & kv : *imatrix_data) {
                for (float f : kv.second) {
@ -961,7 +992,17 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
        });
    }

+    { // Based on old loop
+        std::vector<std::string> tensor_names;
+        tensor_names.reserve(tensors.size());
+        for (const auto * it : tensors) {
+            tensor_names.emplace_back(ggml_get_name(it->tensor));
+        }
+        init_quantize_state_counters(qs, tensor_names);
+    }
+
    int idx = 0;
+
    uint16_t n_split = 1;

    // Assume split index is continuous
@ -1013,7 +1054,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
        metadata[i].allows_quantization = tensor_allows_quantization(params, model.arch, tensor);

        if (metadata[i].allows_quantization) {
-            metadata[i].target_type = llama_tensor_get_type(qs, params, tensor, default_type, metadata[i]);
+            metadata[i].target_type = llama_tensor_get_type_internal(qs, params, tensor, default_type, metadata[i]);
        } else {
            metadata[i].target_type = tensor->type;
        }
@ -1045,7 +1086,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
            gguf_set_val_i32(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), (int32_t)tensors.size());
        }
    }
-
+    
    size_t total_size_org = 0;
    size_t total_size_new = 0;

--- a/src/llama-quant.h
+++ b/src/llama-quant.h
@ -1 +1,54 @@
 #pragma once
+
+#include "llama.h"
+
+#include "ggml.h"
+
+#include "llama-arch.h"
+
+#include <string>
+#include <vector>
+
+struct llama_model;
+
+struct quantize_state_impl {
+    const llama_model                 & model;
+    const llama_model_quantize_params * params;
+
+    int n_attention_wv = 0;
+    int n_ffn_down     = 0;
+    int n_ffn_gate     = 0;
+    int n_ffn_up       = 0;
+    int i_attention_wv = 0;
+    int i_ffn_down     = 0;
+    int i_ffn_gate     = 0;
+    int i_ffn_up       = 0;
+
+    int n_k_quantized = 0;
+    int n_fallback    = 0;
+
+    bool has_imatrix = false;
+
+    // used to figure out if a model shares tok_embd with the output weight
+    bool has_output = false;
+
+    quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params)
+        : model(model)
+        , params(params)
+        {}
+};
+
+ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype);
+ggml_type llama_ftype_default_type(llama_ftype ftype);
+
+// Ftype name <-> enum conversions.
+// Returns (llama_ftype)-1 on failure.
+llama_ftype  llama_ftype_from_name(const char * name);
+const char * llama_ftype_to_name(llama_ftype ftype);
+
+// Initialize quantize_state_impl counters by scanning tensor names.
+// tensor_names: all quantizable weight tensor names in the model.
+void init_quantize_state_counters(quantize_state_impl & qs, const std::vector<std::string> & tensor_names);
+
+// Returns true if this tensor should be quantized (based on name, dims, params).
+bool tensor_allows_quantization(const llama_model_quantize_params * params, llm_arch arch, const ggml_tensor * tensor);
--- a/tests/.gitignore
+++ b/tests/.gitignore
@ -1,5 +1,6 @@
 *
 !*.*
+!snapshots/
 *.o
 ggml-common.h
 **/*.swp
--- a/tests/gguf-model-data.cpp
+++ b/tests/gguf-model-data.cpp
@ -124,6 +124,34 @@ static bool gguf_skip_value(gguf_buf_reader & r, int32_t vtype) {
 }

 static bool gguf_read_uint32_val(gguf_buf_reader & r, int32_t vtype, uint32_t & out) {
+    // Handle array-valued fields (e.g. per-layer head counts in hybrid models)
+    // by reading the first element as a representative value.
+    if (vtype == GGUF_TYPE_ARRAY) {
+        int32_t elem_type;
+        uint64_t count;
+        if (!r.read_val(elem_type)) {
+            return false;
+        }
+        if (!r.read_val(count)) {
+            return false;
+        }
+        if (count == 0) {
+            return false;
+        }
+        // Read first element, skip the rest
+        if (!gguf_read_uint32_val(r, elem_type, out)) {
+            return false;
+        }
+        for (uint64_t i = 1; i < count; i++) {
+            size_t sz = gguf_val_type_size(elem_type);
+            if (sz == 0) {
+                return false;
+            }
+            if (!r.skip(sz)) {
+                return false;
+            }
+        }
+    }
    if (vtype == GGUF_TYPE_UINT8) {
        uint8_t v;
        if (!r.read_val(v)) {
--- a/tests/snapshots/glm-4.6v.schema
+++ b/tests/snapshots/glm-4.6v.schema
--- a/tests/snapshots/qwen3-0.6b.schema
+++ b/tests/snapshots/qwen3-0.6b.schema
--- a/tests/test-gguf-model-data.cpp
+++ b/tests/test-gguf-model-data.cpp
@ -116,6 +116,39 @@ int main() {
    // Verify tensor count
    TEST_ASSERT(model3.tensors.size() == 780, "expected tensor count == 780");

+    // Test a hybrid-attention model with array-valued head counts
+    auto result4 = gguf_fetch_model_meta("ggml-org/Step-3.5-Flash-GGUF", "Q4_K");
+    if (!result4.has_value()) {
+        fprintf(stderr, "FAIL: could not fetch Step-3.5-Flash metadata\n");
+        return 1;
+    }
+    const auto & model4 = result4.value();
+
+    fprintf(stderr, "Architecture:  %s\n", model4.architecture.c_str());
+    fprintf(stderr, "n_embd:        %u\n", model4.n_embd);
+    fprintf(stderr, "n_ff:          %u\n", model4.n_ff);
+    fprintf(stderr, "n_vocab:       %u\n", model4.n_vocab);
+    fprintf(stderr, "n_layer:       %u\n", model4.n_layer);
+    fprintf(stderr, "n_head:        %u\n", model4.n_head);
+    fprintf(stderr, "n_head_kv:     %u\n", model4.n_head_kv);
+    fprintf(stderr, "n_expert:      %u\n", model4.n_expert);
+    fprintf(stderr, "n_embd_head_k: %u\n", model4.n_embd_head_k);
+    fprintf(stderr, "n_embd_head_v: %u\n", model4.n_embd_head_v);
+    fprintf(stderr, "tensors:       %zu\n", model4.tensors.size());
+
+    TEST_ASSERT(model4.architecture == "step35", "expected architecture 'step35'");
+
+    TEST_ASSERT(model4.n_layer == 45, "expected n_layer == 45");
+    TEST_ASSERT(model4.n_embd == 4096, "expected n_embd == 4096");
+    TEST_ASSERT(model4.n_ff == 11264, "expected n_ff == 11264");
+    TEST_ASSERT(model4.n_head == 64, "expected n_head == 64 (first element of per-layer array)");
+    TEST_ASSERT(model4.n_head_kv == 8, "expected n_head_kv == 8 (first element of per-layer array)");
+    TEST_ASSERT(model4.n_expert == 288, "expected n_expert == 288");
+    TEST_ASSERT(model4.n_embd_head_k == 128, "expected n_embd_head_k == 128");
+    TEST_ASSERT(model4.n_embd_head_v == 128, "expected n_embd_head_v == 128");
+    TEST_ASSERT(model4.n_vocab == 128896, "expected n_vocab == 128896");
+    TEST_ASSERT(model4.tensors.size() == 754, "expected tensor count == 754");
+
    fprintf(stderr, "=== ALL TESTS PASSED ===\n");
    return 0;
 }
--- a/tests/test-quant-type-selection.cpp
+++ b/tests/test-quant-type-selection.cpp
@ -0,0 +1,520 @@
+#include "ggml.h"
+#include "ggml-cpp.h"
+#include "llama.h"
+
+#include "../src/llama-arch.h"
+#include "../src/llama-model.h"
+#include "../src/llama-quant.h"
+
+#include "gguf-model-data.h"
+
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+// ---------------------------------------------------------------------------
+// Mock tensor construction - may be better to extract this in the future
+// ---------------------------------------------------------------------------
+
+struct mock_tensor {
+    ggml_context_ptr ctx;
+    ggml_tensor *    tensor;
+};
+
+static mock_tensor make_mock_tensor(const std::string & name, int64_t ne0, int64_t ne1,
+                                    int64_t ne2 = 1, int64_t ne3 = 1) {
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ 2 * ggml_tensor_overhead(),
+        /*.mem_buffer =*/ nullptr,
+        /*.no_alloc   =*/ true,
+    };
+    ggml_context_ptr ctx(ggml_init(params));
+    ggml_tensor * t;
+    if (ne3 > 1) {
+        t = ggml_new_tensor_4d(ctx.get(), GGML_TYPE_F32, ne0, ne1, ne2, ne3);
+    } else if (ne2 > 1) {
+        t = ggml_new_tensor_3d(ctx.get(), GGML_TYPE_F32, ne0, ne1, ne2);
+    } else {
+        t = ggml_new_tensor_2d(ctx.get(), GGML_TYPE_F32, ne0, ne1);
+    }
+    ggml_set_name(t, name.c_str());
+    return { std::move(ctx), t };
+}
+
+static ggml_type ggml_type_from_name(const std::string & name) {
+    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
+        const char * tname = ggml_type_name((ggml_type)i);
+        if (tname && name == tname) {
+            return (ggml_type)i;
+        }
+    }
+    return GGML_TYPE_COUNT;
+}
+
+// ---------------------------------------------------------------------------
+// File parser for snapshot files (quant type schemas)
+// ---------------------------------------------------------------------------
+
+struct snapshot_section {
+    llama_ftype ftype;
+    ggml_type   default_type;
+    std::vector<std::pair<std::string, ggml_type>> overrides;
+};
+
+// This function is pretty ugly, but it's a trade-off of readable snapshot files
+// versus readable parsing code
+static bool parse_snapshot_file(const std::string & path, std::vector<snapshot_section> & sections) {
+    std::ifstream f(path);
+    if (!f.good()) {
+        return false;
+    }
+
+    snapshot_section * cur = nullptr;
+    std::string line;
+
+    while (std::getline(f, line)) {
+        if (line.empty() || line[0] == '#') {
+            continue;
+        }
+
+        // section header: [FTYPE_NAME] default_type
+        if (line[0] == '[') {
+            auto close = line.find(']');
+            if (close == std::string::npos) {
+                fprintf(stderr, "parse error: missing ] in '%s'\n", line.c_str());
+                return false;
+            }
+            std::string ftype_str = line.substr(1, close - 1);
+            std::string default_str;
+            size_t pos = close + 1;
+            while (pos < line.size() && line[pos] == ' ') { pos++; }
+            default_str = line.substr(pos);
+
+            llama_ftype ftype = llama_ftype_from_name(ftype_str.c_str());
+            if ((int)ftype < 0) {
+                fprintf(stderr, "parse error: unknown ftype '%s'\n", ftype_str.c_str());
+                return false;
+            }
+
+            ggml_type dtype = ggml_type_from_name(default_str);
+            if (dtype == GGML_TYPE_COUNT) {
+                fprintf(stderr, "parse error: unknown default type '%s'\n", default_str.c_str());
+                return false;
+            }
+
+            sections.push_back({ftype, dtype, {}});
+            cur = &sections.back();
+            continue;
+        }
+
+        if (!cur) {
+            fprintf(stderr, "parse error: tensor line before any section: '%s'\n", line.c_str());
+            return false;
+        }
+
+        auto sp = line.rfind(' ');
+        if (sp == std::string::npos) {
+            fprintf(stderr, "parse error: no space in tensor line: '%s'\n", line.c_str());
+            return false;
+        }
+
+        std::string tname = line.substr(0, sp);
+        std::string ttype = line.substr(sp + 1);
+
+        ggml_type gt = ggml_type_from_name(ttype);
+        if (gt == GGML_TYPE_COUNT) {
+            fprintf(stderr, "parse error: unknown type '%s' for tensor '%s'\n",
+                    ttype.c_str(), tname.c_str());
+            return false;
+        }
+
+        cur->overrides.push_back({tname, gt});
+    }
+
+    return true;
+}
+
+// ---------------------------------------------------------------------------
+// Remote model support using gguf-model-data.cpp
+// ---------------------------------------------------------------------------
+
+struct remote_model_spec {
+    const char * repo;
+    const char * quant;
+};
+
+// Get model name from repo: strip org prefix, strip -GGUF suffix,
+// and strip anything up to and including first '_' (e.g. "deepseek-ai_DeepSeek-V3.1").
+static std::string model_name_from_repo(const char * repo) {
+    std::string s(repo);
+
+    auto slash = s.find('/');
+    if (slash != std::string::npos) {
+        s = s.substr(slash + 1);
+    }
+
+    const std::string suffix = "-GGUF";
+    if (s.size() >= suffix.size() && s.compare(s.size() - suffix.size(), suffix.size(), suffix) == 0) {
+        s = s.substr(0, s.size() - suffix.size());
+    }
+
+    auto underscore = s.find('_');
+    if (underscore != std::string::npos) {
+        s = s.substr(underscore + 1);
+    }
+
+    return s;
+}
+
+static std::string snapshot_file_from_name(const std::string & name) {
+    std::string lower = name;
+    for (auto & c : lower) {
+        c = std::tolower(c);
+    }
+    return lower;
+}
+
+static const remote_model_spec model_specs[] = {
+    { "ggml-org/Qwen3-0.6B-GGUF",                     "Q8_0"   },
+    { "ggml-org/GLM-4.6V-GGUF",                       "Q8_0"   },
+    { "ggml-org/Step-3.5-Flash-GGUF",                 "Q4_K"   },
+    { "ggml-org/Qwen3-Coder-Next-GGUF",               "Q8_0"   },
+    { "ggml-org/Qwen3-14B-GGUF",                      "Q8_0"   },
+    { "ggml-org/Nemotron-Nano-3-30B-A3B-GGUF",        "Q8_0"   },
+    { "ggml-org/gpt-oss-120b-GGUF",                   "mxfp4"  },
+    { "ggml-org/gemma-3-4b-it-GGUF",                  "Q8_0"   },
+    { "bartowski/Meta-Llama-3.1-70B-Instruct-GGUF",   "Q4_K_M" },
+    { "bartowski/deepseek-ai_DeepSeek-V3.1-GGUF",     "IQ1_S"  },
+    { "bartowski/Qwen_Qwen3.5-397B-A17B-GGUF",        "IQ1_S"  }, // TODO: swap with ggml-org if/when it's released
+    { "bartowski/Qwen_Qwen3.5-27B-GGUF",              "Q8_0"   }, // TODO: swap with ggml-org if/when it's released
+};
+
+static const int n_model_specs = (int)(sizeof(model_specs) / sizeof(model_specs[0]));
+
+// Determine llm_type from metadata.
+// Only LLM_TYPE_70B matters -> probably can/should be dropped in the future
+static llm_type infer_llm_type(llm_arch arch, const gguf_remote_model & remote) {
+    if (arch == LLM_ARCH_LLAMA && remote.n_layer == 80 && remote.n_head != remote.n_head_kv) {
+        return LLM_TYPE_70B;
+    }
+    return LLM_TYPE_UNKNOWN;
+}
+
+static std::unique_ptr<llama_model> build_mock_model_from_remote(const gguf_remote_model & remote) {
+    struct llama_model_params mparams = llama_model_default_params();
+    auto model = std::make_unique<llama_model>(mparams);
+
+    model->arch = llm_arch_from_string(remote.architecture);
+    model->type = infer_llm_type(model->arch, remote);
+
+    model->hparams.n_embd        = remote.n_embd;
+    model->hparams.n_embd_head_k = remote.n_embd_head_k;
+    model->hparams.n_embd_head_v = remote.n_embd_head_v;
+    model->hparams.n_layer       = remote.n_layer;
+    model->hparams.n_expert      = remote.n_expert;
+
+    for (uint32_t i = 0; i < remote.n_layer; i++) {
+        model->hparams.n_head_arr[i]    = remote.n_head;
+        model->hparams.n_head_kv_arr[i] = remote.n_head_kv;
+        model->hparams.n_ff_arr[i]      = remote.n_ff;
+    }
+
+    return model;
+}
+
+static std::vector<mock_tensor> build_mock_tensors(
+        const gguf_remote_model & remote,
+        llm_arch arch,
+        const llama_model_quantize_params & qparams) {
+    std::vector<mock_tensor> result;
+
+    for (const auto & t : remote.tensors) {
+        auto mt = make_mock_tensor(t.name, t.ne[0], t.ne[1], t.ne[2], t.ne[3]);
+        if (tensor_allows_quantization(&qparams, arch, mt.tensor)) {
+            result.push_back(std::move(mt));
+        }
+    }
+
+    return result;
+}
+
+static std::string read_file_contents(const std::string & path) {
+    std::ifstream f(path);
+    if (!f.good()) {
+        return "";
+    }
+    std::ostringstream ss;
+    ss << f.rdbuf();
+    return ss.str();
+}
+
+// ---------------------------------------------------------------------------
+// Compute quantization type assignments per target ftype
+// ---------------------------------------------------------------------------
+
+// Returns {tensor_name, assigned_type} for each tensor, in order.
+static std::vector<std::pair<std::string, ggml_type>> compute_quant_types(
+        llama_model & mdl,
+        const std::vector<mock_tensor> & tensors,
+        llama_ftype ftype) {
+    llama_model_quantize_params qparams = llama_model_quantize_default_params();
+    qparams.ftype = ftype;
+
+    quantize_state_impl qs(mdl, &qparams);
+
+    std::vector<std::string> names;
+    names.reserve(tensors.size());
+    for (const auto & mt : tensors) {
+        names.push_back(mt.tensor->name);
+    }
+    init_quantize_state_counters(qs, names);
+
+    ggml_type default_type = llama_ftype_default_type(ftype);
+
+    std::vector<std::pair<std::string, ggml_type>> result;
+    result.reserve(tensors.size());
+
+    for (const auto & mt : tensors) {
+        ggml_type got = llama_tensor_get_type(qs, default_type, mt.tensor, ftype);
+        result.push_back({mt.tensor->name, got});
+    }
+
+    return result;
+}
+
+// ---------------------------------------------------------------------------
+// Generate mode: regenerate all snapshot files
+// Use this when either adding new models or modifying quants
+// ---------------------------------------------------------------------------
+
+static std::string generate_snapshot(const std::string & name,
+                                     const gguf_remote_model & remote,
+                                     llama_model & mdl,
+                                     const std::vector<mock_tensor> & tensors) {
+    std::ostringstream out;
+
+    out << "# Model: " << name << "\n";
+    out << "# n_embd=" << remote.n_embd
+        << ", n_ff=" << remote.n_ff
+        << ", n_vocab=" << remote.n_vocab
+        << ", n_layer=" << remote.n_layer
+        << ", n_head=" << remote.n_head
+        << ", n_head_kv=" << remote.n_head_kv;
+    if (remote.n_expert > 0) {
+        out << ", n_expert=" << remote.n_expert;
+    }
+    out << "\n";
+
+    for (int i = 0; i < LLAMA_FTYPE_GUESSED; i++) {
+        llama_ftype ft = (llama_ftype)i;
+        ggml_type default_type = llama_ftype_default_type(ft);
+        if (default_type == GGML_TYPE_COUNT) {
+            continue;
+        }
+        const char * fname = llama_ftype_to_name(ft);
+        if (!fname) {
+            continue;
+        }
+
+        auto types = compute_quant_types(mdl, tensors, ft);
+
+        out << "\n[" << fname << "] " << ggml_type_name(default_type) << "\n";
+        for (const auto & [name, type] : types) {
+            if (type != default_type) {
+                out << name << " " << ggml_type_name(type) << "\n";
+            }
+        }
+    }
+
+    return out.str();
+}
+
+static int run_generate(const std::string & snapshot_dir) {
+    fprintf(stderr, "This will overwrite all snapshot files in:\n  %s\n", snapshot_dir.c_str());
+    fprintf(stderr, "Continue? [y/N] ");
+    int ch = fgetc(stdin);
+    if (ch != 'y' && ch != 'Y') {
+        fprintf(stderr, "Aborted.\n");
+        return 1;
+    }
+
+    fprintf(stderr, "\n");
+
+    int n_written = 0;
+
+    for (int m = 0; m < n_model_specs; m++) {
+        const auto & spec = model_specs[m];
+        std::string name = model_name_from_repo(spec.repo);
+
+        fprintf(stderr, "Fetching model metadata for %s from %s...\n", name.c_str(), spec.repo);
+        auto result = gguf_fetch_model_meta(spec.repo, spec.quant);
+        if (!result.has_value()) {
+            fprintf(stderr, "ERROR: could not fetch model metadata for %s\n", name.c_str());
+            return 1;
+        }
+
+        const auto & remote = result.value();
+        auto model   = build_mock_model_from_remote(remote);
+        llama_model_quantize_params qparams = llama_model_quantize_default_params();
+        auto tensors = build_mock_tensors(remote, model->arch, qparams);
+
+        std::string content = generate_snapshot(name, remote, *model, tensors);
+        std::string path    = snapshot_dir + "/" + snapshot_file_from_name(name) + ".schema";
+
+        std::ofstream f(path);
+        if (!f.good()) {
+            fprintf(stderr, "ERROR: could not write %s\n", path.c_str());
+            return 1;
+        }
+        f << content;
+        n_written++;
+        fprintf(stderr, "  wrote %s\n", path.c_str());
+    }
+
+    fprintf(stderr, "%d files written\n", n_written);
+    return 0;
+}
+
+// ---------------------------------------------------------------------------
+// Test mode: compare against snapshot files
+// ---------------------------------------------------------------------------
+
+static bool run_test_section(llama_model & mdl,
+                             const std::vector<mock_tensor> & tensors,
+                             const snapshot_section & section) {
+    // verify default_type matches what llama_ftype_default_type returns
+    ggml_type computed_default = llama_ftype_default_type(section.ftype);
+    if (computed_default != section.default_type) {
+        printf("  FAIL  [%s] default type mismatch: file says %s, code says %s\n",
+               llama_ftype_to_name(section.ftype),
+               ggml_type_name(section.default_type),
+               ggml_type_name(computed_default));
+        return false;
+    }
+
+    auto types = compute_quant_types(mdl, tensors, section.ftype);
+
+    std::map<std::string, ggml_type> override_map(section.overrides.begin(), section.overrides.end());
+
+    bool all_pass = true;
+    int n_override_found = 0;
+
+    for (const auto & [name, got] : types) {
+        ggml_type expected = section.default_type;
+        auto it = override_map.find(name);
+        if (it != override_map.end()) {
+            expected = it->second;
+            n_override_found++;
+        }
+
+        if (got != expected) {
+            printf("  FAIL  %-50s expected %s, got %s\n",
+                   name.c_str(), ggml_type_name(expected), ggml_type_name(got));
+            all_pass = false;
+        }
+    }
+
+    if (n_override_found != (int)section.overrides.size()) {
+        printf("  FAIL  [%s] override count mismatch: listed %d, matched %d\n",
+               llama_ftype_to_name(section.ftype),
+               (int)section.overrides.size(), n_override_found);
+        all_pass = false;
+    }
+
+    return all_pass;
+}
+
+static int run_remote_tests(const std::string & snapshot_dir, const char * argv0) {
+    int total_pass = 0;
+    int total_fail = 0;
+    int total_skip = 0;
+
+    for (int m = 0; m < n_model_specs; m++) {
+        const auto & spec = model_specs[m];
+        std::string name = model_name_from_repo(spec.repo);
+        printf("=== %s ===\n", name.c_str());
+
+        fprintf(stderr, "Fetching model metadata for %s from %s...\n", name.c_str(), spec.repo);
+        auto result = gguf_fetch_model_meta(spec.repo, spec.quant);
+        if (!result.has_value()) {
+            printf("  SKIP  (could not fetch model metadata)\n\n");
+            total_skip++;
+            continue;
+        }
+
+        const auto & remote = result.value();
+        auto model = build_mock_model_from_remote(remote);
+        llama_model_quantize_params qparams = llama_model_quantize_default_params();
+        auto tensors = build_mock_tensors(remote, model->arch, qparams);
+
+        std::string snapshot_path = snapshot_dir + "/" + snapshot_file_from_name(name) + ".schema";
+        std::vector<snapshot_section> sections;
+        if (!parse_snapshot_file(snapshot_path, sections)) {
+            printf("  SKIP  (could not read snapshot file: %s)\n\n", snapshot_path.c_str());
+            total_skip++;
+            continue;
+        }
+
+        int model_pass = 0;
+        int model_fail = 0;
+
+        for (const auto & section : sections) {
+            bool pass = run_test_section(*model, tensors, section);
+            if (pass) {
+                model_pass++;
+            } else {
+                model_fail++;
+            }
+        }
+
+        printf("  %s  %s: %d/%d ftype sections passed (%d tensors)\n",
+               model_fail == 0 ? "PASS" : "FAIL",
+               name.c_str(), model_pass, model_pass + model_fail,
+               (int)tensors.size());
+        printf("\n");
+
+        if (model_fail == 0) {
+            total_pass++;
+        } else {
+            total_fail++;
+        }
+    }
+
+    printf("%d/%d models passed", total_pass, total_pass + total_fail);
+    if (total_skip > 0) {
+        printf(", %d skipped", total_skip);
+    }
+    printf("\n");
+
+    if (total_fail > 0) {
+        printf("\nIf these changes are intentional, regenerate snapshot files with:\n");
+        printf("  %s --generate\n", argv0);
+    }
+
+    return total_fail > 0 ? 1 : 0;
+}
+
+int main(int argc, char ** argv) {
+    std::string snapshot_dir = SNAPSHOT_DIR;
+    bool generate = false;
+
+    for (int i = 1; i < argc; i++) {
+        if (strcmp(argv[i], "--snapshot-dir") == 0 && i + 1 < argc) {
+            snapshot_dir = argv[++i];
+        } else if (strcmp(argv[i], "--generate") == 0) {
+            generate = true;
+        }
+    }
+
+    if (generate) {
+        return run_generate(snapshot_dir);
+    }
+
+    return run_remote_tests(snapshot_dir, argv[0]);
+}