Merge 9c00aab225 into 3fc6f1aed1

2026-03-24 13:46:36 +00:00 · 2026-03-24 13:46:36 +00:00 · e17eb694cf
parent 3fc6f1aed1 9c00aab225
commit e17eb694cf
20 changed files with 35283 additions and 45 deletions
--- a/src/llama-ext.h
+++ b/src/llama-ext.h
@ -1,8 +1,8 @@
 #pragma once

-#include "llama-context.h"
-#include "ggml.h"
-#include "stdint.h"
+#include "llama.h"
+
+#include <cstdint>

 // Reserve a new compute graph. It is valid until the next call to llama_graph_reserve.
 LLAMA_API struct ggml_cgraph * llama_graph_reserve(
@ -10,3 +10,47 @@ LLAMA_API struct ggml_cgraph * llama_graph_reserve(
        uint32_t n_tokens,
        uint32_t n_seqs,
        uint32_t n_outputs);
+
+// Get the default ggml_type for a given ftype.
+LLAMA_API ggml_type llama_ftype_get_default_type(llama_ftype ftype);
+
+// Quantization state.
+struct quantize_state_impl;
+
+LLAMA_API quantize_state_impl * llama_quant_init(
+        const llama_model * model,
+        const llama_model_quantize_params * params);
+
+LLAMA_API void llama_quant_free(quantize_state_impl * qs);
+
+// Descriptor for constructing a mock model for quantization testing.
+struct llama_quant_model_desc {
+    const char * architecture;
+    uint32_t n_embd;
+    uint32_t n_ff;
+    uint32_t n_layer;
+    uint32_t n_head;
+    uint32_t n_head_kv;
+    uint32_t n_expert;
+    uint32_t n_embd_head_k;
+    uint32_t n_embd_head_v;
+};
+
+// Create a mock model from a metadata descriptor (for testing).
+// The returned model must be freed with llama_model_free().
+LLAMA_API llama_model * llama_quant_model_from_metadata(const llama_quant_model_desc * desc);
+
+// Returns true if this tensor should be quantized (based on name, dims, params).
+LLAMA_API bool llama_quant_tensor_allows_quantization(
+        const quantize_state_impl * qs,
+        const ggml_tensor * tensor);
+
+// Compute quantization type assignments for a list of tensors.
+// All tensors should be quantizable (use llama_quant_tensor_allows_quantization to filter).
+// result_types: caller-allocated array of n_tensors elements, filled with assigned types.
+LLAMA_API void llama_quant_compute_types(
+        quantize_state_impl * qs,
+        llama_ftype ftype,
+        ggml_tensor ** tensors,
+        ggml_type * result_types,
+        size_t n_tensors);
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@ -1,11 +1,11 @@
-#include "llama.h"
 #include "llama-impl.h"
 #include "llama-model.h"
 #include "llama-model-loader.h"
+#include "llama-ext.h"

+#include <algorithm>
 #include <cmath>
 #include <cstring>
-#include <string>
 #include <cinttypes>
 #include <fstream>
 #include <mutex>
@ -199,6 +199,7 @@ struct quantize_state_impl {

 // per-tensor metadata, computed in the preliminary loop and used in the main loop
 struct tensor_metadata {
+    std::string     name;
    ggml_type       target_type;
    tensor_category category;
    std::string     remapped_imatrix_name;
@ -784,7 +785,7 @@ static bool tensor_requires_imatrix(const char * tensor_name, const ggml_type ds
 // given a file type, get the default tensor type
 //

-static ggml_type llama_ftype_get_default_type(llama_ftype ftype) {
+ggml_type llama_ftype_get_default_type(llama_ftype ftype) {
    switch (ftype) {
        case LLAMA_FTYPE_MOSTLY_Q4_0: return GGML_TYPE_Q4_0;
        case LLAMA_FTYPE_MOSTLY_Q4_1: return GGML_TYPE_Q4_1;
@ -823,16 +824,32 @@ static ggml_type llama_ftype_get_default_type(llama_ftype ftype) {
        case LLAMA_FTYPE_MOSTLY_IQ3_S:
        case LLAMA_FTYPE_MOSTLY_IQ3_M:   return GGML_TYPE_IQ3_S;

-        default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
+        default: return GGML_TYPE_COUNT;
    }
 }

+
+static void init_quantize_state_counters(quantize_state_impl & qs, std::vector<tensor_metadata> & metadata) {
+    for (auto & tm : metadata) {
+        tensor_category cat = tensor_get_category(tm.name);
+        tm.category = cat;
+
+        if (category_is_attn_v(cat)) {
+            ++qs.n_attention_wv;
+        }
+
+        if (cat == tensor_category::OUTPUT) {
+            qs.has_tied_embeddings = false;
+        }
+    }
+    qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)qs.model.hparams.n_layer;
+}
+
 //
 // main quantization driver
 //

 static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
-    ggml_type default_type;
    llama_ftype ftype = params->ftype;

    int nthread = params->nthread;
@ -841,7 +858,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
        nthread = std::thread::hardware_concurrency();
    }

-    default_type = llama_ftype_get_default_type(ftype);
+    ggml_type default_type = llama_ftype_get_default_type(ftype);
+    if (default_type == GGML_TYPE_COUNT) {
+        throw std::runtime_error(format("invalid output file type %d\n", ftype));
+    }

    // mmap consistently increases speed on Linux, and also increases speed on Windows with
    // hot cache. It may cause a slowdown on macOS, possibly related to free memory.
@ -961,6 +981,15 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
        });
    }

+    // compute tensor metadata once and cache it
+    std::vector<tensor_metadata> metadata(tensors.size());
+    for (size_t i = 0; i < tensors.size(); ++i) {
+        metadata[i].name = ggml_get_name(tensors[i]->tensor);
+    }
+
+    // initialize quantization state counters and metadata categories
+    init_quantize_state_counters(qs, metadata);
+
    int idx = 0;
    uint16_t n_split = 1;

@ -973,25 +1002,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
    std::vector<gguf_context_ptr> ctx_outs(n_split);
    ctx_outs[0] = std::move(ctx_out);

-    // compute tensor metadata once and cache it
-    std::vector<tensor_metadata> metadata(tensors.size());
-
-    // initialize quantization state before preliminary loop (counters for use_more_bits)
-    {
-        for (size_t i = 0; i < tensors.size(); ++i) {
-            const auto cat = tensor_get_category(tensors[i]->tensor->name);
-            if (category_is_attn_v(cat)) {
-                ++qs.n_attention_wv;
-            }
-            if (cat == tensor_category::OUTPUT) {
-                qs.has_tied_embeddings = false;
-            }
-            metadata[i].category = cat; // save and re-use the category while we're at it
-        }
-        // these also need to be set to n_layer by default
-        qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)qs.model.hparams.n_layer;
-    }
-
    // flag for --dry-run
    bool will_require_imatrix = false;

@ -1002,7 +1012,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
    for (size_t i = 0; i < tensors.size(); ++i) {
        const auto * it = tensors[i];
        const struct ggml_tensor * tensor = it->tensor;
-        const std::string name = ggml_get_name(tensor);

        uint16_t i_split = params->keep_split ? it->idx : 0;
        if (!ctx_outs[i_split]) {
@ -1031,7 +1040,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                                "        - offending tensor: %s\n"
                                "        - target type: %s\n"
                                "============================================================================\n\n",
-                                name.c_str(), ggml_type_name(metadata[i].target_type));
+                                metadata[i].name.c_str(), ggml_type_name(metadata[i].target_type));
                throw std::runtime_error("this quantization requires an imatrix!");
            }
        }
@ -1104,7 +1113,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
            new_ofstream(weight.idx);
        }

-        const std::string name = ggml_get_name(tensor);
        const size_t tensor_size = ggml_nbytes(tensor);

        if (!params->dry_run) {
@ -1235,9 +1243,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
            total_size_new += new_size;

            // update the gguf meta data as we go
-            gguf_set_tensor_type(ctx_outs[cur_split].get(), name.c_str(), new_type);
-            GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), name.c_str())) == new_size);
-            gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data);
+            gguf_set_tensor_type(ctx_outs[cur_split].get(), metadata[i].name.c_str(), new_type);
+            GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), metadata[i].name.c_str())) == new_size);
+            gguf_set_tensor_data(ctx_outs[cur_split].get(), metadata[i].name.c_str(), new_data);

            // write tensor data + padding
            fout.write((const char *) new_data, new_size);
@ -1302,3 +1310,89 @@ uint32_t llama_model_quantize(

    return 0;
 }
+
+//
+// Helper functions for external tools exposed in llama-ext.h
+//
+
+quantize_state_impl * llama_quant_init(
+        const llama_model * model,
+        const llama_model_quantize_params * params) {
+    return new quantize_state_impl(*model, params);
+}
+
+void llama_quant_free(quantize_state_impl * qs) {
+    delete qs;
+}
+
+llama_model * llama_quant_model_from_metadata(const llama_quant_model_desc * desc) {
+    struct llama_model_params mparams = llama_model_default_params();
+    auto * model = new llama_model(mparams);
+
+    model->arch = llm_arch_from_string(desc->architecture);
+
+    // infer llm_type: only LLM_TYPE_70B matters for quantization logic
+    if (model->arch == LLM_ARCH_LLAMA && desc->n_layer == 80 && desc->n_head != desc->n_head_kv) {
+        model->type = LLM_TYPE_70B;
+    }
+
+    model->hparams.n_embd             = desc->n_embd;
+    model->hparams.n_embd_head_k_full = desc->n_embd_head_k;
+    model->hparams.n_embd_head_v_full = desc->n_embd_head_v;
+    model->hparams.n_layer            = desc->n_layer;
+    model->hparams.n_expert           = desc->n_expert;
+
+    for (uint32_t i = 0; i < desc->n_layer; i++) {
+        model->hparams.n_head_arr[i]    = desc->n_head;
+        model->hparams.n_head_kv_arr[i] = desc->n_head_kv;
+        model->hparams.n_ff_arr[i]      = desc->n_ff;
+    }
+
+    return model;
+}
+
+bool llama_quant_tensor_allows_quantization(
+        const quantize_state_impl * qs,
+        const ggml_tensor * tensor) {
+    return tensor_allows_quantization(qs->params, qs->model.arch, tensor);
+}
+
+void llama_quant_compute_types(
+        quantize_state_impl * qs,
+        llama_ftype ftype,
+        ggml_tensor ** tensors,
+        ggml_type * result_types,
+        size_t n_tensors) {
+    // reset per-computation state
+    qs->n_attention_wv  = 0;
+    qs->n_ffn_down      = 0;
+    qs->n_ffn_gate      = 0;
+    qs->n_ffn_up        = 0;
+    qs->i_attention_wv  = 0;
+    qs->i_ffn_down      = 0;
+    qs->i_ffn_gate      = 0;
+    qs->i_ffn_up        = 0;
+    qs->n_fallback      = 0;
+    qs->has_imatrix     = false;
+    qs->has_tied_embeddings = true;
+
+    // build metadata from tensor names
+    std::vector<tensor_metadata> metadata(n_tensors);
+    for (size_t i = 0; i < n_tensors; i++) {
+        metadata[i].name = ggml_get_name(tensors[i]);
+    }
+
+    // initialize counters and categories
+    init_quantize_state_counters(*qs, metadata);
+
+    // use a local copy of params with the requested ftype
+    llama_model_quantize_params local_params = *qs->params;
+    local_params.ftype = ftype;
+
+    ggml_type default_type = llama_ftype_get_default_type(ftype);
+
+    // compute types
+    for (size_t i = 0; i < n_tensors; i++) {
+        result_types[i] = llama_tensor_get_type(*qs, &local_params, tensors[i], default_type, metadata[i]);
+    }
+}
--- a/tests/.gitignore
+++ b/tests/.gitignore
@ -1,5 +1,6 @@
 *
 !*.*
+!snapshots/
 *.o
 ggml-common.h
 **/*.swp
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -274,6 +274,12 @@ if (TARGET cpp-httplib)
        add_executable(test-gguf-model-data test-gguf-model-data.cpp)
        target_link_libraries(test-gguf-model-data PRIVATE gguf-model-data common)
        llama_test(test-gguf-model-data LABEL "model")
+
+        # test-quant-type-selection requires gguf-model-data for remote model metadata
+        llama_build_and_test(test-quant-type-selection.cpp LABEL "model")
+        target_link_libraries(test-quant-type-selection PRIVATE gguf-model-data)
+        target_compile_definitions(test-quant-type-selection PRIVATE
+            SNAPSHOT_DIR="${CMAKE_CURRENT_SOURCE_DIR}/snapshots")
    endif()
 endif()

--- a/tests/gguf-model-data.cpp
+++ b/tests/gguf-model-data.cpp
@ -124,6 +124,35 @@ static bool gguf_skip_value(gguf_buf_reader & r, int32_t vtype) {
 }

 static bool gguf_read_uint32_val(gguf_buf_reader & r, int32_t vtype, uint32_t & out) {
+    // Handle array-valued fields (e.g. per-layer head counts in hybrid models)
+    // by reading the first element as a representative value.
+    if (vtype == GGUF_TYPE_ARRAY) {
+        int32_t elem_type;
+        uint64_t count;
+        if (!r.read_val(elem_type)) {
+            return false;
+        }
+        if (!r.read_val(count)) {
+            return false;
+        }
+        if (count == 0) {
+            return false;
+        }
+        // Read first element, skip the rest
+        if (!gguf_read_uint32_val(r, elem_type, out)) {
+            return false;
+        }
+        for (uint64_t i = 1; i < count; i++) {
+            size_t sz = gguf_val_type_size(elem_type);
+            if (sz == 0) {
+                return false;
+            }
+            if (!r.skip(sz)) {
+                return false;
+            }
+        }
+        return true;
+    }
    if (vtype == GGUF_TYPE_UINT8) {
        uint8_t v;
        if (!r.read_val(v)) {
@ -486,7 +515,8 @@ static std::string detect_gguf_filename(const std::string & repo, const std::str
 static std::optional<gguf_remote_model> fetch_and_parse(
        const std::string & repo,
        const std::string & filename,
-        const std::string & cache_path) {
+        const std::string & cache_path,
+        bool verbose) {
    std::string url = "https://huggingface.co/" + repo + "/resolve/main/" + filename;

    // Progressive download inspired by RangeView.fetchChunk()
@ -495,7 +525,9 @@ static std::optional<gguf_remote_model> fetch_and_parse(
    const size_t max_chunk = 64 * 1024 * 1024;

    while (chunk_size <= max_chunk) {
-        fprintf(stderr, "gguf_fetch: downloading %zu bytes from %s\n", chunk_size, filename.c_str());
+        if (verbose) {
+            fprintf(stderr, "gguf_fetch: downloading %zu bytes from %s\n", chunk_size, filename.c_str());
+        }

        char range_buf[64];
        snprintf(range_buf, sizeof(range_buf), "bytes=0-%zu", chunk_size - 1);
@ -536,7 +568,8 @@ static std::optional<gguf_remote_model> fetch_or_cached(
        const std::string & repo,
        const std::string & filename,
        const std::string & cdir,
-        const std::string & repo_part) {
+        const std::string & repo_part,
+        bool verbose) {
    std::string fname_part = sanitize_for_path(filename);
    std::string cache_path = cdir + "/" + repo_part + "--" + fname_part + ".partial";

@ -545,20 +578,23 @@ static std::optional<gguf_remote_model> fetch_or_cached(
        if (std::filesystem::exists(cache_path) && read_file(cache_path, cached)) {
            auto result = gguf_parse_meta(cached);
            if (result.has_value()) {
-                fprintf(stderr, "gguf_fetch: loaded from cache: %s\n", cache_path.c_str());
+                if (verbose) {
+                    fprintf(stderr, "gguf_fetch: loaded from cache: %s\n", cache_path.c_str());
+                }
                return result;
            }
        }
    }

    fs_create_directory_with_parents(cdir);
-    return fetch_and_parse(repo, filename, cache_path);
+    return fetch_and_parse(repo, filename, cache_path, verbose);
 }

 std::optional<gguf_remote_model> gguf_fetch_model_meta(
        const std::string & repo,
        const std::string & quant,
-        const std::string & cache_dir) {
+        const std::string & cache_dir,
+        bool verbose) {
    std::string cdir = cache_dir.empty() ? get_default_cache_dir() : cache_dir;
    std::string repo_part = sanitize_for_path(repo);

@ -568,7 +604,7 @@ std::optional<gguf_remote_model> gguf_fetch_model_meta(
        return std::nullopt;
    }

-    auto model_opt = fetch_or_cached(repo, filename, cdir, repo_part);
+    auto model_opt = fetch_or_cached(repo, filename, cdir, repo_part, verbose);
    if (!model_opt.has_value()) {
        fprintf(stderr, "gguf_fetch: failed to fetch %s\n", filename.c_str());
        return std::nullopt;
@ -583,8 +619,10 @@ std::optional<gguf_remote_model> gguf_fetch_model_meta(
            return std::nullopt;
        }

-        fprintf(stderr, "gguf_fetch: split model with %u shards, fetching remaining %u...\n",
-                model.n_split, model.n_split - 1);
+        if (verbose) {
+            fprintf(stderr, "gguf_fetch: split model with %u shards, fetching remaining %u...\n",
+                    model.n_split, model.n_split - 1);
+        }

        for (int i = 2; i <= model.n_split; i++) {
            char num_buf[6], total_buf[6];
@ -592,7 +630,7 @@ std::optional<gguf_remote_model> gguf_fetch_model_meta(
            snprintf(total_buf, sizeof(total_buf), "%05d", (int)model.n_split);
            std::string shard_name = split_prefix + "-" + num_buf + "-of-" + total_buf + ".gguf";

-            auto shard = fetch_or_cached(repo, shard_name, cdir, repo_part);
+            auto shard = fetch_or_cached(repo, shard_name, cdir, repo_part, verbose);
            if (!shard.has_value()) {
                fprintf(stderr, "gguf_fetch: failed to fetch shard %d: %s\n", i, shard_name.c_str());
                return std::nullopt;
--- a/tests/gguf-model-data.h
+++ b/tests/gguf-model-data.h
@ -39,4 +39,5 @@ struct gguf_remote_model {
 std::optional<gguf_remote_model> gguf_fetch_model_meta(
    const std::string & repo,
    const std::string & quant = "Q8_0",
-    const std::string & cache_dir = "");  // empty = default
+    const std::string & cache_dir = "",  // empty = default
+    bool verbose = true);
--- a/tests/snapshots/deepseek-v3.1.schema
+++ b/tests/snapshots/deepseek-v3.1.schema
--- a/tests/snapshots/gemma-3-4b-it.schema
+++ b/tests/snapshots/gemma-3-4b-it.schema
--- a/tests/snapshots/glm-4.6v.schema
+++ b/tests/snapshots/glm-4.6v.schema
--- a/tests/snapshots/gpt-oss-120b.schema
+++ b/tests/snapshots/gpt-oss-120b.schema
--- a/tests/snapshots/meta-llama-3.1-70b-instruct.schema
+++ b/tests/snapshots/meta-llama-3.1-70b-instruct.schema
--- a/tests/snapshots/nemotron-nano-3-30b-a3b.schema
+++ b/tests/snapshots/nemotron-nano-3-30b-a3b.schema
--- a/tests/snapshots/qwen3-0.6b.schema
+++ b/tests/snapshots/qwen3-0.6b.schema
--- a/tests/snapshots/qwen3-14b.schema
+++ b/tests/snapshots/qwen3-14b.schema
--- a/tests/snapshots/qwen3-coder-next.schema
+++ b/tests/snapshots/qwen3-coder-next.schema
--- a/tests/snapshots/qwen3.5-27b.schema
+++ b/tests/snapshots/qwen3.5-27b.schema
--- a/tests/snapshots/qwen3.5-397b-a17b.schema
+++ b/tests/snapshots/qwen3.5-397b-a17b.schema
--- a/tests/snapshots/step-3.5-flash.schema
+++ b/tests/snapshots/step-3.5-flash.schema
--- a/tests/test-gguf-model-data.cpp
+++ b/tests/test-gguf-model-data.cpp
@ -116,6 +116,39 @@ int main() {
    // Verify tensor count
    TEST_ASSERT(model3.tensors.size() == 780, "expected tensor count == 780");

+    // Test a hybrid-attention model with array-valued head counts
+    auto result4 = gguf_fetch_model_meta("ggml-org/Step-3.5-Flash-GGUF", "Q4_K");
+    if (!result4.has_value()) {
+        fprintf(stderr, "FAIL: could not fetch Step-3.5-Flash metadata\n");
+        return 1;
+    }
+    const auto & model4 = result4.value();
+
+    fprintf(stderr, "Architecture:  %s\n", model4.architecture.c_str());
+    fprintf(stderr, "n_embd:        %u\n", model4.n_embd);
+    fprintf(stderr, "n_ff:          %u\n", model4.n_ff);
+    fprintf(stderr, "n_vocab:       %u\n", model4.n_vocab);
+    fprintf(stderr, "n_layer:       %u\n", model4.n_layer);
+    fprintf(stderr, "n_head:        %u\n", model4.n_head);
+    fprintf(stderr, "n_head_kv:     %u\n", model4.n_head_kv);
+    fprintf(stderr, "n_expert:      %u\n", model4.n_expert);
+    fprintf(stderr, "n_embd_head_k: %u\n", model4.n_embd_head_k);
+    fprintf(stderr, "n_embd_head_v: %u\n", model4.n_embd_head_v);
+    fprintf(stderr, "tensors:       %zu\n", model4.tensors.size());
+
+    TEST_ASSERT(model4.architecture == "step35", "expected architecture 'step35'");
+
+    TEST_ASSERT(model4.n_layer == 45, "expected n_layer == 45");
+    TEST_ASSERT(model4.n_embd == 4096, "expected n_embd == 4096");
+    TEST_ASSERT(model4.n_ff == 11264, "expected n_ff == 11264");
+    TEST_ASSERT(model4.n_head == 64, "expected n_head == 64 (first element of per-layer array)");
+    TEST_ASSERT(model4.n_head_kv == 8, "expected n_head_kv == 8 (first element of per-layer array)");
+    TEST_ASSERT(model4.n_expert == 288, "expected n_expert == 288");
+    TEST_ASSERT(model4.n_embd_head_k == 128, "expected n_embd_head_k == 128");
+    TEST_ASSERT(model4.n_embd_head_v == 128, "expected n_embd_head_v == 128");
+    TEST_ASSERT(model4.n_vocab == 128896, "expected n_vocab == 128896");
+    TEST_ASSERT(model4.tensors.size() == 754, "expected tensor count == 754");
+
    fprintf(stderr, "=== ALL TESTS PASSED ===\n");
    return 0;
 }
--- a/tests/test-quant-type-selection.cpp
+++ b/tests/test-quant-type-selection.cpp
@ -0,0 +1,512 @@
+#include "llama.h"
+
+#include "../src/llama-ext.h"
+
+#include "ggml-cpp.h"
+#include "gguf-model-data.h"
+
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+// ---------------------------------------------------------------------------
+// ftype name <-> enum mapping
+// ---------------------------------------------------------------------------
+
+struct ftype_name_entry {
+    const char * name;
+    llama_ftype  ftype;
+};
+
+static const ftype_name_entry ftype_name_table[] = {
+    { "F32",        LLAMA_FTYPE_ALL_F32 },
+    { "F16",        LLAMA_FTYPE_MOSTLY_F16 },
+    { "BF16",       LLAMA_FTYPE_MOSTLY_BF16 },
+    { "Q4_0",       LLAMA_FTYPE_MOSTLY_Q4_0 },
+    { "Q4_1",       LLAMA_FTYPE_MOSTLY_Q4_1 },
+    { "Q5_0",       LLAMA_FTYPE_MOSTLY_Q5_0 },
+    { "Q5_1",       LLAMA_FTYPE_MOSTLY_Q5_1 },
+    { "Q8_0",       LLAMA_FTYPE_MOSTLY_Q8_0 },
+    { "Q2_K",       LLAMA_FTYPE_MOSTLY_Q2_K },
+    { "Q2_K_S",     LLAMA_FTYPE_MOSTLY_Q2_K_S },
+    { "Q3_K_S",     LLAMA_FTYPE_MOSTLY_Q3_K_S },
+    { "Q3_K_M",     LLAMA_FTYPE_MOSTLY_Q3_K_M },
+    { "Q3_K_L",     LLAMA_FTYPE_MOSTLY_Q3_K_L },
+    { "Q4_K_S",     LLAMA_FTYPE_MOSTLY_Q4_K_S },
+    { "Q4_K_M",     LLAMA_FTYPE_MOSTLY_Q4_K_M },
+    { "Q5_K_S",     LLAMA_FTYPE_MOSTLY_Q5_K_S },
+    { "Q5_K_M",     LLAMA_FTYPE_MOSTLY_Q5_K_M },
+    { "Q6_K",       LLAMA_FTYPE_MOSTLY_Q6_K },
+    { "IQ1_S",      LLAMA_FTYPE_MOSTLY_IQ1_S },
+    { "IQ1_M",      LLAMA_FTYPE_MOSTLY_IQ1_M },
+    { "IQ2_XXS",    LLAMA_FTYPE_MOSTLY_IQ2_XXS },
+    { "IQ2_XS",     LLAMA_FTYPE_MOSTLY_IQ2_XS },
+    { "IQ2_S",      LLAMA_FTYPE_MOSTLY_IQ2_S },
+    { "IQ2_M",      LLAMA_FTYPE_MOSTLY_IQ2_M },
+    { "IQ3_XXS",    LLAMA_FTYPE_MOSTLY_IQ3_XXS },
+    { "IQ3_XS",     LLAMA_FTYPE_MOSTLY_IQ3_XS },
+    { "IQ3_S",      LLAMA_FTYPE_MOSTLY_IQ3_S },
+    { "IQ3_M",      LLAMA_FTYPE_MOSTLY_IQ3_M },
+    { "IQ4_NL",     LLAMA_FTYPE_MOSTLY_IQ4_NL },
+    { "IQ4_XS",     LLAMA_FTYPE_MOSTLY_IQ4_XS },
+    { "TQ1_0",      LLAMA_FTYPE_MOSTLY_TQ1_0 },
+    { "TQ2_0",      LLAMA_FTYPE_MOSTLY_TQ2_0 },
+    { "MXFP4_MOE",  LLAMA_FTYPE_MOSTLY_MXFP4_MOE },
+};
+
+static llama_ftype llama_ftype_from_name(const char * name) {
+    for (const auto & e : ftype_name_table) {
+        if (strcmp(name, e.name) == 0) {
+            return e.ftype;
+        }
+    }
+    return (llama_ftype)-1;
+}
+
+static const char * llama_ftype_to_name(llama_ftype ftype) {
+    for (const auto & e : ftype_name_table) {
+        if (e.ftype == ftype) {
+            return e.name;
+        }
+    }
+    return nullptr;
+}
+
+// ---------------------------------------------------------------------------
+// ggml_type name lookup
+// ---------------------------------------------------------------------------
+
+static ggml_type ggml_type_from_name(const std::string & name) {
+    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
+        const char * tname = ggml_type_name((ggml_type) i);
+        if (tname && name == tname) {
+            return (ggml_type) i;
+        }
+    }
+    return GGML_TYPE_COUNT;
+}
+
+// ---------------------------------------------------------------------------
+// File parser for snapshot files (quant type schemas)
+// ---------------------------------------------------------------------------
+
+struct snapshot_section {
+    llama_ftype                                    ftype;
+    ggml_type                                      default_type;
+    std::vector<std::pair<std::string, ggml_type>> overrides;
+};
+
+// This function is pretty ugly, but it's a trade-off of readable snapshot files
+// versus readable parsing code
+static bool parse_snapshot_file(const std::string & path, std::vector<snapshot_section> & sections) {
+    std::ifstream f(path);
+    if (!f.good()) {
+        return false;
+    }
+
+    snapshot_section * cur = nullptr;
+    std::string        line;
+
+    while (std::getline(f, line)) {
+        if (line.empty() || line[0] == '#') {
+            continue;
+        }
+
+        // section header: [FTYPE_NAME] default_type
+        if (line[0] == '[') {
+            auto close = line.find(']');
+            if (close == std::string::npos) {
+                fprintf(stderr, "parse error: missing ] in '%s'\n", line.c_str());
+                return false;
+            }
+            std::string ftype_str = line.substr(1, close - 1);
+            std::string default_str;
+            size_t      pos = close + 1;
+            while (pos < line.size() && line[pos] == ' ') {
+                pos++;
+            }
+            default_str = line.substr(pos);
+
+            llama_ftype ftype = llama_ftype_from_name(ftype_str.c_str());
+            if ((int) ftype < 0) {
+                fprintf(stderr, "parse error: unknown ftype '%s'\n", ftype_str.c_str());
+                return false;
+            }
+
+            ggml_type dtype = ggml_type_from_name(default_str);
+            if (dtype == GGML_TYPE_COUNT) {
+                fprintf(stderr, "parse error: unknown default type '%s'\n", default_str.c_str());
+                return false;
+            }
+
+            sections.push_back({ ftype, dtype, {} });
+            cur = &sections.back();
+            continue;
+        }
+
+        if (!cur) {
+            fprintf(stderr, "parse error: tensor line before any section: '%s'\n", line.c_str());
+            return false;
+        }
+
+        auto sp = line.rfind(' ');
+        if (sp == std::string::npos) {
+            fprintf(stderr, "parse error: no space in tensor line: '%s'\n", line.c_str());
+            return false;
+        }
+
+        std::string tname = line.substr(0, sp);
+        std::string ttype = line.substr(sp + 1);
+
+        ggml_type gt = ggml_type_from_name(ttype);
+        if (gt == GGML_TYPE_COUNT) {
+            fprintf(stderr, "parse error: unknown type '%s' for tensor '%s'\n", ttype.c_str(), tname.c_str());
+            return false;
+        }
+
+        cur->overrides.push_back({ tname, gt });
+    }
+
+    return true;
+}
+
+// ---------------------------------------------------------------------------
+// Remote model support using gguf-model-data.cpp
+// ---------------------------------------------------------------------------
+
+struct remote_model_spec {
+    const char * repo;
+    const char * quant;
+};
+
+// Get model name from repo: strip org prefix, strip -GGUF suffix,
+// and strip anything up to and including first '_' (e.g. "deepseek-ai_DeepSeek-V3.1").
+static std::string model_name_from_repo(const char * repo) {
+    std::string s(repo);
+
+    auto slash = s.find('/');
+    if (slash != std::string::npos) {
+        s = s.substr(slash + 1);
+    }
+
+    const std::string suffix = "-GGUF";
+    if (s.size() >= suffix.size() && s.compare(s.size() - suffix.size(), suffix.size(), suffix) == 0) {
+        s = s.substr(0, s.size() - suffix.size());
+    }
+
+    auto underscore = s.find('_');
+    if (underscore != std::string::npos) {
+        s = s.substr(underscore + 1);
+    }
+
+    return s;
+}
+
+static std::string snapshot_file_from_name(const std::string & name) {
+    std::string lower = name;
+    for (auto & c : lower) {
+        c = std::tolower(c);
+    }
+    return lower;
+}
+
+static const remote_model_spec model_specs[] = {
+    { "ggml-org/Qwen3-0.6B-GGUF",                   "Q8_0"   },
+    { "ggml-org/GLM-4.6V-GGUF",                     "Q8_0"   },
+    { "ggml-org/Step-3.5-Flash-GGUF",               "Q4_K"   },
+    { "ggml-org/Qwen3-Coder-Next-GGUF",             "Q8_0"   },
+    { "ggml-org/Qwen3-14B-GGUF",                    "Q8_0"   },
+    { "ggml-org/Nemotron-Nano-3-30B-A3B-GGUF",      "Q8_0"   },
+    { "ggml-org/gpt-oss-120b-GGUF",                 "mxfp4"  },
+    { "ggml-org/gemma-3-4b-it-GGUF",                "Q8_0"   },
+    { "bartowski/Meta-Llama-3.1-70B-Instruct-GGUF", "Q4_K_M" },
+    { "bartowski/deepseek-ai_DeepSeek-V3.1-GGUF",   "IQ1_M"  },
+    { "bartowski/Qwen_Qwen3.5-397B-A17B-GGUF",      "IQ1_S"  }, // TODO: swap with ggml-org if/when it's released
+    { "bartowski/Qwen_Qwen3.5-27B-GGUF",            "Q8_0"   }, // TODO: swap with ggml-org if/when it's released
+};
+
+static const int n_model_specs = (int) (sizeof(model_specs) / sizeof(model_specs[0]));
+
+static llama_model * build_mock_model_from_remote(const gguf_remote_model & remote) {
+    llama_quant_model_desc desc = {};
+    desc.architecture  = remote.architecture.c_str();
+    desc.n_embd        = remote.n_embd;
+    desc.n_ff          = remote.n_ff;
+    desc.n_layer       = remote.n_layer;
+    desc.n_head        = remote.n_head;
+    desc.n_head_kv     = remote.n_head_kv;
+    desc.n_expert      = remote.n_expert;
+    desc.n_embd_head_k = remote.n_embd_head_k;
+    desc.n_embd_head_v = remote.n_embd_head_v;
+    return llama_quant_model_from_metadata(&desc);
+}
+
+// Single ggml context holding all quantizable tensors for a model.
+struct mock_tensors {
+    ggml_context_ptr           ctx;
+    std::vector<ggml_tensor *> tensors;
+};
+
+static mock_tensors build_mock_tensors(const quantize_state_impl * qs,
+                                       const gguf_remote_model &   remote) {
+    const size_t ctx_size = remote.tensors.size() * ggml_tensor_overhead();
+    struct ggml_init_params params = { ctx_size, nullptr, true };
+    ggml_context_ptr ctx(ggml_init(params));
+
+    std::vector<ggml_tensor *> result;
+
+    for (const auto & t : remote.tensors) {
+        ggml_tensor * gt = ggml_new_tensor_4d(ctx.get(), GGML_TYPE_F32,
+                                               t.ne[0], t.ne[1], t.ne[2], t.ne[3]);
+        ggml_set_name(gt, t.name.c_str());
+        if (llama_quant_tensor_allows_quantization(qs, gt)) {
+            result.push_back(gt);
+        }
+    }
+
+    return { std::move(ctx), std::move(result) };
+}
+
+// ---------------------------------------------------------------------------
+// Generate mode: regenerate all snapshot files
+// Use this when either adding new models or modifying quants
+// ---------------------------------------------------------------------------
+
+static std::string generate_snapshot(const std::string &       name,
+                                     const gguf_remote_model & remote,
+                                     quantize_state_impl *     qs,
+                                     mock_tensors &            mt) {
+    std::ostringstream out;
+
+    out << "# Model: " << name << "\n";
+    out << "# n_embd=" << remote.n_embd << ", n_ff=" << remote.n_ff << ", n_vocab=" << remote.n_vocab
+        << ", n_layer=" << remote.n_layer << ", n_head=" << remote.n_head << ", n_head_kv=" << remote.n_head_kv;
+    if (remote.n_expert > 0) {
+        out << ", n_expert=" << remote.n_expert;
+    }
+    out << "\n";
+
+    for (int i = 0; i < LLAMA_FTYPE_GUESSED; i++) {
+        llama_ftype ft           = (llama_ftype) i;
+        ggml_type   default_type = llama_ftype_get_default_type(ft);
+        if (default_type == GGML_TYPE_COUNT) {
+            continue;
+        }
+        const char * fname = llama_ftype_to_name(ft);
+        if (!fname) {
+            continue;
+        }
+
+        std::vector<ggml_type> result_types(mt.tensors.size());
+        llama_quant_compute_types(qs, ft, mt.tensors.data(), result_types.data(), mt.tensors.size());
+
+        out << "\n[" << fname << "] " << ggml_type_name(default_type) << "\n";
+        for (size_t j = 0; j < mt.tensors.size(); j++) {
+            if (result_types[j] != default_type) {
+                out << ggml_get_name(mt.tensors[j]) << " " << ggml_type_name(result_types[j]) << "\n";
+            }
+        }
+    }
+
+    return out.str();
+}
+
+static int run_generate(const std::string & snapshot_dir) {
+    fprintf(stderr, "This will overwrite all snapshot files in:\n  %s\n", snapshot_dir.c_str());
+    fprintf(stderr, "Continue? [y/N] ");
+    int ch = fgetc(stdin);
+    if (ch != 'y' && ch != 'Y') {
+        fprintf(stderr, "Aborted.\n");
+        return 1;
+    }
+
+    fprintf(stderr, "\n");
+
+    int n_written = 0;
+
+    for (int m = 0; m < n_model_specs; m++) {
+        const auto & spec = model_specs[m];
+        std::string  name = model_name_from_repo(spec.repo);
+
+        fprintf(stderr, "Fetching model metadata for %s from %s...\n", name.c_str(), spec.repo);
+        auto result = gguf_fetch_model_meta(spec.repo, spec.quant);
+        if (!result.has_value()) {
+            fprintf(stderr, "ERROR: could not fetch model metadata for %s\n", name.c_str());
+            return 1;
+        }
+
+        const auto &                remote  = result.value();
+        llama_model *               model   = build_mock_model_from_remote(remote);
+        llama_model_quantize_params qparams = llama_model_quantize_default_params();
+        quantize_state_impl *       qs     = llama_quant_init(model, &qparams);
+        auto                        mt      = build_mock_tensors(qs, remote);
+
+        std::string content = generate_snapshot(name, remote, qs, mt);
+        std::string path    = snapshot_dir + "/" + snapshot_file_from_name(name) + ".schema";
+
+        std::ofstream f(path);
+        if (!f.good()) {
+            fprintf(stderr, "ERROR: could not write %s\n", path.c_str());
+            llama_quant_free(qs);
+            llama_model_free(model);
+            return 1;
+        }
+        f << content;
+        n_written++;
+        fprintf(stderr, "  wrote %s\n", path.c_str());
+        llama_quant_free(qs);
+        llama_model_free(model);
+    }
+
+    fprintf(stderr, "%d files written\n", n_written);
+    return 0;
+}
+
+// ---------------------------------------------------------------------------
+// Test mode: compare against snapshot files
+// ---------------------------------------------------------------------------
+
+static bool run_test_section(quantize_state_impl *    qs,
+                             mock_tensors &           mt,
+                             const snapshot_section & section) {
+    // verify default_type matches what llama_ftype_get_default_type returns
+    ggml_type computed_default = llama_ftype_get_default_type(section.ftype);
+    if (computed_default != section.default_type) {
+        printf("  FAIL  [%s] default type mismatch: file says %s, code says %s\n", llama_ftype_to_name(section.ftype),
+               ggml_type_name(section.default_type), ggml_type_name(computed_default));
+        return false;
+    }
+
+    std::vector<ggml_type> result_types(mt.tensors.size());
+    llama_quant_compute_types(qs, section.ftype, mt.tensors.data(), result_types.data(), mt.tensors.size());
+
+    std::map<std::string, ggml_type> override_map(section.overrides.begin(), section.overrides.end());
+
+    bool all_pass         = true;
+    int  n_override_found = 0;
+
+    for (size_t i = 0; i < mt.tensors.size(); i++) {
+        const char * name = ggml_get_name(mt.tensors[i]);
+        ggml_type    got  = result_types[i];
+
+        ggml_type expected = section.default_type;
+        auto      it       = override_map.find(name);
+        if (it != override_map.end()) {
+            expected = it->second;
+            n_override_found++;
+        }
+
+        if (got != expected) {
+            printf("  FAIL  %-50s %-10s expected %s, got %s\n", name, llama_ftype_to_name(section.ftype), ggml_type_name(expected), ggml_type_name(got));
+            all_pass = false;
+        }
+    }
+
+    if (n_override_found != (int) section.overrides.size()) {
+        printf("  FAIL  [%s] override count mismatch: listed %d, matched %d\n", llama_ftype_to_name(section.ftype),
+               (int) section.overrides.size(), n_override_found);
+        all_pass = false;
+    }
+
+    return all_pass;
+}
+
+static int run_remote_tests(const std::string & snapshot_dir, const char * argv0) {
+    int total_pass = 0;
+    int total_fail = 0;
+    int total_skip = 0;
+
+    for (int m = 0; m < n_model_specs; m++) {
+        const auto & spec = model_specs[m];
+        std::string  name = model_name_from_repo(spec.repo);
+        printf("=== %s ===\n", name.c_str());
+
+        auto result = gguf_fetch_model_meta(spec.repo, spec.quant, "", false);
+        if (!result.has_value()) {
+            printf("  SKIP  (could not fetch model metadata)\n\n");
+            total_skip++;
+            continue;
+        }
+
+        const auto &                remote  = result.value();
+        llama_model *               model   = build_mock_model_from_remote(remote);
+        llama_model_quantize_params qparams = llama_model_quantize_default_params();
+        quantize_state_impl *       qs     = llama_quant_init(model, &qparams);
+        auto                        mt      = build_mock_tensors(qs, remote);
+
+        std::string                   snapshot_path = snapshot_dir + "/" + snapshot_file_from_name(name) + ".schema";
+        std::vector<snapshot_section> sections;
+        if (!parse_snapshot_file(snapshot_path, sections)) {
+            printf("  SKIP  (could not read snapshot file: %s)\n\n", snapshot_path.c_str());
+            llama_quant_free(qs);
+            llama_model_free(model);
+            total_skip++;
+            continue;
+        }
+
+        int model_pass = 0;
+        int model_fail = 0;
+
+        for (const auto & section : sections) {
+            bool pass = run_test_section(qs, mt, section);
+            if (pass) {
+                model_pass++;
+            } else {
+                model_fail++;
+            }
+        }
+
+        printf("  %s  %s: %d/%d ftype sections passed (%d tensors)\n", model_fail == 0 ? "PASS" : "FAIL", name.c_str(),
+               model_pass, model_pass + model_fail, (int) mt.tensors.size());
+        printf("\n");
+
+        if (model_fail == 0) {
+            total_pass++;
+        } else {
+            total_fail++;
+        }
+
+        llama_quant_free(qs);
+        llama_model_free(model);
+    }
+
+    printf("%d/%d models passed", total_pass, total_pass + total_fail);
+    if (total_skip > 0) {
+        printf(", %d skipped", total_skip);
+    }
+    printf("\n");
+
+    if (total_fail > 0) {
+        printf("\nIf these changes are intentional, regenerate snapshot files with:\n");
+        printf("  %s --generate\n", argv0);
+    }
+
+    return total_fail > 0 ? 1 : 0;
+}
+
+int main(int argc, char ** argv) {
+    std::string snapshot_dir = SNAPSHOT_DIR;
+    bool        generate     = false;
+
+    for (int i = 1; i < argc; i++) {
+        if (strcmp(argv[i], "--generate") == 0) {
+            generate = true;
+        } else if (strcmp(argv[i], "--snapshot-dir") == 0 && i + 1 < argc) {
+            snapshot_dir = argv[++i];
+        }
+    }
+
+    if (generate) {
+        return run_generate(snapshot_dir);
+    }
+
+    // suppress llama log warnings during test (e.g. tensor type fallback messages)
+    llama_log_set([](enum ggml_log_level, const char *, void *) {}, nullptr);
+
+    return run_remote_tests(snapshot_dir, argv[0]);
+}