From 0b3cc323d97bd2b240203952629eba3d522cde30 Mon Sep 17 00:00:00 2001
From: Colin Kealty <3266127+bartowski1182@users.noreply.github.com>
Date: Mon, 16 Mar 2026 15:06:46 -0400
Subject: [PATCH] Cleanup based on review comments

---
 src/llama-ext.h                     |  64 +++++----
 src/llama-quant.cpp                 | 126 ++++++++++++++++-
 src/llama-quant.h                   |  39 ------
 tests/test-quant-type-selection.cpp | 204 ++++++++++------------------
 4 files changed, 232 insertions(+), 201 deletions(-)
diff --git a/src/llama-ext.h b/src/llama-ext.h
index a29dcf1536..7f632ab211 100644
--- a/src/llama-ext.h
+++ b/src/llama-ext.h
@@ -2,13 +2,7 @@
 
 #include "llama.h"
 
-// TODO: try to remove this headers
-#include "llama-arch.h"
-#include "llama-model.h"
-#include "llama-quant.h"
-
 #include <cstdint>
-#include <vector>
 
 // Reserve a new compute graph. It is valid until the next call to llama_graph_reserve.
 LLAMA_API struct ggml_cgraph * llama_graph_reserve(
@@ -17,28 +11,46 @@ LLAMA_API struct ggml_cgraph * llama_graph_reserve(
         uint32_t n_seqs,
         uint32_t n_outputs);
 
+// Get the default ggml_type for a given ftype.
 LLAMA_API ggml_type llama_ftype_get_default_type(llama_ftype ftype);
 
-// TODO: use llama_quant_ prefix to name these consistently:
+// Quantization state.
+struct llama_quant;
+
+LLAMA_API llama_quant * llama_quant_init(
+        const llama_model * model,
+        const llama_model_quantize_params * params);
+
+LLAMA_API void llama_quant_free(llama_quant * qnt);
+
+// Descriptor for constructing a mock model for quantization testing.
+struct llama_quant_model_desc {
+    const char * architecture;
+    uint32_t n_embd;
+    uint32_t n_ff;
+    uint32_t n_layer;
+    uint32_t n_head;
+    uint32_t n_head_kv;
+    uint32_t n_expert;
+    uint32_t n_embd_head_k;
+    uint32_t n_embd_head_v;
+};
+
+// Create a mock model from a metadata descriptor (for testing).
+// The returned model must be freed with llama_model_free().
+LLAMA_API llama_model * llama_quant_model_from_metadata(const llama_quant_model_desc * desc);
 
 // Returns true if this tensor should be quantized (based on name, dims, params).
-LLAMA_API bool tensor_allows_quantization(const llama_model_quantize_params * params, llm_arch arch, const ggml_tensor * tensor);
+LLAMA_API bool llama_quant_tensor_allows_quantization(
+        const llama_quant * qnt,
+        const ggml_tensor * tensor);
 
-// TODO: add:
-// LLAMA_API llama_quant * llama_quant_init(...);
-// LLAMA_API void llama_quant_free(llama_quant * qnt);
-
-// TODO: become member function of llama_quant
-LLAMA_API ggml_type llama_tensor_get_type(
-        llama_quant & qs,
-        const llama_model_quantize_params * params,
-        const ggml_tensor * tensor,
-        ggml_type default_type,
-        const tensor_metadata & tm);
-
-// Initialize llama_quant counters and populate tensor_metadata categories.
-// metadata: vector with name fields already set, will have category field populated.
-// TODO: become member function of llama_quant
-LLAMA_API void init_quantize_state_counters(
-        llama_quant & qs,
-        std::vector<tensor_metadata> & metadata);
+// Compute quantization type assignments for a list of tensors.
+// All tensors should be quantizable (use llama_quant_tensor_allows_quantization to filter).
+// result_types: caller-allocated array of n_tensors elements, filled with assigned types.
+LLAMA_API void llama_quant_compute_types(
+        llama_quant * qnt,
+        llama_ftype ftype,
+        ggml_tensor ** tensors,
+        ggml_type * result_types,
+        size_t n_tensors);
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index d1a9d6d15c..8cb1298167 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -14,6 +14,39 @@
 #include <thread>
 #include <unordered_map>
 
+// tensor categorization - used to avoid repeated string matching in quantization logic.
+// this is different from LLM_TN - we want broad categories, not specific tensor names per arch.
+enum class tensor_category {
+    TOKEN_EMBD,
+    ATTENTION_Q,
+    ATTENTION_V,
+    ATTENTION_K,
+    ATTENTION_QKV,
+    ATTENTION_KV_B,
+    ATTENTION_OUTPUT,
+    FFN_UP,
+    FFN_GATE,
+    FFN_DOWN,
+    OUTPUT,
+    OTHER
+};
+
+// per-tensor metadata, computed in the preliminary loop and used in the main loop
+struct tensor_metadata {
+    std::string     name;
+    ggml_type       target_type;
+    tensor_category category;
+    std::string     remapped_imatrix_name;
+    bool            allows_quantization;
+    bool            requires_imatrix;
+};
+
+// result of parsing --tensor-type option
+// (changes to this struct must be reflected in tools/quantize/quantize.cpp)
+struct tensor_type_option {
+    std::string name;
+    ggml_type   type = GGML_TYPE_COUNT;
+};
 
 static void zeros(std::ofstream & file, size_t n) {
     char zero = 0;
@@ -235,7 +268,7 @@ static void llama_tensor_dequantize_impl(
 // do we allow this tensor to be quantized?
 //
 
-bool tensor_allows_quantization(const llama_model_quantize_params * params, llm_arch arch, const ggml_tensor * tensor) {
+static bool tensor_allows_quantization(const llama_model_quantize_params * params, llm_arch arch, const ggml_tensor * tensor) {
     // trivial checks first -- no string ops needed
     if (params->only_copy)       return false;
 
@@ -602,7 +635,7 @@ static ggml_type llama_tensor_get_type_impl(llama_quant & qs, ggml_type new_type
 }
 
 // outer wrapper: determine the ggml_type that this tensor should be quantized to
-ggml_type llama_tensor_get_type(llama_quant & qs, const llama_model_quantize_params * params, const ggml_tensor * tensor, ggml_type default_type, const tensor_metadata & tm) {
+static ggml_type llama_tensor_get_type(llama_quant & qs, const llama_model_quantize_params * params, const ggml_tensor * tensor, ggml_type default_type, const tensor_metadata & tm) {
     if (!tensor_allows_quantization(params, qs.model.arch, tensor)) {
         return tensor->type;
     }
@@ -777,7 +810,7 @@ ggml_type llama_ftype_get_default_type(llama_ftype ftype) {
 }
 
 
-void init_quantize_state_counters(llama_quant & qs, std::vector<tensor_metadata> & metadata) {
+static void init_quantize_state_counters(llama_quant & qs, std::vector<tensor_metadata> & metadata) {
     for (auto & tm : metadata) {
         tensor_category cat = tensor_get_category(tm.name);
         tm.category = cat;
@@ -1258,3 +1291,90 @@ uint32_t llama_model_quantize(
 
     return 0;
 }
+
+//
+// Helper functions for external tools exposed in llama-ext.h
+//
+
+llama_quant * llama_quant_init(
+        const llama_model * model,
+        const llama_model_quantize_params * params) {
+    return new llama_quant(*model, params);
+}
+
+void llama_quant_free(llama_quant * qnt) {
+    delete qnt;
+}
+
+llama_model * llama_quant_model_from_metadata(const llama_quant_model_desc * desc) {
+    struct llama_model_params mparams = llama_model_default_params();
+    auto * model = new llama_model(mparams);
+
+    model->arch = llm_arch_from_string(desc->architecture);
+
+    // infer llm_type: only LLM_TYPE_70B matters for quantization logic
+    if (model->arch == LLM_ARCH_LLAMA && desc->n_layer == 80 && desc->n_head != desc->n_head_kv) {
+        model->type = LLM_TYPE_70B;
+    }
+
+    model->hparams.n_embd             = desc->n_embd;
+    model->hparams.n_embd_head_k_full = desc->n_embd_head_k;
+    model->hparams.n_embd_head_v_full = desc->n_embd_head_v;
+    model->hparams.n_layer            = desc->n_layer;
+    model->hparams.n_expert           = desc->n_expert;
+
+    for (uint32_t i = 0; i < desc->n_layer; i++) {
+        model->hparams.n_head_arr[i]    = desc->n_head;
+        model->hparams.n_head_kv_arr[i] = desc->n_head_kv;
+        model->hparams.n_ff_arr[i]      = desc->n_ff;
+    }
+
+    return model;
+}
+
+bool llama_quant_tensor_allows_quantization(
+        const llama_quant * qnt,
+        const ggml_tensor * tensor) {
+    return tensor_allows_quantization(qnt->params, qnt->model.arch, tensor);
+}
+
+void llama_quant_compute_types(
+        llama_quant * qnt,
+        llama_ftype ftype,
+        ggml_tensor ** tensors,
+        ggml_type * result_types,
+        size_t n_tensors) {
+    // reset per-computation state
+    qnt->n_attention_wv  = 0;
+    qnt->n_ffn_down      = 0;
+    qnt->n_ffn_gate      = 0;
+    qnt->n_ffn_up        = 0;
+    qnt->i_attention_wv  = 0;
+    qnt->i_ffn_down      = 0;
+    qnt->i_ffn_gate      = 0;
+    qnt->i_ffn_up        = 0;
+    qnt->n_k_quantized   = 0;
+    qnt->n_fallback      = 0;
+    qnt->has_imatrix     = false;
+    qnt->has_tied_embeddings = true;
+
+    // build metadata from tensor names
+    std::vector<tensor_metadata> metadata(n_tensors);
+    for (size_t i = 0; i < n_tensors; i++) {
+        metadata[i].name = ggml_get_name(tensors[i]);
+    }
+
+    // initialize counters and categories
+    init_quantize_state_counters(*qnt, metadata);
+
+    // use a local copy of params with the requested ftype
+    llama_model_quantize_params local_params = *qnt->params;
+    local_params.ftype = ftype;
+
+    ggml_type default_type = llama_ftype_get_default_type(ftype);
+
+    // compute types
+    for (size_t i = 0; i < n_tensors; i++) {
+        result_types[i] = llama_tensor_get_type(*qnt, &local_params, tensors[i], default_type, metadata[i]);
+    }
+}
diff --git a/src/llama-quant.h b/src/llama-quant.h
index 4a7800f98d..8aff106ddf 100644
--- a/src/llama-quant.h
+++ b/src/llama-quant.h
@@ -3,47 +3,8 @@
 #include "llama.h"
 
 #include <memory>
-#include <string>
 
 struct llama_model;
-
-// TODO: use llama_quant_ prefix to name these consistently:
-
-// tensor categorization - used to avoid repeated string matching in quantization logic.
-// this is different from LLM_TN - we want broad categories, not specific tensor names per arch.
-enum class tensor_category {
-    TOKEN_EMBD,
-    ATTENTION_Q,
-    ATTENTION_V,
-    ATTENTION_K,
-    ATTENTION_QKV,
-    ATTENTION_KV_B,
-    ATTENTION_OUTPUT,
-    FFN_UP,
-    FFN_GATE,
-    FFN_DOWN,
-    OUTPUT,
-    OTHER
-};
-
-// per-tensor metadata, computed in the preliminary loop and used in the main loop
-// TODO: probably should belong to llama_quant
-struct tensor_metadata {
-    std::string     name;
-    ggml_type       target_type;
-    tensor_category category;
-    std::string     remapped_imatrix_name;
-    bool            allows_quantization;
-    bool            requires_imatrix;
-};
-
-// result of parsing --tensor-type option
-// (changes to this struct must be reflected in tools/quantize/quantize.cpp)
-struct tensor_type_option {
-    std::string name;
-    ggml_type   type = GGML_TYPE_COUNT;
-};
-
 struct compiled_tensor_type_patterns;
 
 struct llama_quant {
diff --git a/tests/test-quant-type-selection.cpp b/tests/test-quant-type-selection.cpp
index 7fbc6ce611..aa0f14bf9f 100644
--- a/tests/test-quant-type-selection.cpp
+++ b/tests/test-quant-type-selection.cpp
@@ -2,13 +2,13 @@
 
 #include "../src/llama-ext.h"
 
+#include "ggml-cpp.h"
 #include "gguf-model-data.h"
 
 #include <cstdio>
 #include <cstring>
 #include <fstream>
 #include <map>
-#include <memory>
 #include <sstream>
 #include <string>
 #include <utility>
@@ -78,37 +78,9 @@ static const char * llama_ftype_to_name(llama_ftype ftype) {
 }
 
 // ---------------------------------------------------------------------------
-// Mock tensor construction - may be better to extract this in the future
+// ggml_type name lookup
 // ---------------------------------------------------------------------------
 
-struct mock_tensor {
-    ggml_context_ptr ctx;
-    ggml_tensor *    tensor;
-};
-
-static mock_tensor make_mock_tensor(const std::string & name,
-                                    int64_t             ne0,
-                                    int64_t             ne1,
-                                    int64_t             ne2 = 1,
-                                    int64_t             ne3 = 1) {
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ 2 * ggml_tensor_overhead(),
-        /*.mem_buffer =*/ nullptr,
-        /*.no_alloc   =*/ true,
-    };
-    ggml_context_ptr ctx(ggml_init(params));
-    ggml_tensor * t;
-    if (ne3 > 1) {
-        t = ggml_new_tensor_4d(ctx.get(), GGML_TYPE_F32, ne0, ne1, ne2, ne3);
-    } else if (ne2 > 1) {
-        t = ggml_new_tensor_3d(ctx.get(), GGML_TYPE_F32, ne0, ne1, ne2);
-    } else {
-        t = ggml_new_tensor_2d(ctx.get(), GGML_TYPE_F32, ne0, ne1);
-    }
-    ggml_set_name(t, name.c_str());
-    return { std::move(ctx), t };
-}
-
 static ggml_type ggml_type_from_name(const std::string & name) {
     for (int i = 0; i < GGML_TYPE_COUNT; i++) {
         const char * tname = ggml_type_name((ggml_type) i);
@@ -260,94 +232,44 @@ static const remote_model_spec model_specs[] = {
 
 static const int n_model_specs = (int) (sizeof(model_specs) / sizeof(model_specs[0]));
 
-// Determine llm_type from metadata.
-// Only LLM_TYPE_70B matters -> probably can/should be dropped in the future
-static llm_type infer_llm_type(llm_arch arch, const gguf_remote_model & remote) {
-    if (arch == LLM_ARCH_LLAMA && remote.n_layer == 80 && remote.n_head != remote.n_head_kv) {
-        return LLM_TYPE_70B;
-    }
-    return LLM_TYPE_UNKNOWN;
+static llama_model * build_mock_model_from_remote(const gguf_remote_model & remote) {
+    llama_quant_model_desc desc = {};
+    desc.architecture  = remote.architecture.c_str();
+    desc.n_embd        = remote.n_embd;
+    desc.n_ff          = remote.n_ff;
+    desc.n_layer       = remote.n_layer;
+    desc.n_head        = remote.n_head;
+    desc.n_head_kv     = remote.n_head_kv;
+    desc.n_expert      = remote.n_expert;
+    desc.n_embd_head_k = remote.n_embd_head_k;
+    desc.n_embd_head_v = remote.n_embd_head_v;
+    return llama_quant_model_from_metadata(&desc);
 }
 
-static std::unique_ptr<llama_model> build_mock_model_from_remote(const gguf_remote_model & remote) {
-    struct llama_model_params mparams = llama_model_default_params();
-    auto                      model   = std::make_unique<llama_model>(mparams);
+// Single ggml context holding all quantizable tensors for a model.
+struct mock_tensors {
+    ggml_context_ptr           ctx;
+    std::vector<ggml_tensor *> tensors;
+};
 
-    model->arch = llm_arch_from_string(remote.architecture);
-    model->type = infer_llm_type(model->arch, remote);
+static mock_tensors build_mock_tensors(const llama_quant *       qnt,
+                                       const gguf_remote_model & remote) {
+    const size_t ctx_size = remote.tensors.size() * ggml_tensor_overhead();
+    struct ggml_init_params params = { ctx_size, nullptr, true };
+    ggml_context_ptr ctx(ggml_init(params));
 
-    model->hparams.n_embd        = remote.n_embd;
-    model->hparams.n_embd_head_k_full = remote.n_embd_head_k;
-    model->hparams.n_embd_head_v_full = remote.n_embd_head_v;
-    model->hparams.n_layer       = remote.n_layer;
-    model->hparams.n_expert      = remote.n_expert;
-
-    for (uint32_t i = 0; i < remote.n_layer; i++) {
-        model->hparams.n_head_arr[i]    = remote.n_head;
-        model->hparams.n_head_kv_arr[i] = remote.n_head_kv;
-        model->hparams.n_ff_arr[i]      = remote.n_ff;
-    }
-
-    return model;
-}
-
-static std::vector<mock_tensor> build_mock_tensors(const gguf_remote_model &           remote,
-                                                   llm_arch                            arch,
-                                                   const llama_model_quantize_params & qparams) {
-    std::vector<mock_tensor> result;
+    std::vector<ggml_tensor *> result;
 
     for (const auto & t : remote.tensors) {
-        auto mt = make_mock_tensor(t.name, t.ne[0], t.ne[1], t.ne[2], t.ne[3]);
-        if (tensor_allows_quantization(&qparams, arch, mt.tensor)) {
-            result.push_back(std::move(mt));
+        ggml_tensor * gt = ggml_new_tensor_4d(ctx.get(), GGML_TYPE_F32,
+                                               t.ne[0], t.ne[1], t.ne[2], t.ne[3]);
+        ggml_set_name(gt, t.name.c_str());
+        if (llama_quant_tensor_allows_quantization(qnt, gt)) {
+            result.push_back(gt);
         }
     }
 
-    return result;
-}
-
-static std::string read_file_contents(const std::string & path) {
-    std::ifstream f(path);
-    if (!f.good()) {
-        return "";
-    }
-    std::ostringstream ss;
-    ss << f.rdbuf();
-    return ss.str();
-}
-
-// ---------------------------------------------------------------------------
-// Compute quantization type assignments per target ftype
-// ---------------------------------------------------------------------------
-
-// Returns {tensor_name, assigned_type} for each tensor, in order.
-// TODO: should likely be moved as a member function of llama_quant and expose through the `llama-ext.h` interface
-static std::vector<std::pair<std::string, ggml_type>> compute_quant_types(llama_model &                    mdl,
-                                                                          const std::vector<mock_tensor> & tensors,
-                                                                          llama_ftype                      ftype) {
-    llama_model_quantize_params qparams = llama_model_quantize_default_params();
-    qparams.ftype                       = ftype;
-
-    // TODO: call llama_quant_init(...)
-    llama_quant qs(mdl, &qparams);
-
-    std::vector<tensor_metadata> metadata(tensors.size());
-    for (size_t i = 0; i < tensors.size(); ++i) {
-        metadata[i].name = tensors[i].tensor->name;
-    }
-    init_quantize_state_counters(qs, metadata);
-
-    ggml_type default_type = llama_ftype_get_default_type(ftype);
-
-    std::vector<std::pair<std::string, ggml_type>> result;
-    result.reserve(tensors.size());
-
-    for (size_t i = 0; i < tensors.size(); ++i) {
-        ggml_type got = llama_tensor_get_type(qs, &qparams, tensors[i].tensor, default_type, metadata[i]);
-        result.push_back({ metadata[i].name, got });
-    }
-
-    return result;
+    return { std::move(ctx), std::move(result) };
 }
 
 // ---------------------------------------------------------------------------
@@ -355,10 +277,10 @@ static std::vector<std::pair<std::string, ggml_type>> compute_quant_types(llama_
 // Use this when either adding new models or modifying quants
 // ---------------------------------------------------------------------------
 
-static std::string generate_snapshot(const std::string &              name,
-                                     const gguf_remote_model &        remote,
-                                     llama_model &                    mdl,
-                                     const std::vector<mock_tensor> & tensors) {
+static std::string generate_snapshot(const std::string &       name,
+                                     const gguf_remote_model & remote,
+                                     llama_quant *             qnt,
+                                     mock_tensors &            mt) {
     std::ostringstream out;
 
     out << "# Model: " << name << "\n";
@@ -380,12 +302,13 @@ static std::string generate_snapshot(const std::string &              name,
             continue;
         }
 
-        auto types = compute_quant_types(mdl, tensors, ft);
+        std::vector<ggml_type> result_types(mt.tensors.size());
+        llama_quant_compute_types(qnt, ft, mt.tensors.data(), result_types.data(), mt.tensors.size());
 
         out << "\n[" << fname << "] " << ggml_type_name(default_type) << "\n";
-        for (const auto & [name, type] : types) {
-            if (type != default_type) {
-                out << name << " " << ggml_type_name(type) << "\n";
+        for (size_t j = 0; j < mt.tensors.size(); j++) {
+            if (result_types[j] != default_type) {
+                out << ggml_get_name(mt.tensors[j]) << " " << ggml_type_name(result_types[j]) << "\n";
             }
         }
     }
@@ -418,21 +341,26 @@ static int run_generate(const std::string & snapshot_dir) {
         }
 
         const auto &                remote  = result.value();
-        auto                        model   = build_mock_model_from_remote(remote);
+        llama_model *               model   = build_mock_model_from_remote(remote);
         llama_model_quantize_params qparams = llama_model_quantize_default_params();
-        auto                        tensors = build_mock_tensors(remote, model->arch, qparams);
+        llama_quant *               qnt     = llama_quant_init(model, &qparams);
+        auto                        mt      = build_mock_tensors(qnt, remote);
 
-        std::string content = generate_snapshot(name, remote, *model, tensors);
+        std::string content = generate_snapshot(name, remote, qnt, mt);
         std::string path    = snapshot_dir + "/" + snapshot_file_from_name(name) + ".schema";
 
         std::ofstream f(path);
         if (!f.good()) {
             fprintf(stderr, "ERROR: could not write %s\n", path.c_str());
+            llama_quant_free(qnt);
+            llama_model_free(model);
             return 1;
         }
         f << content;
         n_written++;
         fprintf(stderr, "  wrote %s\n", path.c_str());
+        llama_quant_free(qnt);
+        llama_model_free(model);
     }
 
     fprintf(stderr, "%d files written\n", n_written);
@@ -443,9 +371,9 @@ static int run_generate(const std::string & snapshot_dir) {
 // Test mode: compare against snapshot files
 // ---------------------------------------------------------------------------
 
-static bool run_test_section(llama_model &                    mdl,
-                             const std::vector<mock_tensor> & tensors,
-                             const snapshot_section &         section) {
+static bool run_test_section(llama_quant *            qnt,
+                             mock_tensors &           mt,
+                             const snapshot_section & section) {
     // verify default_type matches what llama_ftype_get_default_type returns
     ggml_type computed_default = llama_ftype_get_default_type(section.ftype);
     if (computed_default != section.default_type) {
@@ -454,14 +382,18 @@ static bool run_test_section(llama_model &                    mdl,
         return false;
     }
 
-    auto types = compute_quant_types(mdl, tensors, section.ftype);
+    std::vector<ggml_type> result_types(mt.tensors.size());
+    llama_quant_compute_types(qnt, section.ftype, mt.tensors.data(), result_types.data(), mt.tensors.size());
 
     std::map<std::string, ggml_type> override_map(section.overrides.begin(), section.overrides.end());
 
     bool all_pass         = true;
     int  n_override_found = 0;
 
-    for (const auto & [name, got] : types) {
+    for (size_t i = 0; i < mt.tensors.size(); i++) {
+        const char * name = ggml_get_name(mt.tensors[i]);
+        ggml_type    got  = result_types[i];
+
         ggml_type expected = section.default_type;
         auto      it       = override_map.find(name);
         if (it != override_map.end()) {
@@ -470,7 +402,7 @@ static bool run_test_section(llama_model &                    mdl,
         }
 
         if (got != expected) {
-            printf("  FAIL  %-50s %-10s expected %s, got %s\n", name.c_str(), llama_ftype_to_name(section.ftype), ggml_type_name(expected), ggml_type_name(got));
+            printf("  FAIL  %-50s %-10s expected %s, got %s\n", name, llama_ftype_to_name(section.ftype), ggml_type_name(expected), ggml_type_name(got));
             all_pass = false;
         }
     }
@@ -502,14 +434,17 @@ static int run_remote_tests(const std::string & snapshot_dir, const char * argv0
         }
 
         const auto &                remote  = result.value();
-        auto                        model   = build_mock_model_from_remote(remote);
+        llama_model *               model   = build_mock_model_from_remote(remote);
         llama_model_quantize_params qparams = llama_model_quantize_default_params();
-        auto                        tensors = build_mock_tensors(remote, model->arch, qparams);
+        llama_quant *               qnt     = llama_quant_init(model, &qparams);
+        auto                        mt      = build_mock_tensors(qnt, remote);
 
         std::string                   snapshot_path = snapshot_dir + "/" + snapshot_file_from_name(name) + ".schema";
         std::vector<snapshot_section> sections;
         if (!parse_snapshot_file(snapshot_path, sections)) {
             printf("  SKIP  (could not read snapshot file: %s)\n\n", snapshot_path.c_str());
+            llama_quant_free(qnt);
+            llama_model_free(model);
             total_skip++;
             continue;
         }
@@ -518,7 +453,7 @@ static int run_remote_tests(const std::string & snapshot_dir, const char * argv0
         int model_fail = 0;
 
         for (const auto & section : sections) {
-            bool pass = run_test_section(*model, tensors, section);
+            bool pass = run_test_section(qnt, mt, section);
             if (pass) {
                 model_pass++;
             } else {
@@ -527,7 +462,7 @@ static int run_remote_tests(const std::string & snapshot_dir, const char * argv0
         }
 
         printf("  %s  %s: %d/%d ftype sections passed (%d tensors)\n", model_fail == 0 ? "PASS" : "FAIL", name.c_str(),
-               model_pass, model_pass + model_fail, (int) tensors.size());
+               model_pass, model_pass + model_fail, (int) mt.tensors.size());
         printf("\n");
 
         if (model_fail == 0) {
@@ -535,6 +470,9 @@ static int run_remote_tests(const std::string & snapshot_dir, const char * argv0
         } else {
             total_fail++;
         }
+
+        llama_quant_free(qnt);
+        llama_model_free(model);
     }
 
     printf("%d/%d models passed", total_pass, total_pass + total_fail);
@@ -556,10 +494,10 @@ int main(int argc, char ** argv) {
     bool        generate     = false;
 
     for (int i = 1; i < argc; i++) {
-        if (strcmp(argv[i], "--snapshot-dir") == 0 && i + 1 < argc) {
-            snapshot_dir = argv[++i];
-        } else if (strcmp(argv[i], "--generate") == 0) {
+        if (strcmp(argv[i], "--generate") == 0) {
             generate = true;
+        } else if (strcmp(argv[i], "--snapshot-dir") == 0 && i + 1 < argc) {
+            snapshot_dir = argv[++i];
         }
     }