From b85a7c8c6c5365fe869da4bbdb07e06106579b21 Mon Sep 17 00:00:00 2001 From: Colin Kealty <3266127+bartowski1182@users.noreply.github.com> Date: Tue, 17 Mar 2026 12:47:12 -0400 Subject: [PATCH] Continue cleanup --- src/llama-ext.h | 10 +- src/llama-quant.cpp | 138 ++++++++++++++++------------ src/llama-quant.h | 35 ------- tests/test-quant-type-selection.cpp | 34 +++---- 4 files changed, 101 insertions(+), 116 deletions(-) diff --git a/src/llama-ext.h b/src/llama-ext.h index 7f632ab211..2ffb77934e 100644 --- a/src/llama-ext.h +++ b/src/llama-ext.h @@ -15,13 +15,13 @@ LLAMA_API struct ggml_cgraph * llama_graph_reserve( LLAMA_API ggml_type llama_ftype_get_default_type(llama_ftype ftype); // Quantization state. -struct llama_quant; +struct quantize_state_impl; -LLAMA_API llama_quant * llama_quant_init( +LLAMA_API quantize_state_impl * llama_quant_init( const llama_model * model, const llama_model_quantize_params * params); -LLAMA_API void llama_quant_free(llama_quant * qnt); +LLAMA_API void llama_quant_free(quantize_state_impl * qs); // Descriptor for constructing a mock model for quantization testing. struct llama_quant_model_desc { @@ -42,14 +42,14 @@ LLAMA_API llama_model * llama_quant_model_from_metadata(const llama_quant_model_ // Returns true if this tensor should be quantized (based on name, dims, params). LLAMA_API bool llama_quant_tensor_allows_quantization( - const llama_quant * qnt, + const quantize_state_impl * qs, const ggml_tensor * tensor); // Compute quantization type assignments for a list of tensors. // All tensors should be quantizable (use llama_quant_tensor_allows_quantization to filter). // result_types: caller-allocated array of n_tensors elements, filled with assigned types. LLAMA_API void llama_quant_compute_types( - llama_quant * qnt, + quantize_state_impl * qs, llama_ftype ftype, ggml_tensor ** tensors, ggml_type * result_types, diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 8cb1298167..e98d1b91e5 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -14,6 +14,13 @@ #include #include +// result of parsing --tensor-type option +// (changes to this struct must be reflected in tools/quantize/quantize.cpp) +struct tensor_type_option { + std::string name; + ggml_type type = GGML_TYPE_COUNT; +}; + // tensor categorization - used to avoid repeated string matching in quantization logic. // this is different from LLM_TN - we want broad categories, not specific tensor names per arch. enum class tensor_category { @@ -31,23 +38,6 @@ enum class tensor_category { OTHER }; -// per-tensor metadata, computed in the preliminary loop and used in the main loop -struct tensor_metadata { - std::string name; - ggml_type target_type; - tensor_category category; - std::string remapped_imatrix_name; - bool allows_quantization; - bool requires_imatrix; -}; - -// result of parsing --tensor-type option -// (changes to this struct must be reflected in tools/quantize/quantize.cpp) -struct tensor_type_option { - std::string name; - ggml_type type = GGML_TYPE_COUNT; -}; - static void zeros(std::ofstream & file, size_t n) { char zero = 0; for (size_t i = 0; i < n; ++i) { @@ -168,25 +158,56 @@ static bool category_is_attn_v(tensor_category cat) { cat == tensor_category::ATTENTION_KV_B; } -struct compiled_tensor_type_patterns { - std::vector> patterns; -}; +// +// quantization state +// -llama_quant::llama_quant(const llama_model & model, const llama_model_quantize_params * params) - : model(model), params(params) -{ - if (params->tensor_types) { - const auto & tensor_types = *static_cast *>(params->tensor_types); - if (!tensor_types.empty()) { - tensor_type_patterns = std::make_unique(); +struct quantize_state_impl { + const llama_model & model; + const llama_model_quantize_params * params; + + int n_attention_wv = 0; + int n_ffn_down = 0; + int n_ffn_gate = 0; + int n_ffn_up = 0; + int i_attention_wv = 0; + int i_ffn_down = 0; + int i_ffn_gate = 0; + int i_ffn_up = 0; + + int n_fallback = 0; + + bool has_imatrix = false; + + // used to figure out if a model has tied embeddings (tok_embd shares weights with output) + bool has_tied_embeddings = true; // assume tied until we see output.weight + + // tensor type override patterns (compiled once, used twice) + std::vector> tensor_type_patterns; + + quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params): + model(model), params(params) + { + // compile regex patterns once - they are expensive + if (params->tensor_types) { + const auto & tensor_types = *static_cast *>(params->tensor_types); for (const auto & [tname, qtype] : tensor_types) { - tensor_type_patterns->patterns.emplace_back(std::regex(tname), qtype); + tensor_type_patterns.emplace_back(std::regex(tname), qtype); } } } -} +}; -llama_quant::~llama_quant() = default; + +// per-tensor metadata, computed in the preliminary loop and used in the main loop +struct tensor_metadata { + std::string name; + ggml_type target_type; + tensor_category category; + std::string remapped_imatrix_name; + bool allows_quantization; + bool requires_imatrix; +}; // // dequantization @@ -336,7 +357,7 @@ static bool tensor_allows_quantization(const llama_model_quantize_params * param // // incompatible tensor shapes are handled here - fallback to a compatible type -static ggml_type tensor_type_fallback(llama_quant & qs, const ggml_tensor * t, const ggml_type target_type) { +static ggml_type tensor_type_fallback(quantize_state_impl & qs, const ggml_tensor * t, const ggml_type target_type) { ggml_type return_type = target_type; const int64_t ncols = t->ne[0]; @@ -385,7 +406,7 @@ static ggml_type tensor_type_fallback(llama_quant & qs, const ggml_tensor * t, c } // internal standard logic for selecting the target tensor type based on tensor category, ftype, and model arch -static ggml_type llama_tensor_get_type_impl(llama_quant & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype, tensor_category category) { +static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype, tensor_category category) { const std::string name = ggml_get_name(tensor); // TODO: avoid hardcoded tensor names - use the TN_* constants @@ -635,7 +656,7 @@ static ggml_type llama_tensor_get_type_impl(llama_quant & qs, ggml_type new_type } // outer wrapper: determine the ggml_type that this tensor should be quantized to -static ggml_type llama_tensor_get_type(llama_quant & qs, const llama_model_quantize_params * params, const ggml_tensor * tensor, ggml_type default_type, const tensor_metadata & tm) { +static ggml_type llama_tensor_get_type(quantize_state_impl & qs, const llama_model_quantize_params * params, const ggml_tensor * tensor, ggml_type default_type, const tensor_metadata & tm) { if (!tensor_allows_quantization(params, qs.model.arch, tensor)) { return tensor->type; } @@ -652,9 +673,9 @@ static ggml_type llama_tensor_get_type(llama_quant & qs, const llama_model_quant if (!params->pure && ggml_is_quantized(default_type)) { // if the user provided tensor types - use those bool manual = false; - if (qs.tensor_type_patterns) { + if (!qs.tensor_type_patterns.empty()) { const std::string tensor_name(tensor->name); - for (const auto & [pattern, qtype] : qs.tensor_type_patterns->patterns) { + for (const auto & [pattern, qtype] : qs.tensor_type_patterns) { if (std::regex_search(tensor_name, pattern)) { if (qtype != new_type) { LLAMA_LOG_WARN("%s: %-36s - applying manual override: %s -> %s\n", @@ -810,7 +831,7 @@ ggml_type llama_ftype_get_default_type(llama_ftype ftype) { } -static void init_quantize_state_counters(llama_quant & qs, std::vector & metadata) { +static void init_quantize_state_counters(quantize_state_impl & qs, std::vector & metadata) { for (auto & tm : metadata) { tensor_category cat = tensor_get_category(tm.name); tm.category = cat; @@ -869,7 +890,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: model.load_hparams(ml); model.load_stats (ml); - llama_quant qs(model, params); + quantize_state_impl qs(model, params); if (params->only_copy) { ftype = ml.ftype; @@ -1296,14 +1317,14 @@ uint32_t llama_model_quantize( // Helper functions for external tools exposed in llama-ext.h // -llama_quant * llama_quant_init( +quantize_state_impl * llama_quant_init( const llama_model * model, const llama_model_quantize_params * params) { - return new llama_quant(*model, params); + return new quantize_state_impl(*model, params); } -void llama_quant_free(llama_quant * qnt) { - delete qnt; +void llama_quant_free(quantize_state_impl * qs) { + delete qs; } llama_model * llama_quant_model_from_metadata(const llama_quant_model_desc * desc) { @@ -1333,30 +1354,29 @@ llama_model * llama_quant_model_from_metadata(const llama_quant_model_desc * des } bool llama_quant_tensor_allows_quantization( - const llama_quant * qnt, + const quantize_state_impl * qs, const ggml_tensor * tensor) { - return tensor_allows_quantization(qnt->params, qnt->model.arch, tensor); + return tensor_allows_quantization(qs->params, qs->model.arch, tensor); } void llama_quant_compute_types( - llama_quant * qnt, + quantize_state_impl * qs, llama_ftype ftype, ggml_tensor ** tensors, ggml_type * result_types, size_t n_tensors) { // reset per-computation state - qnt->n_attention_wv = 0; - qnt->n_ffn_down = 0; - qnt->n_ffn_gate = 0; - qnt->n_ffn_up = 0; - qnt->i_attention_wv = 0; - qnt->i_ffn_down = 0; - qnt->i_ffn_gate = 0; - qnt->i_ffn_up = 0; - qnt->n_k_quantized = 0; - qnt->n_fallback = 0; - qnt->has_imatrix = false; - qnt->has_tied_embeddings = true; + qs->n_attention_wv = 0; + qs->n_ffn_down = 0; + qs->n_ffn_gate = 0; + qs->n_ffn_up = 0; + qs->i_attention_wv = 0; + qs->i_ffn_down = 0; + qs->i_ffn_gate = 0; + qs->i_ffn_up = 0; + qs->n_fallback = 0; + qs->has_imatrix = false; + qs->has_tied_embeddings = true; // build metadata from tensor names std::vector metadata(n_tensors); @@ -1365,16 +1385,16 @@ void llama_quant_compute_types( } // initialize counters and categories - init_quantize_state_counters(*qnt, metadata); + init_quantize_state_counters(*qs, metadata); // use a local copy of params with the requested ftype - llama_model_quantize_params local_params = *qnt->params; + llama_model_quantize_params local_params = *qs->params; local_params.ftype = ftype; ggml_type default_type = llama_ftype_get_default_type(ftype); // compute types for (size_t i = 0; i < n_tensors; i++) { - result_types[i] = llama_tensor_get_type(*qnt, &local_params, tensors[i], default_type, metadata[i]); + result_types[i] = llama_tensor_get_type(*qs, &local_params, tensors[i], default_type, metadata[i]); } } diff --git a/src/llama-quant.h b/src/llama-quant.h index 8aff106ddf..6f70f09bee 100644 --- a/src/llama-quant.h +++ b/src/llama-quant.h @@ -1,36 +1 @@ #pragma once - -#include "llama.h" - -#include - -struct llama_model; -struct compiled_tensor_type_patterns; - -struct llama_quant { - const llama_model & model; - const llama_model_quantize_params * params; - - int n_attention_wv = 0; - int n_ffn_down = 0; - int n_ffn_gate = 0; - int n_ffn_up = 0; - int i_attention_wv = 0; - int i_ffn_down = 0; - int i_ffn_gate = 0; - int i_ffn_up = 0; - - int n_k_quantized = 0; - int n_fallback = 0; - - bool has_imatrix = false; - - // used to figure out if a model has tied embeddings (tok_embd shares weights with output) - bool has_tied_embeddings = true; // assume tied until we see output.weight - - // tensor type override patterns (compiled once, used in llama_tensor_get_type) - std::unique_ptr tensor_type_patterns; - - llama_quant(const llama_model & model, const llama_model_quantize_params * params); - ~llama_quant(); -}; diff --git a/tests/test-quant-type-selection.cpp b/tests/test-quant-type-selection.cpp index aa0f14bf9f..ccecbed5c6 100644 --- a/tests/test-quant-type-selection.cpp +++ b/tests/test-quant-type-selection.cpp @@ -252,8 +252,8 @@ struct mock_tensors { std::vector tensors; }; -static mock_tensors build_mock_tensors(const llama_quant * qnt, - const gguf_remote_model & remote) { +static mock_tensors build_mock_tensors(const quantize_state_impl * qs, + const gguf_remote_model & remote) { const size_t ctx_size = remote.tensors.size() * ggml_tensor_overhead(); struct ggml_init_params params = { ctx_size, nullptr, true }; ggml_context_ptr ctx(ggml_init(params)); @@ -264,7 +264,7 @@ static mock_tensors build_mock_tensors(const llama_quant * qnt, ggml_tensor * gt = ggml_new_tensor_4d(ctx.get(), GGML_TYPE_F32, t.ne[0], t.ne[1], t.ne[2], t.ne[3]); ggml_set_name(gt, t.name.c_str()); - if (llama_quant_tensor_allows_quantization(qnt, gt)) { + if (llama_quant_tensor_allows_quantization(qs, gt)) { result.push_back(gt); } } @@ -279,7 +279,7 @@ static mock_tensors build_mock_tensors(const llama_quant * qnt, static std::string generate_snapshot(const std::string & name, const gguf_remote_model & remote, - llama_quant * qnt, + quantize_state_impl * qs, mock_tensors & mt) { std::ostringstream out; @@ -303,7 +303,7 @@ static std::string generate_snapshot(const std::string & name, } std::vector result_types(mt.tensors.size()); - llama_quant_compute_types(qnt, ft, mt.tensors.data(), result_types.data(), mt.tensors.size()); + llama_quant_compute_types(qs, ft, mt.tensors.data(), result_types.data(), mt.tensors.size()); out << "\n[" << fname << "] " << ggml_type_name(default_type) << "\n"; for (size_t j = 0; j < mt.tensors.size(); j++) { @@ -343,23 +343,23 @@ static int run_generate(const std::string & snapshot_dir) { const auto & remote = result.value(); llama_model * model = build_mock_model_from_remote(remote); llama_model_quantize_params qparams = llama_model_quantize_default_params(); - llama_quant * qnt = llama_quant_init(model, &qparams); - auto mt = build_mock_tensors(qnt, remote); + quantize_state_impl * qs = llama_quant_init(model, &qparams); + auto mt = build_mock_tensors(qs, remote); - std::string content = generate_snapshot(name, remote, qnt, mt); + std::string content = generate_snapshot(name, remote, qs, mt); std::string path = snapshot_dir + "/" + snapshot_file_from_name(name) + ".schema"; std::ofstream f(path); if (!f.good()) { fprintf(stderr, "ERROR: could not write %s\n", path.c_str()); - llama_quant_free(qnt); + llama_quant_free(qs); llama_model_free(model); return 1; } f << content; n_written++; fprintf(stderr, " wrote %s\n", path.c_str()); - llama_quant_free(qnt); + llama_quant_free(qs); llama_model_free(model); } @@ -371,7 +371,7 @@ static int run_generate(const std::string & snapshot_dir) { // Test mode: compare against snapshot files // --------------------------------------------------------------------------- -static bool run_test_section(llama_quant * qnt, +static bool run_test_section(quantize_state_impl * qs, mock_tensors & mt, const snapshot_section & section) { // verify default_type matches what llama_ftype_get_default_type returns @@ -383,7 +383,7 @@ static bool run_test_section(llama_quant * qnt, } std::vector result_types(mt.tensors.size()); - llama_quant_compute_types(qnt, section.ftype, mt.tensors.data(), result_types.data(), mt.tensors.size()); + llama_quant_compute_types(qs, section.ftype, mt.tensors.data(), result_types.data(), mt.tensors.size()); std::map override_map(section.overrides.begin(), section.overrides.end()); @@ -436,14 +436,14 @@ static int run_remote_tests(const std::string & snapshot_dir, const char * argv0 const auto & remote = result.value(); llama_model * model = build_mock_model_from_remote(remote); llama_model_quantize_params qparams = llama_model_quantize_default_params(); - llama_quant * qnt = llama_quant_init(model, &qparams); - auto mt = build_mock_tensors(qnt, remote); + quantize_state_impl * qs = llama_quant_init(model, &qparams); + auto mt = build_mock_tensors(qs, remote); std::string snapshot_path = snapshot_dir + "/" + snapshot_file_from_name(name) + ".schema"; std::vector sections; if (!parse_snapshot_file(snapshot_path, sections)) { printf(" SKIP (could not read snapshot file: %s)\n\n", snapshot_path.c_str()); - llama_quant_free(qnt); + llama_quant_free(qs); llama_model_free(model); total_skip++; continue; @@ -453,7 +453,7 @@ static int run_remote_tests(const std::string & snapshot_dir, const char * argv0 int model_fail = 0; for (const auto & section : sections) { - bool pass = run_test_section(qnt, mt, section); + bool pass = run_test_section(qs, mt, section); if (pass) { model_pass++; } else { @@ -471,7 +471,7 @@ static int run_remote_tests(const std::string & snapshot_dir, const char * argv0 total_fail++; } - llama_quant_free(qnt); + llama_quant_free(qs); llama_model_free(model); }