diff --git a/include/llama.h b/include/llama.h
index 48a074df06..c3f79a117e 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -399,7 +399,7 @@ extern "C" {
         int64_t target_size;                  // target file size in bytes
         bool save_state;                      // keep bpw state file
         void * state_file;                    // pointer to bpw state file
-        bool ignore_tensor_importance;        // allocate target bpw budget equitably across all tensors
+        float importance_pct;                 // identify up to pct% of tensors as important
         bool use_wce;                         // optimize for WCE instead of MSE
     } llama_model_quantize_params;
 
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index ea6a59e381..cf24ccd144 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -587,6 +587,22 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 ) {
     bpw_stop.store(false, std::memory_order_relaxed);
 
+    // Vector indices for statistics_data's metrics
+    enum {
+        ENERGY   = 0,
+        MEAN     = 1,
+        ELEMENTS = 2,
+        STDDEV   = 3,
+        SKEWNESS = 4,
+        KURTOSIS = 5,
+        GAIN     = 6,
+        H_NORM   = 7,
+        L2_DIST  = 8,
+        COSSIM   = 9,
+        PCC      = 10,
+        COVAR    = 11
+    };
+
     // SIGINT/SIGTERM signal handlers
     struct signal_scope_guard {
         using handler_t = void (*)(int);
@@ -621,6 +637,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         float min_bpw = 0.0;
         float max_bpw = 0.0;
         size_t n_elements = 0;
+        bool important = false;
     };
 
     // Quantization types
@@ -901,7 +918,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         std::vector<float> & dequantized_buffer,
         float tensor_bias,
         const float * slice_bias,
-        float h_norm,
         const wce_cache * ref_wce = nullptr,
         const mse_cache * ref_mse = nullptr
     ) -> quant_error
@@ -1078,8 +1094,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 total_cos_error += slice_sum / (double)rs * (double)nrows;
             }
 
-            const double penalty = 2.0 - std::clamp((double) h_norm, 0.0, 1.0);
-            qe.wce = total_cos_error * penalty;
+            qe.wce = total_cos_error;
             qe.error = qe.wce;
             return qe;
         }
@@ -1306,13 +1321,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         auto [act_ptr, act_sz] = get_side_data(activations_data);
 
         // Cache WCE stats once per tensor to avoid repeated map lookups/regex inside compute_quant_error
-        float h_norm = 1.0f;
-        if (valid_wce && statistics_data) {
-            if (auto it = statistics_data->find(remapped_name); it != statistics_data->end() && !it->second.empty()) {
-                h_norm = it->second.size() > 3 ? it->second[1] : 1.0f;
-            }
-        }
-
         std::vector<float> val_storage;
         std::vector<float> act_storage;
         const float * val_vec_ptr = nullptr;
@@ -1440,6 +1448,15 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             dq_buf.reserve(total_rows_sampled * n_per_row);
         }
 
+        // Kurtosis-Gain error scaling factor
+        float scaling_factor = 1.0f;
+        if (statistics_data) {
+            if (auto it = statistics_data->find(remapped_name); it != statistics_data->end() && !it->second.empty()) {
+                const auto & ts = it->second;
+                scaling_factor = 1.0f + std::log1p(std::max(0.0f, ts[KURTOSIS])) * std::max(1.0f, std::isnan(ts[GAIN]) ? 1.0f : ts[GAIN]);
+            }
+        }
+
         for (ggml_type vt : valid_types) {
             if (bpw_stop.load(std::memory_order_relaxed)) { return std::nullopt; }
             const wce_cache * ptr_ref_wce = valid_wce && !ref_wce.row_sq_norm.empty() ? & ref_wce : nullptr;
@@ -1455,8 +1472,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 q_buf,
                 dq_buf,
                 tensor_lambda,
-                slice_lambdas.data(),
-                h_norm,
+                slice_lambdas.empty() ? nullptr : slice_lambdas.data(),
                 ptr_ref_wce,
                 ptr_ref_mse
             );
@@ -1465,7 +1481,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             candidate.type = vt;
             candidate.bpw = (float)tensor_bpw(tensor, vt);
             candidate.bytes = tensor_bytes(tensor, vt);
-            candidate.error = qe.error;
+            candidate.error = qe.error * scaling_factor;
             candidate.mse = qe.mse;
             candidate.proj = qe.proj;
             candidate.wce = qe.wce;
@@ -1616,10 +1632,11 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     auto build_mix = [&]() -> std::unordered_map<std::string, ggml_type> {
         std::unordered_map<std::string, ggml_type> mix;
         LLAMA_LOG_INFO("%s: - estimated tensor quantization mix:\n", func);
-        for (const auto & ti : all_tensors) {
-            LLAMA_LOG_INFO("\t%s: %45s - \t%8s, \t%1.4f bpw,\terror: %.4f\n",
-                func, ggml_get_name(ti.w->tensor), ggml_type_name(ti.candidates[ti.choice].type), ti.candidates[ti.choice].bpw, ti.candidates[ti.choice].error);
-            mix[ggml_get_name(ti.w->tensor)] = ti.candidates[ti.choice].type;
+        for (const auto & tn : all_tensors) {
+            LLAMA_LOG_INFO("\t%s: %45s %s\t%8s, \t%1.4f bpw,\terror: %.4f\n",
+                func, ggml_get_name(tn.w->tensor), tn.important ? "⬆︎" : "-", ggml_type_name(tn.candidates[tn.choice].type), tn.candidates[tn.choice].bpw,
+                tn.candidates[tn.choice].error);
+            mix[ggml_get_name(tn.w->tensor)] = tn.candidates[tn.choice].type;
         }
 
         return mix;
@@ -1634,23 +1651,62 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         return build_mix();
     }
 
+    auto importance_score = [](const std::vector<float> & tstats) -> float {
+        if (tstats.size() < 12) { return 0.0f; }
+
+        const float energy = std::log1pf(std::max(0.0f, (float)tstats[ENERGY]));
+        const float range = 1.0f + std::max(0.0f, tstats[STDDEV]);
+        const float magnitude = std::isfinite(tstats[L2_DIST]) ? 1.0f + tstats[L2_DIST] : 1.0f;
+        const float alignment = std::isfinite(tstats[COSSIM]) ? 1.0f - tstats[COSSIM] : 1.0f;
+        const float concentration = 1.0f - std::clamp(tstats[H_NORM], 0.0f, 100.0f) / 100.0f + EPSILON;
+
+        return energy * range * magnitude * alignment * concentration;
+    };
+
+    // Threshold at which pct of tensors will be marked as important
+    auto threshold_score = [&](const std::unordered_map<std::string, std::vector<float>> & stats, const float pct) -> float {
+        if (stats.empty() || pct < 0.0f || pct > 100.0f) { return std::numeric_limits<float>::quiet_NaN(); }
+
+        std::vector<float> val;
+        val.reserve(stats.size());
+        for (const auto & ts : stats) { val.push_back(importance_score(ts.second)); }
+        if (val.empty()) { return std::numeric_limits<float>::quiet_NaN(); }
+
+        size_t idx = std::round((1.0f - pct / 100.0f) * (val.size() - 1));
+        if (idx >= val.size()) { idx = val.size() - 1; }
+        std::nth_element(val.begin(), val.begin() + idx, val.end());
+
+        return val[idx];
+    };
+
+    float cutoff = std::numeric_limits<float>::quiet_NaN();
+    if (statistics_data && !statistics_data->empty()) { cutoff = threshold_score(* statistics_data, params->importance_pct); }
+    LLAMA_LOG_INFO("%s: - importance score cutoff: %1.4f\n", func, cutoff);
+
     // Certain tensors have a higher impact on model quality, so we apply a lower penalty to them
     auto is_important = [&](const std::string & tensor_name) -> bool {
-        bool important = false;
-        if (params->ignore_tensor_importance) { return important; }
+        if (tensor_name == "output.weight") { return true; }
+        if (params->importance_pct == 0.0f) { return false; }
+        if (std::isfinite(cutoff)) {
+            if (auto it = statistics_data->find(remap_imatrix(tensor_name, mapped)); it != statistics_data->end() && !it->second.empty()) {
+                return importance_score(it->second) >= cutoff;
+            }
+        } else {
+            return tensor_name.find(".attn_output.weight") != std::string::npos ||
+                tensor_name.find(".attn_o.weight") != std::string::npos ||
+                tensor_name.find(".attn_v.weight") != std::string::npos ||
+                tensor_name.find(".ffn_down.weight") != std::string::npos ||
+                tensor_name.find(".ffn_down_exps.weight") != std::string::npos ||
+                tensor_name.find(".time_mix_output.weight") != std::string::npos ||
+                tensor_name.find(".time_mix_value.weight") != std::string::npos;
+        }
 
-        important = tensor_name == "output.weight" ||
-                        tensor_name.find(".attn_output.weight") != std::string::npos ||
-                        tensor_name.find(".attn_o.weight") != std::string::npos ||
-                        tensor_name.find(".attn_v.weight") != std::string::npos ||
-                        tensor_name.find(".ffn_down.weight") != std::string::npos ||
-                        tensor_name.find(".ffn_down_exps.weight") != std::string::npos ||
-                        tensor_name.find(".time_mix_output.weight") != std::string::npos ||
-                        tensor_name.find(".time_mix_value.weight") != std::string::npos;
-
-        return important;
+        return false;
     };
 
+    // Determine tensor importance
+    for (auto & tn : all_tensors) { tn.important = is_important(ggml_get_name(tn.w->tensor)); }
+
     // Minimize error subject to a size target constraint
     auto lagrangian_relaxation = [&](const double mu, std::vector<int> & choices, size_t & bytes, double & cost) {
         choices.resize(all_tensors.size());
@@ -1658,8 +1714,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         cost = 0.0;
         for (size_t i = 0; i < all_tensors.size(); ++i) {
             const auto & tn = all_tensors[i];
-            const bool imp = is_important(ggml_get_name(tn.w->tensor));
-            const double eff_mu = imp ? mu * 0.1 : mu; // important tensors get 10x lower penalty
+            const double eff_mu = tn.important ? mu / penalty : mu; // important tensors get a lower penalty
 
             int best = 0;
             double min = INFINITE;
@@ -1764,7 +1819,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             auto bytes = (double)(tn.candidates[next].bytes - tn.candidates[tn.choice].bytes);
             if (bytes > EPSILON) {
                 double ratio = err / bytes;
-                if (is_important(ggml_get_name(tn.w->tensor))) { ratio *= 5.0; } // important tensors get 5x boost
+                if (tn.important) { ratio *= penalty; } // important tensors get a higher priority
                 queue.push({i, next, ratio});
             }
         }
@@ -2051,10 +2106,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
             if (params->statistics) {
                 LLAMA_LOG_INFO("%s: imatrix has statistics\n", __func__);
             }
-            if (params->ignore_tensor_importance) {
-                LLAMA_LOG_INFO("%s: distributing budget equitably across all tensors\n", __func__);
-            } else {
-                LLAMA_LOG_INFO("%s: assigning more budget to important tensors\n", __func__);
+            if (params->importance_pct != 0.0f) {
+                LLAMA_LOG_INFO("%s: marking up to %.2f%% of tensors as important\n", __func__, params->importance_pct);
             }
             if (params->use_wce) {
                 LLAMA_LOG_INFO("%s: using experimental Weighted Cosine Error (WCE) optimization\n", __func__);
@@ -2426,7 +2479,7 @@ llama_model_quantize_params llama_model_quantize_default_params() {
         /*.target_size                 =*/ -1,
         /*.save_state                  =*/ false,
         /*.state_file                  =*/ nullptr,
-        /*.ignore_tensor_importance    =*/ false,
+        /*.importance_pct              =*/ 0.0f,
         /*.use_wce                     =*/ false
     };
 
diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index 5f4a449210..c13af3892d 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -139,8 +139,8 @@ static void usage(const char * executable) {
     printf("      Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n");
     printf("  --target-size N[unit]: target a file size. N must be a positive number with an optional unit (b, kb, mb, gb, tb)\n");
     printf("      Advanced option to automatically select quantization types to achieve a target file size\n");
-    printf("  --ignore-tensor-importance: distribute bpw budget equitably across all tensors\n");
-    printf("      Advanced option to disable assigning more bpw budget to important tensors. It may increase quality for some models\n");
+    printf("  --importance-pct N: mark up to N%% of tensors as important. N must be a positive number between 0.0 and 100.0\n");
+    printf("      Advanced option to select up to N%% of important tensors to keep at a higher precision. It may increase quality for some models\n");
     printf("  --save-state: save the bpw / file size computations to <model name>-<model hash>-mse.bpw_state\n");
     printf("  --state-file file_name: file name to use instead of default\n");
     printf("  --keep-split: will generate quantized model in the same shards as input\n");
@@ -557,6 +557,27 @@ static bool parse_target_bpw(const char * data, float & target_bpw) {
     return true;
 }
 
+static bool parse_importance_pct(const char * data, float & importance_pct) {
+    if (!data) {
+        printf("\n%s: no tensor importance %% provided\n\n", __func__);
+        return false;
+    }
+
+    try {
+        importance_pct = std::stof(data);
+        if (importance_pct < 0.0f || importance_pct > 100.0f) {
+            printf("\n%s: tensor importance %% must be a positive number between 0.0 and 100.0\n\n", __func__);
+            return false;
+        }
+    }
+    catch (const std::exception & e) {
+        printf("\n%s: '%s' is not valid. Tensor importance %% must be a positive number between 0.0 and 100.0\n\n", __func__, data);
+        return false;
+    }
+
+    return true;
+}
+
 static bool parse_target_size(const char * data, int64_t & target_size) {
     if (!data) {
         printf("\n%s: no target file size provided\n\n", __func__);
@@ -633,6 +654,7 @@ int main(int argc, char ** argv) {
     std::vector<int> prune_layers;
     float target_bpw = -1.0f;
     int64_t target_size = -1;
+    float importance_pct = 0.0f;
 
     for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
         if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
@@ -673,8 +695,10 @@ int main(int argc, char ** argv) {
             }
         } else if (strcmp(argv[arg_idx], "--use-wce") == 0) {
             params.use_wce = true;
-        } else if (strcmp(argv[arg_idx], "--ignore-tensor-importance") == 0) {
-            params.ignore_tensor_importance = true;
+        } else if (strcmp(argv[arg_idx], "--importance-pct") == 0) {
+            if (arg_idx == argc-1 || !parse_importance_pct(argv[++arg_idx], importance_pct)) {
+                usage(argv[0]);
+            }
         } else if (strcmp(argv[arg_idx], "--save-state") == 0) {
             params.save_state = true;
         } else if (strcmp(argv[arg_idx], "--state-file") == 0) {
@@ -792,6 +816,9 @@ int main(int argc, char ** argv) {
     if (target_size != -1) {
         params.target_size = target_size;
     }
+    if (importance_pct != 0.0f) {
+        params.importance_pct = importance_pct;
+    }
 
     llama_backend_init();