diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index b3f10856d6..5c044eac37 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -949,6 +949,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         std::vector<float> & dequantized_buffer,
         float tensor_bias,
         const float * slice_bias,
+        float h_norm,
         const wce_cache * ref_wce = nullptr,
         const mse_cache * ref_mse = nullptr
     ) -> quant_error
@@ -990,10 +991,10 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                         const float * v = has_vals ? values_sample + s * n_per_row : nullptr;
                         const float * a = activations_sample + s * n_per_row;
                         double denom = 0.0;
-                        for (int64_t j = 0; j < n_per_row; ++j) {
-                            const double w = v ? std::max(0.0f, v[j]) : 1.0;
-                            const double aj = a[j];
-                            denom += w * aj * aj;
+                        if (v) {
+                            for (int64_t j = 0; j < n_per_row; ++j) { denom += std::max(0.0f, v[j]) * a[j] * a[j]; }
+                        } else {
+                            for (int64_t j = 0; j < n_per_row; ++j) { denom += a[j] * a[j]; }
                         }
 
                         local_bias_denom[s] = denom;
@@ -1009,9 +1010,10 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     for (int64_t r = 0; r < rs; ++r) {
                         const float * x = f32_sample.data() + off;
                         double sum = 0.0;
-                        for (int64_t j = 0; j < n_per_row; ++j) {
-                            double xx = x[j];
-                            sum += (v ? std::max(0.0f, v[j]) : 1.0) * xx * xx;
+                        if (v) {
+                            for (int64_t j = 0; j < n_per_row; ++j) { sum += std::max(0.0f, v[j]) * x[j] * x[j]; }
+                        } else {
+                            for (int64_t j = 0; j < n_per_row; ++j) { sum += x[j] * x[j]; }
                         }
 
                         local_row_sq_norm.push_back(sum);
@@ -1061,15 +1063,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
         // Compute Error Metrics: Entropy-Modulated Weighted Cosine Error (WCE) - Experimental
         if (do_wce) {
-            float h_norm = 1.0f;
-            if (statistics_data) {
-                const std::string name = ggml_get_name(t);
-                const std::string key = remap_imatrix(name, mapped);
-                if (auto it = statistics_data->find(key); it != statistics_data->end() && !it->second.empty()) {
-                    h_norm = it->second.size() > 3 ? it->second[1] : 1.0f;
-                }
-            }
-
             double total_cos_error = 0.0;
             size_t off = 0;
             size_t sample_idx = 0;
@@ -1093,44 +1086,24 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     const bool calc_nx = !cached_norm_x;
 
                     // SIMD-friendly loops
-                    if (v) {
-                        if (calc_nx) {
-                            for (int64_t j = 0; j < n_per_row; ++j) {
-                                const double w = std::max(0.0f, v[j]);
-                                const double xj = wx[j];
-                                const double yj = wy[j];
-                                const double yw = yj * w;
-                                dot += xj * yw;
-                                ny += yj * yw;
-                                nx += xj * xj * w;
-                            }
-                        } else {
-                            nx = (* cached_norm_x)[sample_idx];
-                            for (int64_t j = 0; j < n_per_row; ++j) {
-                                const double w = std::max(0.0f, v[j]);
-                                const double yj = wy[j];
-                                const double yw = yj * w;
-                                dot += (double) wx[j] * yw;
-                                ny += yj * yw;
-                            }
+                    if (calc_nx) {
+                        for (int64_t j = 0; j < n_per_row; ++j) {
+                            const double w = std::max(0.0f, v[j]);
+                            const double xj = wx[j];
+                            const double yj = wy[j];
+                            const double yw = yj * w;
+                            dot += xj * yw;
+                            ny += yj * yw;
+                            nx += xj * xj * w;
                         }
                     } else {
-                        if (calc_nx) {
-                            for (int64_t j = 0; j < n_per_row; ++j) {
-                                const double xj = wx[j];
-                                const double yj = wy[j];
-                                dot += xj * yj;
-                                ny += yj * yj;
-                                nx += xj * xj;
-                            }
-                        } else {
-                            nx = (* cached_norm_x)[sample_idx];
-                            for (int64_t j = 0; j < n_per_row; ++j) {
-                                const double xj = wx[j];
-                                const double yj = wy[j];
-                                dot += xj * yj;
-                                ny += yj * yj;
-                            }
+                        nx = (* cached_norm_x)[sample_idx];
+                        for (int64_t j = 0; j < n_per_row; ++j) {
+                            const double w = std::max(0.0f, v[j]);
+                            const double yj = wy[j];
+                            const double yw = yj * w;
+                            dot += (double) wx[j] * yw;
+                            ny += yj * yw;
                         }
                     }
 
@@ -1184,14 +1157,35 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 double w_err = 0.0;
                 double bias_num = 0.0;
 
-                for (int64_t j = 0; j < n_per_row; ++j) {
-                    const double w = val ? std::max(0.0f, val[j]) : 1.0;
-                    const double e = y[j] - x[j];
-                    w_err += w * e * e;
-                    if (act) { bias_num += w * e * act[j]; }
+                if (val && act) {
+                    for (int64_t j = 0; j < n_per_row; ++j) {
+                        const double w = std::max(0.0f, val[j]);
+                        const double e = y[j] - x[j];
+                        const double we = w * e;
+                        w_err += we * e;
+                        bias_num += we * act[j];
+                    }
+                } else if (val) {
+                    for (int64_t j = 0; j < n_per_row; ++j) {
+                        const double w = std::max(0.0f, val[j]);
+                        const double e = y[j] - x[j];
+                        w_err += w * e * e;
+                    }
+                } else if (act) {
+                    for (int64_t j = 0; j < n_per_row; ++j) {
+                         const double e = y[j] - x[j];
+                         w_err += e * e;
+                         bias_num += e * act[j];
+                    }
+                } else {
+                    for (int64_t j = 0; j < n_per_row; ++j) {
+                        const double e = y[j] - x[j];
+                        w_err += e * e;
+                    }
                 }
 
-                const double m_norm = w_err / ((* ptr_row_sq_norm)[row_idx] + EPSILON);
+                const double rsn = (* ptr_row_sq_norm)[row_idx];
+                const double m_norm = rsn > EPSILON ? w_err / rsn : 0.0;
                 slice_mse_norm.push_back(std::isfinite(m_norm) ? m_norm : INFINITE);
 
                 if (act) {
@@ -1319,7 +1313,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             const ggml_type src_type = tensor->type;
             const size_t src_row_sz = ggml_row_size(src_type, n_per_row);
             const ggml_type_traits * traits = ggml_get_type_traits(src_type);
-            std::vector<float> row_buf(n_per_row);
 
             for (int64_t slice = 0; slice < ne2; ++slice) {
                 std::mt19937 rng(std::hash<std::string>{}(name) ^ HASH_MAGIC ^ slice);
@@ -1330,18 +1323,15 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 int64_t count = 0;
                 for (int64_t r = offset; r < nrows_total && count < limit; r += stride) {
                     const uint8_t * src = (const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz;
-                    if (src_type == GGML_TYPE_F32) {
-                        f32_sample.insert(f32_sample.end(), (const float*)src, (const float*)src + n_per_row);
-                    } else if (src_type == GGML_TYPE_F16 || src_type == GGML_TYPE_BF16) {
-                        if (src_type == GGML_TYPE_F16) { ggml_fp16_to_fp32_row((const ggml_fp16_t*)src, row_buf.data(), (int)n_per_row); }
-                        else { ggml_bf16_to_fp32_row((const ggml_bf16_t*)src, row_buf.data(), (int)n_per_row); }
-                        f32_sample.insert(f32_sample.end(), row_buf.begin(), row_buf.end());
-                    } else if (traits && traits->to_float) {
-                        traits->to_float(src, row_buf.data(), (int)n_per_row);
-                        f32_sample.insert(f32_sample.end(), row_buf.begin(), row_buf.end());
-                    } else {
-                        throw std::runtime_error(format("unsupported source type %s for sampling", ggml_type_name(src_type)));
-                    }
+                    size_t cur_sz = f32_sample.size();
+                    f32_sample.resize(cur_sz + n_per_row);
+                    float * dst = f32_sample.data() + cur_sz;
+
+                    if (src_type == GGML_TYPE_F32) { std::memcpy(dst, src, n_per_row * sizeof(float)); }
+                    else if (src_type == GGML_TYPE_F16) { ggml_fp16_to_fp32_row((const ggml_fp16_t*)src, dst, (int)n_per_row); }
+                    else if (src_type == GGML_TYPE_BF16) { ggml_bf16_to_fp32_row((const ggml_bf16_t*)src, dst, (int)n_per_row); }
+                    else if (traits && traits->to_float) { traits->to_float(src, dst, (int)n_per_row); }
+                    else { throw std::runtime_error(format("unsupported source type %s for sampling", ggml_type_name(src_type))); }
 
                     ++count;
                 }
@@ -1360,6 +1350,15 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         auto [val_ptr, val_sz] = get_side_data(values_data);
         auto [act_ptr, act_sz] = get_side_data(activations_data);
 
+        // Cache WCE stats once per tensor to avoid repeated map lookups/regex inside compute_quant_error
+        float h_norm = 1.0f;
+        if (valid_wce && statistics_data) {
+            const std::string key = remap_imatrix(name, mapped);
+            if (auto it = statistics_data->find(key); it != statistics_data->end() && !it->second.empty()) {
+                h_norm = it->second.size() > 3 ? it->second[1] : 1.0f;
+            }
+        }
+
         std::vector<float> val_vec;
         std::vector<float> act_vec;
         auto prepare_broadcast = [&](const float* src, size_t sz, std::vector<float>& dst) {
@@ -1378,7 +1377,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         prepare_broadcast(val_ptr, val_sz, val_vec);
         prepare_broadcast(act_ptr, act_sz, act_vec);
 
-        // Precompute WCE reference stats (row_sq_norm) to avoid recalculation per candidate
+        // Precompute WCE reference stats
         wce_cache ref_wce;
         mse_cache ref_mse;
         size_t total_rows_sampled = 0;
@@ -1386,13 +1385,11 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
         if (valid_wce && !val_vec.empty() && !act_vec.empty()) {
             ref_wce.row_sq_norm.reserve(total_rows_sampled);
-
             size_t off = 0;
             for (int64_t s = 0; s < ne2; ++s) {
                 const int64_t rs = rows_sample[s];
                 if (rs == 0) { continue; }
                 const float * v = val_vec.data() + s * n_per_row;
-
                 for (int64_t r = 0; r < rs; ++r) {
                     const float * wx = f32_sample.data() + off;
                     double norm_x = 0.0;
@@ -1405,43 +1402,45 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 }
             }
         } else {
-             // Precompute MSE reference stats (row_sq_norm and bias_denominator) to avoid recalculation per candidate
-             ref_mse.row_sq_norm.reserve(total_rows_sampled);
-             ref_mse.bias_denominator.assign(ne2, 0.0);
-             const bool has_acts = !act_vec.empty();
-             const bool has_vals = !val_vec.empty();
+            // Precompute MSE reference stats
+            ref_mse.row_sq_norm.reserve(total_rows_sampled);
+            ref_mse.bias_denominator.assign(ne2, 0.0);
+            const bool has_acts = !act_vec.empty();
+            const bool has_vals = !val_vec.empty();
 
-             // Bias Denominators
-             if (has_acts) {
-                 for (int64_t s = 0; s < ne2; ++s) {
-                     const float * v = has_vals ? val_vec.data() + s * n_per_row : nullptr;
-                     const float * a = act_vec.data() + s * n_per_row;
-                     double denom = 0.0;
-                     for (int64_t j = 0; j < n_per_row; ++j) {
-                         const double w = v ? std::max(0.0f, v[j]) : 1.0;
-                         const double aj = a[j];
-                         denom += w * aj * aj;
-                     }
-                     ref_mse.bias_denominator[s] = denom;
-                 }
-             }
+            if (has_acts) {
+                for (int64_t s = 0; s < ne2; ++s) {
+                    const float * v = has_vals ? val_vec.data() + s * n_per_row : nullptr;
+                    const float * a = act_vec.data() + s * n_per_row;
+                    double denom = 0.0;
+                    if (v) {
+                        for (int64_t j = 0; j < n_per_row; ++j) { denom += std::max(0.0f, v[j]) * a[j] * a[j]; }
+                    } else {
+                        for (int64_t j = 0; j < n_per_row; ++j) { denom += a[j] * a[j]; }
+                    }
 
-             // Row Squared Norms
-             size_t off = 0;
-             for (int64_t s = 0; s < ne2; ++s) {
-                 const int64_t rs = rows_sample[s];
-                 const float * v = has_vals ? val_vec.data() + s * n_per_row : nullptr;
-                 for (int64_t r = 0; r < rs; ++r) {
-                     const float * x = f32_sample.data() + off;
-                     double sum = 0.0;
-                     for (int64_t j = 0; j < n_per_row; ++j) {
-                         double xx = x[j];
-                         sum += (v ? std::max(0.0f, v[j]) : 1.0) * xx * xx;
-                     }
-                     ref_mse.row_sq_norm.push_back(sum);
-                     off += (size_t)n_per_row;
-                 }
-             }
+                    ref_mse.bias_denominator[s] = denom;
+                }
+            }
+
+            size_t off = 0;
+            for (int64_t s = 0; s < ne2; ++s) {
+                const int64_t rs = rows_sample[s];
+                const float * v = has_vals ? val_vec.data() + s * n_per_row : nullptr;
+                for (int64_t r = 0; r < rs; ++r) {
+                    const float * x = f32_sample.data() + off;
+                    double sum = 0.0;
+                    if (v) {
+                        for (int64_t j = 0; j < n_per_row; ++j) { sum += std::max(0.0f, v[j]) * x[j] * x[j]; }
+                    }
+                    else {
+                        for (int64_t j = 0; j < n_per_row; ++j) { sum += x[j] * x[j]; }
+                    }
+
+                    ref_mse.row_sq_norm.push_back(sum);
+                    off += (size_t)n_per_row;
+                }
+            }
         }
 
         // Build candidates
@@ -1461,7 +1460,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         std::sort(valid_types.begin(), valid_types.end());
         valid_types.erase(std::unique(valid_types.begin(), valid_types.end()), valid_types.end());
 
-        // Calculate bias lambda to adjust the trade-off between MSE and systematic bias
         float tensor_lambda = 0.0f;
         std::vector<float> slice_lambdas = estimate_lambda(val_vec.empty()?nullptr:val_vec.data(), act_vec.empty()?nullptr:act_vec.data(), n_per_row, ne2);
         if (!slice_lambdas.empty()) {
@@ -1492,6 +1490,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 dq_buf,
                 tensor_lambda,
                 slice_lambdas.data(),
+                h_norm,
                 ptr_ref_wce,
                 ptr_ref_mse
             );
@@ -1507,35 +1506,29 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             evaluations.push_back(candidate);
         }
 
-        // Select final quality metric (MSE or MSE + bias) if not using WCE
         type_choice ch;
         ch.w = tw;
         ch.n_elements = ggml_nelements(tensor);
         bool bias_needed = false;
         if (!valid_wce && !slice_lambdas.empty()) {
-            // Determine if bias correction is required
             double best_mse = INFINITE;
             double max_rel_bias = 0.0;
             for (const auto& c : evaluations) {
                 if (c.bytes == 0) { continue; }
                 best_mse = std::min(best_mse, c.mse);
-                // Check penalty term contribution (error - mse)
                 if (c.mse > EPSILON) { max_rel_bias = std::max(max_rel_bias, std::max(0.0, c.error - c.mse) / c.mse); }
             }
 
-            // If penalty/bias is significant (>= 50% of MSE), use combined error, else pure MSE
             bias_needed = max_rel_bias >= 0.5;
         }
 
         for (const auto & ev : evaluations) {
             if (ev.bytes == 0) { continue; }
             type_scores ts = ev;
-            // If using WCE, c.error is already set
             if (!valid_wce && !bias_needed) { ts.error = ts.mse; }
             ch.candidates.push_back(ts);
         }
 
-        // Fallback if empty
         if (ch.candidates.empty()) {
             type_scores fb;
             fb.type = tensor->type;
@@ -1544,15 +1537,13 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             ch.candidates.push_back(fb);
         }
 
-        // Convex hull & Pareto Front simplification
         auto simplify_pareto = [](std::vector<type_scores> & candidates) {
             std::sort(candidates.begin(), candidates.end(), [](const auto& a, const auto& b) {
                 return a.bytes < b.bytes || (a.bytes == b.bytes && a.error < b.error);
             });
             candidates.erase(std::unique(candidates.begin(), candidates.end(),
-                 [](const auto & a, const auto &b) { return a.bytes == b.bytes; }), candidates.end());
+                [](const auto & a, const auto &b) { return a.bytes == b.bytes; }), candidates.end());
 
-            // Lower envelope
             std::vector<type_scores> hull;
             double min_err = INFINITE;
             for(const auto & c : candidates) {
@@ -1563,12 +1554,12 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             }
             candidates = std::move(hull);
 
-            // Convex hull
             if (candidates.size() < 3) { return; }
             std::vector<type_scores> convex;
             auto cross = [](const auto& a, const auto& b, const auto& c) {
                 return ((double)b.bytes - (double)a.bytes) * (c.error - a.error) - ((double)c.bytes - (double)a.bytes) * (b.error - a.error);
             };
+
             for (const auto & c : candidates) {
                 while (convex.size() >= 2 && cross(convex[convex.size()-2], convex.back(), c) <= EPSILON) { convex.pop_back(); }
                 convex.push_back(c);