diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index e5e27da509..5460669e7c 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -945,7 +945,7 @@ static std::unordered_map target_bpw_type( const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; // Larger sample_rows_per_expert values may result in more accurate error estimates, but will take longer to compute - int sample_rows_per_expert = 512; + constexpr int sample_rows_per_expert = 384; std::vector f32_sample; f32_sample.reserve((size_t)ne2 * (size_t)std::min(nrows_total, sample_rows_per_expert) * (size_t)n_per_row);