From 12e0524f3a24d4d5c8a81546fff83fee81e0d3e1 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 12 Oct 2025 15:12:15 +0100 Subject: [PATCH] Reduce compute time by parallelising tensor processing - courtesy of https://github.com/ddh0 --- src/llama-quant.cpp | 187 +++++++++++++++++++++++--------------------- 1 file changed, 100 insertions(+), 87 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 07a88f0fd6..c607651b05 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -15,6 +15,7 @@ #include #include #include +#include // Quantization types. Changes to this struct must be replicated in quantize.cpp struct tensor_quantization { @@ -623,7 +624,6 @@ static void signal_handler(int) { // Returns tensor type overrides to meet a global bpw target static std::unordered_map target_bpw_type( llama_model_loader & ml, - std::vector> & buffer, const llama_model & model, const std::vector & tensors, const std::map & mapped, @@ -659,6 +659,7 @@ static std::unordered_map target_bpw_type( GGML_TYPE_IQ3_XXS, GGML_TYPE_Q3_K, GGML_TYPE_IQ4_XS, + GGML_TYPE_IQ4_NL, GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K, @@ -1127,16 +1128,22 @@ static std::unordered_map target_bpw_type( install_signal_handlers(); auto bpw_data = load_bpw_state(); - std::vector all; - all.reserve(tensors.size()); - for (const auto * tw : tensors) { + + // Significantly reduce compute time by parallelising tensor processing - courtesy of https://github.com/ddh0 + auto process_tensor = [&](const llama_model_loader::llama_tensor_weight * tw, + std::vector> & thread_local_buffer, + std::mutex & loader_mutex, + std::mutex & log_mutex) -> std::optional + { ggml_tensor * tensor = tw->tensor; const std::string name = ggml_get_name(tensor); - if (!can_quantize(tensor)) { continue; } - check_signal_handler(all); + if (bpw_stop.load(std::memory_order_relaxed)) { + return std::nullopt; + } - // If we already have fully evaluatedd this tensor then reuse it - if (auto it_saved = bpw_data.find(name); it_saved != bpw_data.end()) { + // check for pre-computed results from a checkpoint file. + auto it_saved = bpw_data.find(name); + if (it_saved != bpw_data.end()) { tensor_info info; info.w = tw; info.candidate = it_saved->second.candidate; @@ -1144,17 +1151,21 @@ static std::unordered_map target_bpw_type( info.min_bpw = it_saved->second.min_bpw; info.max_bpw = it_saved->second.max_bpw; info.n_elements = it_saved->second.n_elements ? it_saved->second.n_elements : (size_t)ggml_nelements(tensor); - all.push_back(std::move(info)); - continue; + return info; + } + { + std::lock_guard lock(log_mutex); + LLAMA_LOG_INFO("\ttarget_bpw_type: - processing tensor %45s \t(%12" PRId64 " elements)\n", name.c_str(), ggml_nelements(tensor)); } - LLAMA_LOG_INFO("\t%s: - processing tensor %45s \t(%12" PRId64 " elements)\n", __func__, name.c_str(), ggml_nelements(tensor)); if (!ml.use_mmap) { - if (buffer.size() < ggml_nbytes(tensor)) { buffer.resize(ggml_nbytes(tensor)); } - tensor->data = buffer.data(); + if (thread_local_buffer.size() < ggml_nbytes(tensor)) { thread_local_buffer.resize(ggml_nbytes(tensor)); } + tensor->data = thread_local_buffer.data(); + } + { + std::lock_guard lock(loader_mutex); + ml.load_data_for(tensor); } - - ml.load_data_for(tensor); // Dequantize sampled rows into f32_sample const int64_t n_per_row = tensor->ne[0]; @@ -1170,7 +1181,7 @@ static std::unordered_map target_bpw_type( const int64_t max_rows = 4096; int64_t total_rows = std::llround(slice_budget / std::max(1, n)); total_rows = std::max(min_rows, std::min(total_rows, std::min(rows, max_rows))); - if (rows <= min_rows * 2) { total_rows = rows; } // use all rows for small tensors + if (rows <= min_rows * 2) { total_rows = rows; } return total_rows; }; @@ -1191,17 +1202,16 @@ static std::unordered_map target_bpw_type( return; } if (t == GGML_TYPE_F16) { - ggml_fp16_to_fp32_row((const ggml_fp16_t *) src, dst, (int)n_per_row); + ggml_fp16_to_fp32_row((const ggml_fp16_t *)src, dst, (int)n_per_row); return; } if (t == GGML_TYPE_BF16) { - ggml_bf16_to_fp32_row((const ggml_bf16_t *) src, dst, (int)n_per_row); + ggml_bf16_to_fp32_row((const ggml_bf16_t *)src, dst, (int)n_per_row); return; } - if (src_is_quant) { GGML_ASSERT(src_traits && src_traits->to_float); - src_traits->to_float(src, dst, (int) n_per_row); + src_traits->to_float(src, dst, (int)n_per_row); return; } @@ -1266,6 +1276,7 @@ static std::unordered_map target_bpw_type( return; } + std::lock_guard lock(log_mutex); LLAMA_LOG_WARN("%s: side data size mismatch for %s: got %zu, expected %zu or %zu; ignoring\n", func, name.c_str(), src_sz, (size_t)n_per_row, want); }; @@ -1276,12 +1287,9 @@ static std::unordered_map target_bpw_type( if (values_all) { copy_or_broadcast(values_all, values_sz, values_sample); } if (activations_all) { copy_or_broadcast(activations_all, activations_sz, activations_sample); } - const int64_t nelem = ggml_nelements(tensor); tensor_info info; info.w = tw; - info.n_elements = nelem; - - // Prepare scratch buffers sized for the largest candidate row size + info.n_elements = ggml_nelements(tensor); size_t total_sampled_rows = f32_sample.size() / n_per_row; // Build list of candidate types first (compatible ones) @@ -1295,7 +1303,8 @@ static std::unordered_map target_bpw_type( for (size_t i = 0; i < base_sz; ++i) { ggml_type ts_type = base_arr[i]; if (is_iq(ts_type) && !has_valid_imatrix) { - LLAMA_LOG_WARN("%s: skipping %s for %s, no or mismatched imatrix\n", __func__, ggml_type_name(ts_type), name.c_str()); + std::lock_guard lock(log_mutex); + LLAMA_LOG_WARN("\t%s: skipping %s for %s, no or mismatched imatrix\n", func, ggml_type_name(ts_type), name.c_str()); continue; } @@ -1325,58 +1334,38 @@ static std::unordered_map target_bpw_type( std::vector quantized_buffer(max_row_sz * total_sampled_rows); std::vector dequantized_buffer(f32_sample.size()); const float * slice_lambda = lambdas.empty() ? nullptr : lambdas.data(); - int n_eval_threads = std::max(1, std::min(nthread, (int)compatible_candidates.size())); - std::atomic cidx{0}; - std::vector eval_workers; - eval_workers.reserve(n_eval_threads); - for (int ti = 0; ti < n_eval_threads; ++ti) { - eval_workers.emplace_back([&] { - // thread-local scratch - std::vector tl_quantized_buffer(quantized_buffer.size()); - std::vector tl_dequantized_buffer(dequantized_buffer.size()); - for (;;) { - if (bpw_stop.load(std::memory_order_relaxed)) { break; } // stop if a signal arrived - const size_t i = cidx.fetch_add(1, std::memory_order_acq_rel); - if (i >= compatible_candidates.size()) { break; } + for (size_t i = 0; i < compatible_candidates.size(); ++i) { + if (bpw_stop.load(std::memory_order_relaxed)) { break; } - const ggml_type tensor_types = compatible_candidates[i]; - const auto bpw = (float)tensor_bpw(tensor, tensor_types); - const size_t bytes = tensor_bytes(tensor, tensor_types); - double mse = 0.0; - double proj = 0.0; - const auto err = estimate_error(tensor, tensor_types, f32_sample, rows_sample, values, activations, - tl_quantized_buffer, tl_dequantized_buffer, tensor_lambda, slice_lambda, &mse, &proj); - eval_candidates[i] = candidate_types{ tensor_types, bpw, bytes, err, mse, proj }; - } - }); + const ggml_type tensor_types = compatible_candidates[i]; + const auto bpw = (float)tensor_bpw(tensor, tensor_types); + const size_t bytes = tensor_bytes(tensor, tensor_types); + double mse = 0.0; + double proj = 0.0; + const auto err = estimate_error(tensor, tensor_types, f32_sample, rows_sample, values, activations, + quantized_buffer, dequantized_buffer, tensor_lambda, slice_lambda, &mse, &proj); + eval_candidates[i] = candidate_types{ tensor_types, bpw, bytes, err, mse, proj }; } - for (auto &th : eval_workers) { th.join(); } - - // If interruption happened mid-evaluation, exit without adding a half-baked tensor entry - if (bpw_stop.load(std::memory_order_relaxed) && cidx.load(std::memory_order_relaxed) < compatible_candidates.size()) { - check_signal_handler(all); - } + if (bpw_stop.load(std::memory_order_relaxed)) { return std::nullopt; } // Check if biasing is needed bool bias_needed = false; if (!lambdas.empty()) { int min_mse = -1; int min_bias = -1; - { - double best_mse = std::numeric_limits::infinity(); - double best_err = std::numeric_limits::infinity(); - for (int i = 0; i < (int)eval_candidates.size(); ++i) { - const auto & c = eval_candidates[i]; - if (c.bytes == 0) { continue; } - if (c.mse < best_mse) { - best_mse = c.mse; - min_mse = i; - } - if (c.error < best_err) { - best_err = c.error; - min_bias = i; - } + double best_mse = std::numeric_limits::infinity(); + double best_err = std::numeric_limits::infinity(); + for (int i = 0; i < (int)eval_candidates.size(); ++i) { + const auto & c = eval_candidates[i]; + if (c.bytes == 0) { continue; } + if (c.mse < best_mse) { + best_mse = c.mse; + min_mse = i; + } + if (c.error < best_err) { + best_err = c.error; + min_bias = i; } } @@ -1388,8 +1377,7 @@ static std::unordered_map target_bpw_type( if (c.bytes == 0) { continue; } const double mse = std::max(c.mse, epsilon); const double bias_term = std::max(0.0, c.error - c.mse); - const double rel = bias_term / mse; - max_rel_bias = std::max(rel, max_rel_bias); + max_rel_bias = std::max(bias_term / mse, max_rel_bias); } bias_needed = max_rel_bias >= 0.5; // >= 50% of MSE? @@ -1404,7 +1392,7 @@ static std::unordered_map target_bpw_type( if (info.candidate.empty()) { // As a last resort, keep original type - float bpw = ggml_nbytes(tensor) * 8.0f / nelem; + float bpw = ggml_nbytes(tensor) * 8.0f / info.n_elements; info.candidate.push_back(candidate_types{ tensor->type, bpw, ggml_nbytes(tensor), 0.0 }); } @@ -1416,26 +1404,18 @@ static std::unordered_map target_bpw_type( if (a.bytes != b.bytes) { return a.bytes < b.bytes; } return a.error < b.error; }); - const auto last = std::unique(candidates.begin(), candidates.end(), [](const candidate_types & a, const candidate_types & b) { + candidates.erase(std::unique(candidates.begin(), candidates.end(), [](const candidate_types & a, const candidate_types & b) { return a.bytes == b.bytes; - }); - candidates.erase(last, candidates.end()); - - // Pareto by bytes -> error + }), candidates.end()); std::vector pareto; pareto.reserve(candidates.size()); double best_err = infinity; - size_t last_b = std::numeric_limits::max(); for (const auto & c : candidates) { - if (c.bytes != last_b) { - last_b = c.bytes; - if (c.error < best_err) { - best_err = c.error; - pareto.push_back(c); - } + if (c.error < best_err) { + best_err = c.error; + pareto.push_back(c); } } - candidates.swap(pareto); if (candidates.size() < 3) { return; } // need at least 3 points to do convex hull @@ -1470,10 +1450,43 @@ static std::unordered_map target_bpw_type( info.choice = 0; info.min_bpw = info.candidate.front().bpw; info.max_bpw = info.candidate.back().bpw; - all.push_back(std::move(info)); - check_signal_handler(all); // save after each tensor + + return info; + }; + + std::vector all; // this vector will be populated by the parallel workers + { + std::atomic tensor_idx{0}; // shared work queue index for all threads + const size_t num_tensors_to_process = tensors.size(); + std::mutex loader_mutex; + std::mutex log_mutex; + std::mutex results_mutex; + std::vector workers; + int num_threads_to_spawn = std::max(1, std::min(nthread, (int)num_tensors_to_process)); + + for (int i = 0; i < num_threads_to_spawn; ++i) { + workers.emplace_back([&]() { + std::vector> thread_local_buffer; + while (true) { + const size_t current_idx = tensor_idx.fetch_add(1); + if (current_idx >= num_tensors_to_process) { break; } + const auto * tw = tensors[current_idx]; + if (!can_quantize(tw->tensor)) { continue; } + // Execute the main processing logic for this tensor + std::optional result_info = process_tensor(tw, thread_local_buffer, loader_mutex, log_mutex); + if (result_info) { + std::lock_guard lock(results_mutex); + all.push_back(std::move(*result_info)); + } + } + }); + } + + for (auto & w : workers) { w.join(); } } + check_signal_handler(all); + if (all.empty()) { return {}; } // Compute total elements across all tensors and bytes for non-quantizable tensors @@ -1965,7 +1978,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: LLAMA_LOG_WARN("%s: imatrix without activations provided, target bpw quantization will be less accurate\n", __func__); } LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw); - bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params, nthread); + bpw_overrides = target_bpw_type(ml, model, tensors, mapped, values_data, activations_data, params, nthread); } else { LLAMA_LOG_WARN("%s: no imatrix provided, target bpw will not apply\n", __func__); }