diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 3d4785c1a3..cab4ecaeec 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -623,7 +623,7 @@ static void signal_handler(int) { bpw_stop.store(true, std::memory_order_relaxed); } -// Returns tensor type overrides to meet a global bpw target +// Returns tensor type overrides that meet a global bpw target static std::unordered_map target_bpw_type( llama_model_loader & ml, const llama_model & model, @@ -650,6 +650,7 @@ static std::unordered_map target_bpw_type( } } signal_guard; + // Error and bias projection per GGML_TYPE per tensor struct candidate_types { ggml_type type = GGML_TYPE_COUNT; float bpw = 0.0f; @@ -659,6 +660,7 @@ static std::unordered_map target_bpw_type( double proj = 0.0; }; + // Per‑tensor quantization mix that satisfies a global bpw target struct tensor_info { const llama_model_loader::llama_tensor_weight * w = nullptr; std::vector candidate; @@ -697,22 +699,33 @@ static std::unordered_map target_bpw_type( constexpr uint64_t arbitrary_magic = 0xeabada55cafed00d; const char * func = __func__; + // Tensor size in bytes for a given type auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t { const int64_t n_per_row = t->ne[0]; const size_t row_sz = ggml_row_size(typ, n_per_row); return (size_t)ggml_nrows(t) * row_sz; }; + // Tensor bpw for a given type auto tensor_bpw = [&](const ggml_tensor * t, const ggml_type typ) -> double { const size_t bytes = tensor_bytes(t, typ); return (double)bytes * 8.0 / (double)ggml_nelements(t); }; + // Check if tensor is compatible with quantization type auto is_compatible = [](const ggml_tensor * t, const ggml_type typ) -> bool { const int64_t blck = ggml_blck_size(typ); return blck <= 1 || (t->ne[0] % blck) == 0; }; + // Get suitable fallback for type + auto make_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> ggml_type { + if (is_compatible(t, typ)) { return typ; } + const ggml_type fb = fallback_type(typ); + return is_compatible(t, fb) ? fb : GGML_TYPE_F16; + }; + + // Check if tensor is an IQ type auto is_iq = [](const enum ggml_type t) { switch (t) { case GGML_TYPE_IQ1_S: @@ -730,12 +743,7 @@ static std::unordered_map target_bpw_type( } }; - auto make_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> ggml_type { - if (is_compatible(t, typ)) { return typ; } - const ggml_type fb = fallback_type(typ); - return is_compatible(t, fb) ? fb : GGML_TYPE_F16; - }; - + // Check if tensor can be quantized auto can_quantize = [&](const ggml_tensor * t) -> bool { if (ggml_n_dims(t) < 2) { return false; } // skip 1D tensors return is_quantizable(ggml_get_name(t), model.arch, params); @@ -750,6 +758,7 @@ static std::unordered_map target_bpw_type( size_t n_elements = 0; }; + // DJB2 hashing algorithm auto djb2_hash = [&](const uint8_t * data, const size_t n) -> uint64_t { uint64_t h = 5381; for (size_t i = 0; i < n; ++i) { @@ -758,6 +767,7 @@ static std::unordered_map target_bpw_type( return h ? h : arbitrary_magic; }; + // Get model ID from metadata hash auto metadata_id = [&](const gguf_context * ctx) -> uint64_t { const size_t sz = gguf_get_meta_size(ctx); std::vector buf(sz); @@ -794,6 +804,7 @@ static std::unordered_map target_bpw_type( } } + // Serializes vector to disk auto save_bpw_state = [&](const std::vector & all_vec) { const std::string tmp = checkpoint_file + ".tmp"; std::ofstream ofs(tmp, std::ios::binary | std::ios::trunc); @@ -832,6 +843,7 @@ static std::unordered_map target_bpw_type( LLAMA_LOG_INFO("%s: saved progress for %lu tensors to %s\n", func, all_vec.size(), checkpoint_file.c_str()); }; + // Deserializes vector from disk auto load_bpw_state = [&]() -> std::unordered_map { std::unordered_map out; std::ifstream ifs(checkpoint_file, std::ios::binary); @@ -890,6 +902,7 @@ static std::unordered_map target_bpw_type( return out; }; + // Deletes checkpoint file unless --keep-bpw-state is set auto delete_bpw_state = [&] { std::ifstream ifs(checkpoint_file); if (ifs.good() && !params->keep_bpw_state) { @@ -898,6 +911,7 @@ static std::unordered_map target_bpw_type( } }; + // Check for user interrupt and save progress auto check_signal_handler = [&](const std::vector & all_vec) { if (bpw_stop.load(std::memory_order_relaxed)) { LLAMA_LOG_INFO("\n%s: saving progress for %lu tensors to %s\n", func, all_vec.size(), checkpoint_file.c_str()); @@ -1161,7 +1175,7 @@ static std::unordered_map target_bpw_type( const auto bpw_data = load_bpw_state(); - // Parallelize tensor processing - courtesy of https://github.com/ddh0 + // Parallelize tensor processing (courtesy of https://github.com/ddh0) auto process_tensor = [&](const llama_model_loader::llama_tensor_weight * tw, std::vector> & thread_local_buffer, std::mutex & loader_mutex, @@ -1555,6 +1569,7 @@ static std::unordered_map target_bpw_type( size_t target_total_bytes = std::llround(target_bpw * (double)nq_elements / 8.0); size_t budget_bytes = target_total_bytes >= nq_bytes ? target_total_bytes - nq_bytes : min_bytes; + // Get the types' override auto emit_overrides = [&]() -> std::unordered_map { std::unordered_map overrides; LLAMA_LOG_INFO("%s: - estimated tensor quantization mix:\n", func); @@ -1592,7 +1607,7 @@ static std::unordered_map target_bpw_type( return important; }; - // Lagrangian relaxation to minimise error subject to a bpw target constraint + // Lagrangian relaxation to minimize error subject to a bpw target constraint auto lagrange_penalty = [&](const double mu, std::vector & choice, size_t & bytes, double & err) { choice.resize(all.size()); bytes = 0; @@ -1636,7 +1651,7 @@ static std::unordered_map target_bpw_type( lagrange_penalty(mu_lo, choice_lo, bytes_lo, err_lo); - // increase mu until we get under budget or hit a safety cap + // Increase mu until we get under budget or hit a safety cap { int expand = 0; size_t prev_bytes_hi = std::numeric_limits::max(); @@ -1741,7 +1756,7 @@ static std::unordered_map target_bpw_type( } } - delete_bpw_state(); // we're done, clear any checkpoint + delete_bpw_state(); return emit_overrides(); }