diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 617c7d9473..04f4ff341a 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -11,11 +11,12 @@ #include #include #include +#include +#include #include #include #include #include -#include #include // Quantization types. Changes to this struct must be replicated in quantize.cpp @@ -1151,7 +1152,7 @@ static std::unordered_map target_bpw_type( const auto bpw_data = load_bpw_state(); - // Reduce compute time by parallelising tensor processing - courtesy of https://github.com/ddh0 + // Parallelize tensor processing - courtesy of https://github.com/ddh0 auto process_tensor = [&](const llama_model_loader::llama_tensor_weight * tw, std::vector> & thread_local_buffer, std::mutex & loader_mutex, @@ -1569,93 +1570,15 @@ static std::unordered_map target_bpw_type( return emit_overrides(); } - auto tensor_depth = [&](const std::string & name) -> float { - static const std::regex layer_pattern(R"(blk\.(\d+)\.)"); - std::smatch match; - - // Depth component: output, embeddings & early/late layers are important - if (name == "output.weight" || name == "token_embd.weight") { - return 1.0f; - } - if (std::regex_search(name, match, layer_pattern)) { - const int layer = std::stoi(match[1]); - const float normalized_layer = (float)layer / (float)std::max(1, (int)model.hparams.n_layer - 1); - const float center_dist = std::abs(normalized_layer - 0.5f) * 2.0f; - return 0.01f + 0.9f * center_dist; - } - - return 0.0f; - }; - - auto tensor_importance = [&](const std::vector & all_tensors) -> std::unordered_map { - std::unordered_map scores; - for (const auto & t : all_tensors) { - const std::string name = ggml_get_name(t.w->tensor); - float total_score = 0.0f; - float depth_score = 0.0f; - float type_score = 0.0f; - - // Type component: certain tensor types have more impact on model quality - const std::vector>> tensor_scores = { - {0.9f, {".ffn_down.weight", ".ffn_down_exps.weight"}}, - {0.89f, {".attn_output.weight", ".time_mix_output.weight", ".attn_o.weight"}}, - {0.3f, {".ffn_up.weight", ".ffn_gate.weight", ".ffn_up_exps.weight", ".ffn_gate_exps.weight"}}, - {0.29f, {".attn_q.weight", ".attn_k.weight", ".attn_v.weight", ".attn_qkv.weight"}}, - {0.2f, {"token_embd.weight"}} - }; - if (name == "output.weight") { - type_score = 1.0f; - } else { - for (const auto& ts : tensor_scores) { - const bool found = std::any_of(ts.second.begin(), ts.second.end(), [&](const char* pattern) { - return name.find(pattern) != std::string::npos; - }); - if (found) { - type_score = ts.first; - break; - } - } - } - if (type_score > 0.0f) { - depth_score = tensor_depth(name); - } - - // Weighted combination - total_score = 0.90f * type_score + 0.10f * depth_score; // 90% type + 10% depth - if (total_score != 0.0f) { - scores[name] = total_score; - LLAMA_LOG_DEBUG("\t%s: \t %45s \t depth score %.4f \t type score %.4f \t total score %.4f\n", func, name.c_str(), depth_score, type_score, total_score); - } - } - - return scores; - }; - - auto select_tensors = [&](const std::vector & all_vec) -> std::unordered_set { - const auto scores = tensor_importance(all_vec); - - // Sort by score - std::vector> sorted_scores(scores.begin(), scores.end()); - std::sort(sorted_scores.begin(), sorted_scores.end(), [](const auto & a, const auto & b) { return a.second > b.second; }); - - // Select top percentile - const size_t n_important = std::max(1, std::llround((double)sorted_scores.size() * 0.29f)); // 29% seems to be the pareto front - - std::unordered_set important; - for (size_t i = 0; i < std::min(n_important, sorted_scores.size()); ++i) { - important.insert(sorted_scores[i].first); - LLAMA_LOG_DEBUG("\t%s: important tensor %s (score %.4f)\n", func, sorted_scores[i].first.c_str(), sorted_scores[i].second); - } - - const auto pct = 100.0 * (double)important.size() / (double)sorted_scores.size(); - LLAMA_LOG_INFO("%s: prioritizing %zu out of %zu tensors (%.2f%%)\n", func, important.size(), sorted_scores.size(), pct); - return important; - }; - - const auto important_set = select_tensors(all); - + // Certain tensors have a higher impact on model quality, so we apply a lower penalty to them auto is_important = [&](const std::string & tensor_name) -> bool { - return important_set.count(tensor_name) > 0; + const auto important = tensor_name == "output.weight" || + tensor_name.find(".ffn_down.weight") != std::string::npos || + tensor_name.find(".ffn_down_exps.weight") != std::string::npos || + tensor_name.find(".attn_output.weight") != std::string::npos || + tensor_name.find(".time_mix_output.weight") != std::string::npos || + tensor_name.find(".attn_o.weight") != std::string::npos; + return important; }; // Lagrangian relaxation to minimise error subject to a bpw target constraint