Simplify tensor selection

2025-10-26 17:40:52 +00:00 · 2025-10-26 17:40:52 +00:00 · 5303212324
parent 8da14c0fc8
commit 5303212324
1 changed files with 11 additions and 88 deletions
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@ -11,11 +11,12 @@
 #include <csignal>
 #include <fstream>
 #include <mutex>
+#include <numeric>
+#include <optional>
 #include <random>
 #include <regex>
 #include <thread>
 #include <unordered_map>
-#include <optional>
 #include <unordered_set>

 // Quantization types. Changes to this struct must be replicated in quantize.cpp
@ -1151,7 +1152,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(

    const auto bpw_data = load_bpw_state();

-    // Reduce compute time by parallelising tensor processing - courtesy of https://github.com/ddh0
+    // Parallelize tensor processing - courtesy of https://github.com/ddh0
    auto process_tensor = [&](const llama_model_loader::llama_tensor_weight * tw,
        std::vector<no_init<uint8_t>> & thread_local_buffer,
        std::mutex & loader_mutex,
@ -1569,93 +1570,15 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
        return emit_overrides();
    }

-    auto tensor_depth = [&](const std::string & name) -> float {
-        static const std::regex layer_pattern(R"(blk\.(\d+)\.)");
-        std::smatch match;
-
-        // Depth component: output, embeddings & early/late layers are important
-        if (name == "output.weight" || name == "token_embd.weight") {
-            return 1.0f;
-        }
-        if (std::regex_search(name, match, layer_pattern)) {
-            const int layer = std::stoi(match[1]);
-            const float normalized_layer = (float)layer / (float)std::max(1, (int)model.hparams.n_layer - 1);
-            const float center_dist = std::abs(normalized_layer - 0.5f) * 2.0f;
-            return 0.01f + 0.9f * center_dist;
-        }
-
-        return 0.0f;
-    };
-
-    auto tensor_importance = [&](const std::vector<tensor_info> & all_tensors) -> std::unordered_map<std::string, float> {
-        std::unordered_map<std::string, float> scores;
-        for (const auto & t : all_tensors) {
-            const std::string name = ggml_get_name(t.w->tensor);
-            float total_score = 0.0f;
-            float depth_score = 0.0f;
-            float type_score = 0.0f;
-
-            // Type component: certain tensor types have more impact on model quality
-            const std::vector<std::pair<float, std::vector<const char*>>> tensor_scores = {
-                {0.9f, {".ffn_down.weight", ".ffn_down_exps.weight"}},
-                {0.89f, {".attn_output.weight", ".time_mix_output.weight", ".attn_o.weight"}},
-                {0.3f, {".ffn_up.weight", ".ffn_gate.weight", ".ffn_up_exps.weight", ".ffn_gate_exps.weight"}},
-                {0.29f, {".attn_q.weight", ".attn_k.weight", ".attn_v.weight", ".attn_qkv.weight"}},
-                {0.2f, {"token_embd.weight"}}
-            };
-            if (name == "output.weight") {
-                type_score = 1.0f;
-            } else {
-                for (const auto& ts : tensor_scores) {
-                    const bool found = std::any_of(ts.second.begin(), ts.second.end(), [&](const char* pattern) {
-                        return name.find(pattern) != std::string::npos;
-                    });
-                    if (found) {
-                        type_score = ts.first;
-                        break;
-                    }
-                }
-            }
-            if (type_score > 0.0f) {
-                depth_score = tensor_depth(name);
-            }
-
-            // Weighted combination
-            total_score = 0.90f * type_score + 0.10f * depth_score; // 90% type + 10% depth
-            if (total_score != 0.0f) {
-                scores[name] = total_score;
-                LLAMA_LOG_DEBUG("\t%s: \t %45s \t depth score %.4f \t type score %.4f \t total score %.4f\n", func, name.c_str(), depth_score, type_score, total_score);
-            }
-        }
-
-        return scores;
-    };
-
-    auto select_tensors = [&](const std::vector<tensor_info> & all_vec) -> std::unordered_set<std::string> {
-        const auto scores = tensor_importance(all_vec);
-
-        // Sort by score
-        std::vector<std::pair<std::string, float>> sorted_scores(scores.begin(), scores.end());
-        std::sort(sorted_scores.begin(), sorted_scores.end(), [](const auto & a, const auto & b) { return a.second > b.second; });
-
-        // Select top percentile
-        const size_t n_important = std::max<size_t>(1, std::llround((double)sorted_scores.size() * 0.29f)); // 29% seems to be the pareto front
-
-        std::unordered_set<std::string> important;
-        for (size_t i = 0; i < std::min(n_important, sorted_scores.size()); ++i) {
-            important.insert(sorted_scores[i].first);
-            LLAMA_LOG_DEBUG("\t%s: important tensor %s (score %.4f)\n", func, sorted_scores[i].first.c_str(), sorted_scores[i].second);
-        }
-
-        const auto pct = 100.0 * (double)important.size() / (double)sorted_scores.size();
-        LLAMA_LOG_INFO("%s: prioritizing %zu out of %zu tensors (%.2f%%)\n", func, important.size(), sorted_scores.size(), pct);
-        return important;
-    };
-
-    const auto important_set = select_tensors(all);
-
+    // Certain tensors have a higher impact on model quality, so we apply a lower penalty to them
    auto is_important = [&](const std::string & tensor_name) -> bool {
-        return important_set.count(tensor_name) > 0;
+        const auto important = tensor_name == "output.weight" ||
+                                    tensor_name.find(".ffn_down.weight") != std::string::npos ||
+                                    tensor_name.find(".ffn_down_exps.weight") != std::string::npos ||
+                                    tensor_name.find(".attn_output.weight") != std::string::npos ||
+                                    tensor_name.find(".time_mix_output.weight") != std::string::npos ||
+                                    tensor_name.find(".attn_o.weight") != std::string::npos;
+        return important;
    };

    // Lagrangian relaxation to minimise error subject to a bpw target constraint