From 51fee3e7ef73cc35ad76da07805597d5f2b9c672 Mon Sep 17 00:00:00 2001
From: Salvatore Rossitto <srossitto79@gmail.com>
Date: Wed, 18 Mar 2026 08:05:05 +0100
Subject: [PATCH] removed moe reap code merged by mistake

---
 tools/expert-profile/CMakeLists.txt        |   8 -
 tools/expert-profile/expert-profile.cpp    | 506 ---------------------
 tools/moe-pruning/README.md                |  97 ----
 tools/moe-pruning/analyze_stats.py         | 284 ------------
 tools/moe-pruning/build_expert_profile.sh  |  42 --
 tools/moe-pruning/extract_ppl.py           |  41 --
 tools/moe-pruning/gguf_prune.py            | 260 -----------
 tools/moe-pruning/requirements.txt         |   1 -
 tools/moe-pruning/sample_calibration.jsonl |   8 -
 9 files changed, 1247 deletions(-)
 delete mode 100644 tools/expert-profile/CMakeLists.txt
 delete mode 100644 tools/expert-profile/expert-profile.cpp
 delete mode 100644 tools/moe-pruning/README.md
 delete mode 100644 tools/moe-pruning/analyze_stats.py
 delete mode 100644 tools/moe-pruning/build_expert_profile.sh
 delete mode 100644 tools/moe-pruning/extract_ppl.py
 delete mode 100644 tools/moe-pruning/gguf_prune.py
 delete mode 100644 tools/moe-pruning/requirements.txt
 delete mode 100644 tools/moe-pruning/sample_calibration.jsonl
diff --git a/tools/expert-profile/CMakeLists.txt b/tools/expert-profile/CMakeLists.txt
deleted file mode 100644
index 859bd77a53..0000000000
--- a/tools/expert-profile/CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-set(TARGET llama-expert-profile)
-add_executable(${TARGET} expert-profile.cpp)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_17)
-
-if(LLAMA_TOOLS_INSTALL)
-    install(TARGETS ${TARGET} RUNTIME)
-endif()
diff --git a/tools/expert-profile/expert-profile.cpp b/tools/expert-profile/expert-profile.cpp
deleted file mode 100644
index de381ff1f1..0000000000
--- a/tools/expert-profile/expert-profile.cpp
+++ /dev/null
@@ -1,506 +0,0 @@
-/**
- * expert-profile: NemotronH MoE expert activation profiler (REAP implementation)
- *
- * Implements the REAP (Router-weighted Expert Activation Pruning) saliency criterion:
- *
- *   REAP(j) = mean over tokens routed to j of:  gate_weight(j,t) * ||expert_output(j,t)||_2
- *
- * where expert_output is ffn_moe_down (the FFN output BEFORE gate weighting),
- * and gate_weight is ffn_moe_weights (post-softmax routing probability).
- *
- * Intercepts three tensors per MoE layer via ggml eval callback:
- *   ffn_moe_topk-{il}    [n_expert_used, n_tokens] I32  — which experts were selected
- *   ffn_moe_weights-{il} [1, n_expert_used, n_tokens] F32 — gate weights (softmax probs)
- *   ffn_moe_down-{il}    [n_embd, n_expert_used, n_tokens] F32 — expert outputs (pre-weighting)
- *
- * Reference: "REAP: Router-weighted Expert Activation Pruning" (arXiv:2510.13999)
- *   score = mean_{x in X_j}[ g_j(x) * ||f_j(x)||_2 ]  (Equation 9)
- *
- * Usage:
- *   llama-expert-profile \
- *     -m model.gguf --jsonl training-data.jsonl --output expert_stats.json \
- *     [--n-experts 128] [--ctx-size 16384] [-ngl 32] [-t 24] [--save-every 1]
- */
-
-#include "arg.h"
-#include "common.h"
-#include "log.h"
-#include "llama.h"
-#include "ggml-backend.h"
-
-#include <algorithm>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <mutex>
-#include <string>
-#include <vector>
-
-// ─── Per-layer stats ──────────────────────────────────────────────────────────
-
-struct LayerStats {
-    int64_t n_experts    = 0;
-    int64_t total_tokens = 0;  // tokens processed through this layer
-
-    // Frequency / weighted-frequency (kept for reference/comparison)
-    std::vector<int64_t> activation_counts;   // [n_experts] — how many tokens routed here
-    std::vector<double>  weighted_freq_sum;   // [n_experts] — sum of gate weights
-
-    // REAP: running sum and count for computing mean(gate_weight * ||expert_out||_2)
-    std::vector<double>  reap_sum;            // [n_experts] — sum of g_j(t)*||f_j(t)||_2
-    std::vector<double>  ean_sum;             // [n_experts] — sum of ||f_j(t)||_2 (EAN, no gate)
-
-    void init(int64_t n) {
-        n_experts = n;
-        activation_counts.assign(n, 0);
-        weighted_freq_sum.assign(n, 0.0);
-        reap_sum.assign(n, 0.0);
-        ean_sum.assign(n, 0.0);
-    }
-
-    // Called once we have all three tensors for a batch.
-    // expert_ids:  [n_expert_used * n_tokens]  I32  — flat, column-major: [k + t*n_expert_used]
-    // gate_weights:[n_expert_used * n_tokens]  F32  — same layout
-    // expert_outs: [n_embd * n_expert_used * n_tokens] F32 — layout: [e + k*n_embd + t*n_embd*n_expert_used]
-    //              i.e. for token t, expert-slot k: out vector starts at t*n_embd*n_expert_used + k*n_embd
-    void add_batch(const int32_t * expert_ids,
-                   const float   * gate_weights,
-                   const float   * expert_outs,
-                   int64_t         n_expert_used,
-                   int64_t         n_tok,
-                   int64_t         n_embd) {
-        total_tokens += n_tok;
-        for (int64_t t = 0; t < n_tok; ++t) {
-            for (int64_t k = 0; k < n_expert_used; ++k) {
-                const int64_t flat = k + t * n_expert_used;
-                const int32_t eid  = expert_ids[flat];
-                if (eid < 0 || eid >= n_experts) continue;
-
-                const float gw = gate_weights[flat];
-
-                // L2 norm of expert output vector for this (token, expert-slot)
-                const float * vec = expert_outs + t * n_embd * n_expert_used + k * n_embd;
-                double norm2 = 0.0;
-                for (int64_t d = 0; d < n_embd; ++d) {
-                    norm2 += (double)vec[d] * (double)vec[d];
-                }
-                const double norm = std::sqrt(norm2);
-
-                activation_counts [eid] += 1;
-                weighted_freq_sum [eid] += gw;
-                reap_sum          [eid] += gw * norm;   // REAP numerator
-                ean_sum           [eid] += norm;         // EAN numerator
-            }
-        }
-    }
-};
-
-// ─── Collector ────────────────────────────────────────────────────────────────
-
-struct ExpertCollector {
-    int64_t n_experts = 128;
-
-    std::map<int, LayerStats> layer_stats;
-    std::mutex                mtx;
-
-    // We need all three tensors before we can compute REAP.
-    // They arrive in order: topk → weights → down (per the graph build order).
-    // Store pending topk+weights until down arrives.
-    struct PendingBatch {
-        int64_t              n_expert_used = 0;
-        int64_t              n_tokens      = 0;
-        std::vector<int32_t> expert_ids;    // [n_expert_used * n_tokens]
-        std::vector<float>   gate_weights;  // [n_expert_used * n_tokens]
-        bool                 has_topk    = false;
-        bool                 has_weights = false;
-    };
-    std::map<int, PendingBatch> pending; // layer_idx → pending
-
-    // Strip device prefix/suffix: "CUDA0#ffn_moe_down-5#0" → "ffn_moe_down-5"
-    static std::string clean_name(const char * raw) {
-        const char * p = strchr(raw, '#');
-        if (p) {
-            ++p;
-            const char * q = strchr(p, '#');
-            return q ? std::string(p, q - p) : std::string(p);
-        }
-        return raw;
-    }
-
-    bool wants(struct ggml_tensor * t) {
-        if (!t->name[0]) return false;
-        const std::string n = clean_name(t->name);
-        return (n.compare(0, 13, "ffn_moe_topk-")    == 0 ||
-                n.compare(0, 16, "ffn_moe_weights-") == 0 ||
-                n.compare(0, 13, "ffn_moe_down-")    == 0);
-    }
-
-    bool on_tensor(struct ggml_tensor * t) {
-        const std::string name = clean_name(t->name);
-
-        // Identify tensor type and layer
-        int  il         = -1;
-        bool is_topk    = false;
-        bool is_weights = false;
-        bool is_down    = false;
-
-        if      (name.compare(0, 13, "ffn_moe_topk-")    == 0) { il = atoi(name.c_str() + 13); is_topk    = true; }
-        else if (name.compare(0, 16, "ffn_moe_weights-") == 0) { il = atoi(name.c_str() + 16); is_weights = true; }
-        else if (name.compare(0, 13, "ffn_moe_down-")    == 0) { il = atoi(name.c_str() + 13); is_down    = true; }
-        else return true;
-
-        if (il < 0) return true;
-
-        // Copy tensor data from (possibly GPU) buffer to host
-        const size_t nbytes = ggml_nbytes(t);
-        std::vector<char> buf(nbytes);
-        ggml_backend_tensor_get(t, buf.data(), 0, nbytes);
-
-        std::lock_guard<std::mutex> lk(mtx);
-        PendingBatch & pb = pending[il];
-
-        if (is_topk) {
-            // [n_expert_used, n_tokens] I32
-            pb.n_expert_used = t->ne[0];
-            pb.n_tokens      = t->ne[1];
-            pb.expert_ids.resize(pb.n_expert_used * pb.n_tokens);
-            memcpy(pb.expert_ids.data(), buf.data(), pb.n_expert_used * pb.n_tokens * sizeof(int32_t));
-            pb.has_topk    = true;
-            pb.has_weights = false; // reset in case of re-use
-
-        } else if (is_weights) {
-            // [1, n_expert_used, n_tokens] F32 — flat layout same as topk
-            if (!pb.has_topk) return true; // shouldn't happen
-            pb.gate_weights.resize(pb.n_expert_used * pb.n_tokens);
-            memcpy(pb.gate_weights.data(), buf.data(), pb.n_expert_used * pb.n_tokens * sizeof(float));
-            pb.has_weights = true;
-
-        } else if (is_down) {
-            // [n_embd, n_expert_used, n_tokens] F32
-            if (!pb.has_topk || !pb.has_weights) return true;
-
-            const int64_t n_embd        = t->ne[0];
-            const int64_t n_expert_used = t->ne[1];
-            const int64_t n_tokens      = t->ne[2];
-
-            // Sanity check
-            if (n_expert_used != pb.n_expert_used || n_tokens != pb.n_tokens) {
-                LOG_ERR("expert-profile: dimension mismatch at layer %d\n", il);
-                pending.erase(il);
-                return true;
-            }
-
-            // Ensure layer stats initialised
-            auto & ls = layer_stats[il];
-            if (ls.n_experts == 0) ls.init(n_experts);
-
-            const float * expert_outs = reinterpret_cast<const float *>(buf.data());
-            ls.add_batch(pb.expert_ids.data(), pb.gate_weights.data(),
-                         expert_outs, n_expert_used, n_tokens, n_embd);
-
-            // Done with this batch for this layer
-            pending.erase(il);
-        }
-
-        return true;
-    }
-};
-
-// ─── Global collector + C callback ───────────────────────────────────────────
-
-static ExpertCollector g_collector;
-
-static bool expert_eval_callback(struct ggml_tensor * t, bool ask, void * /*user_data*/) {
-    if (ask) return g_collector.wants(t);
-    return g_collector.on_tensor(t);
-}
-
-// ─── JSON output ──────────────────────────────────────────────────────────────
-
-static void save_stats(const std::string & path) {
-    std::ofstream f(path);
-    if (!f) {
-        LOG_ERR("expert-profile: failed to open output file '%s'\n", path.c_str());
-        return;
-    }
-
-    f << "{\n";
-    bool first_layer = true;
-    for (auto & [il, ls] : g_collector.layer_stats) {
-        if (!first_layer) f << ",\n";
-        first_layer = false;
-
-        f << "  \"" << il << "\": {\n";
-        f << "    \"total_tokens\": " << ls.total_tokens << ",\n";
-
-        // activation_counts
-        f << "    \"activation_counts\": [";
-        for (int64_t i = 0; i < ls.n_experts; ++i) {
-            if (i) f << ", ";
-            f << ls.activation_counts[i];
-        }
-        f << "],\n";
-
-        // activation_frequency
-        f << "    \"activation_frequency\": [";
-        for (int64_t i = 0; i < ls.n_experts; ++i) {
-            if (i) f << ", ";
-            f << ((ls.total_tokens > 0) ? (double)ls.activation_counts[i] / ls.total_tokens : 0.0);
-        }
-        f << "],\n";
-
-        // avg_gate_weight  (weighted_freq_sum / activation_counts)
-        f << "    \"avg_gate_weight\": [";
-        for (int64_t i = 0; i < ls.n_experts; ++i) {
-            if (i) f << ", ";
-            f << ((ls.activation_counts[i] > 0) ? ls.weighted_freq_sum[i] / ls.activation_counts[i] : 0.0);
-        }
-        f << "],\n";
-
-        // ean_mean  = ean_sum / activation_counts  (EAN criterion, no gate weight)
-        f << "    \"ean_mean\": [";
-        for (int64_t i = 0; i < ls.n_experts; ++i) {
-            if (i) f << ", ";
-            f << ((ls.activation_counts[i] > 0) ? ls.ean_sum[i] / ls.activation_counts[i] : 0.0);
-        }
-        f << "],\n";
-
-        // reap  = reap_sum / activation_counts  (REAP criterion, Eq.9)
-        f << "    \"reap\": [";
-        for (int64_t i = 0; i < ls.n_experts; ++i) {
-            if (i) f << ", ";
-            f << ((ls.activation_counts[i] > 0) ? ls.reap_sum[i] / ls.activation_counts[i] : 0.0);
-        }
-        f << "],\n";
-
-        // never_activated
-        int64_t never = 0;
-        for (int64_t i = 0; i < ls.n_experts; ++i) {
-            if (ls.activation_counts[i] == 0) ++never;
-        }
-        f << "    \"never_activated\": " << never << "\n";
-        f << "  }";
-    }
-    f << "\n}\n";
-
-    LOG_INF("expert-profile: stats saved to '%s'  (%zu MoE layers)\n",
-            path.c_str(), g_collector.layer_stats.size());
-}
-
-// ─── JSONL input ──────────────────────────────────────────────────────────────
-
-struct JsonPair { std::string prompt, response; };
-
-static bool json_get_string(const std::string & line, const std::string & key, std::string & out) {
-    std::string search = "\"" + key + "\"";
-    size_t kpos = line.find(search);
-    if (kpos == std::string::npos) return false;
-    size_t colon = line.find(':', kpos + search.size());
-    if (colon == std::string::npos) return false;
-    size_t q1 = line.find('"', colon + 1);
-    if (q1 == std::string::npos) return false;
-    out.clear();
-    for (size_t i = q1 + 1; i < line.size(); ++i) {
-        if (line[i] == '\\' && i + 1 < line.size()) {
-            ++i;
-            switch (line[i]) {
-                case '"':  out += '"';  break;
-                case '\\': out += '\\'; break;
-                case 'n':  out += '\n'; break;
-                case 'r':  out += '\r'; break;
-                case 't':  out += '\t'; break;
-                default:   out += line[i]; break;
-            }
-        } else if (line[i] == '"') {
-            return true;
-        } else {
-            out += line[i];
-        }
-    }
-    return false;
-}
-
-static std::vector<JsonPair> load_jsonl(const std::string & path) {
-    std::vector<JsonPair> pairs;
-    std::ifstream f(path);
-    if (!f) { LOG_ERR("expert-profile: cannot open JSONL file '%s'\n", path.c_str()); return pairs; }
-    std::string line;
-    while (std::getline(f, line)) {
-        if (line.empty()) continue;
-        JsonPair p;
-        json_get_string(line, "prompt",   p.prompt);
-        json_get_string(line, "response", p.response);
-        if (!p.prompt.empty() || !p.response.empty()) pairs.push_back(std::move(p));
-    }
-    return pairs;
-}
-
-// ─── Inference loop ───────────────────────────────────────────────────────────
-
-static void run_inference(llama_context * ctx,
-                          const llama_model * model,
-                          const std::vector<JsonPair> & pairs,
-                          int max_tokens,
-                          const std::string & output_path,
-                          int save_every) {
-    const llama_vocab * vocab  = llama_model_get_vocab(model);
-    const bool          add_bos = llama_vocab_get_add_bos(vocab);
-
-    llama_batch batch = llama_batch_init(max_tokens, 0, 1);
-
-    for (size_t pi = 0; pi < pairs.size(); ++pi) {
-        const std::string text = pairs[pi].prompt + "\n" + pairs[pi].response;
-
-        std::vector<llama_token> tokens = common_tokenize(ctx, text, add_bos, true);
-        if ((int)tokens.size() > max_tokens) tokens.resize(max_tokens);
-        if (tokens.empty()) continue;
-
-        LOG_INF("  [%zu/%zu] %zu tokens\n", pi + 1, pairs.size(), tokens.size());
-
-        llama_memory_clear(llama_get_memory(ctx), true);
-
-        common_batch_clear(batch);
-        for (int i = 0; i < (int)tokens.size(); ++i) {
-            common_batch_add(batch, tokens[i], i, {0}, false);
-        }
-        batch.logits[batch.n_tokens - 1] = true;
-
-        if (llama_decode(ctx, batch) != 0) {
-            LOG_ERR("  [%zu/%zu] llama_decode failed — skipping\n", pi + 1, pairs.size());
-        }
-
-        if (save_every > 0 && (pi + 1) % save_every == 0) {
-            save_stats(output_path);
-        }
-    }
-
-    llama_batch_free(batch);
-}
-
-// ─── CLI ──────────────────────────────────────────────────────────────────────
-
-int main(int argc, char ** argv) {
-    std::string model_path;
-    std::string jsonl_path;
-    std::string output_path  = "expert_stats.json";
-    int         n_experts    = 128;
-    int         ctx_size     = 2048;
-    int         n_gpu_layers = 99;
-    int         n_threads    = 4;
-    int         save_every   = 100;
-    enum ggml_type kv_type_k = GGML_TYPE_F16;
-    enum ggml_type kv_type_v = GGML_TYPE_F16;
-
-    auto parse_ggml_type = [](const char * s) -> enum ggml_type {
-        if (strcmp(s, "f32")  == 0) return GGML_TYPE_F32;
-        if (strcmp(s, "f16")  == 0) return GGML_TYPE_F16;
-        if (strcmp(s, "q8_0") == 0) return GGML_TYPE_Q8_0;
-        if (strcmp(s, "q4_0") == 0) return GGML_TYPE_Q4_0;
-        fprintf(stderr, "Unknown KV type '%s', using f16\n", s); return GGML_TYPE_F16;
-    };
-
-    for (int i = 1; i < argc; ++i) {
-        std::string a(argv[i]);
-        auto next = [&]() -> const char * {
-            if (i + 1 >= argc) { fprintf(stderr, "Missing argument for %s\n", argv[i]); exit(1); }
-            return argv[++i];
-        };
-        if      (a == "-m" || a == "--model")           model_path  = next();
-        else if (a == "--jsonl")                         jsonl_path  = next();
-        else if (a == "--output")                        output_path = next();
-        else if (a == "--n-experts")                     n_experts    = atoi(next());
-        else if (a == "--ctx-size" || a == "-c")         ctx_size     = atoi(next());
-        else if (a == "-ngl" || a == "--n-gpu-layers")   n_gpu_layers = atoi(next());
-        else if (a == "-t" || a == "--threads")          n_threads    = atoi(next());
-        else if (a == "--type-k")                        kv_type_k   = parse_ggml_type(next());
-        else if (a == "--type-v")                        kv_type_v   = parse_ggml_type(next());
-        else if (a == "--save-every")                    save_every  = atoi(next());
-        else if (a == "-h" || a == "--help") {
-            fprintf(stderr,
-                "\nUsage: %s -m model.gguf --jsonl data.jsonl [options]\n"
-                "  --output PATH       Output JSON (default: expert_stats.json)\n"
-                "  --n-experts N       Experts per layer (default: 128)\n"
-                "  --ctx-size N        Context length (default: 2048)\n"
-                "  -ngl N              GPU layers (default: 99)\n"
-                "  -t N                CPU threads (default: 4)\n"
-                "  --type-k/v TYPE     KV cache type: f32/f16/q8_0/q4_0 (default: f16)\n"
-                "  --save-every N      Checkpoint every N samples (default: 100)\n\n", argv[0]);
-            return 0;
-        } else {
-            fprintf(stderr, "Unknown argument: %s\n", a.c_str()); return 1;
-        }
-    }
-
-    if (model_path.empty()) { fprintf(stderr, "Error: -m required\n"); return 1; }
-    if (jsonl_path.empty()) { fprintf(stderr, "Error: --jsonl required\n"); return 1; }
-
-    g_collector.n_experts = n_experts;
-
-    LOG_INF("expert-profile: model     = %s\n", model_path.c_str());
-    LOG_INF("expert-profile: jsonl     = %s\n", jsonl_path.c_str());
-    LOG_INF("expert-profile: output    = %s\n", output_path.c_str());
-    LOG_INF("expert-profile: n_experts = %d\n", n_experts);
-    LOG_INF("expert-profile: ctx_size  = %d\n", ctx_size);
-    LOG_INF("expert-profile: ngl       = %d\n", n_gpu_layers);
-    LOG_INF("expert-profile: criterion = REAP (gate_weight * ||expert_out||_2)\n");
-
-    auto pairs = load_jsonl(jsonl_path);
-    if (pairs.empty()) { LOG_ERR("expert-profile: no pairs loaded\n"); return 1; }
-    LOG_INF("expert-profile: loaded %zu pairs\n", pairs.size());
-
-    llama_backend_init();
-
-    // Suppress INFO/WARN spam (CUDA graph warmup etc.), only pass errors through
-    llama_log_set([](enum ggml_log_level level, const char * text, void *) {
-        if (level >= GGML_LOG_LEVEL_ERROR) fputs(text, stderr);
-    }, nullptr);
-
-    llama_model_params mparams = llama_model_default_params();
-    mparams.n_gpu_layers = n_gpu_layers;
-
-    llama_model * model = llama_model_load_from_file(model_path.c_str(), mparams);
-    if (!model) { LOG_ERR("expert-profile: failed to load model\n"); return 1; }
-
-    llama_context_params cparams  = llama_context_default_params();
-    cparams.n_ctx                 = ctx_size;
-    cparams.n_batch               = ctx_size;
-    cparams.n_ubatch              = std::min(ctx_size, 512);
-    cparams.n_threads             = n_threads;
-    cparams.type_k                = kv_type_k;
-    cparams.type_v                = kv_type_v;
-    cparams.cb_eval               = expert_eval_callback;
-    cparams.cb_eval_user_data     = nullptr;
-
-    llama_context * ctx = llama_init_from_model(model, cparams);
-    if (!ctx) { LOG_ERR("expert-profile: failed to create context\n"); return 1; }
-
-    LOG_INF("expert-profile: running forward passes...\n");
-    run_inference(ctx, model, pairs, ctx_size, output_path, save_every);
-    save_stats(output_path);
-
-    // Summary
-    LOG_INF("\n  MoE layers profiled: %zu\n", g_collector.layer_stats.size());
-    for (auto & [il, ls] : g_collector.layer_stats) {
-        // Find top and bottom REAP expert
-        int64_t top_e = 0, bot_e = 0;
-        double  top_v = 0.0, bot_v = 1e18;
-        for (int64_t i = 0; i < ls.n_experts; ++i) {
-            double v = (ls.activation_counts[i] > 0) ? ls.reap_sum[i] / ls.activation_counts[i] : 0.0;
-            if (v > top_v) { top_v = v; top_e = i; }
-            if (v < bot_v) { bot_v = v; bot_e = i; }
-        }
-        int64_t never = 0;
-        for (int64_t i = 0; i < ls.n_experts; ++i)
-            if (ls.activation_counts[i] == 0) ++never;
-        LOG_INF("  Layer %3d: tokens=%lld  never=%lld  reap_top=e%lld(%.4f)  reap_bot=e%lld(%.4f)\n",
-                il, (long long)ls.total_tokens, (long long)never,
-                (long long)top_e, top_v, (long long)bot_e, bot_v);
-    }
-
-    llama_free(ctx);
-    llama_model_free(model);
-    llama_backend_free();
-    return 0;
-}
diff --git a/tools/moe-pruning/README.md b/tools/moe-pruning/README.md
deleted file mode 100644
index a88499ac43..0000000000
--- a/tools/moe-pruning/README.md
+++ /dev/null
@@ -1,97 +0,0 @@
-# MoE Expert Pruning Tools for NemotronH
-
-REAP-style expert pruning for `NVIDIA-Nemotron-3-Nano-30B-A3B` (and other
-NemotronH MoE models), implemented in two complementary ways:
-
-1. **`tools/expert-profile/`** — C++ profiler built into llama.cpp, collects
-   REAP scores directly from GGUF inference via the ggml eval callback.
-2. **`tools/moe-pruning/`** (this directory) — Python scripts to prune the model
-   using the collected scores, either on a GGUF file directly or on a
-   HuggingFace BF16 checkpoint.
-
----
-
-## Inspiration & Prior Art
-
-This work is a direct implementation of the **REAP** saliency criterion
-introduced in:
-
-> **REAP the Experts: Why Pruning Prevails for One-Shot MoE Compression**
-> Mike Lasby, Ivan Lazarevich, Nish Sinnadurai, Sean Lie, Yani Ioannou, Vithursan Thangarasa
-> Cerebras Research, 2025
-> arXiv: https://arxiv.org/abs/2510.13999
-> Code:  https://github.com/CerebrasResearch/reap
-
-The REAP score for expert `j` is (Equation 9 of the paper):
-
-```
-REAP(j) = mean_{t : j ∈ topk(t)} [ g_j(t) · ‖f_j(t)‖₂ ]
-```
-
-where `g_j(t)` is the router gate weight and `f_j(t)` is the expert FFN output
-(pre-weighting) for token `t`. Experts with the lowest REAP score contribute
-least to the layer output and are pruned first.
-
-The original REAP repo targets HuggingFace models via PyTorch hooks on
-standard architectures (Qwen3-MoE, Mixtral, DeepSeek-V2, Llama-4, …).
-
-**What we added / adapted:**
-
-- `tools/expert-profile/expert-profile.cpp` — llama.cpp C++ implementation
-  of REAP that intercepts `ffn_moe_topk`, `ffn_moe_weights`, and `ffn_moe_down`
-  tensors via `ggml_backend_eval_callback`, enabling REAP profiling on any
-  GGUF-quantised model (Q4_K_M, Q6_K, etc.) without needing full BF16 VRAM.
-
-- `gguf_prune.py` — prunes the GGUF file **directly**, slicing the expert axis
-  of the stacked weight tensors (`ffn_up_exps`, `ffn_down_exps`, `ffn_gate_inp`,
-  `ffn_exp_probs_b`) and patching `{arch}.expert_count` in the metadata.
-  Quantised blocks are preserved as raw bytes — no dequantise/requantise step.
-
-- `nemotron_reap.py` — HuggingFace-based alternative: profiles with 4-bit NF4
-  on GPU (phase 1) and prunes the BF16 checkpoint on CPU (phase 2). Adds
-  NemotronH (`NemotronHForCausalLM`) support that the original REAP repo does
-  not have.
-
----
-
-## Recommended Workflow (low-VRAM, e.g. RTX 4060 Ti 16 GB)
-
-```
-┌─────────────────────────────────────────────┐
-│  Phase 1 — Profile  (GPU, GGUF Q4, ~15 GB)  │
-│                                             │
-│  llama-expert-profile                       │
-│    -m nemotron-Q4_K_M.gguf                  │
-│    --jsonl sample_calibration.jsonl         │
-│    --output expert_stats.json               │
-│    -ngl 99 --ctx-size 2048                  │
-└───────────────────┬─────────────────────────┘
-                    │ expert_stats.json
-┌───────────────────▼─────────────────────────┐
-│  Phase 2 — Prune  (CPU, pure Python, ~2 GB) │
-│                                             │
-│  python gguf_prune.py                       │
-│    --input  nemotron-Q4_K_M.gguf            │
-│    --stats  expert_stats.json               │
-│    --output nemotron-pruned-26e.gguf        │
-│    --keep_ratio 0.20   # 26/128 experts     │
-└─────────────────────────────────────────────┘
-```
-
-At 20 % keep ratio a ~22 GB Q4_K_M becomes ~4.5 GB.
-
----
-
-## Files
-
-| File | Description |
-|---|---|
-| `gguf_prune.py` | GGUF-native pruner — no GPU needed, preserves quantisation |
-| `nemotron_reap.py` | HF-based pruner — 4-bit GPU profile + CPU BF16 prune |
-| `build_expert_profile.sh` | Build script for `llama-expert-profile` |
-| `run_nemotron_profile.sh` | Example profiling run |
-| `run_prune.sh` | Example pruning run |
-| `run_convert_quantize.sh` | Convert HF → GGUF and quantise |
-| `analyze_stats.py` | Visualise and compare expert stats JSON files |
-| `sample_calibration.jsonl` | Sample calibration data (prompt+response pairs) |
-| `expert_stats_reap.json` | Example stats output from expert-profile |
diff --git a/tools/moe-pruning/analyze_stats.py b/tools/moe-pruning/analyze_stats.py
deleted file mode 100644
index 2e0821f323..0000000000
--- a/tools/moe-pruning/analyze_stats.py
+++ /dev/null
@@ -1,284 +0,0 @@
-#!/usr/bin/env python3
-"""
-analyze_stats.py  --  Summarize expert_stats.json and model size projections.
-Usage: python analyze_stats.py [stats_file] [--keep 0.5]
-"""
-import json, statistics, argparse
-
-parser = argparse.ArgumentParser()
-parser.add_argument("stats", nargs="?", default="expert_stats_reap.json")
-parser.add_argument("--keep", type=float, default=0.5, help="Fraction of experts to keep (default 0.5)")
-args = parser.parse_args()
-
-with open(args.stats) as f:
-    data = json.load(f)
-
-layers = sorted(data.keys(), key=int)
-n_layers = len(layers)
-keep_ratio = args.keep
-
-# Detect which scoring field is available (new REAP vs old importance_score)
-sample_layer = data[layers[0]]
-if "reap" in sample_layer:
-    score_field = "reap"
-    score_label = "REAP (gate_weight × ||expert_out||₂)"
-elif "importance_score" in sample_layer:
-    score_field = "importance_score"
-    score_label = "importance_score (freq × avg_gate_weight)  [legacy, no EAN]"
-else:
-    raise ValueError(f"No recognised score field in stats. Keys: {list(sample_layer.keys())}")
-
-# ── Model architecture constants (Nemotron-3-Nano-30B-A3B) ──────────────────
-N_EXPERTS        = 128
-N_EXPERT_USED    = 6       # top-k per token
-N_MOE_LAYERS     = 23
-N_TOTAL_LAYERS   = 53
-# Approximate parameter counts (bf16, billions)
-PARAMS_TOTAL_B        = 30.0
-PARAMS_MOE_EXPERTS_B  = 22.0   # bulk of MoE weight is in expert FFNs
-PARAMS_NON_MOE_B      = PARAMS_TOTAL_B - PARAMS_MOE_EXPERTS_B
-
-# ── Header ──────────────────────────────────────────────────────────────────
-print("=" * 70)
-print(f"  Expert Stats Analysis  |  file: {args.stats}")
-print("=" * 70)
-
-# ── Profiling completeness ───────────────────────────────────────────────────
-sample_tokens = list(data.values())[0]["total_tokens"]
-# Each token activates N_EXPERT_USED experts, sum(activation_counts) = total*top_k
-# Approximate samples: total_tokens / avg_tokens_per_sample
-# We don't know avg, but can infer: total_tokens / (total_tokens / ctx) ≈ ctx chunks
-# Better: just report tokens and note the user knows sample count
-print(f"\n── Profiling progress ──────────────────────────────────────────────────")
-print(f"  MoE layers profiled    : {n_layers} / {N_MOE_LAYERS}")
-print(f"  Tokens processed       : {sample_tokens:,}  (per layer)")
-act_sum = sum(data[layers[0]]["activation_counts"])
-assert abs(act_sum / sample_tokens - N_EXPERT_USED) < 0.01, "unexpected top-k"
-print(f"  top-k confirmed        : {N_EXPERT_USED}  (sum activations / tokens = {act_sum/sample_tokens:.1f})")
-
-# ── Per-layer importance score stats ────────────────────────────────────────
-print(f"\n── Per-layer score distribution  [{score_label}]")
-print(f"  {'Layer':>5}  {'Min':>9}  {'Max':>9}  {'Range':>9}  {'CV%':>6}  {'Never':>5}")
-global_cvs = []
-for k in layers:
-    d = data[k]
-    s = d[score_field]
-    mn, mx = min(s), max(s)
-    cv = statistics.stdev(s) / statistics.mean(s) * 100
-    global_cvs.append(cv)
-    print(f"  {k:>5}  {mn:>9.5f}  {mx:>9.5f}  {mx-mn:>9.5f}  {cv:>6.3f}%  {d['never_activated']:>5}")
-
-print(f"\n  Mean CV across layers  : {statistics.mean(global_cvs):.3f}%")
-print(f"  (CV < 1% = near-uniform; load-balancing is working as designed)")
-
-# ── Capacity loss sweep across pruning levels ────────────────────────────────
-# Paper (observer.py): REAP[i] = mean(ean_norm * softmax_router_weight) over tokens
-#   routed to expert i, averaged via OnlineStatsTracker weighted by expert_frequency.
-# Our implementation (llama.cpp): same formula but routing weights are the top-k
-#   gate weights (post-softmax within top-k), not the full softmax over all 128.
-# Impact: our weights are slightly higher than the paper's (renormalized to top-k
-#   only), but relative expert ranking within a layer should be preserved.
-#
-# IMPORTANT CAVEAT for this model (Nemotron-3-Nano-30B-A3B):
-#   The model was trained with a strong load-balancing auxiliary loss, so all 128
-#   experts have nearly identical activation frequency (~4.69%) AND nearly identical
-#   REAP scores (Gini ~0.015, top/bottom ratio ~1.1-1.35x). The score distribution
-#   is a smooth monotone curve with NO natural elbow or gap.
-#
-#   This means:
-#   - REAP ranking beats random pruning by only ~1pp in mass terms at keep=33%
-#   - The cut point boundary (rank 42 vs 43) has near-zero gap in most layers
-#   - REAP paper results on Qwen3-30B-A3B likely had higher Gini (less tight
-#     load-balancing or more expert specialization in pre-training)
-#   - For this model, actual quality loss must be measured via eval, not predicted
-#     from REAP score variance
-#
-# Metrics reported:
-# - kept_mass%: REAP mass in the KEPT experts as % of total (> keep_ratio% = good)
-# - vs_random%: how much more mass the REAP-selected set retains vs a random set
-#               of the same size (= kept_mass% - keep_ratio%). Positive = REAP wins.
-# - Rel.gap:    score gap at cut / layer score range. Near 0 = no natural cut point.
-# - Gini:       inequality of score distribution. ~0.015 here = near-uniform.
-
-def gini(scores):
-    """Gini coefficient of a list of non-negative values."""
-    n = len(scores)
-    s = sorted(scores)
-    total = sum(s)
-    if total == 0:
-        return 0.0
-    cumsum = 0.0
-    for i, v in enumerate(s):
-        cumsum += (2 * (i + 1) - n - 1) * v
-    return cumsum / (n * total)
-
-def layer_stats(scores, n_keep):
-    """Return capacity metrics for a single layer at a given keep count."""
-    n = len(scores)
-    ranked = sorted(range(n), key=lambda i: scores[i], reverse=True)
-    total  = sum(scores)
-    kept_mass   = sum(scores[i] for i in ranked[:n_keep])
-    kept_frac   = kept_mass / total if total > 0 else 0.0     # fraction of REAP mass kept
-    random_frac = n_keep / n                                   # uniform expectation
-    vs_random   = kept_frac - random_frac                     # positive = REAP beats random
-    score_range = scores[ranked[0]] - scores[ranked[-1]]
-    gap         = scores[ranked[n_keep - 1]] - (scores[ranked[n_keep]] if n_keep < n else 0)
-    rel_gap     = gap / score_range if score_range > 0 else 0.0
-    return kept_frac * 100, vs_random * 100, rel_gap
-
-# Sweep over a range of keep ratios
-sweep_ratios = [0.10, 0.20, 0.25, 0.33, 0.40, 0.50, 0.60, 0.75]
-if keep_ratio not in sweep_ratios:
-    sweep_ratios.append(keep_ratio)
-sweep_ratios = sorted(set(sweep_ratios))
-
-# Per-layer Gini (fixed, independent of keep ratio)
-layer_ginis = {k: gini(data[k][score_field]) for k in layers}
-mean_gini = statistics.mean(layer_ginis.values())
-worst_gini_layer = max(layer_ginis, key=lambda k: layer_ginis[k])
-
-print(f"\n── Score distribution inequality (Gini coefficient) ────────────────────")
-print(f"  Gini measures how non-uniform REAP scores are within each layer.")
-print(f"  Gini=0: all experts identical. Gini=1: one expert dominates.")
-print(f"  With load-balanced MoE, Gini is small — but any Gini > 0 means")
-print(f"  REAP ranking beats random pruning.")
-print(f"")
-print(f"  {'Layer':>5}  {'Gini':>8}  {'Score range':>13}  {'Max/Min ratio':>14}")
-print(f"  {'-'*5}  {'-'*8}  {'-'*13}  {'-'*14}")
-for k in layers:
-    s = data[k][score_field]
-    mn, mx = min(s), max(s)
-    g = layer_ginis[k]
-    ratio_mm = mx / mn if mn > 0 else float('inf')
-    print(f"  {k:>5}  {g:>8.5f}  {mx-mn:>13.5f}  {ratio_mm:>13.3f}x")
-print(f"")
-print(f"  Mean Gini : {mean_gini:.5f}  (worst layer: {worst_gini_layer})")
-
-print(f"\n── Capacity retention sweep ─────────────────────────────────────────────")
-print(f"  Kept mass%  = REAP mass in KEPT experts as % of total (higher = better)")
-print(f"  vs.rand%    = Kept mass% minus uniform baseline (keep_ratio%)")
-print(f"                Positive = REAP beats random. Magnitude = advantage in pp.")
-print(f"  Rel.gap     = score gap at cut / layer score range (higher = cleaner cut)")
-print(f"  WARNING: near-zero rel.gap and small vs.rand mean eval is the only ground truth.")
-print(f"")
-print(f"  {'Keep':>5}  {'Experts':>7}  {'Kept mass%':>11}  {'vs.rand%':>9}  {'Rel.gap avg':>12}  {'Worst layer':>11}")
-print(f"  {'-'*5}  {'-'*7}  {'-'*11}  {'-'*9}  {'-'*12}  {'-'*11}")
-
-sweep_results = {}
-for ratio in sweep_ratios:
-    nk = max(1, round(N_EXPERTS * ratio))
-    mass_fracs, excesses, rel_gaps = [], [], []
-    worst_excess, worst_layer_id = -999.0, None
-    for k in layers:
-        scores = data[k][score_field]
-        mf, exc, rg = layer_stats(scores, nk)
-        mass_fracs.append(mf)
-        excesses.append(exc)
-        rel_gaps.append(rg)
-        if exc > worst_excess:
-            worst_excess = exc
-            worst_layer_id = k
-    avg_mf  = statistics.mean(mass_fracs)
-    avg_exc = statistics.mean(excesses)
-    avg_rg  = statistics.mean(rel_gaps)
-    marker  = " <--" if abs(ratio - keep_ratio) < 1e-9 else ""
-    print(f"  {ratio:>5.0%}  {nk:>7d}  {avg_mf:>10.2f}%  {avg_exc:>+9.2f}%  {avg_rg:>11.4f}  layer {worst_layer_id:>3}{marker}")
-    sweep_results[ratio] = {
-        "n_keep": nk, "avg_kept_mass": avg_mf, "avg_vs_random": avg_exc,
-        "avg_rel_gap": avg_rg, "worst_layer_id": worst_layer_id, "worst_vs_random": worst_excess,
-    }
-
-print(f"")
-print(f"  vs.rand% quantifies REAP's advantage over random pruning in REAP-mass terms.")
-print(f"  For this model it is small (+0.7 to +1.5pp) due to tight load-balancing.")
-print(f"  Rel.gap near zero means scores are smooth with no natural cut — any threshold")
-print(f"  is as defensible as another. Actual quality delta requires empirical eval.")
-
-# ── Expert keep/prune detail at selected keep_ratio ──────────────────────────
-n_keep   = max(1, round(N_EXPERTS * keep_ratio))
-n_prune  = N_EXPERTS - n_keep
-
-print(f"\n── Expert pruning detail at keep_ratio={keep_ratio:.0%}  ({n_keep} keep / {n_prune} prune per layer) ──")
-print(f"  {'Layer':>5}  {'Kept mass%':>11}  {'vs.rand%':>9}  {'Rel.gap':>9}  {'Min kept':>10}  {'Max pruned':>11}")
-print(f"  {'-'*5}  {'-'*11}  {'-'*9}  {'-'*9}  {'-'*10}  {'-'*11}")
-
-layer_results = {}
-for k in layers:
-    scores = data[k][score_field]
-    ranked = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
-    mf, exc, rg = layer_stats(scores, n_keep)
-    min_kept   = scores[ranked[n_keep - 1]]
-    max_pruned = scores[ranked[n_keep]] if n_prune > 0 else 0
-    layer_results[k] = {"mass_frac": mf, "excess": exc, "rel_gap": rg,
-                        "min_kept": min_kept, "max_pruned": max_pruned}
-    print(f"  {k:>5}  {mf:>10.2f}%  {exc:>+9.2f}%  {rg:>9.4f}  {min_kept:>10.5f}  {max_pruned:>11.5f}")
-
-avg_mf  = statistics.mean(r["mass_frac"] for r in layer_results.values())
-avg_exc = statistics.mean(r["excess"]    for r in layer_results.values())
-avg_rg  = statistics.mean(r["rel_gap"]   for r in layer_results.values())
-print(f"  {'AVG':>5}  {avg_mf:>10.2f}%  {avg_exc:>+9.2f}%  {avg_rg:>9.4f}")
-
-# ── Model size projections ───────────────────────────────────────────────────
-print(f"\n── Model size projections ──────────────────────────────────────────────")
-
-def model_size(keep):
-    expert_params = PARAMS_MOE_EXPERTS_B * keep
-    return PARAMS_NON_MOE_B + expert_params
-
-original_b   = model_size(1.0)
-pruned_b     = model_size(keep_ratio)
-reduction_pct = (1 - pruned_b / original_b) * 100
-
-# GGUF sizes at common quant levels (rough: 1B params ≈ quant_bpw/8 GB)
-quants = [("Q8_0", 8.0), ("Q5_K_M", 5.5), ("Q4_K_M", 4.5), ("Q3_K_M", 3.35), ("Q2_K", 2.63)]
-
-print(f"  {'':20}  {'Original':>10}  {'Pruned':>10}  {'Saved':>8}")
-print(f"  {'Parameters (B)':20}  {original_b:>10.1f}  {pruned_b:>10.1f}  {original_b-pruned_b:>8.1f}B")
-print(f"  {'Reduction':20}  {'':>10}  {reduction_pct:>9.1f}%")
-print()
-print(f"  Estimated GGUF sizes:")
-print(f"  {'Quant':10}  {'Original':>10}  {'Pruned':>10}  {'Fits in':>12}")
-for name, bpw in quants:
-    orig_gb  = original_b * bpw / 8
-    prune_gb = pruned_b   * bpw / 8
-    # VRAM fit (16GB GPU)
-    fits = "16GB GPU" if prune_gb <= 15.5 else ("32GB GPU" if prune_gb <= 31 else "CPU/RAM")
-    print(f"  {name:10}  {orig_gb:>9.1f}G  {prune_gb:>9.1f}G  {fits:>12}")
-
-# ── Active params per token (inference cost) ─────────────────────────────────
-print(f"\n── Inference cost (active params per token) ────────────────────────────")
-# Active params = non-moe + (n_expert_used/n_experts_kept * moe_expert_params)
-# After pruning: router still picks top-k but from n_keep pool
-# Active expert params per token = (N_EXPERT_USED / n_keep) * (PARAMS_MOE_EXPERTS_B * keep_ratio)
-# But actually active params = N_EXPERT_USED * (params per single expert)
-params_per_expert_orig   = PARAMS_MOE_EXPERTS_B / N_EXPERTS          # B per expert
-params_per_expert_pruned = (PARAMS_MOE_EXPERTS_B * keep_ratio) / n_keep  # same, just fewer experts
-
-active_orig   = PARAMS_NON_MOE_B + N_EXPERT_USED * params_per_expert_orig   * N_MOE_LAYERS / N_TOTAL_LAYERS
-active_pruned = PARAMS_NON_MOE_B + N_EXPERT_USED * params_per_expert_pruned * N_MOE_LAYERS / N_TOTAL_LAYERS
-
-print(f"  Original  : {active_orig:.2f}B active params/token  (same expert size, more choice)")
-print(f"  Pruned    : {active_pruned:.2f}B active params/token  (same — top-k still fires {N_EXPERT_USED} experts)")
-print(f"  Note: active params per token are IDENTICAL — pruning only reduces")
-print(f"        model file size and memory footprint, not per-token compute.")
-
-# ── Consistently low-importance experts ──────────────────────────────────────
-print(f"\n── Experts consistently ranked low across all layers ───────────────────")
-bottom_n = max(1, round(N_EXPERTS * 0.10))  # bottom 10%
-low_count = {}
-for k in layers:
-    scores = data[k][score_field]
-    ranked = sorted(range(len(scores)), key=lambda i: scores[i])
-    for eid in ranked[:bottom_n]:
-        low_count[eid] = low_count.get(eid, 0) + 1
-
-consistent = sorted(low_count.items(), key=lambda x: -x[1])
-consistent = [(eid, cnt) for eid, cnt in consistent if cnt >= 3]
-print(f"  (bottom 10% in >= 3 layers — most dispensable experts globally)")
-print(f"  Expert ID : layers in bottom 10%")
-for eid, cnt in consistent[:20]:
-    bar = "█" * cnt
-    print(f"  Expert {eid:>3} : {cnt:>2}/{n_layers}  {bar}")
-
-print()
-print("=" * 70)
diff --git a/tools/moe-pruning/build_expert_profile.sh b/tools/moe-pruning/build_expert_profile.sh
deleted file mode 100644
index 0b39604426..0000000000
--- a/tools/moe-pruning/build_expert_profile.sh
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/usr/bin/env bash
-# build_expert_profile.sh
-# Builds llama.cpp with the expert-profile tool in WSL2 with CUDA.
-# Run this from the tools/moe-pruning/ directory: bash build_expert_profile.sh
-
-set -e
-
-LLAMA_SRC="../.."
-BUILD_DIR="$LLAMA_SRC/build_expert"
-
-echo "=== Building llama.cpp + expert-profile tool ==="
-echo "  Source : $LLAMA_SRC"
-echo "  Build  : $BUILD_DIR"
-
-mkdir -p "$BUILD_DIR"
-cd "$BUILD_DIR"
-
-# Configure with CUDA
-cmake "$LLAMA_SRC" \
-    -DCMAKE_BUILD_TYPE=Release \
-    -DGGML_CUDA=ON \
-    -DLLAMA_CURL=OFF \
-    -DLLAMA_BUILD_TESTS=OFF \
-    -DLLAMA_BUILD_EXAMPLES=OFF \
-    -DCMAKE_CUDA_ARCHITECTURES=86 \
-    2>&1 | tail -20
-
-# Build only the expert-profile target (fast)
-cmake --build . --target llama-expert-profile --config Release -j$(nproc)
-
-echo ""
-echo "=== Build complete ==="
-echo "  Binary: $BUILD_DIR/tools/expert-profile/llama-expert-profile"
-echo ""
-echo "=== Usage ==="
-echo "  $BUILD_DIR/tools/expert-profile/llama-expert-profile \\"
-echo "    -m ~/nemotron-3-nano-30b-Q4_K_M.gguf \\"
-echo "    --jsonl ./sample_calibration.jsonl \\"
-echo "    --output ./expert_stats_reap.json \\"
-echo "    --n-experts 128 \\"
-echo "    --ctx-size 16384 \\"
-echo "    -ngl 99"
diff --git a/tools/moe-pruning/extract_ppl.py b/tools/moe-pruning/extract_ppl.py
deleted file mode 100644
index 972a32e99d..0000000000
--- a/tools/moe-pruning/extract_ppl.py
+++ /dev/null
@@ -1,41 +0,0 @@
-import json, os
-
-base = os.path.dirname(os.path.abspath(__file__))
-
-lines = open(os.path.join(base, 'rwsft-training-data.jsonl'), encoding='utf-8').readlines()
-split = int(len(lines) * 0.95)
-
-train_lines = lines[:split]
-val_lines   = lines[split:]
-
-train_out = os.path.join(base, 'ppl-eval-train.txt')
-val_out   = os.path.join(base, 'ppl-eval-val.txt')
-
-def fmt(s):
-    # Full prompt+response so the model is conditioned correctly.
-    # llama-perplexity scores all tokens, but the prompt PPL is identical
-    # for base vs adapter — the delta is driven by the response tokens.
-    prompt   = s.get('prompt', '').strip()
-    response = s.get('response', '').strip()
-    if not response:
-        return None
-    if prompt:
-        return prompt + '\n' + response
-    return response
-
-with open(train_out, 'w', encoding='utf-8') as f:
-    for line in train_lines:
-        text = fmt(json.loads(line))
-        if text:
-            f.write(text + '\n\n')
-
-with open(val_out, 'w', encoding='utf-8') as f:
-    for line in val_lines:
-        text = fmt(json.loads(line))
-        if text:
-            f.write(text + '\n\n')
-
-train_chars = len(open(train_out, encoding='utf-8').read())
-val_chars   = len(open(val_out,   encoding='utf-8').read())
-print(f'train: {len(train_lines)} samples, {train_chars:,} chars -> ppl-eval-train.txt')
-print(f'val:   {len(val_lines)} samples,  {val_chars:,} chars  -> ppl-eval-val.txt')
diff --git a/tools/moe-pruning/gguf_prune.py b/tools/moe-pruning/gguf_prune.py
deleted file mode 100644
index df3e638ab4..0000000000
--- a/tools/moe-pruning/gguf_prune.py
+++ /dev/null
@@ -1,260 +0,0 @@
-"""
-gguf-prune: REAP-based expert pruning directly on a GGUF file.
-
-Slices the expert dimension of the four stacked MoE weight tensors per layer:
-    blk.{il}.ffn_up_exps      [n_embd, intermediate, n_experts]
-    blk.{il}.ffn_down_exps    [intermediate, n_embd, n_experts]
-    blk.{il}.ffn_gate_inp     [n_embd, n_experts]
-    blk.{il}.ffn_exp_probs_b  [n_experts]  (score-correction bias, if present)
-
-Quantized blocks (Q4_K, Q6_K, …) are preserved as raw bytes — slicing the
-expert axis (last dim) is safe because each expert is independently quantised
-in ggml, so dropping experts = dropping whole quantisation blocks.
-
-Metadata patched:
-    {arch}.expert_count  → keep_n
-    (expert_used_count = top-k routing k, NOT touched)
-
-Usage:
-    # keep top 20% of experts (26/128) per MoE layer
-    python gguf_prune.py \\
-        --input  nemotron.gguf \\
-        --stats  expert_stats.json \\
-        --output nemotron-pruned.gguf \\
-        --keep_ratio 0.20
-
-    # or keep an absolute number
-    python gguf_prune.py \\
-        --input  nemotron.gguf \\
-        --stats  expert_stats.json \\
-        --output nemotron-pruned.gguf \\
-        --keep_n 32
-"""
-
-from __future__ import annotations
-
-import argparse
-import json
-import re
-from pathlib import Path
-
-import numpy as np
-from gguf import GGUFReader, GGUFWriter, GGUFValueType
-
-
-# ── Constants ─────────────────────────────────────────────────────────────────
-
-# Base tensor names that carry the expert dimension (last axis in ggml layout).
-# Some GGUFs append parameter tails like ".weight" / ".bias".
-EXPERT_BASE_SUFFIXES = {
-    "ffn_up_exps",
-    "ffn_down_exps",
-    "ffn_gate_inp",
-}
-
-
-def is_expert_suffix(suffix: str) -> bool:
-    """Return True if a tensor suffix is one of the MoE expert tensors to prune."""
-    if suffix in ("ffn_exp_probs_b", "exp_probs_b", "exp_probs_b.bias"):
-        return True
-    return any(suffix == base or suffix.startswith(base + ".") for base in EXPERT_BASE_SUFFIXES)
-
-
-# ── Helpers ───────────────────────────────────────────────────────────────────
-
-def layer_and_suffix(name: str) -> tuple[int, str] | tuple[None, None]:
-    m = re.match(r"blk\.(\d+)\.(.+)$", name)
-    if m:
-        return int(m.group(1)), m.group(2)
-    return None, None
-
-
-def pick_experts(layer_stats: dict, keep_n: int) -> list[int]:
-    """
-    Return sorted indices of the top `keep_n` experts by REAP score.
-    Falls back to 'importance_score' (weighted frequency) if 'reap' absent.
-    """
-    if "reap" in layer_stats:
-        scores = np.array(layer_stats["reap"], dtype=np.float64)
-    elif "importance_score" in layer_stats:
-        scores = np.array(layer_stats["importance_score"], dtype=np.float64)
-    else:
-        raise KeyError(
-            "Layer stats has neither 'reap' nor 'importance_score'. "
-            "Run expert-profile / nemotron_reap.py profile first."
-        )
-    return sorted(np.argsort(scores)[-keep_n:].tolist())
-
-
-def slice_expert_axis(data: np.ndarray, keep: list[int]) -> np.ndarray:
-    """
-    Slice the expert axis of reader tensor data keeping only `keep` indices.
-
-    GGUFReader reshapes tensors to NumPy with reversed ggml dims, so for MoE
-    tensors where experts are the last ggml dim, expert is axis 0 in `data`.
-    This also preserves quantized row-byte alignment (axis -1 is byte-packed
-    rows for quantized tensors and must not be sliced for expert pruning).
-    """
-    return np.take(data, keep, axis=0)
-
-
-def copy_field(writer: GGUFWriter, field, reader: GGUFReader) -> bool:
-    """Copy a single metadata field to writer. Returns False if skipped."""
-    key = field.name
-    val_type = field.types[0]
-    part = field.parts[-1]
-
-    if val_type == GGUFValueType.STRING:
-        # Preserve raw bytes: GGUF metadata can contain non-UTF8 strings.
-        writer.add_key_value(key, bytes(part), GGUFValueType.STRING)
-    elif val_type == GGUFValueType.UINT8:
-        writer.add_uint8(key, int(part[0]))
-    elif val_type == GGUFValueType.INT8:
-        writer.add_int8(key, int(part[0]))
-    elif val_type == GGUFValueType.UINT16:
-        writer.add_uint16(key, int(part[0]))
-    elif val_type == GGUFValueType.INT16:
-        writer.add_int16(key, int(part[0]))
-    elif val_type == GGUFValueType.UINT32:
-        writer.add_uint32(key, int(part[0]))
-    elif val_type == GGUFValueType.INT32:
-        writer.add_int32(key, int(part[0]))
-    elif val_type == GGUFValueType.FLOAT32:
-        writer.add_float32(key, float(part[0]))
-    elif val_type == GGUFValueType.UINT64:
-        writer.add_uint64(key, int(part[0]))
-    elif val_type == GGUFValueType.INT64:
-        writer.add_int64(key, int(part[0]))
-    elif val_type == GGUFValueType.FLOAT64:
-        writer.add_float64(key, float(part[0]))
-    elif val_type == GGUFValueType.BOOL:
-        writer.add_bool(key, bool(part[0]))
-    elif val_type == GGUFValueType.ARRAY:
-        elem_type = field.types[1]
-        if elem_type == GGUFValueType.STRING:
-            # ReaderField.data stores indices of ARRAY payload items; for
-            # STRING arrays this points at each string byte payload.
-            vals = [bytes(field.parts[idx]) for idx in field.data]
-            writer.add_key_value(key, vals, GGUFValueType.ARRAY, sub_type=GGUFValueType.STRING)
-        else:
-            # ReaderField.data stores part-indices, not payload values.
-            vals = field.contents()
-            if not isinstance(vals, list):
-                print(f"  WARNING: skipping array field {key!r} (unexpected non-list contents)")
-                return False
-            writer.add_array(key, vals)
-    else:
-        print(f"  WARNING: skipping field {key!r} (unsupported type {val_type})")
-        return False
-    return True
-
-
-# ── Main ──────────────────────────────────────────────────────────────────────
-
-def main():
-    ap = argparse.ArgumentParser(description="REAP expert pruning on a GGUF file")
-    ap.add_argument("--input",      required=True,              help="Input .gguf path")
-    ap.add_argument("--stats",      required=True,              help="expert_stats.json from expert-profile")
-    ap.add_argument("--output",     required=True,              help="Output .gguf path")
-    ap.add_argument("--keep_ratio", type=float, default=None,   help="Fraction to keep, e.g. 0.20")
-    ap.add_argument("--keep_n",     type=int,   default=None,   help="Absolute count to keep, e.g. 32")
-    ap.add_argument("--n_experts",  type=int,   default=128,    help="Experts per MoE layer in source model")
-    args = ap.parse_args()
-
-    if args.keep_ratio is None and args.keep_n is None:
-        ap.error("Provide --keep_ratio or --keep_n")
-    if args.keep_ratio is not None and args.keep_n is not None:
-        ap.error("Provide --keep_ratio OR --keep_n, not both")
-
-    keep_n = args.keep_n if args.keep_n is not None else max(1, int(args.n_experts * args.keep_ratio))
-    print(f"[gguf-prune] keeping {keep_n}/{args.n_experts} experts per MoE layer")
-
-    # ── Load stats ─────────────────────────────────────────────────────────────
-    with open(args.stats) as f:
-        stats = {int(k): v for k, v in json.load(f).items()}
-    print(f"[gguf-prune] stats loaded for {len(stats)} MoE layers")
-
-    # ── Open source GGUF ───────────────────────────────────────────────────────
-    print(f"[gguf-prune] reading  {args.input}")
-    reader = GGUFReader(args.input, mode="r")
-
-    arch_field = reader.get_field("general.architecture")
-    arch = str(bytes(arch_field.parts[-1]), "utf-8") if arch_field else "nemotron_h_moe"
-    print(f"[gguf-prune] arch     {arch}")
-
-    expert_count_key = f"{arch}.expert_count"
-
-    # ── Compute kept indices per layer ─────────────────────────────────────────
-    kept: dict[int, list[int]] = {}
-    for tensor in reader.tensors:
-        il, suffix = layer_and_suffix(tensor.name)
-        if il is None or suffix is None or not is_expert_suffix(suffix):
-            continue
-        if il in kept:
-            continue  # already computed for this layer
-        if il not in stats:
-            print(f"  Layer {il:3d}: no stats — keeping ALL {args.n_experts} experts")
-            kept[il] = list(range(args.n_experts))
-        else:
-            kept[il] = pick_experts(stats[il], keep_n)
-            never = stats[il].get("never_activated", "?")
-            crit  = "reap" if "reap" in stats[il] else "importance_score"
-            print(f"  Layer {il:3d}: keep {kept[il][:4]}…  never_activated={never}  criterion={crit}")
-
-    # ── Build output GGUF ──────────────────────────────────────────────────────
-    print(f"\n[gguf-prune] writing  {args.output}")
-    writer = GGUFWriter(args.output, arch=arch)
-
-    # --- metadata: copy all fields, replace expert_count ---
-    for field in reader.fields.values():
-        # Reader exposes synthetic header fields (GGUF.*) that are not KV
-        # metadata and must not be copied back as normal keys.
-        if field.name.startswith("GGUF."):
-            continue
-        # Writer already sets general.architecture from ctor; avoid duplicate warning.
-        if field.name in (expert_count_key, "general.architecture"):
-            continue  # replaced below
-        copy_field(writer, field, reader)
-
-    writer.add_expert_count(keep_n)
-    print(f"[gguf-prune] patched  {expert_count_key} → {keep_n}")
-
-    # --- tensors ---
-    n_pruned = 0
-    for tensor in reader.tensors:
-        il, suffix = layer_and_suffix(tensor.name)
-        is_expert = il is not None and suffix is not None and is_expert_suffix(suffix)
-
-        if is_expert:
-            assert il is not None
-            k = kept[il]
-            data = slice_expert_axis(tensor.data, k)
-            writer.add_tensor(
-                tensor.name,
-                data,
-                raw_dtype=tensor.tensor_type,
-            )
-            n_pruned += 1
-        else:
-            writer.add_tensor(
-                tensor.name,
-                tensor.data,
-                raw_dtype=tensor.tensor_type,
-            )
-
-    writer.write_header_to_file()
-    writer.write_kv_data_to_file()
-    writer.write_tensors_to_file(progress=True)
-    writer.close()
-
-    out = Path(args.output)
-    size_gb = out.stat().st_size / 1024**3
-    print(f"\n[gguf-prune] done")
-    print(f"  Expert tensors sliced : {n_pruned}")
-    print(f"  MoE layers pruned     : {len(kept)}")
-    print(f"  Experts per layer     : {keep_n}/{args.n_experts}")
-    print(f"  Output size           : {size_gb:.2f} GB  →  {out}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tools/moe-pruning/requirements.txt b/tools/moe-pruning/requirements.txt
deleted file mode 100644
index 6b21f239f4..0000000000
--- a/tools/moe-pruning/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-gguf
diff --git a/tools/moe-pruning/sample_calibration.jsonl b/tools/moe-pruning/sample_calibration.jsonl
deleted file mode 100644
index 44e2627f97..0000000000
--- a/tools/moe-pruning/sample_calibration.jsonl
+++ /dev/null
@@ -1,8 +0,0 @@
-{"prompt": "What is the difference between a transformer and an RNN?", "response": "Transformers process all tokens in parallel using self-attention, while RNNs process tokens sequentially and maintain a hidden state. Transformers scale better and handle long-range dependencies more effectively, but require more memory for the attention matrix."}
-{"prompt": "Explain the concept of weight quantization in neural networks.", "response": "Weight quantization reduces model size and inference latency by representing parameters in lower precision formats such as INT8 or INT4 instead of FP32. This trades a small amount of accuracy for significant gains in memory efficiency and compute throughput."}
-{"prompt": "What is mixture of experts (MoE)?", "response": "MoE is a neural network architecture where the feed-forward layers are replaced by multiple expert sub-networks and a routing mechanism. For each token, only a small subset of experts is activated, allowing the model to have a large parameter count while keeping inference compute constant."}
-{"prompt": "Describe the attention mechanism.", "response": "Attention computes a weighted sum of value vectors, where the weights are determined by the compatibility between query and key vectors. Scaled dot-product attention computes scores as Q*K^T/sqrt(d_k), applies softmax to get weights, then multiplies by V."}
-{"prompt": "What is GGUF and how does it differ from GGML?", "response": "GGUF is the successor to the GGML file format for storing quantized models. It supports arbitrary key-value metadata, is extensible without breaking backward compatibility, and encodes tensor names and shapes explicitly, making it more robust than the original GGML format."}
-{"prompt": "How does LoRA work?", "response": "LoRA (Low-Rank Adaptation) injects trainable rank-decomposition matrices A and B into frozen weight layers. The adapted weight is W + alpha/r * B*A. Since rank r is much smaller than the weight dimensions, only a tiny fraction of parameters are trained."}
-{"prompt": "What is perplexity in language modeling?", "response": "Perplexity measures how well a language model predicts a sample text. It is the exponentiated average negative log-likelihood per token: PPL = exp(-1/N * sum log P(token_i)). Lower perplexity indicates a better fit to the data."}
-{"prompt": "Explain rotary position embeddings (RoPE).", "response": "RoPE encodes position by rotating query and key vectors in 2D subspaces using a position-dependent rotation matrix. This makes the dot product between Q and K depend only on their relative position, enabling the model to generalise to sequence lengths longer than those seen during training."}