llama.cpp/tools/moe-pruning/build_expert_profile.sh

#!/usr/bin/env bash
# build_expert_profile.sh
# Builds llama.cpp with the expert-profile tool in WSL2 with CUDA.
# Run this from the tools/moe-pruning/ directory: bash build_expert_profile.sh

set -e

LLAMA_SRC="../.."
BUILD_DIR="$LLAMA_SRC/build_expert"

echo "=== Building llama.cpp + expert-profile tool ==="
echo "  Source : $LLAMA_SRC"
echo "  Build  : $BUILD_DIR"

mkdir -p "$BUILD_DIR"
cd "$BUILD_DIR"

# Configure with CUDA
cmake "$LLAMA_SRC" \
    -DCMAKE_BUILD_TYPE=Release \
    -DGGML_CUDA=ON \
    -DLLAMA_CURL=OFF \
    -DLLAMA_BUILD_TESTS=OFF \
    -DLLAMA_BUILD_EXAMPLES=OFF \
    -DCMAKE_CUDA_ARCHITECTURES=86 \
    2>&1 | tail -20

# Build only the expert-profile target (fast)
cmake --build . --target llama-expert-profile --config Release -j$(nproc)

echo ""
echo "=== Build complete ==="
echo "  Binary: $BUILD_DIR/tools/expert-profile/llama-expert-profile"
echo ""
echo "=== Usage ==="
echo "  $BUILD_DIR/tools/expert-profile/llama-expert-profile \\"
echo "    -m ~/nemotron-3-nano-30b-Q4_K_M.gguf \\"
echo "    --jsonl ./sample_calibration.jsonl \\"
echo "    --output ./expert_stats_reap.json \\"
echo "    --n-experts 128 \\"
echo "    --ctx-size 16384 \\"
echo "    -ngl 99"