llama.cpp/scripts/compute-imatrix.py

106 lines
3.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""Compute imatrix (importance matrix) from captured activation tensors.
The imatrix is the per-dimension sum-of-squares of the activations.
It's what upstream llama.cpp uses to weight quantization optimization.
For each activation file act_blkL_*.f32bin, produces imatrix_blkL_<role>.f32bin
where <role> matches the weight tensor it multiplies with.
Format: flat float32 array of length n_per_row, one importance value per dimension.
"""
import numpy as np
import struct
import os
DATA_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "data")
def load_f32_tensor(name):
path = os.path.join(DATA_DIR, name)
with open(path, "rb") as f:
nrow, ncol = struct.unpack("qq", f.read(16))
data = np.frombuffer(f.read(), dtype=np.float32)
assert len(data) == nrow * ncol
return data.reshape(nrow, ncol)
def save_imatrix(name, data):
path = os.path.join(DATA_DIR, name)
data.astype(np.float32).tofile(path)
print(
f" Wrote {path}: {len(data)} dims, "
f"min={data.min():.6f}, max={data.max():.6f}, mean={data.mean():.6f}"
)
# Mapping: activation file → imatrix files for each weight it multiplies with
# Each weight tensor's column dimension matches the activation's column dimension
mappings = [
{
"act_file": "act_blk0_ffn_input.f32bin",
"imatrix_name": "imatrix_blk0_ffn_gate_up.f32bin",
"description": "ffn_gate and ffn_up (both use ffn_input activation)",
},
{
"act_file": "act_blk0_ffn_down_input.f32bin",
"imatrix_name": "imatrix_blk0_ffn_down.f32bin",
"description": "ffn_down (uses SwiGLU activation)",
},
{
"act_file": "act_blk0_attn_input.f32bin",
"imatrix_name": "imatrix_blk0_attn_qkv.f32bin",
"description": "attn_q, attn_k, attn_v (all use attn_input activation)",
},
{
"act_file": "act_blk0_attn_output_input.f32bin",
"imatrix_name": "imatrix_blk0_attn_output.f32bin",
"description": "attn_output (uses kqv_out activation)",
},
]
print("Computing imatrix from captured activations")
print("=" * 60)
for m in mappings:
try:
A = load_f32_tensor(m["act_file"])
print(f"\n{m['description']}:")
print(f" Activation: {A.shape[0]} tokens × {A.shape[1]} dims")
# imatrix = sum over tokens of activation^2
# This is the standard definition used by llama.cpp
imatrix = np.sum(A**2, axis=0)
# Also compute per-dim RMS for reference
rms = np.sqrt(np.mean(A**2, axis=0))
print(
f" Imatrix stats: min={imatrix.min():.6f}, max={imatrix.max():.6f}, "
f"mean={imatrix.mean():.6f}, std={imatrix.std():.6f}"
)
print(
f" RMS stats: min={rms.min():.6f}, max={rms.max():.6f}, "
f"mean={rms.mean():.6f}"
)
# Concentration metrics
total = imatrix.sum()
sorted_im = np.sort(imatrix)[::-1]
top1pct = max(1, int(len(imatrix) * 0.01))
top10pct = max(1, int(len(imatrix) * 0.10))
print(f" Power concentration:")
print(
f" Top 1% dims ({top1pct}): {sorted_im[:top1pct].sum() / total * 100:.1f}% of total"
)
print(
f" Top 10% dims ({top10pct}): {sorted_im[:top10pct].sum() / total * 100:.1f}% of total"
)
save_imatrix(m["imatrix_name"], imatrix)
except Exception as e:
print(f" SKIP: {e}")
print("\nDone.")