ggml : add NVFP4 quantization type support (#19769)
* WIP: add NVFP4 quantization support * tests * improve NVFP4 dot product implementation performance and fix bad super call * typo * Use nvfp4 kvalues * vulkan : fix NVFP4 shader compilation by including kvalues_mxfp4 lookup table * vulcal and perf fixes * wip * Fix metal * fix vulcan * Rename threshold & fix wrong scale * Fix MOE * Shelf backend implementations (CUDA, Metal, Vulkan, arch-specific SIMD) Remove NVFP4 support from GPU backends and architecture-specific optimized dot products. These should be added in separate PRs so backend specialists can review them independently. Reverted files: - ggml-cuda: common.cuh, convert.cu, mmq.cu/cuh, mmvq.cu, vecdotq.cuh, quantize.cu/cuh, mma.cuh, ggml-cuda.cu, fattn-tile.cuh - ggml-metal: ggml-metal.metal, ggml-metal-device.cpp, ggml-metal-impl.h, ggml-metal-ops.cpp - ggml-vulkan: ggml-vulkan.cpp, all vulkan-shaders/* - ggml-cpu arch: arm/quants.c, x86/quants.c, powerpc/quants.c, s390/quants.c Core NVFP4 support (type definition, CPU fallback dot product, quantization, dequantization, conversion) is retained. * Fix arch-fallback.h: add NVFP4 generic fallback for all platforms After shelving backend-specific SIMD implementations, the generic CPU dot product needs to be aliased on ARM, x86, PowerPC, and s390 platforms that previously relied on arch-specific versions. * quantize: add NVFP4 as a quantization type option * Fix ggml_fp32_to_ue4m3: handle subnormal values Previously, values with ue4m3_exp <= 0 were clamped to 0, causing all small scales to underflow. This made NVFP4 quantization via llama-quantize produce garbage (PPL = 5.8M) since typical transformer weights have amax/6.0 in the range 0.001-0.01, which falls in the UE4M3 subnormal range. Now subnormals are properly encoded as man * 2^-9 (exp=0, man=1..7), matching the decode path in ggml_ue4m3_to_fp32. Result: NVFP4 requantization now produces PPL = 15.25 (vs F16 = 14.33), comparable to Q4_1 (PPL = 15.81) at slightly lower BPW (4.70 vs 5.15). * Restore ARM NEON NVFP4 dot product implementation Restores the optimized ggml_vec_dot_nvfp4_q8_0 for ARM NEON using vqtbl1q_s8 lookup and ggml_vdotq_s32 dot products. tg128 performance: 4.37 t/s (generic) -> 13.66 t/s (NEON) = 3.1x speedup * Optimize ARM NEON NVFP4 dot product: LUT + vpaddq + vfmaq - Add ue4m3_scale_lut[128] to ggml-common.h replacing branch-heavy ggml_ue4m3_to_fp32() in the hot loop - Use vpaddq_s32 for pairwise int32 reduction instead of vaddvq_s32 - Accumulate with vfmaq_f32 into float32x4_t vector accumulators tg128: 8.1 -> 31.0 t/s (3.8x speedup, 77% of Q4_1 speed) * ARM NEON NVFP4: rearrange q8 to match nibble layout Alternative approach: rearrange q8 data to match the NVFP4 lo/hi nibble layout instead of rearranging the looked-up NVFP4 values. Eliminates vcombine_s8(vget_low, vget_low) shuffles. Performance is equivalent (~18.5 t/s) - the bottleneck is the 2x block overhead from QK=16 vs QK=32, not the shuffle instructions. * CPU only backend 64 super-block layout * cleanup * Remove unused LUT * int * exclude NVFP4 from unsupported ops in metal build * remove quantization for now * store scales as native UE4M3, preserve original model bits when possible * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * correct comment * format * reduce duplication and cleanup * Address comments * move detection to prepare_tensors * Use math instead of const * Move * fix comment * Shelf quantize tests * Rebase and move check * cleanup * lint * Update gguf-py/gguf/scripts/gguf_convert_endian.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Use fallback quant config * Simplify Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * organize * Refactor * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * add quantize_nvfp4 (required for test_quants.py) * add quantize_nvfp4 (required for test_quants.py) * add quantize_nvfp4 (required for test_quants.py) * fix return type --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
This commit is contained in:
parent
3ca19b0e9f
commit
5eae9cb1d9
|
|
@ -144,6 +144,7 @@ class ModelBase:
|
|||
self.metadata_override = metadata_override
|
||||
self.model_name = model_name
|
||||
self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
|
||||
self._is_nvfp4 = False
|
||||
|
||||
# Apply heuristics to figure out typical tensor encoding based on first tensor's dtype
|
||||
# NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
|
||||
|
|
@ -271,6 +272,9 @@ class ModelBase:
|
|||
return tensors
|
||||
|
||||
def dequant_model(self):
|
||||
if self._is_nvfp4:
|
||||
return # NVFP4 weights are repacked in _generate_nvfp4_tensors
|
||||
|
||||
tensors_to_remove: list[str] = []
|
||||
new_tensors: dict[str, Callable[[], Tensor]] = {}
|
||||
|
||||
|
|
@ -516,6 +520,13 @@ class ModelBase:
|
|||
raise NotImplementedError("set_gguf_parameters() must be implemented in subclasses")
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
# skip NVFP4 auxiliary tensors (handled in _generate_nvfp4_tensors)
|
||||
if self._is_nvfp4:
|
||||
if name.endswith((".weight_scale", ".weight_scale_2", ".input_scale", ".k_scale", ".v_scale")):
|
||||
return []
|
||||
if name.endswith(".weight") and name.replace(".weight", ".weight_scale") in self.model_tensors:
|
||||
return []
|
||||
|
||||
new_name = self.map_tensor_name(name)
|
||||
|
||||
# Handle gate/up expert tensor fusion if enabled
|
||||
|
|
@ -551,9 +562,135 @@ class ModelBase:
|
|||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||
return ()
|
||||
|
||||
@staticmethod
|
||||
def _nvfp4_pack(weight: Tensor, scale: Tensor) -> tuple[np.ndarray, list[int]]:
|
||||
"""Repack NVFP4 ModelOpt tensors into ggml super-block layout.
|
||||
Preserves original E4M3 scale bits as UE4M3 (strip sign bit).
|
||||
The per-tensor scale2 factor is stored as a separate tensor and applied at inference time via ggml_mul().
|
||||
Returns (raw_data, logical_shape)."""
|
||||
|
||||
out_features = weight.shape[0]
|
||||
n_blocks = scale.shape[1]
|
||||
|
||||
# Unpack ModelOpt nibble-packed weights
|
||||
w = weight.reshape(out_features, n_blocks, 8)
|
||||
vals = torch.stack([w & 0x0F, w >> 4], dim=-1).reshape(out_features, n_blocks, 16)
|
||||
|
||||
# Preserve original E4M3 scale bits as UE4M3 (strip sign bit)
|
||||
d_ue = scale.view(torch.uint8).numpy().reshape(out_features, n_blocks) & 0x7F
|
||||
qs = (vals[:, :, :8] | (vals[:, :, 8:] << 4)).to(torch.uint8).numpy()
|
||||
|
||||
# Pack into super-blocks: [4 UE4M3 scales, 32 qs bytes] = 36 bytes per 64 elements
|
||||
n_super = n_blocks // 4
|
||||
d_grouped = d_ue.reshape(out_features, n_super, 4)
|
||||
qs_grouped = qs.reshape(out_features, n_super, 4, 8).reshape(out_features, n_super, 32)
|
||||
raw = np.concatenate([d_grouped, qs_grouped], axis=-1).reshape(out_features, n_super * 36)
|
||||
return raw, [out_features, n_super * 64]
|
||||
|
||||
@staticmethod
|
||||
def _nvfp4_scale2_is_trivial(scale2: Tensor) -> bool:
|
||||
return scale2.numel() <= 1 and abs(float(scale2.float().sum()) - 1.0) < 1e-6
|
||||
|
||||
def _repack_nvfp4(self, new_name: str, weight: Tensor, scale: Tensor, scale2: Tensor):
|
||||
raw, shape = self._nvfp4_pack(weight, scale)
|
||||
logger.info(f"Repacked {new_name} with shape {shape} and quantization NVFP4")
|
||||
self.gguf_writer.add_tensor(new_name, raw, raw_dtype=gguf.GGMLQuantizationType.NVFP4)
|
||||
|
||||
# Emit per-tensor scale2 as a separate F32 tensor when non-trivial
|
||||
if not self._nvfp4_scale2_is_trivial(scale2):
|
||||
scale2_f32 = scale2.float().numpy().flatten()
|
||||
scale_name = new_name.replace(".weight", ".scale")
|
||||
logger.info(f" + {scale_name} (per-tensor NVFP4 scale2, shape [{scale2_f32.size}])")
|
||||
self.gguf_writer.add_tensor(scale_name, scale2_f32)
|
||||
|
||||
def _generate_nvfp4_tensors(self):
|
||||
# Per-layer expert merging to avoid holding all experts in memory
|
||||
expert_blocks: dict[tuple[int, str], list[tuple[int, np.ndarray]]] = {}
|
||||
expert_scales: dict[tuple[int, str], list[tuple[int, float]]] = {}
|
||||
expert_shapes: dict[tuple[int, str], list[int]] = {}
|
||||
n_experts = self.find_hparam(["num_local_experts", "num_experts"], optional=True) or 0
|
||||
|
||||
for name in list(self.model_tensors.keys()):
|
||||
if not name.endswith(".weight"):
|
||||
continue
|
||||
scale_name = name.replace(".weight", ".weight_scale")
|
||||
scale2_name = name.replace(".weight", ".weight_scale_2")
|
||||
if scale_name not in self.model_tensors:
|
||||
continue
|
||||
# Force eager materialization of lazy tensors
|
||||
weight = LazyTorchTensor.to_eager(self.model_tensors[name]())
|
||||
scale = LazyTorchTensor.to_eager(self.model_tensors[scale_name]())
|
||||
scale2 = LazyTorchTensor.to_eager(self.model_tensors.get(scale2_name, lambda: torch.tensor(1.0))())
|
||||
|
||||
# Check if this is a per-expert tensor
|
||||
m = re.search(r'\.experts\.(\d+)\.(gate_proj|up_proj|down_proj)\.weight$', name)
|
||||
if m:
|
||||
expert_id = int(m.group(1))
|
||||
proj_type = m.group(2)
|
||||
bid_m = re.search(r'\.layers\.(\d+)\.', name)
|
||||
bid = int(bid_m.group(1)) if bid_m else 0
|
||||
key = (bid, proj_type)
|
||||
|
||||
raw, shape = self._nvfp4_pack(weight, scale)
|
||||
|
||||
if key not in expert_blocks:
|
||||
expert_blocks[key] = []
|
||||
expert_scales[key] = []
|
||||
expert_shapes[key] = shape
|
||||
expert_blocks[key].append((expert_id, raw.copy()))
|
||||
# Collect per-expert scale2 (scalar per expert)
|
||||
expert_scales[key].append((expert_id, float(scale2.float().sum())))
|
||||
|
||||
# Flush when all experts for this (layer, proj) are collected
|
||||
if n_experts > 0 and len(expert_blocks[key]) >= n_experts:
|
||||
self._flush_nvfp4_experts(key, expert_blocks, expert_scales, expert_shapes, bid, proj_type)
|
||||
else:
|
||||
new_name = self.map_tensor_name(name)
|
||||
self._repack_nvfp4(new_name, weight, scale, scale2)
|
||||
|
||||
# Flush any remaining experts (fallback if n_experts was unknown)
|
||||
for (bid, proj_type) in list(expert_blocks.keys()):
|
||||
self._flush_nvfp4_experts((bid, proj_type), expert_blocks, expert_scales, expert_shapes, bid, proj_type)
|
||||
|
||||
def _flush_nvfp4_experts(self, key, expert_blocks, expert_scales, expert_shapes, bid, proj_type):
|
||||
experts = expert_blocks.pop(key)
|
||||
scales = expert_scales.pop(key)
|
||||
shape = expert_shapes.pop(key)
|
||||
|
||||
experts.sort(key=lambda x: x[0])
|
||||
merged = np.stack([e[1] for e in experts], axis=0)
|
||||
merged_name = f"model.layers.{bid}.mlp.experts.{proj_type}.weight"
|
||||
new_name = self.map_tensor_name(merged_name)
|
||||
logger.info(f"Repacked {new_name} with shape [{len(experts)}, {shape[0]}, {shape[1]}] and quantization NVFP4")
|
||||
self.gguf_writer.add_tensor(new_name, merged, raw_dtype=gguf.GGMLQuantizationType.NVFP4)
|
||||
|
||||
# Emit per-expert scale2 tensor if any expert has non-trivial scale2
|
||||
scales.sort(key=lambda x: x[0])
|
||||
scale_vals = np.array([s[1] for s in scales], dtype=np.float32)
|
||||
if not np.allclose(scale_vals, 1.0, atol=1e-6):
|
||||
scale_name = new_name.replace(".weight", ".scale")
|
||||
logger.info(f" + {scale_name} (per-expert NVFP4 scale2, shape [{len(scales)}])")
|
||||
self.gguf_writer.add_tensor(scale_name, scale_vals)
|
||||
|
||||
del experts, merged
|
||||
|
||||
def prepare_tensors(self):
|
||||
# detect NVFP4 quantization (ModelOpt format)
|
||||
quant_algo = (self.hparams.get("quantization_config") or {}).get("quant_algo")
|
||||
quant_config_file = self.dir_model / "hf_quant_config.json"
|
||||
|
||||
if not quant_algo and quant_config_file.is_file():
|
||||
with open(quant_config_file, "r", encoding="utf-8") as f:
|
||||
quant_algo = (json.load(f).get("quantization") or {}).get("quant_algo")
|
||||
|
||||
self._is_nvfp4 = quant_algo == "NVFP4"
|
||||
|
||||
self.dequant_model()
|
||||
|
||||
# NVFP4 weights are repacked and written directly to gguf_writer
|
||||
if self._is_nvfp4:
|
||||
self._generate_nvfp4_tensors()
|
||||
|
||||
# Handle empty tensor_map for models with block_count=0 (like MobileNetV5)
|
||||
if self.tensor_map.mapping:
|
||||
max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
|
||||
|
|
@ -4303,6 +4440,14 @@ class Qwen2MoeModel(TextModel):
|
|||
# process the experts separately
|
||||
name = name.replace("language_model.", "") # InternVL
|
||||
|
||||
# NVFP4 expert weights are handled in _generate_nvfp4_tensors
|
||||
if self._is_nvfp4 and "experts" in name:
|
||||
if name.endswith((".weight", ".weight_scale", ".weight_scale_2", ".input_scale")):
|
||||
if name.endswith(".weight") and name.replace(".weight", ".weight_scale") in self.model_tensors:
|
||||
return
|
||||
if not name.endswith(".weight"):
|
||||
return
|
||||
|
||||
# handle aggregated expert tensors
|
||||
# GGUF stores dimensions reversed from PyTorch, so:
|
||||
# PyTorch (A,B,C) -> GGUF writes [C,B,A] -> GGML reads ne={C,B,A}
|
||||
|
|
|
|||
|
|
@ -427,7 +427,8 @@ extern "C" {
|
|||
// GGML_TYPE_IQ4_NL_4_8 = 37,
|
||||
// GGML_TYPE_IQ4_NL_8_8 = 38,
|
||||
GGML_TYPE_MXFP4 = 39, // MXFP4 (1 block)
|
||||
GGML_TYPE_COUNT = 40,
|
||||
GGML_TYPE_NVFP4 = 40, // NVFP4 (4 blocks, E4M3 scale)
|
||||
GGML_TYPE_COUNT = 41,
|
||||
};
|
||||
|
||||
// precision
|
||||
|
|
@ -463,6 +464,7 @@ extern "C" {
|
|||
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_MXFP4 = 25, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_NVFP4 = 26, // except 1d tensors
|
||||
};
|
||||
|
||||
// available tensor operations:
|
||||
|
|
|
|||
|
|
@ -102,6 +102,9 @@ typedef sycl::half2 ggml_half2;
|
|||
#define QI_MXFP4 (QK_MXFP4 / (4 * QR_MXFP4))
|
||||
#define QR_MXFP4 2
|
||||
|
||||
#define QI_NVFP4 (QK_NVFP4 / (4 * QR_NVFP4))
|
||||
#define QR_NVFP4 2
|
||||
|
||||
#define QI5_0 (QK5_0 / (4 * QR5_0))
|
||||
#define QR5_0 2
|
||||
|
||||
|
|
@ -194,6 +197,14 @@ typedef struct {
|
|||
} block_mxfp4;
|
||||
static_assert(sizeof(block_mxfp4) == sizeof(uint8_t) + QK_MXFP4/2, "wrong mxfp4 block size/padding");
|
||||
|
||||
#define QK_NVFP4 64
|
||||
#define QK_NVFP4_SUB 16 // sub-block size for per-group scales
|
||||
typedef struct {
|
||||
uint8_t d[QK_NVFP4/QK_NVFP4_SUB]; // UE4M3 scales (4 bytes, one per 16-element sub-block)
|
||||
uint8_t qs[QK_NVFP4/2]; // packed 4-bit E2M1 values (32 bytes)
|
||||
} block_nvfp4;
|
||||
static_assert(sizeof(block_nvfp4) == sizeof(uint8_t)*(QK_NVFP4/QK_NVFP4_SUB) + QK_NVFP4/2, "wrong nvfp4 block size/padding");
|
||||
|
||||
#define QK5_0 32
|
||||
typedef struct {
|
||||
ggml_half d; // delta
|
||||
|
|
|
|||
|
|
@ -15,6 +15,7 @@
|
|||
#define ggml_vec_dot_q5_1_q8_1_generic ggml_vec_dot_q5_1_q8_1
|
||||
#define ggml_vec_dot_q8_0_q8_0_generic ggml_vec_dot_q8_0_q8_0
|
||||
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
|
||||
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
|
||||
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
||||
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
||||
#define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
|
||||
|
|
@ -79,6 +80,8 @@
|
|||
#define ggml_gemm_mxfp4_8x8_q8_0_generic ggml_gemm_mxfp4_8x8_q8_0
|
||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
|
||||
// quants.c
|
||||
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
|
||||
|
|
@ -108,6 +111,7 @@
|
|||
// ref: https://github.com/ggml-org/llama.cpp/pull/14146#issuecomment-2972561679
|
||||
// quants.c
|
||||
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
|
||||
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
||||
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
||||
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
||||
|
|
@ -155,6 +159,7 @@
|
|||
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
||||
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
||||
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
|
||||
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||
|
|
@ -201,6 +206,7 @@
|
|||
#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
|
||||
#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
|
||||
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
|
||||
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x1_generic ggml_quantize_mat_q8_0_4x1
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
|
|
@ -240,6 +246,7 @@
|
|||
#elif defined(__s390x__)
|
||||
// quants.c
|
||||
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
|
||||
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
||||
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
||||
#define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
|
||||
|
|
@ -302,6 +309,7 @@
|
|||
#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
|
||||
#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
|
||||
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
|
||||
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||
|
|
|
|||
|
|
@ -650,6 +650,90 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_nvfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
UNUSED(by);
|
||||
UNUSED(bs);
|
||||
assert(n % QK_NVFP4 == 0);
|
||||
|
||||
const block_nvfp4 * GGML_RESTRICT x = vx;
|
||||
const block_q8_0 * GGML_RESTRICT y = vy;
|
||||
|
||||
// Each NVFP4 super-block (64 elements) spans 2 q8_0 blocks
|
||||
const int nb = n / QK_NVFP4;
|
||||
|
||||
float sumf = 0;
|
||||
|
||||
#if defined __ARM_NEON
|
||||
const int8x16_t values = vld1q_s8(kvalues_mxfp4);
|
||||
const uint8x16_t m4b = vdupq_n_u8(0x0f);
|
||||
float32x4_t acc = vdupq_n_f32(0.0f);
|
||||
|
||||
for (int ib = 0; ib < nb; ++ib) {
|
||||
const uint8x16_t q4bits_0 = vld1q_u8(x[ib].qs);
|
||||
const uint8x16_t q4bits_1 = vld1q_u8(x[ib].qs + 16);
|
||||
|
||||
const int8x16_t q4_lo_0 = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits_0, m4b));
|
||||
const int8x16_t q4_hi_0 = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits_0, 4));
|
||||
const int8x16_t q4_lo_1 = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits_1, m4b));
|
||||
const int8x16_t q4_hi_1 = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits_1, 4));
|
||||
|
||||
const int8x16_t q8_0a = vld1q_s8(y[2*ib].qs);
|
||||
const int8x16_t q8_0b = vld1q_s8(y[2*ib].qs + 16);
|
||||
const int8x16_t q8_lo_0 = vcombine_s8(vget_low_s8(q8_0a), vget_low_s8(q8_0b));
|
||||
const int8x16_t q8_hi_0 = vcombine_s8(vget_high_s8(q8_0a), vget_high_s8(q8_0b));
|
||||
|
||||
const int8x16_t q8_1a = vld1q_s8(y[2*ib+1].qs);
|
||||
const int8x16_t q8_1b = vld1q_s8(y[2*ib+1].qs + 16);
|
||||
const int8x16_t q8_lo_1 = vcombine_s8(vget_low_s8(q8_1a), vget_low_s8(q8_1b));
|
||||
const int8x16_t q8_hi_1 = vcombine_s8(vget_high_s8(q8_1a), vget_high_s8(q8_1b));
|
||||
|
||||
const int32x4_t p0 = vaddq_s32(
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), q4_lo_0, q8_lo_0),
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), q4_hi_0, q8_hi_0));
|
||||
const int32x4_t p1 = vaddq_s32(
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), q4_lo_1, q8_lo_1),
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), q4_hi_1, q8_hi_1));
|
||||
|
||||
const int32x4_t sums = vpaddq_s32(p0, p1);
|
||||
|
||||
// Decode 4 UE4M3 scales to f32 and multiply with q8 scales
|
||||
const float dy0 = GGML_CPU_FP16_TO_FP32(y[2*ib].d);
|
||||
const float dy1 = GGML_CPU_FP16_TO_FP32(y[2*ib+1].d);
|
||||
const float32x4_t nvsc = {
|
||||
ggml_ue4m3_to_fp32(x[ib].d[0]),
|
||||
ggml_ue4m3_to_fp32(x[ib].d[1]),
|
||||
ggml_ue4m3_to_fp32(x[ib].d[2]),
|
||||
ggml_ue4m3_to_fp32(x[ib].d[3])
|
||||
};
|
||||
const float32x4_t scales = vmulq_f32(nvsc, (float32x4_t){dy0, dy0, dy1, dy1});
|
||||
|
||||
acc = vfmaq_f32(acc, vcvtq_f32_s32(sums), scales);
|
||||
}
|
||||
sumf = vaddvq_f32(acc);
|
||||
#else
|
||||
for (int ib = 0; ib < nb; ++ib) {
|
||||
for (int si = 0; si < 4; ++si) {
|
||||
const float d = ggml_ue4m3_to_fp32(x[ib].d[si]);
|
||||
const int q8b = si / 2;
|
||||
const int q8o = (si % 2) * QK_NVFP4_SUB;
|
||||
const float dy = GGML_CPU_FP16_TO_FP32(y[2*ib + q8b].d);
|
||||
|
||||
int sumi_lo = 0, sumi_hi = 0;
|
||||
for (int j = 0; j < QK_NVFP4_SUB/2; ++j) {
|
||||
const uint8_t qv = x[ib].qs[si*(QK_NVFP4_SUB/2) + j];
|
||||
sumi_lo += y[2*ib + q8b].qs[q8o + j + 0] * kvalues_mxfp4[qv & 0xf];
|
||||
sumi_hi += y[2*ib + q8b].qs[q8o + j + QK_NVFP4_SUB/2] * kvalues_mxfp4[qv >> 4];
|
||||
}
|
||||
sumf += dy * d * (sumi_lo + sumi_hi);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
|
|
|
|||
|
|
@ -270,6 +270,12 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
|
|||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_NVFP4] = {
|
||||
.from_float = quantize_row_nvfp4,
|
||||
.vec_dot = ggml_vec_dot_nvfp4_q8_0,
|
||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_Q2_K] = {
|
||||
.from_float = quantize_row_q2_K,
|
||||
.vec_dot = ggml_vec_dot_q2_K_q8_K,
|
||||
|
|
|
|||
|
|
@ -670,6 +670,7 @@ void ggml_compute_forward_add(
|
|||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_MXFP4:
|
||||
case GGML_TYPE_NVFP4:
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q3_K:
|
||||
case GGML_TYPE_Q4_K:
|
||||
|
|
@ -1119,6 +1120,7 @@ void ggml_compute_forward_add1(
|
|||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q8_1:
|
||||
case GGML_TYPE_MXFP4:
|
||||
case GGML_TYPE_NVFP4:
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q3_K:
|
||||
case GGML_TYPE_Q4_K:
|
||||
|
|
@ -1247,6 +1249,7 @@ void ggml_compute_forward_acc(
|
|||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q8_1:
|
||||
case GGML_TYPE_MXFP4:
|
||||
case GGML_TYPE_NVFP4:
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q3_K:
|
||||
case GGML_TYPE_Q4_K:
|
||||
|
|
@ -4334,6 +4337,7 @@ void ggml_compute_forward_out_prod(
|
|||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_MXFP4:
|
||||
case GGML_TYPE_NVFP4:
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q3_K:
|
||||
case GGML_TYPE_Q4_K:
|
||||
|
|
@ -4609,6 +4613,7 @@ void ggml_compute_forward_set(
|
|||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q8_1:
|
||||
case GGML_TYPE_MXFP4:
|
||||
case GGML_TYPE_NVFP4:
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q3_K:
|
||||
case GGML_TYPE_Q4_K:
|
||||
|
|
@ -4831,6 +4836,7 @@ void ggml_compute_forward_get_rows(
|
|||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q8_1:
|
||||
case GGML_TYPE_MXFP4:
|
||||
case GGML_TYPE_NVFP4:
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q3_K:
|
||||
case GGML_TYPE_Q4_K:
|
||||
|
|
@ -5555,6 +5561,7 @@ void ggml_compute_forward_clamp(
|
|||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q8_1:
|
||||
case GGML_TYPE_MXFP4:
|
||||
case GGML_TYPE_NVFP4:
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q3_K:
|
||||
case GGML_TYPE_Q4_K:
|
||||
|
|
|
|||
|
|
@ -50,6 +50,10 @@ void quantize_row_mxfp4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, i
|
|||
quantize_row_mxfp4_ref(x, y, k);
|
||||
}
|
||||
|
||||
void quantize_row_nvfp4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
|
||||
quantize_row_nvfp4_ref(x, y, k);
|
||||
}
|
||||
|
||||
//
|
||||
// 2-6 bit quantization in super-blocks
|
||||
//
|
||||
|
|
@ -216,6 +220,42 @@ void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|||
*s = sumf;
|
||||
}
|
||||
|
||||
// NVFP4: super-block of 64 elements = 4 sub-blocks of 16 = 2 q8_0 blocks
|
||||
void ggml_vec_dot_nvfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
UNUSED(by);
|
||||
UNUSED(bs);
|
||||
assert(n % QK_NVFP4 == 0);
|
||||
|
||||
const block_nvfp4 * GGML_RESTRICT x = vx;
|
||||
const block_q8_0 * GGML_RESTRICT y = vy;
|
||||
|
||||
const int nb = n / QK_NVFP4;
|
||||
|
||||
float sumf = 0;
|
||||
|
||||
for (int ib = 0; ib < nb; ++ib) {
|
||||
for (int s_idx = 0; s_idx < 4; ++s_idx) {
|
||||
const float d = ggml_ue4m3_to_fp32(x[ib].d[s_idx]);
|
||||
const int q8_block = s_idx / 2;
|
||||
const int q8_off = (s_idx % 2) * QK_NVFP4_SUB;
|
||||
const float dy = GGML_CPU_FP16_TO_FP32(y[2*ib + q8_block].d);
|
||||
|
||||
int sumi_lo = 0, sumi_hi = 0;
|
||||
for (int j = 0; j < QK_NVFP4_SUB/2; ++j) {
|
||||
const uint8_t qv = x[ib].qs[s_idx*(QK_NVFP4_SUB/2) + j];
|
||||
sumi_lo += y[2*ib + q8_block].qs[q8_off + j + 0] * kvalues_mxfp4[qv & 0xf];
|
||||
sumi_hi += y[2*ib + q8_block].qs[q8_off + j + QK_NVFP4_SUB/2] * kvalues_mxfp4[qv >> 4];
|
||||
}
|
||||
|
||||
sumf += dy * d * (sumi_lo + sumi_hi);
|
||||
}
|
||||
}
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
|
|
|
|||
|
|
@ -20,6 +20,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
|
|||
void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void quantize_row_mxfp4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_nvfp4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
|
|
@ -42,6 +43,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
||||
void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_nvfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
||||
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
|
@ -73,6 +75,7 @@ void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
|||
void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
||||
void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_nvfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
||||
void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
|
|
|||
|
|
@ -491,6 +491,61 @@ static inline float ggml_e8m0_to_fp32_half(uint8_t x) {
|
|||
#define GGML_E8M0_TO_FP32(x) ggml_e8m0_to_fp32(x)
|
||||
#define GGML_E8M0_TO_FP32_HALF(x) ggml_e8m0_to_fp32_half(x)
|
||||
|
||||
// UE4M3: unsigned, 4 exp bits (bias=7), 3 mantissa bits
|
||||
// Returns value * 0.5 to match kvalues_mxfp4 convention (kvalues = 2 * E2M1_float)
|
||||
static inline float ggml_ue4m3_to_fp32(uint8_t x) {
|
||||
if (x == 0 || x == 0x7F) {
|
||||
return 0.0f;
|
||||
}
|
||||
int exp = (x >> 3) & 0xF;
|
||||
int man = x & 0x7;
|
||||
float raw;
|
||||
if (exp == 0) {
|
||||
raw = ldexpf((float) man, -9);
|
||||
} else {
|
||||
raw = ldexpf(1.0f + (float) man / 8.0f, exp - 7);
|
||||
}
|
||||
return raw * 0.5f;
|
||||
}
|
||||
|
||||
static inline uint8_t ggml_fp32_to_ue4m3(float x) {
|
||||
if (!(x > 0.0f)) {
|
||||
return 0;
|
||||
}
|
||||
if (x > 448.0f) {
|
||||
x = 448.0f;
|
||||
}
|
||||
uint32_t bits;
|
||||
memcpy(&bits, &x, 4);
|
||||
int fp32_exp = ((bits >> 23) & 0xFF) - 127;
|
||||
int fp32_man = (bits >> 20) & 0x7;
|
||||
int ue4m3_exp = fp32_exp + 7;
|
||||
if (ue4m3_exp <= 0) {
|
||||
// subnormal: value = man * 2^-9, man = round(x * 2^9)
|
||||
int man = (int) (x * 512.0f + 0.5f);
|
||||
if (man > 7) {
|
||||
man = 7;
|
||||
}
|
||||
if (man < 1) {
|
||||
return 0;
|
||||
}
|
||||
return (uint8_t) man;
|
||||
}
|
||||
if (ue4m3_exp >= 15) {
|
||||
return 0x7E;
|
||||
}
|
||||
int round_bit = (bits >> 19) & 1;
|
||||
int ue4m3_man = fp32_man + round_bit;
|
||||
if (ue4m3_man > 7) {
|
||||
ue4m3_man = 0;
|
||||
ue4m3_exp++;
|
||||
if (ue4m3_exp >= 15) {
|
||||
return 0x7E;
|
||||
}
|
||||
}
|
||||
return (uint8_t) ((ue4m3_exp << 3) | ue4m3_man);
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts brain16 to float32.
|
||||
*
|
||||
|
|
|
|||
|
|
@ -1158,7 +1158,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
|
|||
case GGML_OP_SOLVE_TRI:
|
||||
case GGML_OP_MUL_MAT:
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
return has_simdgroup_reduction;
|
||||
return has_simdgroup_reduction && op->src[0]->type != GGML_TYPE_NVFP4;
|
||||
case GGML_OP_SET:
|
||||
case GGML_OP_CPY:
|
||||
case GGML_OP_DUP:
|
||||
|
|
@ -1216,7 +1216,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
|
|||
};
|
||||
}
|
||||
case GGML_OP_GET_ROWS:
|
||||
return true;
|
||||
return op->src[0]->type != GGML_TYPE_NVFP4;
|
||||
case GGML_OP_SET_ROWS:
|
||||
{
|
||||
if (op->src[0]->type != GGML_TYPE_F32) {
|
||||
|
|
|
|||
|
|
@ -304,6 +304,41 @@ void quantize_row_mxfp4_ref(const float * GGML_RESTRICT x, block_mxfp4 * GGML_RE
|
|||
}
|
||||
}
|
||||
|
||||
void quantize_row_nvfp4_ref(const float * GGML_RESTRICT x, block_nvfp4 * GGML_RESTRICT y, int64_t k) {
|
||||
static const int qk = QK_NVFP4;
|
||||
static const int qk_sub = QK_NVFP4_SUB;
|
||||
static const int n_sub = QK_NVFP4 / QK_NVFP4_SUB;
|
||||
|
||||
assert(k % qk == 0);
|
||||
|
||||
const int nb = k / qk;
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
for (int s = 0; s < n_sub; s++) {
|
||||
const float * xb = x + i*qk + s*qk_sub;
|
||||
|
||||
float amax = 0.0f;
|
||||
for (int j = 0; j < qk_sub; j++) {
|
||||
if (amax < fabsf(xb[j])) {
|
||||
amax = fabsf(xb[j]);
|
||||
}
|
||||
}
|
||||
|
||||
// UE4M3 scale: amax / 6.0 maps the max E2M1 value (6.0) to amax
|
||||
const uint8_t ue = ggml_fp32_to_ue4m3(amax / 6.0f);
|
||||
y[i].d[s] = ue;
|
||||
const float d = ggml_ue4m3_to_fp32(ue);
|
||||
|
||||
for (int j = 0; j < qk_sub/2; ++j) {
|
||||
const uint8_t x0 = best_index_mxfp4(xb[0 + j], d);
|
||||
const uint8_t x1 = best_index_mxfp4(xb[qk_sub/2 + j], d);
|
||||
|
||||
y[i].qs[s*(qk_sub/2) + j] = x0 | (x1 << 4);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
||||
static const int qk = QK4_0;
|
||||
|
||||
|
|
@ -434,6 +469,31 @@ void dequantize_row_mxfp4(const block_mxfp4 * GGML_RESTRICT x, float * GGML_REST
|
|||
}
|
||||
}
|
||||
|
||||
void dequantize_row_nvfp4(const block_nvfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
||||
static const int qk = QK_NVFP4;
|
||||
static const int qk_sub = QK_NVFP4_SUB;
|
||||
static const int n_sub = QK_NVFP4 / QK_NVFP4_SUB;
|
||||
|
||||
assert(k % qk == 0);
|
||||
|
||||
const int nb = k / qk;
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
for (int s = 0; s < n_sub; s++) {
|
||||
const float d = ggml_ue4m3_to_fp32(x[i].d[s]);
|
||||
float * yb = y + i*qk + s*qk_sub;
|
||||
|
||||
for (int j = 0; j < qk_sub/2; ++j) {
|
||||
const int8_t v0 = kvalues_mxfp4[x[i].qs[s*(qk_sub/2) + j] & 0x0F];
|
||||
const int8_t v1 = kvalues_mxfp4[x[i].qs[s*(qk_sub/2) + j] >> 4];
|
||||
|
||||
yb[j + 0 ] = v0*d;
|
||||
yb[j + qk_sub/2] = v1*d;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// 2-6 bit quantization in super-blocks
|
||||
//
|
||||
|
|
@ -2098,6 +2158,12 @@ size_t quantize_mxfp4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
|
|||
return nrow * ggml_row_size(GGML_TYPE_MXFP4, n_per_row);
|
||||
}
|
||||
|
||||
size_t quantize_nvfp4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
GGML_UNUSED(quant_weights);
|
||||
quantize_row_nvfp4_ref(src, dst, (int64_t)nrow*n_per_row);
|
||||
return nrow * ggml_row_size(GGML_TYPE_NVFP4, n_per_row);
|
||||
}
|
||||
|
||||
// ====================== Ternary (de)-quantization (BitNet b1.58 and TriLMs)
|
||||
|
||||
void quantize_row_tq1_0_ref(const float * GGML_RESTRICT x, block_tq1_0 * GGML_RESTRICT y, int64_t k) {
|
||||
|
|
@ -5244,6 +5310,12 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
|
|||
{
|
||||
VALIDATE_ROW_DATA_E_E8M0_IMPL(block_mxfp4, data, nb);
|
||||
} break;
|
||||
case GGML_TYPE_NVFP4:
|
||||
{
|
||||
// UE4M3 scales are uint8_t — all byte values are valid
|
||||
GGML_UNUSED(data);
|
||||
GGML_UNUSED(nb);
|
||||
} break;
|
||||
case GGML_TYPE_Q2_K:
|
||||
{
|
||||
VALIDATE_ROW_DATA_DM_F16_IMPL(block_q2_K, data, nb, d, dmin);
|
||||
|
|
|
|||
|
|
@ -22,6 +22,7 @@ GGML_API void quantize_row_q8_0_ref(const float * GGML_RESTRICT x, block_q8_0 *
|
|||
GGML_API void quantize_row_q8_1_ref(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
GGML_API void quantize_row_mxfp4_ref(const float * GGML_RESTRICT x, block_mxfp4 * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API void quantize_row_nvfp4_ref(const float * GGML_RESTRICT x, block_nvfp4 * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
GGML_API void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API void quantize_row_q3_K_ref(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
|
||||
|
|
@ -48,6 +49,7 @@ GGML_API void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GG
|
|||
//GGML_API void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
GGML_API void dequantize_row_mxfp4(const block_mxfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API void dequantize_row_nvfp4(const block_nvfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
GGML_API void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
|
|
@ -95,6 +97,7 @@ GGML_API size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTR
|
|||
GGML_API size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
|
||||
GGML_API size_t quantize_mxfp4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
GGML_API size_t quantize_nvfp4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
|
||||
GGML_API void iq2xs_init_impl(enum ggml_type type);
|
||||
GGML_API void iq2xs_free_impl(enum ggml_type type);
|
||||
|
|
|
|||
|
|
@ -718,6 +718,14 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
|||
.to_float = (ggml_to_float_t) dequantize_row_mxfp4,
|
||||
.from_float_ref = (ggml_from_float_t)quantize_row_mxfp4_ref,
|
||||
},
|
||||
[GGML_TYPE_NVFP4] = {
|
||||
.type_name = "nvfp4",
|
||||
.blck_size = QK_NVFP4,
|
||||
.type_size = sizeof(block_nvfp4),
|
||||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_nvfp4,
|
||||
.from_float_ref = (ggml_from_float_t)quantize_row_nvfp4_ref,
|
||||
},
|
||||
[GGML_TYPE_Q2_K] = {
|
||||
.type_name = "q2_K",
|
||||
.blck_size = QK_K,
|
||||
|
|
@ -1374,6 +1382,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
|||
case GGML_FTYPE_MOSTLY_Q5_1: wtype = GGML_TYPE_Q5_1; break;
|
||||
case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break;
|
||||
case GGML_FTYPE_MOSTLY_MXFP4: wtype = GGML_TYPE_MXFP4; break;
|
||||
case GGML_FTYPE_MOSTLY_NVFP4: wtype = GGML_TYPE_NVFP4; break;
|
||||
case GGML_FTYPE_MOSTLY_Q2_K: wtype = GGML_TYPE_Q2_K; break;
|
||||
case GGML_FTYPE_MOSTLY_Q3_K: wtype = GGML_TYPE_Q3_K; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_K: wtype = GGML_TYPE_Q4_K; break;
|
||||
|
|
@ -7641,6 +7650,7 @@ size_t ggml_quantize_chunk(
|
|||
case GGML_TYPE_Q5_1: result = quantize_q5_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_Q8_0: result = quantize_q8_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_MXFP4: result = quantize_mxfp4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_NVFP4: result = quantize_nvfp4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_Q2_K: result = quantize_q2_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_Q3_K: result = quantize_q3_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_Q4_K: result = quantize_q4_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
|
|
|
|||
|
|
@ -3784,6 +3784,7 @@ class GGMLQuantizationType(IntEnum):
|
|||
TQ1_0 = 34
|
||||
TQ2_0 = 35
|
||||
MXFP4 = 39
|
||||
NVFP4 = 40
|
||||
|
||||
|
||||
class ExpertGatingFuncType(IntEnum):
|
||||
|
|
@ -3941,6 +3942,7 @@ GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
|
|||
GGMLQuantizationType.TQ1_0: (256, 2 + 4 * 13),
|
||||
GGMLQuantizationType.TQ2_0: (256, 2 + 64),
|
||||
GGMLQuantizationType.MXFP4: (32, 1 + 16),
|
||||
GGMLQuantizationType.NVFP4: (64, 4 + 32),
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -139,10 +139,13 @@ class GGUFWriter:
|
|||
size = prod(shape)
|
||||
|
||||
if "_exps." in name:
|
||||
expert_count = shape[-2 if ".bias" in name else -3]
|
||||
expert_params += (size // expert_count)
|
||||
expert_sum += expert_count
|
||||
n_expert_tensors += 1
|
||||
if len(shape) >= 3:
|
||||
expert_count = shape[-2 if ".bias" in name else -3]
|
||||
expert_params += (size // expert_count)
|
||||
expert_sum += expert_count
|
||||
n_expert_tensors += 1
|
||||
else:
|
||||
shared_params += size
|
||||
else:
|
||||
shared_params += size
|
||||
|
||||
|
|
|
|||
|
|
@ -704,6 +704,65 @@ class MXFP4(__Quant, qtype=GGMLQuantizationType.MXFP4):
|
|||
return (d * qs.astype(np.float32))
|
||||
|
||||
|
||||
class NVFP4(__Quant, qtype=GGMLQuantizationType.NVFP4):
|
||||
# E2M1 values doubled (kvalues_mxfp4 convention)
|
||||
kvalues = (0, 1, 2, 3, 4, 6, 8, 12, 0, -1, -2, -3, -4, -6, -8, -12)
|
||||
|
||||
@staticmethod
|
||||
def ue4m3_to_fp32(x: np.ndarray) -> np.ndarray:
|
||||
"""Decode unsigned E4M3 (bias=7) to float, with 0.5 factor for kvalues convention."""
|
||||
exp = (x >> 3).astype(np.int32) & 0xF
|
||||
man = (x & 0x7).astype(np.float32)
|
||||
raw = np.where(
|
||||
exp == 0,
|
||||
man * 2**-9,
|
||||
(1.0 + man / 8.0) * (2.0 ** (exp.astype(np.float32) - 7)))
|
||||
return np.where((x == 0) | (x == 0x7F), 0.0, raw * 0.5)
|
||||
|
||||
@staticmethod
|
||||
def fp32_to_ue4m3(x: np.ndarray) -> np.ndarray:
|
||||
"""Vectorized float32 to unsigned E4M3, matching ggml_fp32_to_ue4m3 in C."""
|
||||
x = np.clip(x, 0.0, 448.0).astype(np.float32)
|
||||
bits = x.view(np.uint32)
|
||||
fp32_exp = ((bits >> 23) & 0xFF).astype(np.int32) - 127
|
||||
fp32_man = ((bits >> 20) & 0x7).astype(np.int32)
|
||||
ue4m3_exp = fp32_exp + 7
|
||||
|
||||
# Subnormal
|
||||
sub_man = np.clip((x * 512.0 + 0.5).astype(np.int32), 0, 7)
|
||||
sub_result = np.where(sub_man >= 1, sub_man, 0).astype(np.uint8)
|
||||
|
||||
# Normal with rounding
|
||||
round_bit = ((bits >> 19) & 1).astype(np.int32)
|
||||
man = fp32_man + round_bit
|
||||
exp = ue4m3_exp.copy()
|
||||
overflow = man > 7
|
||||
man = np.where(overflow, 0, man)
|
||||
exp = np.where(overflow, exp + 1, exp)
|
||||
normal_result = np.where(exp >= 15, np.uint8(0x7E), ((exp << 3) | man).astype(np.uint8))
|
||||
|
||||
return np.where(x <= 0.0, np.uint8(0),
|
||||
np.where(ue4m3_exp <= 0, sub_result,
|
||||
np.where(ue4m3_exp >= 15, np.uint8(0x7E), normal_result)))
|
||||
|
||||
@classmethod
|
||||
def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
|
||||
n_super = blocks.shape[0]
|
||||
|
||||
d_bytes, qs = np.hsplit(blocks, [4])
|
||||
d = cls.ue4m3_to_fp32(d_bytes).reshape(n_super, 4, 1) # (n_super, 4, 1)
|
||||
|
||||
qs = qs.reshape(n_super, 4, 8)
|
||||
lo = (qs & np.uint8(0x0F)).view(np.int8)
|
||||
hi = (qs >> np.uint8(4)).view(np.int8)
|
||||
vals = np.concatenate([lo, hi], axis=-1) # (n_super, 4, 16)
|
||||
|
||||
kvalues = np.array(cls.kvalues, dtype=np.int8).reshape(1, 1, 16)
|
||||
vals = np.take_along_axis(kvalues, vals, axis=-1)
|
||||
|
||||
return (d * vals.astype(np.float32)).reshape(n_super, 64)
|
||||
|
||||
|
||||
class IQ2_XXS(__Quant, qtype=GGMLQuantizationType.IQ2_XXS):
|
||||
ksigns: bytes = (
|
||||
b"\x00\x81\x82\x03\x84\x05\x06\x87\x88\x09\x0a\x8b\x0c\x8d\x8e\x0f"
|
||||
|
|
|
|||
|
|
@ -65,6 +65,7 @@ byteswap_tensors = {
|
|||
gguf.GGMLQuantizationType.Q4_K: byteswap_q4_k,
|
||||
gguf.GGMLQuantizationType.Q6_K: byteswap_q6_k,
|
||||
gguf.GGMLQuantizationType.MXFP4: byteswap_noop,
|
||||
gguf.GGMLQuantizationType.NVFP4: byteswap_noop,
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -68,6 +68,7 @@ class GGMLQuants:
|
|||
"q2_K", "q3_K", "q4_K", "q5_K", "q6_K",
|
||||
"tq1_0", "tq2_0",
|
||||
"mxfp4",
|
||||
"nvfp4",
|
||||
"iq2_xxs", "iq2_xs", "iq2_s", "iq3_xxs", "iq3_s", "iq1_s", "iq1_m",
|
||||
"iq4_nl", "iq4_xs",
|
||||
):
|
||||
|
|
|
|||
|
|
@ -153,6 +153,7 @@ extern "C" {
|
|||
LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_NVFP4 = 39, // except 1d tensors
|
||||
|
||||
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
||||
};
|
||||
|
|
|
|||
|
|
@ -1166,7 +1166,10 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|||
llama_expert_gating_func_type gating_op,
|
||||
int il,
|
||||
ggml_tensor * probs_in,
|
||||
ggml_tensor * gate_up_exps) const {
|
||||
ggml_tensor * gate_up_exps,
|
||||
ggml_tensor * up_exps_s,
|
||||
ggml_tensor * gate_exps_s,
|
||||
ggml_tensor * down_exps_s) const {
|
||||
return build_moe_ffn(
|
||||
cur,
|
||||
gate_inp, /* gate_inp_b */ nullptr,
|
||||
|
|
@ -1182,7 +1185,11 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|||
gating_op,
|
||||
il,
|
||||
probs_in,
|
||||
gate_up_exps
|
||||
gate_up_exps,
|
||||
/* gate_up_exps_b */ nullptr,
|
||||
up_exps_s,
|
||||
gate_exps_s,
|
||||
down_exps_s
|
||||
);
|
||||
}
|
||||
|
||||
|
|
@ -1206,7 +1213,10 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|||
int il,
|
||||
ggml_tensor * probs_in,
|
||||
ggml_tensor * gate_up_exps,
|
||||
ggml_tensor * gate_up_exps_b) const {
|
||||
ggml_tensor * gate_up_exps_b,
|
||||
ggml_tensor * up_exps_s,
|
||||
ggml_tensor * gate_exps_s,
|
||||
ggml_tensor * down_exps_s) const {
|
||||
const int64_t n_embd = cur->ne[0];
|
||||
const int64_t n_tokens = cur->ne[1];
|
||||
const bool weight_before_ffn = arch == LLM_ARCH_LLAMA4; // for llama4, we apply the sigmoid-ed weights before the FFN
|
||||
|
|
@ -1358,6 +1368,15 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|||
cb(gate_up, "ffn_moe_gate_up_biased", il);
|
||||
}
|
||||
|
||||
// apply per-expert scale2 to merged gate_up (use up_exps_s since gate and up are fused)
|
||||
if (up_exps_s) {
|
||||
ggml_tensor * s = ggml_reshape_3d(ctx0, up_exps_s, 1, n_expert, 1);
|
||||
s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1);
|
||||
s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens]
|
||||
gate_up = ggml_mul(ctx0, gate_up, s);
|
||||
cb(gate_up, "ffn_moe_gate_up_scaled", il);
|
||||
}
|
||||
|
||||
const int64_t n_ff = gate_up->ne[0] / 2;
|
||||
cur = ggml_view_3d(ctx0, gate_up, n_ff, gate_up->ne[1], gate_up->ne[2], gate_up->nb[1], gate_up->nb[2], 0);
|
||||
cb(cur, "ffn_moe_gate", il);
|
||||
|
|
@ -1373,6 +1392,15 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|||
cb(up, "ffn_moe_up_biased", il);
|
||||
}
|
||||
|
||||
// apply per-expert scale2 to up
|
||||
if (up_exps_s) {
|
||||
ggml_tensor * s = ggml_reshape_3d(ctx0, up_exps_s, 1, n_expert, 1);
|
||||
s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1);
|
||||
s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens]
|
||||
up = ggml_mul(ctx0, up, s);
|
||||
cb(up, "ffn_moe_up_scaled", il);
|
||||
}
|
||||
|
||||
if (gate_exps) {
|
||||
cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
||||
cb(cur, "ffn_moe_gate", il);
|
||||
|
|
@ -1384,6 +1412,15 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|||
cur = ggml_add_id(ctx0, cur, gate_exps_b, selected_experts);
|
||||
cb(cur, "ffn_moe_gate_biased", il);
|
||||
}
|
||||
|
||||
// apply per-expert scale2 to gate
|
||||
if (gate_exps_s) {
|
||||
ggml_tensor * s = ggml_reshape_3d(ctx0, gate_exps_s, 1, n_expert, 1);
|
||||
s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1);
|
||||
s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens]
|
||||
cur = ggml_mul(ctx0, cur, s);
|
||||
cb(cur, "ffn_moe_gate_scaled", il);
|
||||
}
|
||||
}
|
||||
|
||||
const bool has_gate = gate_exps || gate_up_exps;
|
||||
|
|
@ -1463,6 +1500,15 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|||
cb(experts, "ffn_moe_down_biased", il);
|
||||
}
|
||||
|
||||
// apply per-expert scale2 to down
|
||||
if (down_exps_s) {
|
||||
ggml_tensor * s = ggml_reshape_3d(ctx0, down_exps_s, 1, n_expert, 1);
|
||||
s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1);
|
||||
s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens]
|
||||
experts = ggml_mul(ctx0, experts, s);
|
||||
cb(experts, "ffn_moe_down_scaled", il);
|
||||
}
|
||||
|
||||
if (!weight_before_ffn) {
|
||||
experts = ggml_mul(ctx0, experts, weights);
|
||||
cb(cur, "ffn_moe_weighted", il);
|
||||
|
|
|
|||
|
|
@ -814,7 +814,10 @@ struct llm_graph_context {
|
|||
llama_expert_gating_func_type gating_op,
|
||||
int il,
|
||||
ggml_tensor * probs_in = nullptr,
|
||||
ggml_tensor * gate_up_exps = nullptr) const;
|
||||
ggml_tensor * gate_up_exps = nullptr,
|
||||
ggml_tensor * up_exps_s = nullptr,
|
||||
ggml_tensor * gate_exps_s = nullptr,
|
||||
ggml_tensor * down_exps_s = nullptr) const;
|
||||
|
||||
ggml_tensor * build_moe_ffn(
|
||||
ggml_tensor * cur,
|
||||
|
|
@ -836,7 +839,10 @@ struct llm_graph_context {
|
|||
int il,
|
||||
ggml_tensor * probs_in = nullptr,
|
||||
ggml_tensor * gate_up_exps = nullptr,
|
||||
ggml_tensor * gate_up_exps_b = nullptr) const;
|
||||
ggml_tensor * gate_up_exps_b = nullptr,
|
||||
ggml_tensor * up_exps_s = nullptr,
|
||||
ggml_tensor * gate_exps_s = nullptr,
|
||||
ggml_tensor * down_exps_s = nullptr) const;
|
||||
|
||||
//
|
||||
// inputs
|
||||
|
|
|
|||
|
|
@ -42,6 +42,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|||
case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
|
||||
case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
|
||||
case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: return "MXFP4 MoE";
|
||||
case LLAMA_FTYPE_MOSTLY_NVFP4: return "NVFP4";
|
||||
case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
|
||||
case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
|
||||
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
|
||||
|
|
@ -724,6 +725,7 @@ llama_model_loader::llama_model_loader(
|
|||
case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
|
||||
case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
|
||||
case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
|
||||
case GGML_TYPE_NVFP4: ftype = LLAMA_FTYPE_MOSTLY_NVFP4; break;
|
||||
default:
|
||||
{
|
||||
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
||||
|
|
|
|||
|
|
@ -5010,23 +5010,23 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
layer.attn_sub_norm = create_tensor(tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
||||
layer.wq_scale = create_tensor(tn(LLM_TENSOR_ATTN_Q, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
layer.wq_s = create_tensor(tn(LLM_TENSOR_ATTN_Q, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
||||
layer.wk_scale = create_tensor(tn(LLM_TENSOR_ATTN_K, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
layer.wk_s = create_tensor(tn(LLM_TENSOR_ATTN_K, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
||||
layer.wv_scale = create_tensor(tn(LLM_TENSOR_ATTN_V, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
layer.wv_s = create_tensor(tn(LLM_TENSOR_ATTN_V, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
||||
layer.wo_scale = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
layer.wo_s = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
layer.ffn_sub_norm = create_tensor(tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff}, 0);
|
||||
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_gate_scale = create_tensor(tn(LLM_TENSOR_FFN_GATE, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
layer.ffn_gate_s = create_tensor(tn(LLM_TENSOR_FFN_GATE, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
||||
layer.ffn_down_scale = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
layer.ffn_down_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_up_scale = create_tensor(tn(LLM_TENSOR_FFN_UP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
layer.ffn_up_s = create_tensor(tn(LLM_TENSOR_FFN_UP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_T5:
|
||||
|
|
@ -7443,6 +7443,48 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
default:
|
||||
throw std::runtime_error("unknown architecture");
|
||||
}
|
||||
|
||||
// generic pass: load optional per-tensor/per-expert ".scale" tensors (e.g. NVFP4 scale2)
|
||||
// this avoids having to add scale loading to every architecture
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
auto & layer = layers[i];
|
||||
|
||||
// attention weight scales (per-tensor, shape {1})
|
||||
if (!layer.wq_s && layer.wq) {
|
||||
layer.wq_s = create_tensor(tn(LLM_TENSOR_ATTN_Q, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.wk_s && layer.wk) {
|
||||
layer.wk_s = create_tensor(tn(LLM_TENSOR_ATTN_K, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.wv_s && layer.wv) {
|
||||
layer.wv_s = create_tensor(tn(LLM_TENSOR_ATTN_V, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.wo_s && layer.wo) {
|
||||
layer.wo_s = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
|
||||
// dense FFN weight scales (per-tensor, shape {1})
|
||||
if (!layer.ffn_gate_s && layer.ffn_gate) {
|
||||
layer.ffn_gate_s = create_tensor(tn(LLM_TENSOR_FFN_GATE, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.ffn_down_s && layer.ffn_down) {
|
||||
layer.ffn_down_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.ffn_up_s && layer.ffn_up) {
|
||||
layer.ffn_up_s = create_tensor(tn(LLM_TENSOR_FFN_UP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
|
||||
// MoE expert weight scales (per-expert, shape {n_expert})
|
||||
if (!layer.ffn_gate_exps_s && layer.ffn_gate_exps) {
|
||||
layer.ffn_gate_exps_s = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "scale", i), {n_expert}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.ffn_down_exps_s && layer.ffn_down_exps) {
|
||||
layer.ffn_down_exps_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "scale", i), {n_expert}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.ffn_up_exps_s && layer.ffn_up_exps) {
|
||||
layer.ffn_up_exps_s = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "scale", i), {n_expert}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ml.done_getting_tensors();
|
||||
|
|
|
|||
|
|
@ -295,6 +295,11 @@ struct llama_layer {
|
|||
struct ggml_tensor * ffn_up_exps_b = nullptr;
|
||||
struct ggml_tensor * ffn_gate_up_exps_b = nullptr;
|
||||
|
||||
// ff MoE per-expert scales (NVFP4 per-tensor scale2)
|
||||
struct ggml_tensor * ffn_gate_exps_s = nullptr;
|
||||
struct ggml_tensor * ffn_down_exps_s = nullptr;
|
||||
struct ggml_tensor * ffn_up_exps_s = nullptr;
|
||||
|
||||
// ff MoE latent proj
|
||||
struct ggml_tensor * ffn_latent_down = nullptr;
|
||||
struct ggml_tensor * ffn_latent_up = nullptr;
|
||||
|
|
@ -392,13 +397,13 @@ struct llama_layer {
|
|||
struct ggml_tensor * rope_freqs = nullptr;
|
||||
|
||||
// bitnet scale
|
||||
struct ggml_tensor * wq_scale = nullptr;
|
||||
struct ggml_tensor * wk_scale = nullptr;
|
||||
struct ggml_tensor * wv_scale = nullptr;
|
||||
struct ggml_tensor * wo_scale = nullptr;
|
||||
struct ggml_tensor * ffn_gate_scale = nullptr;
|
||||
struct ggml_tensor * ffn_up_scale = nullptr;
|
||||
struct ggml_tensor * ffn_down_scale = nullptr;
|
||||
struct ggml_tensor * wq_s = nullptr;
|
||||
struct ggml_tensor * wk_s = nullptr;
|
||||
struct ggml_tensor * wv_s = nullptr;
|
||||
struct ggml_tensor * wo_s = nullptr;
|
||||
struct ggml_tensor * ffn_gate_s = nullptr;
|
||||
struct ggml_tensor * ffn_up_s = nullptr;
|
||||
struct ggml_tensor * ffn_down_s = nullptr;
|
||||
|
||||
// altup & laurel
|
||||
struct ggml_tensor * per_layer_inp_gate = nullptr;
|
||||
|
|
|
|||
|
|
@ -30,8 +30,8 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
|
|||
{
|
||||
// compute Q and K and RoPE them
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
if (model.layers[il].wq_scale) {
|
||||
Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_scale);
|
||||
if (model.layers[il].wq_s) {
|
||||
Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_s);
|
||||
}
|
||||
cb(Qcur, "Qcur", il);
|
||||
if (model.layers[il].bq) {
|
||||
|
|
@ -41,8 +41,8 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
|
|||
|
||||
// B1.K
|
||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
||||
if (model.layers[il].wk_scale) {
|
||||
Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_scale);
|
||||
if (model.layers[il].wk_s) {
|
||||
Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_s);
|
||||
}
|
||||
cb(Kcur, "Kcur", il);
|
||||
if (model.layers[il].bk) {
|
||||
|
|
@ -52,8 +52,8 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
|
|||
|
||||
// B1.V
|
||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
||||
if (model.layers[il].wv_scale) {
|
||||
Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_scale);
|
||||
if (model.layers[il].wv_s) {
|
||||
Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_s);
|
||||
}
|
||||
cb(Vcur, "Vcur", il);
|
||||
if (model.layers[il].bv) {
|
||||
|
|
@ -91,8 +91,8 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
|
|||
cb(cur, "attn_sub_norm", il);
|
||||
|
||||
cur = build_lora_mm(model.layers[il].wo, cur);
|
||||
if (model.layers[il].wo_scale) {
|
||||
cur = ggml_mul(ctx0, cur, model.layers[il].wo_scale);
|
||||
if (model.layers[il].wo_s) {
|
||||
cur = ggml_mul(ctx0, cur, model.layers[il].wo_s);
|
||||
}
|
||||
if (model.layers[il].bo) {
|
||||
cur = ggml_add(ctx0, cur, model.layers[il].bo);
|
||||
|
|
@ -115,8 +115,8 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
|
|||
cb(cur, "ffn_norm", il);
|
||||
|
||||
cur = build_ffn(cur,
|
||||
model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_scale,
|
||||
model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_scale,
|
||||
model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_s,
|
||||
model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_s,
|
||||
NULL, NULL, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||
|
|
@ -128,8 +128,8 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
|
|||
cb(cur, "ffn_sub_norm", il);
|
||||
|
||||
cur = build_lora_mm(model.layers[il].ffn_down, cur);
|
||||
if (model.layers[il].ffn_down_scale) {
|
||||
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
|
||||
if (model.layers[il].ffn_down_s) {
|
||||
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_s);
|
||||
}
|
||||
cb(cur, "ffn_down", il);
|
||||
|
||||
|
|
|
|||
|
|
@ -44,18 +44,27 @@ llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_gra
|
|||
|
||||
// compute Q and K and RoPE them
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
if (model.layers[il].wq_s) {
|
||||
Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_s);
|
||||
}
|
||||
cb(Qcur, "Qcur", il);
|
||||
if (model.layers[il].bq) {
|
||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||
cb(Qcur, "Qcur", il);
|
||||
}
|
||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
||||
if (model.layers[il].wk_s) {
|
||||
Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_s);
|
||||
}
|
||||
cb(Kcur, "Kcur", il);
|
||||
if (model.layers[il].bk) {
|
||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||
cb(Kcur, "Kcur", il);
|
||||
}
|
||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
||||
if (model.layers[il].wv_s) {
|
||||
Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_s);
|
||||
}
|
||||
cb(Vcur, "Vcur", il);
|
||||
if (model.layers[il].bv) {
|
||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||
|
|
@ -91,6 +100,9 @@ llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_gra
|
|||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||
if (model.layers[il].wo_s) {
|
||||
cur = ggml_mul(ctx0, cur, model.layers[il].wo_s);
|
||||
}
|
||||
cb(cur, "attn_out", il);
|
||||
}
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
|
|
@ -109,9 +121,9 @@ llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_gra
|
|||
cb(cur, "ffn_norm", il);
|
||||
|
||||
cur = build_ffn(cur,
|
||||
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
||||
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
|
||||
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
||||
model.layers[il].ffn_up, model.layers[il].ffn_up_b, model.layers[il].ffn_up_s,
|
||||
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, model.layers[il].ffn_gate_s,
|
||||
model.layers[il].ffn_down, model.layers[il].ffn_down_b, model.layers[il].ffn_down_s,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
|
@ -132,7 +144,11 @@ llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_gra
|
|||
LLM_FFN_SILU, true,
|
||||
hparams.expert_weights_scale,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||
il);
|
||||
il,
|
||||
nullptr, nullptr,
|
||||
model.layers[il].ffn_up_exps_s,
|
||||
model.layers[il].ffn_gate_exps_s,
|
||||
model.layers[il].ffn_down_exps_s);
|
||||
cb(cur, "ffn_moe_out", il);
|
||||
}
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
|
|
|||
|
|
@ -31,12 +31,21 @@ llm_build_qwen3::llm_build_qwen3(const llama_model & model, const llm_graph_para
|
|||
{
|
||||
// compute Q and K and RoPE them
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
if (model.layers[il].wq_s) {
|
||||
Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_s);
|
||||
}
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
||||
if (model.layers[il].wk_s) {
|
||||
Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_s);
|
||||
}
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
||||
if (model.layers[il].wv_s) {
|
||||
Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_s);
|
||||
}
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
|
|
@ -68,6 +77,9 @@ llm_build_qwen3::llm_build_qwen3(const llama_model & model, const llm_graph_para
|
|||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||
if (model.layers[il].wo_s) {
|
||||
cur = ggml_mul(ctx0, cur, model.layers[il].wo_s);
|
||||
}
|
||||
}
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
|
|
@ -83,9 +95,9 @@ llm_build_qwen3::llm_build_qwen3(const llama_model & model, const llm_graph_para
|
|||
cb(cur, "ffn_norm", il);
|
||||
|
||||
cur = build_ffn(cur,
|
||||
model.layers[il].ffn_up, NULL, NULL,
|
||||
model.layers[il].ffn_gate, NULL, NULL,
|
||||
model.layers[il].ffn_down, NULL, NULL,
|
||||
model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_s,
|
||||
model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_s,
|
||||
model.layers[il].ffn_down, NULL, model.layers[il].ffn_down_s,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
|
|
|||
|
|
@ -31,12 +31,21 @@ llm_build_qwen3moe::llm_build_qwen3moe(const llama_model & model, const llm_grap
|
|||
{
|
||||
// compute Q and K and RoPE them
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
if (model.layers[il].wq_s) {
|
||||
Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_s);
|
||||
}
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
||||
if (model.layers[il].wk_s) {
|
||||
Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_s);
|
||||
}
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
||||
if (model.layers[il].wv_s) {
|
||||
Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_s);
|
||||
}
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
|
|
@ -68,6 +77,9 @@ llm_build_qwen3moe::llm_build_qwen3moe(const llama_model & model, const llm_grap
|
|||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||
if (model.layers[il].wo_s) {
|
||||
cur = ggml_mul(ctx0, cur, model.layers[il].wo_s);
|
||||
}
|
||||
}
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
|
|
@ -93,7 +105,11 @@ llm_build_qwen3moe::llm_build_qwen3moe(const llama_model & model, const llm_grap
|
|||
LLM_FFN_SILU, true,
|
||||
hparams.expert_weights_scale,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||
il);
|
||||
il,
|
||||
nullptr, nullptr,
|
||||
model.layers[il].ffn_up_exps_s,
|
||||
model.layers[il].ffn_gate_exps_s,
|
||||
model.layers[il].ffn_down_exps_s);
|
||||
cb(moe_out, "ffn_moe_out", il);
|
||||
cur = moe_out;
|
||||
|
||||
|
|
|
|||
|
|
@ -7854,10 +7854,6 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
|||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 67, {1, 1}, {4, 1}, {0, 2, 1, 3}));
|
||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32, GGML_TYPE_F32, 16, 32, 32, { 1, 1}, {1, 1}, {0, 1, 2, 3}, 64, 3));
|
||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32, GGML_TYPE_F32, 64, 77, 77, {12,1}, {1,1}));
|
||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 2, 1, 3, {128, 1024}, {1, 1}));
|
||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 2, 3, 4, {128, 1024}, {1, 1}));
|
||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 2, 1, 3, {128*1024, 1}, {1, 1}, {0, 2, 1, 3}));
|
||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 2, 1, 3, {128*1024, 1}, {1, 1}, {0, 1, 2, 3}, 64));
|
||||
|
||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q4_0, GGML_TYPE_F32, 576, 512, 576, {1,1}, {1,1}));
|
||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q4_0, GGML_TYPE_F32, 1, 2048, 8192, {1, 1}, {1, 1}));
|
||||
|
|
|
|||
|
|
@ -20,8 +20,10 @@ constexpr float MAX_QUANTIZATION_TOTAL_ERROR_TERNARY = 0.01f;
|
|||
constexpr float MAX_QUANTIZATION_TOTAL_ERROR_2BITS = 0.0075f;
|
||||
constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040f;
|
||||
constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS = 0.0050f;
|
||||
constexpr float MAX_QUANTIZATION_TOTAL_ERROR_FP4 = 0.0030f;
|
||||
constexpr float MAX_DOT_PRODUCT_ERROR = 0.02f;
|
||||
constexpr float MAX_DOT_PRODUCT_ERROR_LOWBIT = 0.04f;
|
||||
constexpr float MAX_DOT_PRODUCT_ERROR_FP4 = 0.03f;
|
||||
constexpr float MAX_DOT_PRODUCT_ERROR_TERNARY = 0.15f;
|
||||
|
||||
static const char* RESULT_STR[] = {"ok", "FAILED"};
|
||||
|
|
@ -149,7 +151,8 @@ int main(int argc, char * argv[]) {
|
|||
type == GGML_TYPE_IQ2_S ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
|
||||
type == GGML_TYPE_Q3_K ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS :
|
||||
type == GGML_TYPE_IQ3_S ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS :
|
||||
type == GGML_TYPE_IQ3_XXS ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS : MAX_QUANTIZATION_TOTAL_ERROR;
|
||||
type == GGML_TYPE_IQ3_XXS ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS :
|
||||
type == GGML_TYPE_NVFP4 ? MAX_QUANTIZATION_TOTAL_ERROR_FP4 : MAX_QUANTIZATION_TOTAL_ERROR;
|
||||
failed = !(total_error < max_quantization_error);
|
||||
num_failed += failed;
|
||||
if (failed || verbose) {
|
||||
|
|
@ -169,6 +172,8 @@ int main(int argc, char * argv[]) {
|
|||
? MAX_DOT_PRODUCT_ERROR_LOWBIT
|
||||
: type == GGML_TYPE_TQ1_0 || type == GGML_TYPE_TQ2_0
|
||||
? MAX_DOT_PRODUCT_ERROR_TERNARY
|
||||
: type == GGML_TYPE_NVFP4
|
||||
? MAX_DOT_PRODUCT_ERROR_FP4
|
||||
: MAX_DOT_PRODUCT_ERROR;
|
||||
failed = !(vec_dot_error < max_allowed_error);
|
||||
num_failed += failed;
|
||||
|
|
|
|||
Loading…
Reference in New Issue