From eac9c6ea83ff2b72ba3b5459a58c44990823f2cf Mon Sep 17 00:00:00 2001 From: Michael Wand Date: Sat, 21 Mar 2026 04:35:21 -0700 Subject: [PATCH] Convert: Make NVFP4 and MXFP4 HF conversions say NVFP4/MXFP4 instead of BF16 (#20730) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Corrected convert script for NVFP4 naming and updated gguf constants * Add mostly_MXFP4 to FileType Co-authored-by: Sigbjørn Skjæret * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret * simplify * set initial value [no ci] --------- Co-authored-by: Sigbjørn Skjæret --- convert_hf_to_gguf.py | 12 ++++++++++-- gguf-py/gguf/constants.py | 2 ++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 087e9f926f..dba190b480 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -145,6 +145,7 @@ class ModelBase: self.model_name = model_name self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py self._is_nvfp4 = False + self._is_mxfp4 = False # Apply heuristics to figure out typical tensor encoding based on first tensor's dtype # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie. @@ -712,6 +713,7 @@ class ModelBase: def prepare_tensors(self): # detect NVFP4 quantization (ModelOpt format) quant_algo = (self.hparams.get("quantization_config") or {}).get("quant_algo") + quant_method = (self.hparams.get("quantization_config") or {}).get("quant_method") quant_layers = (self.hparams.get("quantization_config") or {}).get("quantized_layers") or {} quant_config_file = self.dir_model / "hf_quant_config.json" @@ -728,6 +730,7 @@ class ModelBase: quant_algo = "NVFP4" self._is_nvfp4 = quant_algo == "NVFP4" + self._is_mxfp4 = quant_method == "mxfp4" # NVFP4 weights are repacked and written directly to gguf_writer. # This must run before dequant_model so NVFP4 tensors are removed @@ -876,6 +879,12 @@ class ModelBase: if self.metadata.name is None: self.metadata.name = self.dir_model.name + if self.ftype in (gguf.LlamaFileType.ALL_F32, gguf.LlamaFileType.MOSTLY_F16, gguf.LlamaFileType.MOSTLY_BF16): + if self._is_nvfp4: + self.ftype = gguf.LlamaFileType.MOSTLY_NVFP4 + elif self._is_mxfp4: + self.ftype = gguf.LlamaFileType.MOSTLY_MXFP4_MOE + # Generate parameter weight class (useful for leader boards) if not yet determined if self.metadata.size_label is None and total_params > 0: self.metadata.size_label = gguf.size_label(total_params, shared_params, expert_params, expert_count) @@ -11125,8 +11134,7 @@ class GptOssModel(TextModel): # TODO: remove once MXFP4 is supported more generally def dequant_model(self): - quant_config = self.hparams.get("quantization_config") - if quant_config is not None and quant_config.get("quant_method") == "mxfp4": + if self._is_mxfp4: return return super().dequant_model() diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 0a032e9039..c5f92c7700 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -3869,6 +3869,8 @@ class LlamaFileType(IntEnum): # MOSTLY_Q4_0_8_8 = 35 # removed from gguf files, use Q4_0 and runtime repack MOSTLY_TQ1_0 = 36 # except 1d tensors MOSTLY_TQ2_0 = 37 # except 1d tensors + MOSTLY_MXFP4_MOE = 38 # except 1d tensors + MOSTLY_NVFP4 = 39 # except 1d tensors GUESSED = 1024 # not specified in the model file