convert : support non-mxfp4 HF model (#15153)
* convert : support non-mxfp4 HF model * rm redundant check * disable debug check
This commit is contained in:
parent
c4f53563df
commit
50aa938901
|
|
@ -8102,7 +8102,6 @@ class GptOssModel(TextModel):
|
||||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||||
blocks0: Tensor = torch.zeros(1)
|
blocks0: Tensor = torch.zeros(1)
|
||||||
blocks1: Tensor = torch.zeros(1)
|
blocks1: Tensor = torch.zeros(1)
|
||||||
found_mxfp4_tensors = False
|
|
||||||
# we assume that tensors are loaded in the correct order
|
# we assume that tensors are loaded in the correct order
|
||||||
for name, data_torch in self.get_tensors():
|
for name, data_torch in self.get_tensors():
|
||||||
if "mlp.experts.down_proj_blocks" in name:
|
if "mlp.experts.down_proj_blocks" in name:
|
||||||
|
|
@ -8110,7 +8109,6 @@ class GptOssModel(TextModel):
|
||||||
elif "mlp.experts.down_proj_scales" in name:
|
elif "mlp.experts.down_proj_scales" in name:
|
||||||
new_name = self.map_tensor_name(name.replace("_scales", ".weight"))
|
new_name = self.map_tensor_name(name.replace("_scales", ".weight"))
|
||||||
self.repack_mxfp4(new_name, blocks0, data_torch)
|
self.repack_mxfp4(new_name, blocks0, data_torch)
|
||||||
found_mxfp4_tensors = True
|
|
||||||
elif "mlp.experts.gate_up_proj_blocks" in name:
|
elif "mlp.experts.gate_up_proj_blocks" in name:
|
||||||
blocks0, blocks1 = data_torch[:, ::2, :, :], data_torch[:, 1::2, :, :]
|
blocks0, blocks1 = data_torch[:, ::2, :, :], data_torch[:, 1::2, :, :]
|
||||||
elif "mlp.experts.gate_up_proj_scales" in name:
|
elif "mlp.experts.gate_up_proj_scales" in name:
|
||||||
|
|
@ -8119,9 +8117,6 @@ class GptOssModel(TextModel):
|
||||||
new_name_up = self.map_tensor_name(name.replace("gate_up_proj_scales", "up_proj.weight"))
|
new_name_up = self.map_tensor_name(name.replace("gate_up_proj_scales", "up_proj.weight"))
|
||||||
self.repack_mxfp4(new_name_gate, blocks0, scales0)
|
self.repack_mxfp4(new_name_gate, blocks0, scales0)
|
||||||
self.repack_mxfp4(new_name_up, blocks1, scales1)
|
self.repack_mxfp4(new_name_up, blocks1, scales1)
|
||||||
found_mxfp4_tensors = True
|
|
||||||
if not found_mxfp4_tensors:
|
|
||||||
raise ValueError("No MXFP4 tensors found in the model. Please make sure you are using MXFP4 model.")
|
|
||||||
return []
|
return []
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
|
@ -8134,7 +8129,12 @@ class GptOssModel(TextModel):
|
||||||
if "down_proj" in name:
|
if "down_proj" in name:
|
||||||
if name.endswith("_bias"):
|
if name.endswith("_bias"):
|
||||||
name = name.replace("down_proj_bias", "down_proj.bias")
|
name = name.replace("down_proj_bias", "down_proj.bias")
|
||||||
|
elif "_blocks" not in name and "_scales" not in name:
|
||||||
|
logger.warning(f"{name} is not in MXFP4, performance may be degraded")
|
||||||
|
name = name.replace("down_proj", "down_proj.weight")
|
||||||
|
data_torch = data_torch.transpose(-1, -2)
|
||||||
else:
|
else:
|
||||||
|
# otherwise, it should already be repacked to ggml MXFP4 format
|
||||||
return []
|
return []
|
||||||
|
|
||||||
# split the gate_up into gate and up
|
# split the gate_up into gate and up
|
||||||
|
|
@ -8147,7 +8147,18 @@ class GptOssModel(TextModel):
|
||||||
(self.map_tensor_name(name_gate), gate_proj_bias),
|
(self.map_tensor_name(name_gate), gate_proj_bias),
|
||||||
(self.map_tensor_name(name_up), up_proj_bias)
|
(self.map_tensor_name(name_up), up_proj_bias)
|
||||||
]
|
]
|
||||||
|
elif "_blocks" not in name and "_scales" not in name:
|
||||||
|
logger.warning(f"{name} is not in MXFP4, performance may be degraded")
|
||||||
|
name_up = name.replace("gate_up_proj", "up_proj.weight")
|
||||||
|
name_gate = name.replace("gate_up_proj", "gate_proj.weight")
|
||||||
|
data_torch = data_torch.transpose(-1, -2)
|
||||||
|
gate_proj_weight, up_proj_weight = data_torch[:, ::2, :], data_torch[:, 1::2, :]
|
||||||
|
return [
|
||||||
|
(self.map_tensor_name(name_gate), gate_proj_weight),
|
||||||
|
(self.map_tensor_name(name_up), up_proj_weight)
|
||||||
|
]
|
||||||
else:
|
else:
|
||||||
|
# otherwise, it should already be repacked to ggml MXFP4 format
|
||||||
return []
|
return []
|
||||||
|
|
||||||
return [(self.map_tensor_name(name), data_torch)]
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
|
||||||
|
|
@ -999,7 +999,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||||
new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
|
new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
|
||||||
|
|
||||||
// TODO: temporary sanity check that the F16 -> MXFP4 is lossless
|
// TODO: temporary sanity check that the F16 -> MXFP4 is lossless
|
||||||
#if 1
|
#if 0
|
||||||
if (new_type == GGML_TYPE_MXFP4) {
|
if (new_type == GGML_TYPE_MXFP4) {
|
||||||
auto * x = f32_data_03;
|
auto * x = f32_data_03;
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue