add min/max pixels gguf metadata
This commit is contained in:
parent
8bbeab0616
commit
89db71702b
|
|
@ -7667,8 +7667,7 @@ class DeepseekV2Model(TextModel):
|
|||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@ModelBase.register("VaetkiForCausalLM")
|
||||
@ModelBase.register("VaetkiVLForCausalLM")
|
||||
@ModelBase.register("VaetkiForCausalLM", "VaetkiVLForCausalLM")
|
||||
class VaetkiModel(TextModel):
|
||||
"""VAETKI MoE model with MLA attention and 4-norm layer structure"""
|
||||
model_arch = gguf.MODEL_ARCH.VAETKI
|
||||
|
|
@ -7922,7 +7921,7 @@ class VaetkiVisionModel(MmprojModel):
|
|||
if "embed_dim" in self.hparams_vision:
|
||||
self.hparams_vision["hidden_size"] = self.hparams_vision.get("embed_dim")
|
||||
if "image_size" not in self.hparams_vision:
|
||||
self.hparams_vision["image_size"] = self.preprocessor_config.get("size", {}).get("shortest_edge", 560)
|
||||
self.hparams_vision["image_size"] = 560 # unused, set for compatibility
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
|
|
@ -7934,6 +7933,11 @@ class VaetkiVisionModel(MmprojModel):
|
|||
self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-5))
|
||||
self.gguf_writer.add_vision_spatial_merge_size(hparams.get("spatial_merge_size", 2))
|
||||
|
||||
# support dynamic size
|
||||
image_size = self.preprocessor_config["size"]
|
||||
self.gguf_writer.add_vision_image_min_pixels(image_size["shortest_edge"])
|
||||
self.gguf_writer.add_vision_image_max_pixels(image_size["longest_edge"])
|
||||
|
||||
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||
if "class_pos_embd" in new_name:
|
||||
return gguf.GGMLQuantizationType.F32
|
||||
|
|
|
|||
|
|
@ -284,6 +284,8 @@ class Keys:
|
|||
class ClipVision:
|
||||
PROJECTOR_TYPE = "clip.vision.projector_type" # for mixed modality models
|
||||
IMAGE_SIZE = "clip.vision.image_size"
|
||||
IMAGE_MIN_PIXELS = "clip.vision.image_min_pixels"
|
||||
IMAGE_MAX_PIXELS = "clip.vision.image_max_pixels"
|
||||
PREPROC_IMAGE_SIZE = "clip.vision.preproc_image_size"
|
||||
PATCH_SIZE = "clip.vision.patch_size"
|
||||
EMBEDDING_LENGTH = "clip.vision.embedding_length"
|
||||
|
|
|
|||
|
|
@ -1113,6 +1113,12 @@ class GGUFWriter:
|
|||
def add_vision_image_size(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipVision.IMAGE_SIZE, value)
|
||||
|
||||
def add_vision_image_max_pixels(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipVision.IMAGE_MAX_PIXELS, value)
|
||||
|
||||
def add_vision_image_min_pixels(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipVision.IMAGE_MIN_PIXELS, value)
|
||||
|
||||
def add_vision_preproc_image_size(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipVision.PREPROC_IMAGE_SIZE, value)
|
||||
|
||||
|
|
|
|||
|
|
@ -36,6 +36,8 @@
|
|||
// vision-specific
|
||||
#define KEY_VISION_PROJ_TYPE "clip.vision.projector_type" // for models with mixed modalities
|
||||
#define KEY_IMAGE_SIZE "clip.vision.image_size"
|
||||
#define KEY_IMAGE_MIN_PIXELS "clip.vision.image_min_pixels"
|
||||
#define KEY_IMAGE_MAX_PIXELS "clip.vision.image_max_pixels"
|
||||
#define KEY_PREPROC_IMAGE_SIZE "clip.vision.preproc_image_size"
|
||||
#define KEY_PATCH_SIZE "clip.vision.patch_size"
|
||||
#define KEY_IMAGE_MEAN "clip.vision.image_mean"
|
||||
|
|
|
|||
|
|
@ -1201,7 +1201,8 @@ struct clip_model_loader {
|
|||
hparams.rope_theta = 10000.0f;
|
||||
hparams.n_merge = 2;
|
||||
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
|
||||
hparams.set_limit_image_tokens(4, 3265);
|
||||
get_u32(KEY_IMAGE_MIN_PIXELS, hparams.image_min_pixels);
|
||||
get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels);
|
||||
hparams.set_warmup_n_tokens(40*40);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LLAMA4:
|
||||
|
|
|
|||
Loading…
Reference in New Issue