Merge 16010cba64 into 39bf692af1
This commit is contained in:
commit
31e8db2a52
|
|
@ -160,8 +160,6 @@ class ModelBase:
|
|||
self.ftype = gguf.LlamaFileType.MOSTLY_F16
|
||||
logger.info("heuristics unable to detect tensor dtype, defaulting to --outtype f16")
|
||||
|
||||
self.dequant_model()
|
||||
|
||||
# Configure GGUF Writer
|
||||
self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file,
|
||||
split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard)
|
||||
|
|
@ -527,6 +525,8 @@ class ModelBase:
|
|||
return ()
|
||||
|
||||
def prepare_tensors(self):
|
||||
self.dequant_model()
|
||||
|
||||
# Handle empty tensor_map for models with block_count=0 (like MobileNetV5)
|
||||
if self.tensor_map.mapping:
|
||||
max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
|
||||
|
|
@ -1812,7 +1812,7 @@ class MmprojModel(ModelBase):
|
|||
preprocessor_config: dict[str, Any]
|
||||
global_config: dict[str, Any]
|
||||
|
||||
n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth", "encoder_layers"]
|
||||
n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth", "encoder_layers", "vt_num_hidden_layers"]
|
||||
|
||||
has_vision_encoder: bool = True # by default
|
||||
has_audio_encoder: bool = False
|
||||
|
|
@ -1867,7 +1867,15 @@ class MmprojModel(ModelBase):
|
|||
preprocessor_config_path = self.dir_model / "preprocessor_config.json"
|
||||
if preprocessor_config_path.is_file():
|
||||
with open(preprocessor_config_path, "r", encoding="utf-8") as f:
|
||||
self.preprocessor_config = json.load(f)
|
||||
cfg = json.load(f)
|
||||
# move media_proc_cfg to root level for compat
|
||||
if "media_proc_cfg" in cfg:
|
||||
cfg = {
|
||||
**cfg,
|
||||
**cfg["media_proc_cfg"],
|
||||
}
|
||||
# merge configs
|
||||
self.preprocessor_config = {**self.preprocessor_config, **cfg}
|
||||
|
||||
# prefer processor_config.json if possible
|
||||
processor_config_path = self.dir_model / "processor_config.json"
|
||||
|
|
@ -1916,10 +1924,10 @@ class MmprojModel(ModelBase):
|
|||
self.image_size = self.find_vparam(["image_size"])
|
||||
self.gguf_writer.add_vision_image_size(self.image_size)
|
||||
self.gguf_writer.add_vision_patch_size(self.find_vparam(["patch_size"]))
|
||||
self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size"]))
|
||||
self.gguf_writer.add_vision_feed_forward_length(self.find_vparam(["intermediate_size"]))
|
||||
self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size", "vt_hidden_size"]))
|
||||
self.gguf_writer.add_vision_feed_forward_length(self.find_vparam(["intermediate_size", "vt_intermediate_size"]))
|
||||
self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys))
|
||||
self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads", "num_heads"]))
|
||||
self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads", "num_heads", "vt_num_attention_heads"]))
|
||||
|
||||
# preprocessor config
|
||||
image_mean = _MISTRAL_COMMON_DATASET_MEAN if self.is_mistral_format else self.preprocessor_config["image_mean"]
|
||||
|
|
@ -7601,6 +7609,7 @@ class DeepseekModel(TextModel):
|
|||
"DeepseekV2ForCausalLM",
|
||||
"DeepseekV3ForCausalLM",
|
||||
"KimiVLForConditionalGeneration",
|
||||
"KimiK25ForConditionalGeneration",
|
||||
"YoutuForCausalLM",
|
||||
"YoutuVLForConditionalGeneration",
|
||||
)
|
||||
|
|
@ -7719,8 +7728,8 @@ class DeepseekV2Model(TextModel):
|
|||
_experts: list[dict[str, Tensor]] | None = None
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
# skip vision tensors and remove "language_model." for Kimi-VL
|
||||
if "vision_tower" in name or "multi_modal_projector" in name:
|
||||
# skip vision tensors and remove "language_model." for Kimi-VL and Kimi-K2.5
|
||||
if "vision_tower" in name or "multi_modal_projector" in name or "mm_projector" in name:
|
||||
return
|
||||
if name.startswith("siglip2.") or name.startswith("merger."):
|
||||
return
|
||||
|
|
@ -11082,6 +11091,104 @@ class KimiVLModel(MmprojModel):
|
|||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("KimiK25ForConditionalGeneration")
|
||||
class KimiK25Model(MmprojModel):
|
||||
"""Kimi-K2.5 with MoonViT3d vision encoder"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
assert self.hparams_vision is not None, "Kimi-K2.5 requires vision_config in model config"
|
||||
|
||||
self.merge_kernel_size = tuple(self.hparams_vision.get("merge_kernel_size", [2, 2]))
|
||||
self.patch_size = self.hparams_vision.get("patch_size", 14)
|
||||
|
||||
# Set image_size for compatibility with base class
|
||||
# Use position embedding dimensions as image_size reference
|
||||
pos_emb_h = self.hparams_vision.get("init_pos_emb_height", 64)
|
||||
self.hparams_vision["image_size"] = pos_emb_h * self.patch_size
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
# Base class MmprojModel.set_gguf_parameters() already writes:
|
||||
# - vision_block_count, vision_head_count, vision_embedding_length
|
||||
# - vision_feed_forward_length, vision_patch_size, image_mean, image_std
|
||||
# via find_vparam() which handles the vt_* prefixed keys in Kimi-K2.5's config
|
||||
super().set_gguf_parameters()
|
||||
assert self.hparams_vision is not None
|
||||
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.KIMIK25)
|
||||
|
||||
# Position embedding parameters (for interpolation)
|
||||
self.gguf_writer.add_uint32("vision.pos_emb_height", self.hparams_vision.get("init_pos_emb_height", 64))
|
||||
self.gguf_writer.add_uint32("vision.pos_emb_width", self.hparams_vision.get("init_pos_emb_width", 64))
|
||||
self.gguf_writer.add_uint32("vision.pos_emb_time", self.hparams_vision.get("init_pos_emb_time", 4))
|
||||
|
||||
# Projector parameters
|
||||
self.gguf_writer.add_vision_use_gelu(self.hparams_vision.get("projector_hidden_act", "gelu") == "gelu")
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("projector_ln_eps", 1e-5))
|
||||
self.gguf_writer.add_vision_projector_scale_factor(self.merge_kernel_size[0])
|
||||
|
||||
# Image size limits
|
||||
# Note: in_patch_limit is for images, in_patch_limit_each_frame is for video (not supported yet)
|
||||
in_patch_limit = self.preprocessor_config.get("in_patch_limit", 16384)
|
||||
min_patches = 8 # reasonable minimum
|
||||
pixels_per_patch = self.patch_size ** 2
|
||||
self.gguf_writer.add_vision_min_pixels(min_patches * pixels_per_patch)
|
||||
self.gguf_writer.add_vision_max_pixels(in_patch_limit * pixels_per_patch)
|
||||
|
||||
@staticmethod
|
||||
def permute(weights: Tensor, n_head: int) -> Tensor:
|
||||
out_dim, in_dim = weights.shape
|
||||
head_dim = out_dim // n_head
|
||||
w = weights.reshape(n_head, head_dim // 4, 2, 2, in_dim)
|
||||
w = w.permute(0, 2, 1, 3, 4)
|
||||
return w.reshape(out_dim, in_dim)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
# Only process vision and projector tensors
|
||||
is_vision = any(x in name for x in ["vision_tower", "mm_projector"])
|
||||
|
||||
if not is_vision:
|
||||
return
|
||||
|
||||
assert self.hparams_vision is not None
|
||||
n_head = self.hparams_vision.get("num_attention_heads", 16)
|
||||
|
||||
# Permute Q/K weights/biases from interleaved to split RoPE format
|
||||
# This allows using build_rope_2d at runtime without post-permutation.
|
||||
if "wqkv" in name:
|
||||
out_dim = data_torch.shape[0]
|
||||
qkv_dim = out_dim // 3
|
||||
head_dim = qkv_dim // n_head
|
||||
|
||||
if "weight" in name:
|
||||
wq, wk, wv = data_torch[:qkv_dim, :], data_torch[qkv_dim:2*qkv_dim, :], data_torch[2*qkv_dim:, :]
|
||||
wq = self.permute(wq, n_head)
|
||||
wk = self.permute(wk, n_head)
|
||||
data_torch = torch.cat([wq, wk, wv], dim=0)
|
||||
elif "bias" in name:
|
||||
bq, bk, bv = data_torch[:qkv_dim], data_torch[qkv_dim:2*qkv_dim], data_torch[2*qkv_dim:]
|
||||
bq = bq.reshape(n_head, head_dim // 4, 2, 2).permute(0, 2, 1, 3).reshape(-1)
|
||||
bk = bk.reshape(n_head, head_dim // 4, 2, 2).permute(0, 2, 1, 3).reshape(-1)
|
||||
data_torch = torch.cat([bq, bk, bv], dim=0)
|
||||
|
||||
|
||||
# Temporal embeddings: (T, 1, C) → (T, C)
|
||||
if "pos_emb.time_weight" in name:
|
||||
T, _, C = data_torch.shape
|
||||
data_torch = data_torch.reshape(T, C)
|
||||
|
||||
# PatchMergerMLP tensor name mapping
|
||||
# proj.0.weight → proj.linear_1.weight
|
||||
# proj.2.weight → proj.linear_2.weight
|
||||
if "mm_projector.proj.0." in name:
|
||||
name = name.replace(".proj.0.", ".proj.linear_1.")
|
||||
elif "mm_projector.proj.2." in name:
|
||||
name = name.replace(".proj.2.", ".proj.linear_2.")
|
||||
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("CogVLMForCausalLM")
|
||||
class CogVLMVisionModel(MmprojModel):
|
||||
|
||||
|
|
|
|||
|
|
@ -3763,6 +3763,7 @@ class VisionProjectorType:
|
|||
VOXTRAL = "voxtral"
|
||||
LFM2 = "lfm2"
|
||||
KIMIVL = "kimivl"
|
||||
KIMIK25 = "kimik25"
|
||||
LIGHTONOCR = "lightonocr"
|
||||
COGVLM = "cogvlm"
|
||||
JANUS_PRO = "janus_pro"
|
||||
|
|
|
|||
|
|
@ -1298,6 +1298,7 @@ class TensorNameMap:
|
|||
|
||||
MODEL_TENSOR.V_MMPROJ: (
|
||||
"multi_modal_projector.linear_{bid}",
|
||||
"mm_projector.proj.linear_{bid}", # Kimi-K2.5
|
||||
"visual.merger.mlp.{bid}", # qwen2vl
|
||||
"merger.mlp.{bid}",
|
||||
),
|
||||
|
|
@ -1359,6 +1360,7 @@ class TensorNameMap:
|
|||
MODEL_TENSOR.V_ENC_ATTN_QKV: (
|
||||
"visual.blocks.{bid}.attn.qkv", # qwen3vl
|
||||
"model.vision.transformer.layers.{bid}.attention.query_key_value", # cogvlm
|
||||
"vision_tower.encoder.blocks.{bid}.wqkv" # Kimi-K2.5
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_ATTN_Q: (
|
||||
|
|
@ -1533,6 +1535,7 @@ class TensorNameMap:
|
|||
"multi_modal_projector.norm",
|
||||
"multi_modal_projector.layer_norm",
|
||||
"multi_modal_projector.pre_norm",
|
||||
"mm_projector.pre_norm", # Kimi-K2.5
|
||||
"pre_mm_projector_norm",
|
||||
"model.vision.linear_proj.norm1", # cogvlm
|
||||
"merger.ln_q",
|
||||
|
|
|
|||
|
|
@ -19,6 +19,7 @@ add_library(mtmd
|
|||
models/glm4v.cpp
|
||||
models/internvl.cpp
|
||||
models/kimivl.cpp
|
||||
models/kimik25.cpp
|
||||
models/llama4.cpp
|
||||
models/llava.cpp
|
||||
models/minicpmv.cpp
|
||||
|
|
|
|||
|
|
@ -235,6 +235,7 @@ enum projector_type {
|
|||
PROJECTOR_TYPE_LFM2A,
|
||||
PROJECTOR_TYPE_GLM4V,
|
||||
PROJECTOR_TYPE_YOUTUVL,
|
||||
PROJECTOR_TYPE_KIMIK25,
|
||||
PROJECTOR_TYPE_UNKNOWN,
|
||||
};
|
||||
|
||||
|
|
@ -268,6 +269,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
|||
{ PROJECTOR_TYPE_LFM2A, "lfm2a"},
|
||||
{ PROJECTOR_TYPE_GLM4V, "glm4v"},
|
||||
{ PROJECTOR_TYPE_YOUTUVL, "youtuvl"},
|
||||
{ PROJECTOR_TYPE_KIMIK25, "kimik25"},
|
||||
};
|
||||
|
||||
static projector_type clip_projector_type_from_string(const std::string & str) {
|
||||
|
|
|
|||
|
|
@ -655,6 +655,11 @@ ggml_tensor * clip_graph::build_rope_2d(
|
|||
const int64_t n_head = cur->ne[1];
|
||||
const int64_t n_pos = cur->ne[2];
|
||||
|
||||
// Ensure input is contiguous (needed when using merged QKV with ggml_view)
|
||||
if (!ggml_is_contiguous(cur)) {
|
||||
cur = ggml_cont(ctx0, cur);
|
||||
}
|
||||
|
||||
// for example, if we have cur tensor of shape (n_dim=8, n_head, n_pos)
|
||||
// we will have a list of 4 inv_freq: 1e-0, 1e-1, 1e-2, 1e-3
|
||||
// first half of cur will use 1e-0, 1e-2 (even)
|
||||
|
|
@ -825,6 +830,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||
{
|
||||
builder = std::make_unique<clip_graph_kimivl>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_KIMIK25:
|
||||
{
|
||||
builder = std::make_unique<clip_graph_kimik25>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_COGVLM:
|
||||
{
|
||||
builder = std::make_unique<clip_graph_cogvlm>(ctx, img);
|
||||
|
|
@ -1139,6 +1148,22 @@ struct clip_model_loader {
|
|||
hparams.set_limit_image_tokens(8, 1024);
|
||||
hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
|
||||
} break;
|
||||
case PROJECTOR_TYPE_KIMIK25:
|
||||
{
|
||||
hparams.rope_theta = 10000.0f;
|
||||
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
|
||||
|
||||
int min_pixels = 0, max_pixels = 0;
|
||||
get_u32(KEY_IMAGE_MIN_PIXELS, min_pixels, false);
|
||||
get_u32(KEY_IMAGE_MAX_PIXELS, max_pixels, false);
|
||||
if (min_pixels > 0 && max_pixels > 0) {
|
||||
hparams.image_min_pixels = min_pixels;
|
||||
hparams.image_max_pixels = max_pixels;
|
||||
hparams.warmup_image_size = static_cast<int>(std::sqrt(max_pixels));
|
||||
} else {
|
||||
hparams.set_limit_image_tokens(2, 4096);
|
||||
}
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA3:
|
||||
{
|
||||
// default value (used by all model sizes in gemma 3 family)
|
||||
|
|
@ -1668,6 +1693,7 @@ struct clip_model_loader {
|
|||
model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
|
||||
} break;
|
||||
case PROJECTOR_TYPE_KIMIVL:
|
||||
case PROJECTOR_TYPE_KIMIK25:
|
||||
{
|
||||
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM);
|
||||
model.mm_input_norm_b = get_tensor(TN_MM_INP_NORM_B);
|
||||
|
|
@ -3039,6 +3065,23 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
|
|||
res_imgs->entries.push_back(std::move(res));
|
||||
} break;
|
||||
|
||||
case PROJECTOR_TYPE_KIMIK25:
|
||||
{
|
||||
GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0);
|
||||
const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
|
||||
original_size,
|
||||
params.patch_size * params.n_merge,
|
||||
params.image_min_pixels,
|
||||
params.image_max_pixels);
|
||||
const std::array<uint8_t, 3> pad_color = {0, 0, 0};
|
||||
|
||||
clip_image_u8 resized_img;
|
||||
img_tool::resize(*img, resized_img, target_size, img_tool::RESIZE_ALGO_BICUBIC, true, pad_color);
|
||||
clip_image_f32_ptr res(clip_image_f32_init());
|
||||
normalize_image_u8_to_f32(resized_img, *res, params.image_mean, params.image_std);
|
||||
res_imgs->entries.push_back(std::move(res));
|
||||
} break;
|
||||
|
||||
case PROJECTOR_TYPE_MLP:
|
||||
case PROJECTOR_TYPE_MLP_NORM:
|
||||
case PROJECTOR_TYPE_LDP:
|
||||
|
|
@ -3247,6 +3290,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
|||
} break;
|
||||
case PROJECTOR_TYPE_LFM2:
|
||||
case PROJECTOR_TYPE_KIMIVL:
|
||||
case PROJECTOR_TYPE_KIMIK25:
|
||||
{
|
||||
// dynamic size
|
||||
int out_patch_size = params.patch_size * ctx->model.hparams.n_merge;
|
||||
|
|
@ -3588,6 +3632,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|||
} break;
|
||||
case PROJECTOR_TYPE_PIXTRAL:
|
||||
case PROJECTOR_TYPE_KIMIVL:
|
||||
case PROJECTOR_TYPE_KIMIK25:
|
||||
case PROJECTOR_TYPE_LIGHTONOCR:
|
||||
{
|
||||
// set the 2D positions
|
||||
|
|
@ -3724,6 +3769,47 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|||
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
|
||||
}
|
||||
|
||||
// Debug: dump final embeddings if MTMD_DEBUG_EMBEDDINGS is set
|
||||
if (std::getenv("MTMD_DEBUG_EMBEDDINGS") != nullptr) {
|
||||
const int64_t n_embd = embeddings->ne[0];
|
||||
const int64_t n_tokens = embeddings->ne[1];
|
||||
std::vector<float> emb_data(n_embd * n_tokens);
|
||||
ggml_backend_tensor_get(embeddings, emb_data.data(), 0, ggml_nbytes(embeddings));
|
||||
|
||||
LOG_INF("\n=== MTMD_DEBUG_EMBEDDINGS ===\n");
|
||||
LOG_INF("Shape: [%lld, %lld]\n", (long long)n_embd, (long long)n_tokens);
|
||||
|
||||
// Print first few values of first token
|
||||
LOG_INF("Token 0 (first 16 values): ");
|
||||
for (int i = 0; i < std::min((int64_t)16, n_embd); i++) {
|
||||
LOG_INF("%.6f ", emb_data[i]);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
// Print last few values of first token
|
||||
if (n_embd > 16) {
|
||||
LOG_INF("Token 0 (last 16 values): ");
|
||||
for (int64_t i = n_embd - 16; i < n_embd; i++) {
|
||||
LOG_INF("%.6f ", emb_data[i]);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
}
|
||||
|
||||
// Compute and print statistics
|
||||
float sum = 0.0f, sum_sq = 0.0f, min_val = emb_data[0], max_val = emb_data[0];
|
||||
for (size_t i = 0; i < emb_data.size(); i++) {
|
||||
sum += emb_data[i];
|
||||
sum_sq += emb_data[i] * emb_data[i];
|
||||
min_val = std::min(min_val, emb_data[i]);
|
||||
max_val = std::max(max_val, emb_data[i]);
|
||||
}
|
||||
float mean = sum / emb_data.size();
|
||||
float variance = (sum_sq / emb_data.size()) - (mean * mean);
|
||||
LOG_INF("Stats: mean=%.6f, std=%.6f, min=%.6f, max=%.6f, sum=%.6f\n",
|
||||
mean, sqrtf(variance), min_val, max_val, sum);
|
||||
LOG_INF("=== END MTMD_DEBUG_EMBEDDINGS ===\n\n");
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -3770,6 +3856,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
|||
return ctx->model.mm_2_w->ne[1];
|
||||
case PROJECTOR_TYPE_LFM2:
|
||||
case PROJECTOR_TYPE_KIMIVL:
|
||||
case PROJECTOR_TYPE_KIMIK25:
|
||||
return ctx->model.mm_2_w->ne[1];
|
||||
case PROJECTOR_TYPE_COGVLM:
|
||||
return ctx->model.mm_4h_to_h_w->ne[1];
|
||||
|
|
|
|||
|
|
@ -0,0 +1,101 @@
|
|||
#include "models.h"
|
||||
#include <cstring>
|
||||
#include <cmath>
|
||||
|
||||
// note: this is similar to clip_graph::resize_position_embeddings, major difference is having
|
||||
// the w/h in ne[1] and ne[2] instead of assuming with sqrt. Could try storing the tensor in 2D instead
|
||||
// with a w*h? Also the permute is a bit different at (2, 1, 0, 3) instead of (2, 0, 1, 3).
|
||||
ggml_tensor * clip_graph_kimik25::resize_position_embeddings_3d(uint32_t interpolation_mode) {
|
||||
ggml_tensor * pos_embd = model.position_embeddings;
|
||||
const int height = img.ny / patch_size;
|
||||
const int width = img.nx / patch_size;
|
||||
const uint32_t mode = interpolation_mode;
|
||||
|
||||
GGML_ASSERT(pos_embd);
|
||||
|
||||
const int64_t stored_c = pos_embd->ne[0]; // C = 1152
|
||||
const int64_t orig_w = pos_embd->ne[1]; // W = 64
|
||||
const int64_t orig_h = pos_embd->ne[2]; // H = 64
|
||||
|
||||
GGML_ASSERT(stored_c == n_embd);
|
||||
|
||||
if (height == (int)orig_h && width == (int)orig_w) {
|
||||
// No interpolation needed, just flatten to [C, H*W]
|
||||
return ggml_cont_2d(ctx0, pos_embd, n_embd, width * height);
|
||||
}
|
||||
|
||||
pos_embd = ggml_permute(ctx0, pos_embd, 2, 1, 0, 3);
|
||||
pos_embd = ggml_interpolate(ctx0, pos_embd, height, width, n_embd, 1, mode);
|
||||
pos_embd = ggml_permute(ctx0, pos_embd, 2, 1, 0, 3);
|
||||
pos_embd = ggml_cont_2d(ctx0, pos_embd, n_embd, width * height);
|
||||
return pos_embd;
|
||||
}
|
||||
|
||||
ggml_cgraph * clip_graph_kimik25::build() {
|
||||
ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
|
||||
ggml_set_name(pos_h, "pos_h");
|
||||
ggml_set_input(pos_h);
|
||||
|
||||
ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
|
||||
ggml_set_name(pos_w, "pos_w");
|
||||
ggml_set_input(pos_w);
|
||||
|
||||
ggml_tensor * learned_pos_embd = resize_position_embeddings_3d(GGML_SCALE_MODE_BICUBIC);
|
||||
|
||||
// Kimi-K2.5 uses interleaved 2D RoPE pattern natively, but
|
||||
// Q / K are permuted during conversion to use split format.
|
||||
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
|
||||
cur = build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
|
||||
return cur;
|
||||
};
|
||||
|
||||
ggml_tensor * inp = build_inp();
|
||||
|
||||
// I don't know why, but doing this in the build_vit lead to the ggml_add not occurring?
|
||||
// Doing it manually here does work.
|
||||
inp = ggml_add(ctx0, inp, learned_pos_embd);
|
||||
|
||||
ggml_tensor * cur = build_vit(
|
||||
inp, n_patches,
|
||||
NORM_TYPE_NORMAL,
|
||||
hparams.ffn_op,
|
||||
nullptr,
|
||||
add_pos);
|
||||
|
||||
cb(cur, "vit_out", -1);
|
||||
|
||||
{
|
||||
// patch_merger
|
||||
const int scale_factor = model.hparams.n_merge;
|
||||
cur = build_patch_merge_permute(cur, scale_factor);
|
||||
|
||||
// projection norm
|
||||
int proj_inp_dim = cur->ne[0];
|
||||
int n_merged_patches = cur->ne[1];
|
||||
cur = ggml_view_2d(ctx0, cur,
|
||||
n_embd, n_merged_patches * scale_factor * scale_factor,
|
||||
ggml_row_size(cur->type, n_embd), 0);
|
||||
cur = ggml_norm(ctx0, cur, hparams.eps);
|
||||
cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
|
||||
cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
|
||||
cur = ggml_view_2d(ctx0, cur,
|
||||
proj_inp_dim, n_merged_patches,
|
||||
ggml_row_size(cur->type, proj_inp_dim), 0);
|
||||
cb(cur, "proj_inp_normed", -1);
|
||||
|
||||
// projection mlp
|
||||
cur = build_ffn(cur,
|
||||
model.mm_1_w, model.mm_1_b,
|
||||
nullptr, nullptr,
|
||||
model.mm_2_w, model.mm_2_b,
|
||||
FFN_GELU,
|
||||
-1);
|
||||
|
||||
cb(cur, "proj_out", -1);
|
||||
}
|
||||
|
||||
// build the graph
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
|
@ -109,3 +109,10 @@ struct clip_graph_mobilenetv5 : clip_graph {
|
|||
ggml_tensor * inp,
|
||||
const mobilenetv5_block & block);
|
||||
};
|
||||
|
||||
struct clip_graph_kimik25 : clip_graph {
|
||||
clip_graph_kimik25(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
|
||||
ggml_tensor * resize_position_embeddings_3d(uint32_t interpolation_mode);
|
||||
};
|
||||
|
|
|
|||
Loading…
Reference in New Issue