Merge eefcfeed7a into 650bf14eb9
This commit is contained in:
commit
d66447f506
|
|
@ -4250,9 +4250,7 @@ class Qwen2VLVisionModel(MmprojModel):
|
|||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("Qwen2_5OmniModel")
|
||||
class Qwen25OmniModel(Qwen2VLVisionModel):
|
||||
has_vision_encoder = True
|
||||
class Qwen25AudioModel(MmprojModel):
|
||||
has_audio_encoder = True
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
|
|
@ -4268,12 +4266,6 @@ class Qwen25OmniModel(Qwen2VLVisionModel):
|
|||
self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["num_mel_bins"])
|
||||
self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams_audio.get("layer_norm_eps", 1e-5))
|
||||
|
||||
def get_vision_config(self) -> dict[str, Any] | None:
|
||||
return self.global_config["thinker_config"].get("vision_config")
|
||||
|
||||
def get_audio_config(self) -> dict[str, Any] | None:
|
||||
return self.global_config["thinker_config"].get("audio_config")
|
||||
|
||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||
# SinusoidsPositionEmbedding
|
||||
assert self.hparams_audio is not None
|
||||
|
|
@ -4303,8 +4295,33 @@ class Qwen25OmniModel(Qwen2VLVisionModel):
|
|||
if "audio_bos_eos_token" in name:
|
||||
# this tensor is left unused in transformers code
|
||||
# https://github.com/huggingface/transformers/blob/6e3063422c4b1c014aa60c32b9254fd2902f0f28/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py#L1809
|
||||
return
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
return []
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
return [] # skip other tensors
|
||||
|
||||
|
||||
@ModelBase.register("Qwen2_5OmniModel")
|
||||
class Qwen25OmniModel(Qwen2VLVisionModel, Qwen25AudioModel):
|
||||
has_audio_encoder = True
|
||||
has_vision_encoder = True
|
||||
|
||||
def get_vision_config(self) -> dict[str, Any] | None:
|
||||
return self.global_config["thinker_config"].get("vision_config")
|
||||
|
||||
def get_audio_config(self) -> dict[str, Any] | None:
|
||||
return self.global_config["thinker_config"].get("audio_config")
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25O)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if "visual." in name:
|
||||
yield from Qwen2VLVisionModel.modify_tensors(self, data_torch, name, bid)
|
||||
elif "audio_tower." in name:
|
||||
yield from Qwen25AudioModel.modify_tensors(self, data_torch, name, bid)
|
||||
return [] # skip other tensors
|
||||
|
||||
|
||||
@ModelBase.register("InternVisionModel")
|
||||
|
|
@ -4808,7 +4825,10 @@ class RND1Model(Qwen2MoeModel):
|
|||
class Qwen3VLVisionModel(MmprojModel):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
assert self.hparams_vision is not None
|
||||
if self.hparams_vision is None:
|
||||
logger.info("No vision config found, skipping vision tensor processing")
|
||||
return
|
||||
|
||||
# Compute image_size if not present
|
||||
if "image_size" not in self.hparams_vision:
|
||||
# For Qwen3VL/Qwen3VLMoe, compute from num_position_embeddings
|
||||
|
|
@ -4829,7 +4849,9 @@ class Qwen3VLVisionModel(MmprojModel):
|
|||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN3VL)
|
||||
# in case mixed modalities, the arch will be handled by subclass
|
||||
if not self.has_audio_encoder:
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN3VL)
|
||||
self.gguf_writer.add_vision_use_gelu(True)
|
||||
|
||||
if self.hparams_vision is not None:
|
||||
|
|
@ -4917,11 +4939,64 @@ class Qwen3VLVisionModel(MmprojModel):
|
|||
return
|
||||
|
||||
if name.startswith("visual."):
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
return
|
||||
yield (self.map_tensor_name(name), data_torch)
|
||||
return [] # skip other tensors
|
||||
|
||||
# Fall back to parent class for other tensors
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
@ModelBase.register("Qwen3OmniMoeForConditionalGeneration")
|
||||
class Qwen3OmniMmprojModel(Qwen3VLVisionModel, Qwen25AudioModel):
|
||||
has_audio_encoder = True
|
||||
has_vision_encoder = True
|
||||
|
||||
def get_vision_config(self) -> dict[str, Any] | None:
|
||||
if self.has_vision_encoder:
|
||||
return self.global_config["thinker_config"].get("vision_config")
|
||||
else:
|
||||
return None
|
||||
|
||||
def get_audio_config(self) -> dict[str, Any] | None:
|
||||
if self.has_audio_encoder:
|
||||
return self.global_config["thinker_config"].get("audio_config")
|
||||
else:
|
||||
return None
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
if self.has_vision_encoder:
|
||||
Qwen3VLVisionModel.set_gguf_parameters(self)
|
||||
self.gguf_writer.add_clip_vision_projector_type(gguf.VisionProjectorType.QWEN3VL)
|
||||
if self.has_audio_encoder:
|
||||
Qwen25AudioModel.set_gguf_parameters(self)
|
||||
self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.QWEN3A)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if "visual." in name:
|
||||
if not self.has_vision_encoder:
|
||||
raise ValueError(f"Model does not have vision encoder, but found tensor {name}")
|
||||
# need to transform vision tensor naming, so that modify_tensors() logic can be used correctly
|
||||
name = name.replace("thinker.visual.", "model.visual.")
|
||||
if ".merger_list." in name:
|
||||
name = name.replace(".merger_list.", ".deepstack_merger_list.")
|
||||
name = name.replace(".ln_q", ".norm")
|
||||
name = name.replace(".mlp.0", ".linear_fc1")
|
||||
name = name.replace(".mlp.2", ".linear_fc2")
|
||||
elif ".merger." in name:
|
||||
name = name.replace(".ln_q", ".norm")
|
||||
name = name.replace(".mlp.0", ".linear_fc1")
|
||||
name = name.replace(".mlp.2", ".linear_fc2")
|
||||
yield from Qwen3VLVisionModel.modify_tensors(self, data_torch, name, bid)
|
||||
elif "audio_tower." in name:
|
||||
if not self.has_audio_encoder:
|
||||
raise ValueError(f"Model does not have audio encoder, but found tensor {name}")
|
||||
if "conv2d" in name and name.endswith(".bias"):
|
||||
# transform conv2d bias [n_embd] --> [1, 1, n_embd]
|
||||
data_torch = data_torch.unsqueeze(-1).unsqueeze(-1)
|
||||
yield from Qwen25AudioModel.modify_tensors(self, data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("Qwen3ASRForConditionalGeneration")
|
||||
class Qwen3ASRMmprojModel(Qwen3OmniMmprojModel):
|
||||
has_audio_encoder = True
|
||||
has_vision_encoder = False
|
||||
|
||||
|
||||
@ModelBase.register("Glm4vForConditionalGeneration", "Glm4vMoeForConditionalGeneration", "GlmOcrForConditionalGeneration")
|
||||
|
|
@ -4955,9 +5030,10 @@ class Qwen3VLTextModel(Qwen3Model):
|
|||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
|
||||
# Handle MRoPE (Multi-axis Rotary Position Embedding) for Qwen3-VL
|
||||
vision_config = self.hparams.get("vision_config", {})
|
||||
if "thinker_config" in self.hparams:
|
||||
vision_config = self.hparams["thinker_config"].get("vision_config", {})
|
||||
else:
|
||||
vision_config = self.hparams.get("vision_config", {})
|
||||
deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", []))
|
||||
self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num)
|
||||
|
||||
|
|
@ -4969,20 +5045,66 @@ class Qwen3VLTextModel(Qwen3Model):
|
|||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("Qwen3VLMoeForConditionalGeneration")
|
||||
class Qwen3VLMoeTextModel(Qwen3MoeModel):
|
||||
model_arch = gguf.MODEL_ARCH.QWEN3VLMOE
|
||||
@ModelBase.register("Qwen3ASRForConditionalGeneration")
|
||||
class Qwen3ASRTextModel(Qwen3VLTextModel):
|
||||
model_arch = gguf.MODEL_ARCH.QWEN3VL
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
vision_config = self.hparams.get("vision_config", {})
|
||||
deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", []))
|
||||
self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num)
|
||||
self.gguf_writer.add_num_deepstack_layers(0)
|
||||
|
||||
def set_vocab(self):
|
||||
super().set_vocab()
|
||||
# fix chat template, use correct chatml format
|
||||
self.gguf_writer.add_chat_template("{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}")
|
||||
# correct BOS/EOS tokens
|
||||
with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
|
||||
tokenizer_config = json.load(f)
|
||||
added_tokens = tokenizer_config.get("added_tokens_decoder", {})
|
||||
for token_id, data in added_tokens.items():
|
||||
if data.get("content") == "<|im_end|>":
|
||||
self.gguf_writer.add_bos_token_id(int(token_id))
|
||||
self.gguf_writer.add_eos_token_id(int(token_id))
|
||||
|
||||
def modify_tensors(self, data_torch, name, bid):
|
||||
# qwen3-omni
|
||||
name = name.replace("thinker.", "")
|
||||
|
||||
# Skip vision and audio tensors - they go in the mmproj file
|
||||
if "visual." in name or "audio_tower." in name \
|
||||
or "talker." in name or "code2wav." in name:
|
||||
return
|
||||
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("Qwen3VLMoeForConditionalGeneration", "Qwen3OmniMoeForConditionalGeneration")
|
||||
class Qwen3VLMoeTextModel(Qwen3MoeModel):
|
||||
model_arch = gguf.MODEL_ARCH.QWEN3VLMOE
|
||||
|
||||
def set_vocab(self):
|
||||
super().set_vocab()
|
||||
# correct BOS/EOS tokens
|
||||
with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
|
||||
tokenizer_config = json.load(f)
|
||||
added_tokens = tokenizer_config.get("added_tokens_decoder", {})
|
||||
for token_id, data in added_tokens.items():
|
||||
if data.get("content") == "<|im_end|>":
|
||||
self.gguf_writer.add_bos_token_id(int(token_id))
|
||||
self.gguf_writer.add_eos_token_id(int(token_id))
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_num_deepstack_layers(0)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
# Skip vision tensors - they go in the mmproj file
|
||||
if name.startswith("model.visual."):
|
||||
return
|
||||
if "visual." in name or "audio_tower." in name \
|
||||
or "talker." in name or "code2wav." in name:
|
||||
return []
|
||||
|
||||
# qwen3-omni
|
||||
name = name.replace("thinker.", "")
|
||||
|
||||
# Qwen3VL has transposed packed tensors, so we treat it differently from general Qwen2MoE packed tensors
|
||||
if name.endswith("mlp.experts.down_proj") or name.endswith("mlp.experts.down_proj.weight"):
|
||||
|
|
@ -5016,6 +5138,26 @@ class Qwen3VLMoeTextModel(Qwen3MoeModel):
|
|||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("Qwen3OmniMoeForConditionalGeneration")
|
||||
class Qwen3OmniMoeTextModel(Qwen3VLMoeTextModel):
|
||||
model_arch = gguf.MODEL_ARCH.QWEN3VLMOE
|
||||
|
||||
def set_vocab(self):
|
||||
super().set_vocab()
|
||||
# correct BOS/EOS tokens
|
||||
with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
|
||||
tokenizer_config = json.load(f)
|
||||
added_tokens = tokenizer_config.get("added_tokens_decoder", {})
|
||||
for token_id, data in added_tokens.items():
|
||||
if data.get("content") == "<|im_end|>":
|
||||
self.gguf_writer.add_bos_token_id(int(token_id))
|
||||
self.gguf_writer.add_eos_token_id(int(token_id))
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_num_deepstack_layers(0)
|
||||
|
||||
|
||||
class _LinearAttentionVReorderBase(Qwen3NextModel):
|
||||
model_arch = gguf.MODEL_ARCH.QWEN3NEXT # overridden by subclasses
|
||||
"""reorders V heads from grouped to tiled order for ggml broadcast
|
||||
|
|
|
|||
|
|
@ -794,6 +794,8 @@ class MODEL_TENSOR(IntEnum):
|
|||
A_ENC_INP_PROJ = auto() # gemma4
|
||||
A_ENC_CONV1D = auto()
|
||||
A_ENC_CONV1D_NORM = auto() # gemma3n
|
||||
A_ENC_CONV2D = auto()
|
||||
A_ENC_CONV_OUT = auto()
|
||||
A_PRE_NORM = auto()
|
||||
A_POST_NORM = auto()
|
||||
A_ENC_LAYER_PRE_NORM = auto() # gemma3n
|
||||
|
|
@ -1271,6 +1273,8 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
|||
MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS: "a.embd_to_logits",
|
||||
MODEL_TENSOR.A_ENC_INP_PROJ: "a.input_projection",
|
||||
MODEL_TENSOR.A_ENC_CONV1D: "a.conv1d.{bid}",
|
||||
MODEL_TENSOR.A_ENC_CONV2D: "a.conv2d.{bid}",
|
||||
MODEL_TENSOR.A_ENC_CONV_OUT: "a.conv_out",
|
||||
MODEL_TENSOR.A_ENC_CONV1D_NORM: "a.conv1d.{bid}.norm",
|
||||
MODEL_TENSOR.A_PRE_NORM: "a.pre_ln",
|
||||
MODEL_TENSOR.A_POST_NORM: "a.post_ln",
|
||||
|
|
@ -1414,6 +1418,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||
MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS,
|
||||
MODEL_TENSOR.A_ENC_INP_PROJ,
|
||||
MODEL_TENSOR.A_ENC_CONV1D,
|
||||
MODEL_TENSOR.A_ENC_CONV2D,
|
||||
MODEL_TENSOR.A_ENC_CONV_OUT,
|
||||
MODEL_TENSOR.A_ENC_CONV1D_NORM,
|
||||
MODEL_TENSOR.A_PRE_NORM,
|
||||
MODEL_TENSOR.A_POST_NORM,
|
||||
|
|
@ -4097,6 +4103,7 @@ class VisionProjectorType:
|
|||
ULTRAVOX = "ultravox"
|
||||
INTERNVL = "internvl"
|
||||
QWEN2A = "qwen2a" # audio
|
||||
QWEN3A = "qwen3a" # audio
|
||||
GLMA = "glma" # audio
|
||||
QWEN25O = "qwen2.5o" # omni
|
||||
VOXTRAL = "voxtral"
|
||||
|
|
|
|||
|
|
@ -1844,6 +1844,14 @@ class TensorNameMap:
|
|||
"conformer.subsample_conv_projection.input_proj_linear", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_CONV2D: (
|
||||
"audio_tower.conv2d{bid}", # qwen3omni
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_CONV_OUT: (
|
||||
"audio_tower.conv_out", # qwen3omni
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_PRE_NORM: (),
|
||||
|
||||
MODEL_TENSOR.A_POST_NORM: (
|
||||
|
|
@ -1994,7 +2002,8 @@ class TensorNameMap:
|
|||
|
||||
MODEL_TENSOR.A_MMPROJ: (
|
||||
"audio.multi_modal_projector.linear_{bid}", # ultravox
|
||||
"audio_adapter.model.{bid}" # lfm2
|
||||
"audio_adapter.model.{bid}", # lfm2
|
||||
"audio_tower.proj{bid}", # qwen3omni
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_MMPROJ_FC: (
|
||||
|
|
|
|||
|
|
@ -30,6 +30,7 @@ add_library(mtmd
|
|||
models/pixtral.cpp
|
||||
models/qwen2vl.cpp
|
||||
models/qwen3vl.cpp
|
||||
models/qwen3a.cpp
|
||||
models/siglip.cpp
|
||||
models/whisper-enc.cpp
|
||||
models/deepseekocr.cpp
|
||||
|
|
|
|||
|
|
@ -135,6 +135,8 @@
|
|||
|
||||
// ultravox
|
||||
#define TN_CONV1D "a.conv1d.%d.%s"
|
||||
#define TN_CONV2D "a.conv2d.%d.%s"
|
||||
#define TN_CONV_OUT "a.conv_out.%s"
|
||||
#define TN_MM_AUDIO_MLP "mm.a.mlp.%d.%s"
|
||||
#define TN_MM_AUDIO_FC "mm.a.fc.%s" // fully connected layer
|
||||
#define TN_MM_NORM_PRE "mm.a.norm_pre.%s"
|
||||
|
|
@ -250,6 +252,7 @@ enum projector_type {
|
|||
PROJECTOR_TYPE_INTERNVL,
|
||||
PROJECTOR_TYPE_LLAMA4,
|
||||
PROJECTOR_TYPE_QWEN2A,
|
||||
PROJECTOR_TYPE_QWEN3A,
|
||||
PROJECTOR_TYPE_GLMA,
|
||||
PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
|
||||
PROJECTOR_TYPE_VOXTRAL,
|
||||
|
|
@ -290,6 +293,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
|||
{ PROJECTOR_TYPE_INTERNVL, "internvl"},
|
||||
{ PROJECTOR_TYPE_LLAMA4, "llama4"},
|
||||
{ PROJECTOR_TYPE_QWEN2A, "qwen2a"},
|
||||
{ PROJECTOR_TYPE_QWEN3A, "qwen3a"},
|
||||
{ PROJECTOR_TYPE_GLMA, "glma"},
|
||||
{ PROJECTOR_TYPE_QWEN25O, "qwen2.5o"},
|
||||
{ PROJECTOR_TYPE_VOXTRAL, "voxtral"},
|
||||
|
|
|
|||
|
|
@ -406,10 +406,20 @@ struct clip_model {
|
|||
ggml_tensor * conv1d_1_b = nullptr;
|
||||
ggml_tensor * conv1d_2_w = nullptr;
|
||||
ggml_tensor * conv1d_2_b = nullptr;
|
||||
ggml_tensor * conv_out_w = nullptr;
|
||||
ggml_tensor * conv_out_b = nullptr;
|
||||
ggml_tensor * mm_norm_pre_w = nullptr;
|
||||
ggml_tensor * mm_norm_pre_b = nullptr;
|
||||
ggml_tensor * mm_norm_mid_w = nullptr;
|
||||
|
||||
// qwen3a
|
||||
ggml_tensor * conv2d_1_w = nullptr;
|
||||
ggml_tensor * conv2d_1_b = nullptr;
|
||||
ggml_tensor * conv2d_2_w = nullptr;
|
||||
ggml_tensor * conv2d_2_b = nullptr;
|
||||
ggml_tensor * conv2d_3_w = nullptr;
|
||||
ggml_tensor * conv2d_3_b = nullptr;
|
||||
|
||||
// cogvlm
|
||||
ggml_tensor * mm_post_fc_norm_w = nullptr;
|
||||
ggml_tensor * mm_post_fc_norm_b = nullptr;
|
||||
|
|
|
|||
|
|
@ -922,6 +922,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||
{
|
||||
builder = std::make_unique<clip_graph_glm4v>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_QWEN3A:
|
||||
{
|
||||
builder = std::make_unique<clip_graph_qwen3a>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
{
|
||||
builder = std::make_unique<clip_graph_youtuvl>(ctx, img);
|
||||
|
|
@ -1366,6 +1370,7 @@ struct clip_model_loader {
|
|||
} break;
|
||||
case PROJECTOR_TYPE_ULTRAVOX:
|
||||
case PROJECTOR_TYPE_QWEN2A:
|
||||
case PROJECTOR_TYPE_QWEN3A:
|
||||
case PROJECTOR_TYPE_GLMA:
|
||||
case PROJECTOR_TYPE_VOXTRAL:
|
||||
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
||||
|
|
@ -1968,6 +1973,20 @@ struct clip_model_loader {
|
|||
model.mm_fc_w = get_tensor(string_format(TN_MM_AUDIO_FC, "weight"));
|
||||
model.mm_fc_b = get_tensor(string_format(TN_MM_AUDIO_FC, "bias"));
|
||||
} break;
|
||||
case PROJECTOR_TYPE_QWEN3A:
|
||||
{
|
||||
model.conv2d_1_w = get_tensor(string_format(TN_CONV2D, 1, "weight"));
|
||||
model.conv2d_1_b = get_tensor(string_format(TN_CONV2D, 1, "bias"));
|
||||
model.conv2d_2_w = get_tensor(string_format(TN_CONV2D, 2, "weight"));
|
||||
model.conv2d_2_b = get_tensor(string_format(TN_CONV2D, 2, "bias"));
|
||||
model.conv2d_3_w = get_tensor(string_format(TN_CONV2D, 3, "weight"));
|
||||
model.conv2d_3_b = get_tensor(string_format(TN_CONV2D, 3, "bias"));
|
||||
model.conv_out_w = get_tensor(string_format(TN_CONV_OUT, "weight")); // no bias
|
||||
model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
|
||||
model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias"));
|
||||
model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
|
||||
model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias"));
|
||||
} break;
|
||||
case PROJECTOR_TYPE_VOXTRAL:
|
||||
{
|
||||
model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
|
||||
|
|
@ -2742,6 +2761,15 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
|||
n_patches /= 2;
|
||||
}
|
||||
} break;
|
||||
case PROJECTOR_TYPE_QWEN3A:
|
||||
{
|
||||
// 3x stride-2 conv2d: each step is floor((n-1)/2)+1
|
||||
int n = img->nx;
|
||||
n = (n - 1) / 2 + 1;
|
||||
n = (n - 1) / 2 + 1;
|
||||
n = (n - 1) / 2 + 1;
|
||||
n_patches = n;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GLMA:
|
||||
{
|
||||
n_patches = img->nx;
|
||||
|
|
@ -3167,6 +3195,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|||
case PROJECTOR_TYPE_INTERNVL:
|
||||
case PROJECTOR_TYPE_NEMOTRON_V2_VL:
|
||||
case PROJECTOR_TYPE_QWEN2A:
|
||||
case PROJECTOR_TYPE_QWEN3A:
|
||||
case PROJECTOR_TYPE_GLMA:
|
||||
case PROJECTOR_TYPE_ULTRAVOX:
|
||||
case PROJECTOR_TYPE_LFM2:
|
||||
|
|
@ -3339,8 +3368,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
|||
return ctx->model.mm_model_proj->ne[1];
|
||||
case PROJECTOR_TYPE_QWEN2A:
|
||||
return ctx->model.mm_fc_w->ne[1];
|
||||
case PROJECTOR_TYPE_GLMA:
|
||||
case PROJECTOR_TYPE_QWEN3A:
|
||||
return ctx->model.mm_2_w->ne[1];
|
||||
case PROJECTOR_TYPE_GLMA:
|
||||
case PROJECTOR_TYPE_LFM2:
|
||||
case PROJECTOR_TYPE_KIMIVL:
|
||||
case PROJECTOR_TYPE_PADDLEOCR:
|
||||
|
|
@ -3388,6 +3418,7 @@ bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
|
|||
switch (ctx->proj_type()) {
|
||||
case PROJECTOR_TYPE_ULTRAVOX:
|
||||
case PROJECTOR_TYPE_QWEN2A:
|
||||
case PROJECTOR_TYPE_QWEN3A:
|
||||
case PROJECTOR_TYPE_GLMA:
|
||||
case PROJECTOR_TYPE_VOXTRAL:
|
||||
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
||||
|
|
|
|||
|
|
@ -131,6 +131,11 @@ struct clip_graph_mobilenetv5 : clip_graph {
|
|||
const mobilenetv5_block & block);
|
||||
};
|
||||
|
||||
struct clip_graph_qwen3a : clip_graph {
|
||||
clip_graph_qwen3a(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_kimik25 : clip_graph {
|
||||
clip_graph_kimik25(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,68 @@
|
|||
#include "models.h"
|
||||
|
||||
ggml_cgraph * clip_graph_qwen3a::build() {
|
||||
ggml_tensor * inp = build_inp_raw(1);
|
||||
|
||||
// conv2d block
|
||||
// TODO: do we need to split by chunks of n_window each like on transformers impl?
|
||||
{
|
||||
inp = ggml_conv_2d(ctx0, model.conv2d_1_w, inp, 2, 2, 1, 1, 1, 1);
|
||||
inp = ggml_add(ctx0, inp, model.conv2d_1_b);
|
||||
inp = ggml_gelu_erf(ctx0, inp);
|
||||
|
||||
inp = ggml_conv_2d(ctx0, model.conv2d_2_w, inp, 2, 2, 1, 1, 1, 1);
|
||||
inp = ggml_add(ctx0, inp, model.conv2d_2_b);
|
||||
inp = ggml_gelu_erf(ctx0, inp);
|
||||
|
||||
inp = ggml_conv_2d(ctx0, model.conv2d_3_w, inp, 2, 2, 1, 1, 1, 1);
|
||||
inp = ggml_add(ctx0, inp, model.conv2d_3_b);
|
||||
inp = ggml_gelu_erf(ctx0, inp);
|
||||
|
||||
// inp [n_pos, n_mels/8, channels, 1] (W, H, C, N)
|
||||
cb(inp, "after_conv_blocks", -1);
|
||||
|
||||
const int64_t n_pos_after_conv = inp->ne[0];
|
||||
const int64_t n_mel_after_conv = inp->ne[1]; // 128/8 = 16
|
||||
|
||||
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 3, 1));
|
||||
inp = ggml_reshape_2d(ctx0, inp, n_pos_after_conv, n_mel_after_conv * inp->ne[3]); // [n_pos, 7680]
|
||||
inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp)); // [7680, n_pos]
|
||||
|
||||
// project to n_embd
|
||||
inp = ggml_mul_mat(ctx0, model.conv_out_w, inp);
|
||||
if (model.conv_out_b) {
|
||||
inp = ggml_add(ctx0, inp, model.conv_out_b);
|
||||
}
|
||||
cb(inp, "after_conv_out", -1);
|
||||
}
|
||||
|
||||
auto n_pos = inp->ne[1];
|
||||
|
||||
ggml_tensor * pos_embd_selected = ggml_view_2d(
|
||||
ctx0, model.position_embeddings,
|
||||
model.position_embeddings->ne[0], n_pos,
|
||||
model.position_embeddings->nb[1], 0
|
||||
);
|
||||
ggml_tensor * cur = build_vit(
|
||||
inp, n_pos,
|
||||
NORM_TYPE_NORMAL,
|
||||
hparams.ffn_op,
|
||||
pos_embd_selected,
|
||||
nullptr);
|
||||
|
||||
cb(cur, "after_transformer", -1);
|
||||
|
||||
// projector
|
||||
cur = build_ffn(cur,
|
||||
model.mm_1_w, model.mm_1_b,
|
||||
nullptr, nullptr,
|
||||
model.mm_2_w, model.mm_2_b,
|
||||
FFN_GELU_ERF,
|
||||
-1);
|
||||
|
||||
cb(cur, "projected", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
|
@ -425,6 +425,7 @@ struct mtmd_context {
|
|||
// set preprocessor
|
||||
switch (proj) {
|
||||
case PROJECTOR_TYPE_QWEN2A:
|
||||
case PROJECTOR_TYPE_QWEN3A:
|
||||
case PROJECTOR_TYPE_QWEN25O:
|
||||
{
|
||||
// <|audio_bos|> ... (embeddings) ... <|audio_eos|>
|
||||
|
|
@ -989,6 +990,10 @@ bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
|
|||
}
|
||||
|
||||
bool mtmd_decode_use_mrope(mtmd_context * ctx) {
|
||||
if (ctx->ctx_v == nullptr && ctx->proj_type_a() == PROJECTOR_TYPE_QWEN3A) {
|
||||
// qwen3-asr
|
||||
return true;
|
||||
}
|
||||
switch (ctx->proj_type_v()) {
|
||||
case PROJECTOR_TYPE_QWEN2VL:
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
|
|
|
|||
Loading…
Reference in New Issue