wip
This commit is contained in:
parent
d7435467e7
commit
d703cf7184
|
|
@ -4515,12 +4515,15 @@ class Qwen3OmniMmprojModel(Qwen3VLVisionModel, Qwen25AudioModel):
|
||||||
name = name.replace(".ln_q", ".norm")
|
name = name.replace(".ln_q", ".norm")
|
||||||
name = name.replace(".mlp.0", ".linear_fc1")
|
name = name.replace(".mlp.0", ".linear_fc1")
|
||||||
name = name.replace(".mlp.2", ".linear_fc2")
|
name = name.replace(".mlp.2", ".linear_fc2")
|
||||||
if ".merger." in name:
|
elif ".merger." in name:
|
||||||
name = name.replace(".ln_q", ".norm")
|
name = name.replace(".ln_q", ".norm")
|
||||||
name = name.replace(".mlp.0", ".linear_fc1")
|
name = name.replace(".mlp.0", ".linear_fc1")
|
||||||
name = name.replace(".mlp.2", ".linear_fc2")
|
name = name.replace(".mlp.2", ".linear_fc2")
|
||||||
return Qwen3VLVisionModel.modify_tensors(self, data_torch, name, bid)
|
return Qwen3VLVisionModel.modify_tensors(self, data_torch, name, bid)
|
||||||
elif "audio_tower." in name:
|
elif "audio_tower." in name:
|
||||||
|
if "conv2d" in name and name.endswith(".bias"):
|
||||||
|
# transform conv2d bias [n_embd] --> [1, 1, n_embd]
|
||||||
|
data_torch = data_torch.unsqueeze(-1).unsqueeze(-1)
|
||||||
return Qwen25AudioModel.modify_tensors(self, data_torch, name, bid)
|
return Qwen25AudioModel.modify_tensors(self, data_torch, name, bid)
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
@ -4555,9 +4558,10 @@ class Qwen3VLTextModel(Qwen3Model):
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
|
if "thinker_config" in self.hparams:
|
||||||
# Handle MRoPE (Multi-axis Rotary Position Embedding) for Qwen3-VL
|
vision_config = self.hparams["thinker_config"].get("vision_config", {})
|
||||||
vision_config = self.hparams.get("vision_config", {})
|
else:
|
||||||
|
vision_config = self.hparams.get("vision_config", {})
|
||||||
deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", []))
|
deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", []))
|
||||||
self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num)
|
self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num)
|
||||||
|
|
||||||
|
|
@ -4575,7 +4579,10 @@ class Qwen3VLMoeTextModel(Qwen3MoeModel):
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
vision_config = self.hparams.get("vision_config", {})
|
if "thinker_config" in self.hparams:
|
||||||
|
vision_config = self.hparams["thinker_config"].get("vision_config", {})
|
||||||
|
else:
|
||||||
|
vision_config = self.hparams.get("vision_config", {})
|
||||||
deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", []))
|
deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", []))
|
||||||
self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num)
|
self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -700,6 +700,7 @@ class MODEL_TENSOR(IntEnum):
|
||||||
A_ENC_EMBD_NORM = auto()
|
A_ENC_EMBD_NORM = auto()
|
||||||
A_ENC_EMBD_TO_LOGITS = auto()
|
A_ENC_EMBD_TO_LOGITS = auto()
|
||||||
A_ENC_CONV1D = auto()
|
A_ENC_CONV1D = auto()
|
||||||
|
A_ENC_CONV2D = auto()
|
||||||
A_ENC_CONV_OUT = auto()
|
A_ENC_CONV_OUT = auto()
|
||||||
A_PRE_NORM = auto()
|
A_PRE_NORM = auto()
|
||||||
A_POST_NORM = auto()
|
A_POST_NORM = auto()
|
||||||
|
|
@ -1098,6 +1099,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
MODEL_TENSOR.A_ENC_EMBD_NORM: "a.position_embd_norm",
|
MODEL_TENSOR.A_ENC_EMBD_NORM: "a.position_embd_norm",
|
||||||
MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS: "a.embd_to_logits",
|
MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS: "a.embd_to_logits",
|
||||||
MODEL_TENSOR.A_ENC_CONV1D: "a.conv1d.{bid}",
|
MODEL_TENSOR.A_ENC_CONV1D: "a.conv1d.{bid}",
|
||||||
|
MODEL_TENSOR.A_ENC_CONV2D: "a.conv2d.{bid}",
|
||||||
MODEL_TENSOR.A_ENC_CONV_OUT: "a.conv_out",
|
MODEL_TENSOR.A_ENC_CONV_OUT: "a.conv_out",
|
||||||
MODEL_TENSOR.A_PRE_NORM: "a.pre_ln",
|
MODEL_TENSOR.A_PRE_NORM: "a.pre_ln",
|
||||||
MODEL_TENSOR.A_POST_NORM: "a.post_ln",
|
MODEL_TENSOR.A_POST_NORM: "a.post_ln",
|
||||||
|
|
@ -1196,6 +1198,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.A_ENC_EMBD_NORM,
|
MODEL_TENSOR.A_ENC_EMBD_NORM,
|
||||||
MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS,
|
MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS,
|
||||||
MODEL_TENSOR.A_ENC_CONV1D,
|
MODEL_TENSOR.A_ENC_CONV1D,
|
||||||
|
MODEL_TENSOR.A_ENC_CONV2D,
|
||||||
MODEL_TENSOR.A_ENC_CONV_OUT,
|
MODEL_TENSOR.A_ENC_CONV_OUT,
|
||||||
MODEL_TENSOR.A_PRE_NORM,
|
MODEL_TENSOR.A_PRE_NORM,
|
||||||
MODEL_TENSOR.A_POST_NORM,
|
MODEL_TENSOR.A_POST_NORM,
|
||||||
|
|
|
||||||
|
|
@ -1563,6 +1563,9 @@ class TensorNameMap:
|
||||||
MODEL_TENSOR.A_ENC_CONV1D: (
|
MODEL_TENSOR.A_ENC_CONV1D: (
|
||||||
"audio_tower.conv{bid}", # ultravox
|
"audio_tower.conv{bid}", # ultravox
|
||||||
"conformer.pre_encode.conv.{bid}", # lfm2
|
"conformer.pre_encode.conv.{bid}", # lfm2
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.A_ENC_CONV2D: (
|
||||||
"audio_tower.conv2d{bid}", # qwen3omni
|
"audio_tower.conv2d{bid}", # qwen3omni
|
||||||
),
|
),
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -27,6 +27,7 @@ add_library(mtmd
|
||||||
models/qwen3vl.cpp
|
models/qwen3vl.cpp
|
||||||
models/siglip.cpp
|
models/siglip.cpp
|
||||||
models/whisper-enc.cpp
|
models/whisper-enc.cpp
|
||||||
|
models/qwen3a.cpp
|
||||||
)
|
)
|
||||||
|
|
||||||
set_target_properties(mtmd PROPERTIES
|
set_target_properties(mtmd PROPERTIES
|
||||||
|
|
|
||||||
|
|
@ -125,6 +125,7 @@
|
||||||
|
|
||||||
// ultravox
|
// ultravox
|
||||||
#define TN_CONV1D "a.conv1d.%d.%s"
|
#define TN_CONV1D "a.conv1d.%d.%s"
|
||||||
|
#define TN_CONV2D "a.conv2d.%d.%s"
|
||||||
#define TN_CONV_OUT "a.conv_out.%s"
|
#define TN_CONV_OUT "a.conv_out.%s"
|
||||||
#define TN_MM_AUDIO_MLP "mm.a.mlp.%d.%s"
|
#define TN_MM_AUDIO_MLP "mm.a.mlp.%d.%s"
|
||||||
#define TN_MM_AUDIO_FC "mm.a.fc.%s" // fully connected layer
|
#define TN_MM_AUDIO_FC "mm.a.fc.%s" // fully connected layer
|
||||||
|
|
|
||||||
|
|
@ -304,6 +304,14 @@ struct clip_model {
|
||||||
ggml_tensor * mm_norm_pre_b = nullptr;
|
ggml_tensor * mm_norm_pre_b = nullptr;
|
||||||
ggml_tensor * mm_norm_mid_w = nullptr;
|
ggml_tensor * mm_norm_mid_w = nullptr;
|
||||||
|
|
||||||
|
// qwen3a
|
||||||
|
ggml_tensor * conv2d_1_w = nullptr;
|
||||||
|
ggml_tensor * conv2d_1_b = nullptr;
|
||||||
|
ggml_tensor * conv2d_2_w = nullptr;
|
||||||
|
ggml_tensor * conv2d_2_b = nullptr;
|
||||||
|
ggml_tensor * conv2d_3_w = nullptr;
|
||||||
|
ggml_tensor * conv2d_3_b = nullptr;
|
||||||
|
|
||||||
// cogvlm
|
// cogvlm
|
||||||
ggml_tensor * mm_post_fc_norm_w = nullptr;
|
ggml_tensor * mm_post_fc_norm_w = nullptr;
|
||||||
ggml_tensor * mm_post_fc_norm_b = nullptr;
|
ggml_tensor * mm_post_fc_norm_b = nullptr;
|
||||||
|
|
|
||||||
|
|
@ -817,7 +817,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
case PROJECTOR_TYPE_ULTRAVOX:
|
case PROJECTOR_TYPE_ULTRAVOX:
|
||||||
case PROJECTOR_TYPE_VOXTRAL:
|
case PROJECTOR_TYPE_VOXTRAL:
|
||||||
case PROJECTOR_TYPE_QWEN2A:
|
case PROJECTOR_TYPE_QWEN2A:
|
||||||
case PROJECTOR_TYPE_QWEN3A:
|
|
||||||
case PROJECTOR_TYPE_GLMA:
|
case PROJECTOR_TYPE_GLMA:
|
||||||
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
||||||
{
|
{
|
||||||
|
|
@ -847,6 +846,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
{
|
{
|
||||||
builder = std::make_unique<clip_graph_glm4v>(ctx, img);
|
builder = std::make_unique<clip_graph_glm4v>(ctx, img);
|
||||||
} break;
|
} break;
|
||||||
|
case PROJECTOR_TYPE_QWEN3A:
|
||||||
|
{
|
||||||
|
builder = std::make_unique<clip_graph_qwen3a>(ctx, img);
|
||||||
|
} break;
|
||||||
default:
|
default:
|
||||||
GGML_ABORT("missing cgraph builder");
|
GGML_ABORT("missing cgraph builder");
|
||||||
}
|
}
|
||||||
|
|
@ -1573,10 +1576,12 @@ struct clip_model_loader {
|
||||||
} break;
|
} break;
|
||||||
case PROJECTOR_TYPE_QWEN3A:
|
case PROJECTOR_TYPE_QWEN3A:
|
||||||
{
|
{
|
||||||
model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
|
model.conv2d_1_w = get_tensor(string_format(TN_CONV2D, 1, "weight"));
|
||||||
model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
|
model.conv2d_1_b = get_tensor(string_format(TN_CONV2D, 1, "bias"));
|
||||||
model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
|
model.conv2d_2_w = get_tensor(string_format(TN_CONV2D, 2, "weight"));
|
||||||
model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
|
model.conv2d_2_b = get_tensor(string_format(TN_CONV2D, 2, "bias"));
|
||||||
|
model.conv2d_3_w = get_tensor(string_format(TN_CONV2D, 3, "weight"));
|
||||||
|
model.conv2d_3_b = get_tensor(string_format(TN_CONV2D, 3, "bias"));
|
||||||
model.conv_out_w = get_tensor(string_format(TN_CONV_OUT, "weight")); // no bias
|
model.conv_out_w = get_tensor(string_format(TN_CONV_OUT, "weight")); // no bias
|
||||||
model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
|
model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
|
||||||
model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias"));
|
model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias"));
|
||||||
|
|
@ -3058,7 +3063,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||||
case PROJECTOR_TYPE_VOXTRAL:
|
case PROJECTOR_TYPE_VOXTRAL:
|
||||||
case PROJECTOR_TYPE_ULTRAVOX:
|
case PROJECTOR_TYPE_ULTRAVOX:
|
||||||
case PROJECTOR_TYPE_QWEN2A:
|
case PROJECTOR_TYPE_QWEN2A:
|
||||||
case PROJECTOR_TYPE_QWEN3A:
|
|
||||||
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
||||||
{
|
{
|
||||||
n_patches = img->nx;
|
n_patches = img->nx;
|
||||||
|
|
@ -3078,6 +3082,10 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||||
n_patches /= 2;
|
n_patches /= 2;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case PROJECTOR_TYPE_QWEN3A:
|
||||||
|
{
|
||||||
|
return 375; // TODO: calculate this
|
||||||
|
} break;
|
||||||
case PROJECTOR_TYPE_GLMA:
|
case PROJECTOR_TYPE_GLMA:
|
||||||
{
|
{
|
||||||
n_patches = img->nx;
|
n_patches = img->nx;
|
||||||
|
|
@ -3566,6 +3574,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||||
case PROJECTOR_TYPE_QWEN2A:
|
case PROJECTOR_TYPE_QWEN2A:
|
||||||
return ctx->model.mm_fc_w->ne[1];
|
return ctx->model.mm_fc_w->ne[1];
|
||||||
case PROJECTOR_TYPE_QWEN3A:
|
case PROJECTOR_TYPE_QWEN3A:
|
||||||
|
return ctx->model.mm_2_w->ne[1] * 4; // 4 for deepstack, TODO: do NOT hardcode
|
||||||
case PROJECTOR_TYPE_GLMA:
|
case PROJECTOR_TYPE_GLMA:
|
||||||
case PROJECTOR_TYPE_LFM2:
|
case PROJECTOR_TYPE_LFM2:
|
||||||
case PROJECTOR_TYPE_KIMIVL:
|
case PROJECTOR_TYPE_KIMIVL:
|
||||||
|
|
|
||||||
|
|
@ -71,3 +71,8 @@ struct clip_graph_glm4v : clip_graph {
|
||||||
clip_graph_glm4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
clip_graph_glm4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||||
ggml_cgraph * build() override;
|
ggml_cgraph * build() override;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct clip_graph_qwen3a : clip_graph {
|
||||||
|
clip_graph_qwen3a(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||||
|
ggml_cgraph * build() override;
|
||||||
|
};
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,69 @@
|
||||||
|
#include "models.h"
|
||||||
|
|
||||||
|
ggml_cgraph * clip_graph_qwen3a::build() {
|
||||||
|
ggml_tensor * inp = build_inp_raw(1);
|
||||||
|
|
||||||
|
// conv2d block
|
||||||
|
// TODO: do we need to split by chunks of n_window each like on transformers impl?
|
||||||
|
{
|
||||||
|
inp = ggml_conv_2d(ctx0, model.conv2d_1_w, inp, 2, 2, 1, 1, 1, 1);
|
||||||
|
inp = ggml_add(ctx0, inp, model.conv2d_1_b);
|
||||||
|
inp = ggml_gelu_erf(ctx0, inp);
|
||||||
|
|
||||||
|
inp = ggml_conv_2d(ctx0, model.conv2d_2_w, inp, 2, 2, 1, 1, 1, 1);
|
||||||
|
inp = ggml_add(ctx0, inp, model.conv2d_2_b);
|
||||||
|
inp = ggml_gelu_erf(ctx0, inp);
|
||||||
|
|
||||||
|
inp = ggml_conv_2d(ctx0, model.conv2d_3_w, inp, 2, 2, 1, 1, 1, 1);
|
||||||
|
inp = ggml_add(ctx0, inp, model.conv2d_3_b);
|
||||||
|
inp = ggml_gelu_erf(ctx0, inp);
|
||||||
|
|
||||||
|
// inp is now [time, frames, channels]
|
||||||
|
cb(inp, "after_conv_blocks", -1);
|
||||||
|
|
||||||
|
inp = ggml_permute(ctx0, inp, 2, 1, 0, 3); // [channels, frames, time]
|
||||||
|
inp = ggml_cont(ctx0, inp);
|
||||||
|
inp = ggml_reshape_2d(ctx0, inp, inp->ne[0] * inp->ne[1], inp->ne[2]); // [channels * time, frames]
|
||||||
|
|
||||||
|
// project to n_embd
|
||||||
|
inp = ggml_mul_mat(ctx0, model.conv_out_w, inp);
|
||||||
|
if (model.conv_out_b) {
|
||||||
|
inp = ggml_add(ctx0, inp, model.conv_out_b);
|
||||||
|
}
|
||||||
|
cb(inp, "after_conv_out", -1);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto n_pos = inp->ne[1];
|
||||||
|
|
||||||
|
ggml_tensor * pos_embd_selected = ggml_view_2d(
|
||||||
|
ctx0, model.position_embeddings,
|
||||||
|
model.position_embeddings->ne[0], n_pos,
|
||||||
|
model.position_embeddings->nb[1], 0
|
||||||
|
);
|
||||||
|
ggml_tensor * cur = build_vit(
|
||||||
|
inp, n_pos,
|
||||||
|
NORM_TYPE_NORMAL,
|
||||||
|
hparams.ffn_op,
|
||||||
|
pos_embd_selected,
|
||||||
|
nullptr);
|
||||||
|
|
||||||
|
cb(cur, "after_transformer", -1);
|
||||||
|
|
||||||
|
// projector
|
||||||
|
cur = build_ffn(cur,
|
||||||
|
model.mm_1_w, model.mm_1_b,
|
||||||
|
nullptr, nullptr,
|
||||||
|
model.mm_2_w, model.mm_2_b,
|
||||||
|
FFN_GELU_ERF,
|
||||||
|
-1);
|
||||||
|
|
||||||
|
cb(cur, "projected", -1);
|
||||||
|
|
||||||
|
// pad deepstack if needed
|
||||||
|
// TODO: do NOT hard code 3 here
|
||||||
|
cur = ggml_pad(ctx0, cur, cur->ne[0] * 3, 0, 0, 0);
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
@ -19,18 +19,9 @@ ggml_cgraph * clip_graph_whisper_enc::build() {
|
||||||
cur = ggml_add(ctx0, cur, model.conv1d_2_b);
|
cur = ggml_add(ctx0, cur, model.conv1d_2_b);
|
||||||
|
|
||||||
cur = ggml_gelu_erf(ctx0, cur);
|
cur = ggml_gelu_erf(ctx0, cur);
|
||||||
|
|
||||||
// transpose
|
// transpose
|
||||||
inp = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
|
inp = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
|
||||||
cb(inp, "after_conv1d", -1);
|
cb(inp, "after_conv1d", -1);
|
||||||
|
|
||||||
if (model.conv_out_w) {
|
|
||||||
inp = ggml_mul_mat(ctx0, model.conv_out_w, inp);
|
|
||||||
if (model.conv_out_b) {
|
|
||||||
inp = ggml_add(ctx0, inp, model.conv_out_b);
|
|
||||||
}
|
|
||||||
cb(inp, "after_conv_out", -1);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// sanity check (only check one layer, but it should be the same for all)
|
// sanity check (only check one layer, but it should be the same for all)
|
||||||
|
|
@ -86,15 +77,6 @@ ggml_cgraph * clip_graph_whisper_enc::build() {
|
||||||
cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur);
|
cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur);
|
||||||
cur = ggml_add(ctx0, cur, model.mm_fc_b);
|
cur = ggml_add(ctx0, cur, model.mm_fc_b);
|
||||||
|
|
||||||
} else if (proj_type == PROJECTOR_TYPE_QWEN3A) {
|
|
||||||
// projector
|
|
||||||
cur = build_ffn(cur,
|
|
||||||
model.mm_1_w, model.mm_1_b,
|
|
||||||
nullptr, nullptr,
|
|
||||||
model.mm_2_w, model.mm_2_b,
|
|
||||||
FFN_GELU_ERF,
|
|
||||||
-1);
|
|
||||||
|
|
||||||
} else if (proj_type == PROJECTOR_TYPE_VOXTRAL) {
|
} else if (proj_type == PROJECTOR_TYPE_VOXTRAL) {
|
||||||
// projector
|
// projector
|
||||||
cur = build_ffn(cur,
|
cur = build_ffn(cur,
|
||||||
|
|
|
||||||
|
|
@ -326,6 +326,7 @@ struct mtmd_context {
|
||||||
// set preprocessor
|
// set preprocessor
|
||||||
switch (proj) {
|
switch (proj) {
|
||||||
case PROJECTOR_TYPE_QWEN2A:
|
case PROJECTOR_TYPE_QWEN2A:
|
||||||
|
case PROJECTOR_TYPE_QWEN3A:
|
||||||
case PROJECTOR_TYPE_QWEN25O:
|
case PROJECTOR_TYPE_QWEN25O:
|
||||||
case PROJECTOR_TYPE_ULTRAVOX:
|
case PROJECTOR_TYPE_ULTRAVOX:
|
||||||
case PROJECTOR_TYPE_VOXTRAL:
|
case PROJECTOR_TYPE_VOXTRAL:
|
||||||
|
|
@ -344,7 +345,7 @@ struct mtmd_context {
|
||||||
audio_preproc->initialize();
|
audio_preproc->initialize();
|
||||||
|
|
||||||
// set special tokens
|
// set special tokens
|
||||||
if (proj == PROJECTOR_TYPE_QWEN2A) {
|
if (proj == PROJECTOR_TYPE_QWEN2A || proj == PROJECTOR_TYPE_QWEN3A) {
|
||||||
// <|audio_bos|> ... (embeddings) ... <|audio_eos|>
|
// <|audio_bos|> ... (embeddings) ... <|audio_eos|>
|
||||||
aud_beg = "<|audio_bos|>";
|
aud_beg = "<|audio_bos|>";
|
||||||
aud_end = "<|audio_eos|>";
|
aud_end = "<|audio_eos|>";
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue