Merge 72a41fd960 into 58062860af
This commit is contained in:
commit
e3ffcb633c
|
|
@ -1184,7 +1184,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.system_prompt = value;
|
params.system_prompt = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
|
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION, LLAMA_EXAMPLE_MTMD}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--perf"},
|
{"--perf"},
|
||||||
{"--no-perf"},
|
{"--no-perf"},
|
||||||
|
|
|
||||||
|
|
@ -711,6 +711,9 @@ class ModelBase:
|
||||||
if "thinker_config" in config:
|
if "thinker_config" in config:
|
||||||
# rename for Qwen2.5-Omni
|
# rename for Qwen2.5-Omni
|
||||||
config["text_config"] = config["thinker_config"]["text_config"]
|
config["text_config"] = config["thinker_config"]["text_config"]
|
||||||
|
if "lfm" in config:
|
||||||
|
# rename for LFM2-Audio
|
||||||
|
config["text_config"] = config["lfm"]
|
||||||
return config
|
return config
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|
@ -9712,12 +9715,12 @@ class LFM2Model(TextModel):
|
||||||
self._add_feed_forward_length()
|
self._add_feed_forward_length()
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
is_vision_tensor = "vision_tower" in name or "multi_modal_projector" in name
|
if self._is_vision_tensor(name) or self._is_audio_tensor(name):
|
||||||
if is_vision_tensor:
|
# skip multimodal tensors
|
||||||
# skip vision tensors
|
|
||||||
return []
|
return []
|
||||||
|
|
||||||
name = name.replace("language_model.", "")
|
name = name.replace("language_model.", "") # vision
|
||||||
|
name = name.replace("lfm.", "model.") # audio
|
||||||
|
|
||||||
# conv op requires 2d tensor
|
# conv op requires 2d tensor
|
||||||
if 'conv.conv' in name:
|
if 'conv.conv' in name:
|
||||||
|
|
@ -9725,6 +9728,12 @@ class LFM2Model(TextModel):
|
||||||
|
|
||||||
return [(self.map_tensor_name(name), data_torch)]
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
|
def _is_vision_tensor(self, name: str) -> bool:
|
||||||
|
return "vision_tower" in name or "multi_modal_projector" in name
|
||||||
|
|
||||||
|
def _is_audio_tensor(self, name: str):
|
||||||
|
return any(p in name for p in ["audio", "codebook", "conformer", "depth_embedding", "depthformer", "depth_linear"])
|
||||||
|
|
||||||
|
|
||||||
@ModelBase.register("Lfm2MoeForCausalLM")
|
@ModelBase.register("Lfm2MoeForCausalLM")
|
||||||
class LFM2MoeModel(TextModel):
|
class LFM2MoeModel(TextModel):
|
||||||
|
|
@ -9830,6 +9839,75 @@ class LFM2VLModel(MmprojModel):
|
||||||
return [] # skip other tensors
|
return [] # skip other tensors
|
||||||
|
|
||||||
|
|
||||||
|
@ModelBase.register("Lfm2AudioForConditionalGeneration")
|
||||||
|
class LFM2AudioModel(MmprojModel):
|
||||||
|
has_vision_encoder = False
|
||||||
|
has_audio_encoder = True
|
||||||
|
model_name = "Lfm2AudioEncoder"
|
||||||
|
|
||||||
|
_batch_norm_tensors: list[dict[str, Tensor]] | None = None
|
||||||
|
|
||||||
|
def get_audio_config(self) -> dict[str, Any] | None:
|
||||||
|
return self.global_config.get("encoder")
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
assert self.hparams_audio is not None
|
||||||
|
self.hparams_audio["hidden_size"] = self.hparams_audio["d_model"]
|
||||||
|
self.hparams_audio["intermediate_size"] = self.hparams_audio["d_model"]
|
||||||
|
self.hparams_audio["num_attention_heads"] = self.hparams_audio["n_heads"]
|
||||||
|
super().set_gguf_parameters()
|
||||||
|
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LFM2A)
|
||||||
|
self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
|
||||||
|
self.gguf_writer.add_audio_attention_layernorm_eps(1e-5)
|
||||||
|
|
||||||
|
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||||
|
if ".conv" in name and ".weight" in name:
|
||||||
|
return gguf.GGMLQuantizationType.F32
|
||||||
|
return super().tensor_force_quant(name, new_name, bid, n_dims)
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
# skip language model tensors
|
||||||
|
if name.startswith("lfm."):
|
||||||
|
return []
|
||||||
|
|
||||||
|
# for training only
|
||||||
|
if any(p in name for p in ["audio_loss_weight"]):
|
||||||
|
return []
|
||||||
|
|
||||||
|
# for audio output
|
||||||
|
if any(p in name for p in ["codebook_offsets", "depth_embeddings", "depth_linear", "depthformer"]):
|
||||||
|
return []
|
||||||
|
|
||||||
|
# fold running_mean, running_var and eps into weight and bias for batch_norm
|
||||||
|
if "batch_norm" in name:
|
||||||
|
if self._batch_norm_tensors is None:
|
||||||
|
self._batch_norm_tensors = [{} for _ in range(self.block_count)]
|
||||||
|
assert bid is not None
|
||||||
|
self._batch_norm_tensors[bid][name] = data_torch
|
||||||
|
|
||||||
|
if len(self._batch_norm_tensors[bid]) < 5:
|
||||||
|
return []
|
||||||
|
|
||||||
|
weight = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.weight"]
|
||||||
|
bias = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.bias"]
|
||||||
|
running_mean = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_mean"]
|
||||||
|
running_var = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_var"]
|
||||||
|
eps = 1e-5 # default value
|
||||||
|
|
||||||
|
a = weight / torch.sqrt(running_var + eps)
|
||||||
|
b = bias - running_mean * a
|
||||||
|
return [
|
||||||
|
(self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.weight"), a),
|
||||||
|
(self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.bias"), b),
|
||||||
|
]
|
||||||
|
|
||||||
|
# reshape conv weights
|
||||||
|
if name.startswith("conformer.pre_encode.conv.") and name.endswith(".bias"):
|
||||||
|
data_torch = data_torch[:, None, None]
|
||||||
|
|
||||||
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
|
|
||||||
@ModelBase.register("SmallThinkerForCausalLM")
|
@ModelBase.register("SmallThinkerForCausalLM")
|
||||||
class SmallThinkerModel(TextModel):
|
class SmallThinkerModel(TextModel):
|
||||||
model_arch = gguf.MODEL_ARCH.SMALLTHINKER
|
model_arch = gguf.MODEL_ARCH.SMALLTHINKER
|
||||||
|
|
|
||||||
|
|
@ -102,31 +102,25 @@ static void ssm_conv_f32_cuda(const float * src0, const float * src1, const int
|
||||||
const int threads = 128;
|
const int threads = 128;
|
||||||
GGML_ASSERT(nr % threads == 0);
|
GGML_ASSERT(nr % threads == 0);
|
||||||
|
|
||||||
|
auto launch_kernel = [&](auto NC) {
|
||||||
|
constexpr int kNC = decltype(NC)::value;
|
||||||
if (n_t <= 32) {
|
if (n_t <= 32) {
|
||||||
const dim3 blocks(n_s, (nr + threads - 1) / threads, 1);
|
const dim3 blocks(n_s, (nr + threads - 1) / threads, 1);
|
||||||
if (nc == 4) {
|
ssm_conv_f32<threads, kNC><<<blocks, threads, 0, stream>>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1,
|
||||||
ssm_conv_f32<threads, 4><<<blocks, threads, 0, stream>>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1,
|
|
||||||
dst, dst_nb0, dst_nb1, dst_nb2, n_t);
|
|
||||||
} else if (nc == 3) {
|
|
||||||
ssm_conv_f32<threads, 3><<<blocks, threads, 0, stream>>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1,
|
|
||||||
dst, dst_nb0, dst_nb1, dst_nb2, n_t);
|
dst, dst_nb0, dst_nb1, dst_nb2, n_t);
|
||||||
} else {
|
} else {
|
||||||
GGML_ABORT("Only support kernel size = 3 or size = 4 right now.");
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (nc == 4) {
|
|
||||||
const int64_t split_n_t = 32;
|
const int64_t split_n_t = 32;
|
||||||
dim3 blocks(n_s, (nr + threads - 1) / threads, (n_t + split_n_t - 1) / split_n_t);
|
dim3 blocks(n_s, (nr + threads - 1) / threads, (n_t + split_n_t - 1) / split_n_t);
|
||||||
ssm_conv_long_token_f32<threads, 4, split_n_t><<<blocks, threads, 0, stream>>>(
|
ssm_conv_long_token_f32<threads, kNC, split_n_t><<<blocks, threads, 0, stream>>>(
|
||||||
src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0, dst_nb1, dst_nb2, n_t);
|
src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0, dst_nb1, dst_nb2, n_t);
|
||||||
} else if (nc == 3) {
|
|
||||||
const int64_t split_n_t = 32;
|
|
||||||
dim3 blocks(n_s, (nr + threads - 1) / threads, (n_t + split_n_t - 1) / split_n_t);
|
|
||||||
ssm_conv_long_token_f32<threads, 3, split_n_t><<<blocks, threads, 0, stream>>>(
|
|
||||||
src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0, dst_nb1, dst_nb2, n_t);
|
|
||||||
} else {
|
|
||||||
GGML_ABORT("Only support kernel size = 3 or size = 4 right now.");
|
|
||||||
}
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
switch (nc) {
|
||||||
|
case 3: launch_kernel(std::integral_constant<int, 3>{}); break;
|
||||||
|
case 4: launch_kernel(std::integral_constant<int, 4>{}); break;
|
||||||
|
case 9: launch_kernel(std::integral_constant<int, 9>{}); break;
|
||||||
|
default: GGML_ABORT("Only support kernel sizes 3, 4, 9 right now.");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -690,6 +690,8 @@ class MODEL_TENSOR(IntEnum):
|
||||||
V_TOK_EOI = auto() # cogvlm
|
V_TOK_EOI = auto() # cogvlm
|
||||||
# audio (mtmd)
|
# audio (mtmd)
|
||||||
A_ENC_EMBD_POS = auto()
|
A_ENC_EMBD_POS = auto()
|
||||||
|
A_ENC_EMBD_NORM = auto()
|
||||||
|
A_ENC_EMBD_TO_LOGITS = auto()
|
||||||
A_ENC_CONV1D = auto()
|
A_ENC_CONV1D = auto()
|
||||||
A_PRE_NORM = auto()
|
A_PRE_NORM = auto()
|
||||||
A_POST_NORM = auto()
|
A_POST_NORM = auto()
|
||||||
|
|
@ -700,8 +702,13 @@ class MODEL_TENSOR(IntEnum):
|
||||||
A_ENC_OUTPUT = auto()
|
A_ENC_OUTPUT = auto()
|
||||||
A_ENC_OUTPUT_NORM = auto()
|
A_ENC_OUTPUT_NORM = auto()
|
||||||
A_ENC_FFN_UP = auto()
|
A_ENC_FFN_UP = auto()
|
||||||
|
A_ENC_FFN_NORM = auto()
|
||||||
A_ENC_FFN_GATE = auto()
|
A_ENC_FFN_GATE = auto()
|
||||||
A_ENC_FFN_DOWN = auto()
|
A_ENC_FFN_DOWN = auto()
|
||||||
|
A_ENC_FFN_UP_1 = auto()
|
||||||
|
A_ENC_FFN_NORM_1 = auto()
|
||||||
|
A_ENC_FFN_GATE_1 = auto()
|
||||||
|
A_ENC_FFN_DOWN_1 = auto()
|
||||||
A_MMPROJ = auto()
|
A_MMPROJ = auto()
|
||||||
A_MMPROJ_FC = auto()
|
A_MMPROJ_FC = auto()
|
||||||
A_MM_NORM_PRE = auto()
|
A_MM_NORM_PRE = auto()
|
||||||
|
|
@ -713,6 +720,12 @@ class MODEL_TENSOR(IntEnum):
|
||||||
NEXTN_HNORM = auto()
|
NEXTN_HNORM = auto()
|
||||||
NEXTN_SHARED_HEAD_HEAD = auto()
|
NEXTN_SHARED_HEAD_HEAD = auto()
|
||||||
NEXTN_SHARED_HEAD_NORM = auto()
|
NEXTN_SHARED_HEAD_NORM = auto()
|
||||||
|
# lfm2 audio
|
||||||
|
A_ENC_NORM_CONV = auto()
|
||||||
|
A_ENC_LINEAR_POS = auto()
|
||||||
|
A_ENC_POS_BIAS_U = auto()
|
||||||
|
A_ENC_POS_BIAS_V = auto()
|
||||||
|
A_ENC_OUT = auto()
|
||||||
|
|
||||||
|
|
||||||
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
|
|
@ -1065,6 +1078,8 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
MODEL_TENSOR.V_TOK_EOI: "v.eoi",
|
MODEL_TENSOR.V_TOK_EOI: "v.eoi",
|
||||||
# audio (mtmd)
|
# audio (mtmd)
|
||||||
MODEL_TENSOR.A_ENC_EMBD_POS: "a.position_embd",
|
MODEL_TENSOR.A_ENC_EMBD_POS: "a.position_embd",
|
||||||
|
MODEL_TENSOR.A_ENC_EMBD_NORM: "a.position_embd_norm",
|
||||||
|
MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS: "a.embd_to_logits",
|
||||||
MODEL_TENSOR.A_ENC_CONV1D: "a.conv1d.{bid}",
|
MODEL_TENSOR.A_ENC_CONV1D: "a.conv1d.{bid}",
|
||||||
MODEL_TENSOR.A_PRE_NORM: "a.pre_ln",
|
MODEL_TENSOR.A_PRE_NORM: "a.pre_ln",
|
||||||
MODEL_TENSOR.A_POST_NORM: "a.post_ln",
|
MODEL_TENSOR.A_POST_NORM: "a.post_ln",
|
||||||
|
|
@ -1074,9 +1089,14 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
MODEL_TENSOR.A_ENC_INPUT_NORM: "a.blk.{bid}.ln1",
|
MODEL_TENSOR.A_ENC_INPUT_NORM: "a.blk.{bid}.ln1",
|
||||||
MODEL_TENSOR.A_ENC_OUTPUT: "a.blk.{bid}.attn_out",
|
MODEL_TENSOR.A_ENC_OUTPUT: "a.blk.{bid}.attn_out",
|
||||||
MODEL_TENSOR.A_ENC_OUTPUT_NORM: "a.blk.{bid}.ln2",
|
MODEL_TENSOR.A_ENC_OUTPUT_NORM: "a.blk.{bid}.ln2",
|
||||||
|
MODEL_TENSOR.A_ENC_FFN_NORM: "a.blk.{bid}.ffn_norm",
|
||||||
MODEL_TENSOR.A_ENC_FFN_UP: "a.blk.{bid}.ffn_up",
|
MODEL_TENSOR.A_ENC_FFN_UP: "a.blk.{bid}.ffn_up",
|
||||||
MODEL_TENSOR.A_ENC_FFN_GATE: "a.blk.{bid}.ffn_gate",
|
MODEL_TENSOR.A_ENC_FFN_GATE: "a.blk.{bid}.ffn_gate",
|
||||||
MODEL_TENSOR.A_ENC_FFN_DOWN: "a.blk.{bid}.ffn_down",
|
MODEL_TENSOR.A_ENC_FFN_DOWN: "a.blk.{bid}.ffn_down",
|
||||||
|
MODEL_TENSOR.A_ENC_FFN_NORM_1: "a.blk.{bid}.ffn_norm_1",
|
||||||
|
MODEL_TENSOR.A_ENC_FFN_UP_1: "a.blk.{bid}.ffn_up_1",
|
||||||
|
MODEL_TENSOR.A_ENC_FFN_GATE_1: "a.blk.{bid}.ffn_gate_1",
|
||||||
|
MODEL_TENSOR.A_ENC_FFN_DOWN_1: "a.blk.{bid}.ffn_down_1",
|
||||||
MODEL_TENSOR.A_MMPROJ: "mm.a.mlp.{bid}",
|
MODEL_TENSOR.A_MMPROJ: "mm.a.mlp.{bid}",
|
||||||
MODEL_TENSOR.A_MMPROJ_FC: "mm.a.fc",
|
MODEL_TENSOR.A_MMPROJ_FC: "mm.a.fc",
|
||||||
MODEL_TENSOR.A_MM_NORM_PRE: "mm.a.norm_pre",
|
MODEL_TENSOR.A_MM_NORM_PRE: "mm.a.norm_pre",
|
||||||
|
|
@ -1088,6 +1108,12 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
MODEL_TENSOR.NEXTN_HNORM: "blk.{bid}.nextn.hnorm",
|
MODEL_TENSOR.NEXTN_HNORM: "blk.{bid}.nextn.hnorm",
|
||||||
MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD: "blk.{bid}.nextn.shared_head_head",
|
MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD: "blk.{bid}.nextn.shared_head_head",
|
||||||
MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM: "blk.{bid}.nextn.shared_head_norm",
|
MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM: "blk.{bid}.nextn.shared_head_norm",
|
||||||
|
# lfm2
|
||||||
|
MODEL_TENSOR.A_ENC_NORM_CONV: "a.blk.{bid}.norm_conv",
|
||||||
|
MODEL_TENSOR.A_ENC_LINEAR_POS: "a.blk.{bid}.linear_pos",
|
||||||
|
MODEL_TENSOR.A_ENC_POS_BIAS_U: "a.blk.{bid}.pos_bias_u",
|
||||||
|
MODEL_TENSOR.A_ENC_POS_BIAS_V: "a.blk.{bid}.pos_bias_v",
|
||||||
|
MODEL_TENSOR.A_ENC_OUT: "a.pre_encode.out",
|
||||||
}
|
}
|
||||||
|
|
||||||
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
|
|
@ -1145,6 +1171,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.V_TOK_EOI,
|
MODEL_TENSOR.V_TOK_EOI,
|
||||||
# audio
|
# audio
|
||||||
MODEL_TENSOR.A_ENC_EMBD_POS,
|
MODEL_TENSOR.A_ENC_EMBD_POS,
|
||||||
|
MODEL_TENSOR.A_ENC_EMBD_NORM,
|
||||||
|
MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS,
|
||||||
MODEL_TENSOR.A_ENC_CONV1D,
|
MODEL_TENSOR.A_ENC_CONV1D,
|
||||||
MODEL_TENSOR.A_PRE_NORM,
|
MODEL_TENSOR.A_PRE_NORM,
|
||||||
MODEL_TENSOR.A_POST_NORM,
|
MODEL_TENSOR.A_POST_NORM,
|
||||||
|
|
@ -1154,13 +1182,27 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.A_ENC_INPUT_NORM,
|
MODEL_TENSOR.A_ENC_INPUT_NORM,
|
||||||
MODEL_TENSOR.A_ENC_OUTPUT,
|
MODEL_TENSOR.A_ENC_OUTPUT,
|
||||||
MODEL_TENSOR.A_ENC_OUTPUT_NORM,
|
MODEL_TENSOR.A_ENC_OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.A_ENC_FFN_NORM,
|
||||||
MODEL_TENSOR.A_ENC_FFN_UP,
|
MODEL_TENSOR.A_ENC_FFN_UP,
|
||||||
MODEL_TENSOR.A_ENC_FFN_GATE,
|
MODEL_TENSOR.A_ENC_FFN_GATE,
|
||||||
MODEL_TENSOR.A_ENC_FFN_DOWN,
|
MODEL_TENSOR.A_ENC_FFN_DOWN,
|
||||||
|
MODEL_TENSOR.A_ENC_FFN_NORM_1,
|
||||||
|
MODEL_TENSOR.A_ENC_FFN_UP_1,
|
||||||
|
MODEL_TENSOR.A_ENC_FFN_GATE_1,
|
||||||
|
MODEL_TENSOR.A_ENC_FFN_DOWN_1,
|
||||||
MODEL_TENSOR.A_MMPROJ,
|
MODEL_TENSOR.A_MMPROJ,
|
||||||
MODEL_TENSOR.A_MMPROJ_FC,
|
MODEL_TENSOR.A_MMPROJ_FC,
|
||||||
MODEL_TENSOR.A_MM_NORM_PRE,
|
MODEL_TENSOR.A_MM_NORM_PRE,
|
||||||
MODEL_TENSOR.A_MM_NORM_MID,
|
MODEL_TENSOR.A_MM_NORM_MID,
|
||||||
|
MODEL_TENSOR.CONVNEXT_DW,
|
||||||
|
MODEL_TENSOR.CONVNEXT_NORM,
|
||||||
|
MODEL_TENSOR.CONVNEXT_PW1,
|
||||||
|
MODEL_TENSOR.CONVNEXT_PW2,
|
||||||
|
MODEL_TENSOR.A_ENC_NORM_CONV,
|
||||||
|
MODEL_TENSOR.A_ENC_LINEAR_POS,
|
||||||
|
MODEL_TENSOR.A_ENC_POS_BIAS_U,
|
||||||
|
MODEL_TENSOR.A_ENC_POS_BIAS_V,
|
||||||
|
MODEL_TENSOR.A_ENC_OUT,
|
||||||
],
|
],
|
||||||
MODEL_ARCH.LLAMA: [
|
MODEL_ARCH.LLAMA: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
|
@ -3363,6 +3405,7 @@ class VisionProjectorType:
|
||||||
LIGHTONOCR = "lightonocr"
|
LIGHTONOCR = "lightonocr"
|
||||||
COGVLM = "cogvlm"
|
COGVLM = "cogvlm"
|
||||||
JANUS_PRO = "janus_pro"
|
JANUS_PRO = "janus_pro"
|
||||||
|
LFM2A = "lfm2a" # audio
|
||||||
GLM4V = "glm4v"
|
GLM4V = "glm4v"
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1131,18 +1131,26 @@ class TensorNameMap:
|
||||||
|
|
||||||
MODEL_TENSOR.CONVNEXT_DW: (
|
MODEL_TENSOR.CONVNEXT_DW: (
|
||||||
"backbone.convnext.{bid}.dwconv", # wavtokenizer
|
"backbone.convnext.{bid}.dwconv", # wavtokenizer
|
||||||
|
"conformer.layers.{bid}.conv.depthwise_conv", # lfm2
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.CONVNEXT_NORM: (
|
MODEL_TENSOR.CONVNEXT_NORM: (
|
||||||
"backbone.convnext.{bid}.norm", # wavtokenizer
|
"backbone.convnext.{bid}.norm", # wavtokenizer
|
||||||
|
"conformer.layers.{bid}.conv.batch_norm", # lfm2
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.CONVNEXT_PW1: (
|
MODEL_TENSOR.CONVNEXT_PW1: (
|
||||||
"backbone.convnext.{bid}.pwconv1", # wavtokenizer
|
"backbone.convnext.{bid}.pwconv1", # wavtokenizer
|
||||||
|
"conformer.layers.{bid}.conv.pointwise_conv1", # lfm2
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.CONVNEXT_PW2: (
|
MODEL_TENSOR.CONVNEXT_PW2: (
|
||||||
"backbone.convnext.{bid}.pwconv2", # wavtokenizer
|
"backbone.convnext.{bid}.pwconv2", # wavtokenizer
|
||||||
|
"conformer.layers.{bid}.conv.pointwise_conv2", # lfm2
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.A_ENC_NORM_CONV: (
|
||||||
|
"conformer.layers.{bid}.norm_conv", # lfm2
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.CONVNEXT_GAMMA: (
|
MODEL_TENSOR.CONVNEXT_GAMMA: (
|
||||||
|
|
@ -1535,10 +1543,20 @@ class TensorNameMap:
|
||||||
|
|
||||||
MODEL_TENSOR.A_ENC_EMBD_POS: (
|
MODEL_TENSOR.A_ENC_EMBD_POS: (
|
||||||
"audio_tower.embed_positions", # ultravox
|
"audio_tower.embed_positions", # ultravox
|
||||||
|
"audio_embedding.embedding", # lfm2
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.A_ENC_EMBD_NORM: (
|
||||||
|
"audio_embedding.embedding_norm", # lfm2
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS: (
|
||||||
|
"audio_embedding.to_logits", # lfm2
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.A_ENC_CONV1D: (
|
MODEL_TENSOR.A_ENC_CONV1D: (
|
||||||
"audio_tower.conv{bid}", # ultravox
|
"audio_tower.conv{bid}", # ultravox
|
||||||
|
"conformer.pre_encode.conv.{bid}", # lfm2
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.A_PRE_NORM: (),
|
MODEL_TENSOR.A_PRE_NORM: (),
|
||||||
|
|
@ -1550,36 +1568,76 @@ class TensorNameMap:
|
||||||
|
|
||||||
MODEL_TENSOR.A_ENC_ATTN_Q: (
|
MODEL_TENSOR.A_ENC_ATTN_Q: (
|
||||||
"audio_tower.layers.{bid}.self_attn.q_proj", # ultravox
|
"audio_tower.layers.{bid}.self_attn.q_proj", # ultravox
|
||||||
|
"conformer.layers.{bid}.self_attn.linear_q", # lfm2
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.A_ENC_ATTN_K: (
|
MODEL_TENSOR.A_ENC_ATTN_K: (
|
||||||
"audio_tower.layers.{bid}.self_attn.k_proj", # ultravox
|
"audio_tower.layers.{bid}.self_attn.k_proj", # ultravox
|
||||||
|
"conformer.layers.{bid}.self_attn.linear_k", # lfm2
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.A_ENC_ATTN_V: (
|
MODEL_TENSOR.A_ENC_ATTN_V: (
|
||||||
"audio_tower.layers.{bid}.self_attn.v_proj", # ultravox
|
"audio_tower.layers.{bid}.self_attn.v_proj", # ultravox
|
||||||
|
"conformer.layers.{bid}.self_attn.linear_v", # lfm2
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.A_ENC_INPUT_NORM: (
|
MODEL_TENSOR.A_ENC_INPUT_NORM: (
|
||||||
"audio_tower.layers.{bid}.self_attn_layer_norm", # ultravox
|
"audio_tower.layers.{bid}.self_attn_layer_norm", # ultravox
|
||||||
|
"conformer.layers.{bid}.norm_self_att", # lfm2
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.A_ENC_OUTPUT: (
|
MODEL_TENSOR.A_ENC_OUTPUT: (
|
||||||
"audio_tower.layers.{bid}.self_attn.out_proj", # ultravox
|
"audio_tower.layers.{bid}.self_attn.out_proj", # ultravox
|
||||||
|
"conformer.layers.{bid}.self_attn.linear_out", # lfm2
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.A_ENC_OUTPUT_NORM: (
|
MODEL_TENSOR.A_ENC_OUTPUT_NORM: (
|
||||||
"audio_tower.layers.{bid}.final_layer_norm", # ultravox
|
"audio_tower.layers.{bid}.final_layer_norm", # ultravox
|
||||||
|
"conformer.layers.{bid}.norm_out", # lfm2
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.A_ENC_FFN_NORM: (
|
||||||
|
"conformer.layers.{bid}.norm_feed_forward1", # lfm2
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.A_ENC_FFN_UP: (
|
MODEL_TENSOR.A_ENC_FFN_UP: (
|
||||||
"audio_tower.layers.{bid}.fc1", # ultravox
|
"audio_tower.layers.{bid}.fc1", # ultravox
|
||||||
|
"conformer.layers.{bid}.feed_forward1.linear1", # lfm2
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.A_ENC_FFN_GATE: (),
|
MODEL_TENSOR.A_ENC_FFN_GATE: (),
|
||||||
|
|
||||||
MODEL_TENSOR.A_ENC_FFN_DOWN: (
|
MODEL_TENSOR.A_ENC_FFN_DOWN: (
|
||||||
"audio_tower.layers.{bid}.fc2", # ultravox
|
"audio_tower.layers.{bid}.fc2", # ultravox
|
||||||
|
"conformer.layers.{bid}.feed_forward1.linear2", # lfm2
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.A_ENC_FFN_UP_1: (
|
||||||
|
"conformer.layers.{bid}.feed_forward2.linear1", # lfm2
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.A_ENC_FFN_DOWN_1: (
|
||||||
|
"conformer.layers.{bid}.feed_forward2.linear2", # lfm2
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.A_ENC_FFN_NORM_1: (
|
||||||
|
"conformer.layers.{bid}.norm_feed_forward2", # lfm2
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.A_ENC_LINEAR_POS: (
|
||||||
|
"conformer.layers.{bid}.self_attn.linear_pos", # lfm2
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.A_ENC_POS_BIAS_U: (
|
||||||
|
"conformer.layers.{bid}.self_attn.pos_bias_u", # lfm2
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.A_ENC_POS_BIAS_V: (
|
||||||
|
"conformer.layers.{bid}.self_attn.pos_bias_v", # lfm2
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.A_ENC_OUT: (
|
||||||
|
"conformer.pre_encode.out", # lfm2
|
||||||
),
|
),
|
||||||
|
|
||||||
# note: some tensors below has "audio." pseudo-prefix, to prevent conflicts with vision tensors
|
# note: some tensors below has "audio." pseudo-prefix, to prevent conflicts with vision tensors
|
||||||
|
|
@ -1587,6 +1645,7 @@ class TensorNameMap:
|
||||||
|
|
||||||
MODEL_TENSOR.A_MMPROJ: (
|
MODEL_TENSOR.A_MMPROJ: (
|
||||||
"audio.multi_modal_projector.linear_{bid}", # ultravox
|
"audio.multi_modal_projector.linear_{bid}", # ultravox
|
||||||
|
"audio_adapter.model.{bid}" # lfm2
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.A_MMPROJ_FC: (
|
MODEL_TENSOR.A_MMPROJ_FC: (
|
||||||
|
|
|
||||||
|
|
@ -7295,11 +7295,11 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||||
|
|
||||||
test_cases.emplace_back(new test_l2_norm(GGML_TYPE_F32, {64, 5, 4, 3}, 1e-12f));
|
test_cases.emplace_back(new test_l2_norm(GGML_TYPE_F32, {64, 5, 4, 3}, 1e-12f));
|
||||||
|
|
||||||
for (int64_t d_conv : {3, 4}) {
|
for (int64_t d_conv : {3, 4, 9}) {
|
||||||
for (int64_t d_inner: {1024, 1536, 2048}) {
|
for (int64_t d_inner: {1024, 1536, 2048}) {
|
||||||
test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {4, d_inner, 1, 1}, {d_conv, d_inner, 1, 1}));
|
test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {d_conv, d_inner, 1, 1}, {d_conv, d_inner, 1, 1}));
|
||||||
test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {8, d_inner, 1, 1}, {d_conv, d_inner, 1, 1}));
|
test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {2 * d_conv, d_inner, 1, 1}, {d_conv, d_inner, 1, 1}));
|
||||||
test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {4, d_inner, 4, 1}, {d_conv, d_inner, 1, 1}));
|
test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {d_conv, d_inner, 4, 1}, {d_conv, d_inner, 1, 1}));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -15,6 +15,7 @@ add_library(mtmd
|
||||||
clip-graph.h
|
clip-graph.h
|
||||||
models/models.h
|
models/models.h
|
||||||
models/cogvlm.cpp
|
models/cogvlm.cpp
|
||||||
|
models/conformer.cpp
|
||||||
models/glm4v.cpp
|
models/glm4v.cpp
|
||||||
models/internvl.cpp
|
models/internvl.cpp
|
||||||
models/kimivl.cpp
|
models/kimivl.cpp
|
||||||
|
|
|
||||||
|
|
@ -138,6 +138,17 @@
|
||||||
#define TN_TOK_BOI "v.boi"
|
#define TN_TOK_BOI "v.boi"
|
||||||
#define TN_TOK_EOI "v.eoi"
|
#define TN_TOK_EOI "v.eoi"
|
||||||
|
|
||||||
|
// (conformer) lfm2
|
||||||
|
#define TN_PRE_ENCODE_OUT "a.pre_encode.out.%s"
|
||||||
|
#define TN_FFN_NORM "%s.blk.%d.ffn_norm.%s"
|
||||||
|
#define TN_FFN_NORM_1 "%s.blk.%d.ffn_norm_1.%s"
|
||||||
|
#define TN_FFN_UP_1 "%s.blk.%d.ffn_up_1.%s"
|
||||||
|
#define TN_FFN_DOWN_1 "%s.blk.%d.ffn_down_1.%s"
|
||||||
|
#define TN_POS_BIAS_U "%s.blk.%d.pos_bias_u"
|
||||||
|
#define TN_POS_BIAS_V "%s.blk.%d.pos_bias_v"
|
||||||
|
#define TN_NORM_CONV "%s.blk.%d.norm_conv.%s"
|
||||||
|
#define TN_LINEAR_POS "%s.blk.%d.linear_pos.%s"
|
||||||
|
|
||||||
// align x to upper multiple of n
|
// align x to upper multiple of n
|
||||||
#define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
|
#define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
|
||||||
|
|
||||||
|
|
@ -170,6 +181,7 @@ enum projector_type {
|
||||||
PROJECTOR_TYPE_LIGHTONOCR,
|
PROJECTOR_TYPE_LIGHTONOCR,
|
||||||
PROJECTOR_TYPE_COGVLM,
|
PROJECTOR_TYPE_COGVLM,
|
||||||
PROJECTOR_TYPE_JANUS_PRO,
|
PROJECTOR_TYPE_JANUS_PRO,
|
||||||
|
PROJECTOR_TYPE_LFM2A,
|
||||||
PROJECTOR_TYPE_GLM4V,
|
PROJECTOR_TYPE_GLM4V,
|
||||||
PROJECTOR_TYPE_UNKNOWN,
|
PROJECTOR_TYPE_UNKNOWN,
|
||||||
};
|
};
|
||||||
|
|
@ -198,6 +210,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||||
{ PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
|
{ PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
|
||||||
{ PROJECTOR_TYPE_COGVLM, "cogvlm"},
|
{ PROJECTOR_TYPE_COGVLM, "cogvlm"},
|
||||||
{ PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
|
{ PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
|
||||||
|
{ PROJECTOR_TYPE_LFM2A, "lfm2a"},
|
||||||
{ PROJECTOR_TYPE_GLM4V, "glm4v"},
|
{ PROJECTOR_TYPE_GLM4V, "glm4v"},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -4,6 +4,7 @@
|
||||||
#include "clip.h"
|
#include "clip.h"
|
||||||
#include "clip-impl.h"
|
#include "clip-impl.h"
|
||||||
|
|
||||||
|
#include <array>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <unordered_set>
|
#include <unordered_set>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
|
|
@ -142,6 +143,30 @@ struct clip_layer {
|
||||||
ggml_tensor * deepstack_fc2_w = nullptr;
|
ggml_tensor * deepstack_fc2_w = nullptr;
|
||||||
ggml_tensor * deepstack_fc2_b = nullptr;
|
ggml_tensor * deepstack_fc2_b = nullptr;
|
||||||
|
|
||||||
|
// lfm2
|
||||||
|
ggml_tensor * ff_norm_w = nullptr;
|
||||||
|
ggml_tensor * ff_norm_b = nullptr;
|
||||||
|
ggml_tensor * ff_norm_1_w = nullptr;
|
||||||
|
ggml_tensor * ff_norm_1_b = nullptr;
|
||||||
|
ggml_tensor * ff_up_1_w = nullptr;
|
||||||
|
ggml_tensor * ff_up_1_b = nullptr;
|
||||||
|
ggml_tensor * ff_down_1_w = nullptr;
|
||||||
|
ggml_tensor * ff_down_1_b = nullptr;
|
||||||
|
ggml_tensor * pos_bias_u = nullptr;
|
||||||
|
ggml_tensor * pos_bias_v = nullptr;
|
||||||
|
ggml_tensor * norm_conv_w = nullptr;
|
||||||
|
ggml_tensor * norm_conv_b = nullptr;
|
||||||
|
ggml_tensor * linear_pos_w = nullptr;
|
||||||
|
|
||||||
|
ggml_tensor * conv_norm_w = nullptr;
|
||||||
|
ggml_tensor * conv_norm_b = nullptr;
|
||||||
|
ggml_tensor * conv_dw_w = nullptr;
|
||||||
|
ggml_tensor * conv_dw_b = nullptr;
|
||||||
|
ggml_tensor * conv_pw1_w = nullptr;
|
||||||
|
ggml_tensor * conv_pw1_b = nullptr;
|
||||||
|
ggml_tensor * conv_pw2_w = nullptr;
|
||||||
|
ggml_tensor * conv_pw2_b = nullptr;
|
||||||
|
|
||||||
bool has_deepstack() const {
|
bool has_deepstack() const {
|
||||||
return deepstack_fc1_w != nullptr;
|
return deepstack_fc1_w != nullptr;
|
||||||
}
|
}
|
||||||
|
|
@ -286,6 +311,12 @@ struct clip_model {
|
||||||
ggml_tensor * mm_boi = nullptr;
|
ggml_tensor * mm_boi = nullptr;
|
||||||
ggml_tensor * mm_eoi = nullptr;
|
ggml_tensor * mm_eoi = nullptr;
|
||||||
|
|
||||||
|
// lfm2
|
||||||
|
std::array<ggml_tensor *, 7> pre_encode_conv_X_w = {nullptr};
|
||||||
|
std::array<ggml_tensor *, 7> pre_encode_conv_X_b = {nullptr};
|
||||||
|
ggml_tensor * pre_encode_out_w = nullptr;
|
||||||
|
ggml_tensor * pre_encode_out_b = nullptr;
|
||||||
|
|
||||||
bool audio_has_avgpool() const {
|
bool audio_has_avgpool() const {
|
||||||
return proj_type == PROJECTOR_TYPE_QWEN2A
|
return proj_type == PROJECTOR_TYPE_QWEN2A
|
||||||
|| proj_type == PROJECTOR_TYPE_VOXTRAL;
|
|| proj_type == PROJECTOR_TYPE_VOXTRAL;
|
||||||
|
|
|
||||||
|
|
@ -837,6 +837,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
{
|
{
|
||||||
builder = std::make_unique<clip_graph_llava>(ctx, img);
|
builder = std::make_unique<clip_graph_llava>(ctx, img);
|
||||||
} break;
|
} break;
|
||||||
|
case PROJECTOR_TYPE_LFM2A:
|
||||||
|
{
|
||||||
|
builder = std::make_unique<clip_graph_conformer>(ctx, img);
|
||||||
|
} break;
|
||||||
case PROJECTOR_TYPE_GLM4V:
|
case PROJECTOR_TYPE_GLM4V:
|
||||||
{
|
{
|
||||||
builder = std::make_unique<clip_graph_glm4v>(ctx, img);
|
builder = std::make_unique<clip_graph_glm4v>(ctx, img);
|
||||||
|
|
@ -1187,6 +1191,15 @@ struct clip_model_loader {
|
||||||
hparams.audio_window_len = 400;
|
hparams.audio_window_len = 400;
|
||||||
hparams.audio_hop_len = 160;
|
hparams.audio_hop_len = 160;
|
||||||
} break;
|
} break;
|
||||||
|
case PROJECTOR_TYPE_LFM2A:
|
||||||
|
{
|
||||||
|
// audio preprocessing params
|
||||||
|
hparams.audio_chunk_len = 1; // in seconds
|
||||||
|
hparams.audio_sample_rate = 16000;
|
||||||
|
hparams.audio_n_fft = 512;
|
||||||
|
hparams.audio_window_len = 400;
|
||||||
|
hparams.audio_hop_len = 160;
|
||||||
|
} break;
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
@ -1611,6 +1624,52 @@ struct clip_model_loader {
|
||||||
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
|
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
|
||||||
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
|
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
|
||||||
} break;
|
} break;
|
||||||
|
case PROJECTOR_TYPE_LFM2A:
|
||||||
|
{
|
||||||
|
for (int i : {0, 2, 3, 5, 6}) {
|
||||||
|
model.pre_encode_conv_X_w[i] = get_tensor(string_format(TN_CONV1D, i, "weight"));
|
||||||
|
model.pre_encode_conv_X_b[i] = get_tensor(string_format(TN_CONV1D, i, "bias"));
|
||||||
|
}
|
||||||
|
model.pre_encode_out_w = get_tensor(string_format(TN_PRE_ENCODE_OUT, "weight"));
|
||||||
|
model.pre_encode_out_b = get_tensor(string_format(TN_PRE_ENCODE_OUT, "bias"));
|
||||||
|
|
||||||
|
model.mm_0_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 0, "weight"));
|
||||||
|
model.mm_0_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 0, "bias"));
|
||||||
|
model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
|
||||||
|
model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias"));
|
||||||
|
model.mm_3_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 3, "weight"));
|
||||||
|
model.mm_3_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 3, "bias"));
|
||||||
|
|
||||||
|
for (int il = 0; il < hparams.n_layer; ++il) {
|
||||||
|
auto & layer = model.layers[il];
|
||||||
|
|
||||||
|
layer.ff_norm_w = get_tensor(string_format(TN_FFN_NORM, prefix, il, "weight"));
|
||||||
|
layer.ff_norm_b = get_tensor(string_format(TN_FFN_NORM, prefix, il, "bias"));
|
||||||
|
layer.ff_norm_1_w = get_tensor(string_format(TN_FFN_NORM_1, prefix, il, "weight"));
|
||||||
|
layer.ff_norm_1_b = get_tensor(string_format(TN_FFN_NORM_1, prefix, il, "bias"));
|
||||||
|
layer.ff_up_1_w = get_tensor(string_format(TN_FFN_UP_1, prefix, il, "weight"));
|
||||||
|
layer.ff_up_1_b = get_tensor(string_format(TN_FFN_UP_1, prefix, il, "bias"));
|
||||||
|
layer.ff_down_1_w = get_tensor(string_format(TN_FFN_DOWN_1, prefix, il, "weight"));
|
||||||
|
layer.ff_down_1_b = get_tensor(string_format(TN_FFN_DOWN_1, prefix, il, "bias"));
|
||||||
|
|
||||||
|
layer.pos_bias_u = get_tensor(string_format(TN_POS_BIAS_U, prefix, il));
|
||||||
|
layer.pos_bias_v = get_tensor(string_format(TN_POS_BIAS_V, prefix, il));
|
||||||
|
|
||||||
|
layer.norm_conv_w = get_tensor(string_format(TN_NORM_CONV, prefix, il, "weight"));
|
||||||
|
layer.norm_conv_b = get_tensor(string_format(TN_NORM_CONV, prefix, il, "bias"));
|
||||||
|
|
||||||
|
layer.linear_pos_w = get_tensor(string_format(TN_LINEAR_POS, prefix, il, "weight"));
|
||||||
|
|
||||||
|
layer.conv_norm_w = get_tensor(string_format("convnext.%d.norm.%s", il, "weight"));
|
||||||
|
layer.conv_norm_b = get_tensor(string_format("convnext.%d.norm.%s", il, "bias"));
|
||||||
|
layer.conv_dw_w = get_tensor(string_format("convnext.%d.dw.%s", il, "weight"));
|
||||||
|
layer.conv_dw_b = get_tensor(string_format("convnext.%d.dw.%s", il, "bias"));
|
||||||
|
layer.conv_pw1_w = get_tensor(string_format("convnext.%d.pw1.%s", il, "weight"));
|
||||||
|
layer.conv_pw1_b = get_tensor(string_format("convnext.%d.pw1.%s", il, "bias"));
|
||||||
|
layer.conv_pw2_w = get_tensor(string_format("convnext.%d.pw2.%s", il, "weight"));
|
||||||
|
layer.conv_pw2_b = get_tensor(string_format("convnext.%d.pw2.%s", il, "bias"));
|
||||||
|
}
|
||||||
|
} break;
|
||||||
default:
|
default:
|
||||||
GGML_ASSERT(false && "unknown projector type");
|
GGML_ASSERT(false && "unknown projector type");
|
||||||
}
|
}
|
||||||
|
|
@ -3004,6 +3063,10 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||||
{
|
{
|
||||||
n_patches += 2; // for BOI and EOI token embeddings
|
n_patches += 2; // for BOI and EOI token embeddings
|
||||||
} break;
|
} break;
|
||||||
|
case PROJECTOR_TYPE_LFM2A:
|
||||||
|
{
|
||||||
|
n_patches = ((((img->nx + 1) / 2) + 1) / 2 + 1) / 2;
|
||||||
|
} break;
|
||||||
default:
|
default:
|
||||||
GGML_ABORT("unsupported projector type");
|
GGML_ABORT("unsupported projector type");
|
||||||
}
|
}
|
||||||
|
|
@ -3362,6 +3425,27 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||||
}
|
}
|
||||||
set_input_i32("pos_w", pos_data);
|
set_input_i32("pos_w", pos_data);
|
||||||
} break;
|
} break;
|
||||||
|
case PROJECTOR_TYPE_LFM2A:
|
||||||
|
{
|
||||||
|
GGML_ASSERT(imgs.entries.size() == 1);
|
||||||
|
const auto n_frames = clip_n_output_tokens(ctx, imgs.entries.front().get());
|
||||||
|
|
||||||
|
auto d_model = 512;
|
||||||
|
auto seq_len = n_frames * 2 - 1;
|
||||||
|
std::vector<float> pos_emb(d_model*seq_len);
|
||||||
|
std::vector<double> inv_freq(d_model / 2);
|
||||||
|
for (size_t i = 0; i < inv_freq.size(); ++i) {
|
||||||
|
inv_freq[i] = std::exp(-(std::log(10000.0) / (float)d_model) * (2.0f * (float)(i)));
|
||||||
|
}
|
||||||
|
for (int64_t pos = 0; pos < seq_len; ++pos) {
|
||||||
|
for (size_t i = 0; i < inv_freq.size(); ++i) {
|
||||||
|
const float ang = (n_frames - pos - 1) * inv_freq[i];
|
||||||
|
pos_emb[pos*d_model + 2*i + 0] = sinf(ang); // even
|
||||||
|
pos_emb[pos*d_model + 2*i + 1] = cosf(ang); // odd
|
||||||
|
}
|
||||||
|
}
|
||||||
|
set_input_f32("pos_emb", pos_emb);
|
||||||
|
} break;
|
||||||
default:
|
default:
|
||||||
GGML_ABORT("Unknown projector type");
|
GGML_ABORT("Unknown projector type");
|
||||||
}
|
}
|
||||||
|
|
@ -3456,6 +3540,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||||
return ctx->model.mm_2_w->ne[1];
|
return ctx->model.mm_2_w->ne[1];
|
||||||
case PROJECTOR_TYPE_COGVLM:
|
case PROJECTOR_TYPE_COGVLM:
|
||||||
return ctx->model.mm_4h_to_h_w->ne[1];
|
return ctx->model.mm_4h_to_h_w->ne[1];
|
||||||
|
case PROJECTOR_TYPE_LFM2A:
|
||||||
|
return ctx->model.position_embeddings->ne[0];
|
||||||
case PROJECTOR_TYPE_GLM4V:
|
case PROJECTOR_TYPE_GLM4V:
|
||||||
return ctx->model.mm_ffn_down_w->ne[1];
|
return ctx->model.mm_ffn_down_w->ne[1];
|
||||||
default:
|
default:
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,222 @@
|
||||||
|
#include "models.h"
|
||||||
|
|
||||||
|
ggml_cgraph * clip_graph_conformer::build() {
|
||||||
|
const int n_frames = img.nx;
|
||||||
|
const int n_pos = n_frames / 2;
|
||||||
|
const int n_pos_embd = (((((n_frames + 1) / 2) + 1) / 2 + 1) / 2) * 2 - 1;
|
||||||
|
GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos);
|
||||||
|
|
||||||
|
ggml_tensor * pos_emb = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 512, n_pos_embd);
|
||||||
|
ggml_set_name(pos_emb, "pos_emb");
|
||||||
|
ggml_set_input(pos_emb);
|
||||||
|
ggml_build_forward_expand(gf, pos_emb);
|
||||||
|
|
||||||
|
ggml_tensor * inp = build_inp_raw(1);
|
||||||
|
cb(inp, "input", -1);
|
||||||
|
|
||||||
|
auto * cur = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
|
||||||
|
|
||||||
|
// pre encode, conv subsampling
|
||||||
|
{
|
||||||
|
// layer.0 - conv2d
|
||||||
|
cur = ggml_conv_2d(ctx0, model.pre_encode_conv_X_w[0], cur, 2, 2, 1, 1, 1, 1);
|
||||||
|
cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[0]);
|
||||||
|
cb(cur, "conformer.pre_encode.conv.{}", 0);
|
||||||
|
|
||||||
|
// layer.1 - relu
|
||||||
|
cur = ggml_relu_inplace(ctx0, cur);
|
||||||
|
|
||||||
|
// layer.2 conv2d dw
|
||||||
|
cur = ggml_conv_2d_dw_direct(ctx0, model.pre_encode_conv_X_w[2], cur, 2, 2, 1, 1, 1, 1);
|
||||||
|
cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[2]);
|
||||||
|
cb(cur, "conformer.pre_encode.conv.{}", 2);
|
||||||
|
|
||||||
|
// layer.3 conv2d
|
||||||
|
cur = ggml_conv_2d_direct(ctx0, model.pre_encode_conv_X_w[3], cur, 1, 1, 0, 0, 1, 1);
|
||||||
|
cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[3]);
|
||||||
|
cb(cur, "conformer.pre_encode.conv.{}", 3);
|
||||||
|
|
||||||
|
// layer.4 - relu
|
||||||
|
cur = ggml_relu_inplace(ctx0, cur);
|
||||||
|
|
||||||
|
// layer.5 conv2d dw
|
||||||
|
cur = ggml_conv_2d_dw_direct(ctx0, model.pre_encode_conv_X_w[5], cur, 2, 2, 1, 1, 1, 1);
|
||||||
|
cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[5]);
|
||||||
|
cb(cur, "conformer.pre_encode.conv.{}", 5);
|
||||||
|
|
||||||
|
// layer.6 conv2d
|
||||||
|
cur = ggml_conv_2d_direct(ctx0, model.pre_encode_conv_X_w[6], cur, 1, 1, 0, 0, 1, 1);
|
||||||
|
cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[6]);
|
||||||
|
cb(cur, "conformer.pre_encode.conv.{}", 6);
|
||||||
|
|
||||||
|
// layer.7 - relu
|
||||||
|
cur = ggml_relu_inplace(ctx0, cur);
|
||||||
|
|
||||||
|
// flatten channel and frequency axis
|
||||||
|
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 0, 2, 1, 3));
|
||||||
|
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2]);
|
||||||
|
|
||||||
|
// calculate out
|
||||||
|
cur = ggml_mul_mat(ctx0, model.pre_encode_out_w, cur);
|
||||||
|
cur = ggml_add(ctx0, cur, model.pre_encode_out_b);
|
||||||
|
cb(cur, "conformer.pre_encode.out", -1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// pos_emb
|
||||||
|
cb(pos_emb, "pos_emb", -1);
|
||||||
|
|
||||||
|
for (int il = 0; il < hparams.n_layer; il++) {
|
||||||
|
const auto & layer = model.layers[il];
|
||||||
|
|
||||||
|
auto * residual = cur;
|
||||||
|
|
||||||
|
cb(cur, "layer.in", il);
|
||||||
|
|
||||||
|
// feed_forward1
|
||||||
|
cur = build_norm(cur, layer.ff_norm_w, layer.ff_norm_b, NORM_TYPE_NORMAL, 1e-5, il);
|
||||||
|
cb(cur, "conformer.layers.{}.norm_feed_forward1", il);
|
||||||
|
|
||||||
|
cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, layer.ff_down_b, FFN_SILU,
|
||||||
|
il);
|
||||||
|
cb(cur, "conformer.layers.{}.feed_forward1.linear2", il);
|
||||||
|
|
||||||
|
const auto fc_factor = 0.5f;
|
||||||
|
residual = ggml_add(ctx0, residual, ggml_scale(ctx0, cur, fc_factor));
|
||||||
|
|
||||||
|
// self-attention
|
||||||
|
{
|
||||||
|
cur = build_norm(residual, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, 1e-5, il);
|
||||||
|
cb(cur, "conformer.layers.{}.norm_self_att", il);
|
||||||
|
|
||||||
|
ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
|
||||||
|
Qcur = ggml_add(ctx0, Qcur, layer.q_b);
|
||||||
|
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, Qcur->ne[1]);
|
||||||
|
ggml_tensor * Q_bias_u = ggml_add(ctx0, Qcur, layer.pos_bias_u);
|
||||||
|
Q_bias_u = ggml_permute(ctx0, Q_bias_u, 0, 2, 1, 3);
|
||||||
|
ggml_tensor * Q_bias_v = ggml_add(ctx0, Qcur, layer.pos_bias_v);
|
||||||
|
Q_bias_v = ggml_permute(ctx0, Q_bias_v, 0, 2, 1, 3);
|
||||||
|
|
||||||
|
// TODO @ngxson : some cont can/should be removed when ggml_mul_mat support these cases
|
||||||
|
ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
|
||||||
|
Kcur = ggml_add(ctx0, Kcur, layer.k_b);
|
||||||
|
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, Kcur->ne[1]);
|
||||||
|
Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
|
||||||
|
|
||||||
|
ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
|
||||||
|
Vcur = ggml_add(ctx0, Vcur, layer.v_b);
|
||||||
|
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, Vcur->ne[1]);
|
||||||
|
Vcur = ggml_cont(ctx0, ggml_permute(ctx0, Vcur, 1, 2, 0, 3));
|
||||||
|
|
||||||
|
// build_attn won't fit due to matrix_ac and matrix_bd separation
|
||||||
|
ggml_tensor * matrix_ac = ggml_mul_mat(ctx0, Q_bias_u, Kcur);
|
||||||
|
matrix_ac = ggml_cont(ctx0, ggml_permute(ctx0, matrix_ac, 1, 0, 2, 3));
|
||||||
|
cb(matrix_ac, "conformer.layers.{}.self_attn.id3", il);
|
||||||
|
|
||||||
|
auto * p = ggml_mul_mat(ctx0, layer.linear_pos_w, pos_emb);
|
||||||
|
cb(p, "conformer.layers.{}.self_attn.linear_pos", il);
|
||||||
|
p = ggml_reshape_3d(ctx0, p, d_head, n_head, p->ne[1]);
|
||||||
|
p = ggml_permute(ctx0, p, 0, 2, 1, 3);
|
||||||
|
|
||||||
|
auto * matrix_bd = ggml_mul_mat(ctx0, Q_bias_v, p);
|
||||||
|
matrix_bd = ggml_cont(ctx0, ggml_permute(ctx0, matrix_bd, 1, 0, 2, 3));
|
||||||
|
|
||||||
|
// rel shift
|
||||||
|
{
|
||||||
|
const auto pos_len = matrix_bd->ne[0];
|
||||||
|
const auto q_len = matrix_bd->ne[1];
|
||||||
|
const auto h = matrix_bd->ne[2];
|
||||||
|
matrix_bd = ggml_pad(ctx0, matrix_bd, 1, 0, 0, 0);
|
||||||
|
matrix_bd = ggml_roll(ctx0, matrix_bd, 1, 0, 0, 0);
|
||||||
|
matrix_bd = ggml_reshape_3d(ctx0, matrix_bd, q_len, pos_len + 1, h);
|
||||||
|
matrix_bd = ggml_view_3d(ctx0, matrix_bd, q_len, pos_len, h, matrix_bd->nb[1],
|
||||||
|
matrix_bd->nb[2], matrix_bd->nb[0] * q_len);
|
||||||
|
matrix_bd = ggml_cont_3d(ctx0, matrix_bd, pos_len, q_len, h);
|
||||||
|
}
|
||||||
|
|
||||||
|
matrix_bd = ggml_view_3d(ctx0, matrix_bd, matrix_ac->ne[0], matrix_bd->ne[1],
|
||||||
|
matrix_bd->ne[2], matrix_bd->nb[1], matrix_bd->nb[2], 0);
|
||||||
|
auto * scores = ggml_add(ctx0, matrix_ac, matrix_bd);
|
||||||
|
scores = ggml_scale(ctx0, scores, 1.0f / std::sqrt(d_head));
|
||||||
|
cb(scores, "conformer.layers.{}.self_attn.id0", il);
|
||||||
|
|
||||||
|
ggml_tensor * attn = ggml_soft_max(ctx0, scores);
|
||||||
|
ggml_tensor * x = ggml_mul_mat(ctx0, attn, Vcur);
|
||||||
|
x = ggml_permute(ctx0, x, 2, 0, 1, 3);
|
||||||
|
x = ggml_cont_2d(ctx0, x, x->ne[0] * x->ne[1], x->ne[2]);
|
||||||
|
|
||||||
|
ggml_tensor * out = ggml_mul_mat(ctx0, layer.o_w, x);
|
||||||
|
out = ggml_add(ctx0, out, layer.o_b);
|
||||||
|
cb(out, "conformer.layers.{}.self_attn.linear_out", il);
|
||||||
|
|
||||||
|
cur = out;
|
||||||
|
}
|
||||||
|
|
||||||
|
residual = ggml_add(ctx0, residual, cur);
|
||||||
|
cur = build_norm(residual, layer.norm_conv_w, layer.norm_conv_b, NORM_TYPE_NORMAL, 1e-5, il);
|
||||||
|
cb(cur, "conformer.layers.{}.norm_conv", il);
|
||||||
|
|
||||||
|
// conv
|
||||||
|
{
|
||||||
|
auto * x = cur;
|
||||||
|
auto * conv_pw1_w =
|
||||||
|
ggml_reshape_2d(ctx0, layer.conv_pw1_w, layer.conv_pw1_w->ne[1], layer.conv_pw1_w->ne[2]);
|
||||||
|
x = ggml_mul_mat(ctx0, conv_pw1_w, x);
|
||||||
|
x = ggml_add(ctx0, x, layer.conv_pw1_b);
|
||||||
|
cb(x, "conformer.layers.{}.conv.pointwise_conv1", il);
|
||||||
|
|
||||||
|
// ggml_glu doesn't support sigmoid
|
||||||
|
// TODO @ngxson : support this ops in ggml
|
||||||
|
{
|
||||||
|
int64_t d = x->ne[0] / 2;
|
||||||
|
ggml_tensor * gate = ggml_sigmoid(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], d * x->nb[0]));
|
||||||
|
x = ggml_mul(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], 0), gate);
|
||||||
|
x = ggml_cont(ctx0, ggml_transpose(ctx0, x));
|
||||||
|
}
|
||||||
|
|
||||||
|
// use ggml_ssm_conv for f32 precision
|
||||||
|
x = ggml_pad(ctx0, x, 4, 0, 0, 0);
|
||||||
|
x = ggml_roll(ctx0, x, 4, 0, 0, 0);
|
||||||
|
x = ggml_pad(ctx0, x, 4, 0, 0, 0);
|
||||||
|
auto * conv_dw_w = ggml_reshape_2d(ctx0, layer.conv_dw_w, layer.conv_dw_w->ne[0], layer.conv_dw_w->ne[2]);
|
||||||
|
x = ggml_ssm_conv(ctx0, x, conv_dw_w);
|
||||||
|
x = ggml_add(ctx0, x, ggml_reshape_1d(ctx0, layer.conv_dw_b, layer.conv_dw_b->ne[0]));
|
||||||
|
|
||||||
|
x = ggml_add(ctx0, ggml_mul(ctx0, x, layer.conv_norm_w), layer.conv_norm_b);
|
||||||
|
x = ggml_silu(ctx0, x);
|
||||||
|
|
||||||
|
// pointwise_conv2
|
||||||
|
auto * conv_pw2_w =
|
||||||
|
ggml_reshape_2d(ctx0, layer.conv_pw2_w, layer.conv_pw2_w->ne[1], layer.conv_pw2_w->ne[2]);
|
||||||
|
x = ggml_mul_mat(ctx0, conv_pw2_w, x);
|
||||||
|
x = ggml_add(ctx0, x, layer.conv_pw2_b);
|
||||||
|
|
||||||
|
cur = x;
|
||||||
|
}
|
||||||
|
|
||||||
|
residual = ggml_add(ctx0, residual, cur);
|
||||||
|
|
||||||
|
cur = build_norm(residual, layer.ff_norm_1_w, layer.ff_norm_1_b, NORM_TYPE_NORMAL, 1e-5, il);
|
||||||
|
cb(cur, "conformer.layers.{}.norm_feed_forward2", il);
|
||||||
|
|
||||||
|
cur = build_ffn(cur, layer.ff_up_1_w, layer.ff_up_1_b, nullptr, nullptr, layer.ff_down_1_w, layer.ff_down_1_b,
|
||||||
|
FFN_SILU, il); // TODO(tarek): read activation for ffn from hparams
|
||||||
|
cb(cur, "conformer.layers.{}.feed_forward2.linear2", il);
|
||||||
|
|
||||||
|
residual = ggml_add(ctx0, residual, ggml_scale(ctx0, cur, fc_factor));
|
||||||
|
cb(residual, "conformer.layers.{}.conv.id", il);
|
||||||
|
|
||||||
|
cur = build_norm(residual, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, 1e-5, il);
|
||||||
|
cb(cur, "conformer.layers.{}.norm_out", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
// audio adapter
|
||||||
|
cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1);
|
||||||
|
cb(cur, "audio_adapter.model.{}", 0);
|
||||||
|
cur = build_ffn(cur, model.mm_1_w, model.mm_1_b, nullptr, nullptr, model.mm_3_w, model.mm_3_b, FFN_GELU_ERF, -1);
|
||||||
|
|
||||||
|
cb(cur, "projected", -1);
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
@ -57,6 +57,11 @@ struct clip_graph_whisper_enc : clip_graph {
|
||||||
ggml_cgraph * build() override;
|
ggml_cgraph * build() override;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct clip_graph_conformer : clip_graph {
|
||||||
|
clip_graph_conformer(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||||
|
ggml_cgraph * build() override;
|
||||||
|
};
|
||||||
|
|
||||||
struct clip_graph_glm4v : clip_graph {
|
struct clip_graph_glm4v : clip_graph {
|
||||||
clip_graph_glm4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
clip_graph_glm4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||||
ggml_cgraph * build() override;
|
ggml_cgraph * build() override;
|
||||||
|
|
|
||||||
|
|
@ -535,3 +535,56 @@ bool mtmd_audio_preprocessor_whisper::preprocess(
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// mtmd_audio_preprocessor_conformer
|
||||||
|
//
|
||||||
|
|
||||||
|
void mtmd_audio_preprocessor_conformer::initialize() {
|
||||||
|
g_cache.fill_sin_cos_table(hparams.audio_n_fft);
|
||||||
|
g_cache.fill_hann_window(hparams.audio_window_len, true);
|
||||||
|
g_cache.fill_mel_filterbank_matrix(
|
||||||
|
hparams.n_mel_bins,
|
||||||
|
hparams.audio_n_fft,
|
||||||
|
hparams.audio_sample_rate);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool mtmd_audio_preprocessor_conformer::preprocess(
|
||||||
|
const float * samples,
|
||||||
|
size_t n_samples,
|
||||||
|
std::vector<mtmd_audio_mel> & output) {
|
||||||
|
// empty audio
|
||||||
|
if (n_samples == 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
filter_params params;
|
||||||
|
params.n_mel = hparams.n_mel_bins;
|
||||||
|
params.n_fft_bins = 1 + (hparams.audio_n_fft / 2);
|
||||||
|
params.hann_window_size = hparams.audio_window_len;
|
||||||
|
params.hop_length = hparams.audio_hop_len;
|
||||||
|
params.sample_rate = hparams.audio_sample_rate;
|
||||||
|
params.center_padding = true;
|
||||||
|
params.preemph = 0.97f; // disabled
|
||||||
|
params.use_natural_log = true;
|
||||||
|
params.norm_per_feature = true;
|
||||||
|
|
||||||
|
// make sure the global cache is initialized
|
||||||
|
GGML_ASSERT(!g_cache.sin_vals.empty());
|
||||||
|
GGML_ASSERT(!g_cache.cos_vals.empty());
|
||||||
|
GGML_ASSERT(!g_cache.filters.data.empty());
|
||||||
|
|
||||||
|
mtmd_audio_mel out_full;
|
||||||
|
bool ok = log_mel_spectrogram(
|
||||||
|
samples,
|
||||||
|
n_samples,
|
||||||
|
4, // n_threads
|
||||||
|
params,
|
||||||
|
out_full);
|
||||||
|
if (!ok) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
output.push_back(std::move(out_full));
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -32,3 +32,9 @@ struct mtmd_audio_preprocessor_whisper : mtmd_audio_preprocessor {
|
||||||
void initialize() override;
|
void initialize() override;
|
||||||
bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
|
bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct mtmd_audio_preprocessor_conformer : mtmd_audio_preprocessor {
|
||||||
|
mtmd_audio_preprocessor_conformer(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
|
||||||
|
void initialize() override;
|
||||||
|
bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
|
||||||
|
};
|
||||||
|
|
|
||||||
|
|
@ -309,9 +309,22 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
if (g_is_interrupted) return 130;
|
if (g_is_interrupted) return 130;
|
||||||
|
|
||||||
|
auto eval_system_prompt_if_present = [&] {
|
||||||
|
if (params.system_prompt.empty()) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
common_chat_msg msg;
|
||||||
|
msg.role = "system";
|
||||||
|
msg.content = params.system_prompt;
|
||||||
|
return eval_message(ctx, msg);
|
||||||
|
};
|
||||||
|
|
||||||
LOG_WRN("WARN: This is an experimental CLI for testing multimodal capability.\n");
|
LOG_WRN("WARN: This is an experimental CLI for testing multimodal capability.\n");
|
||||||
LOG_WRN(" For normal use cases, please use the standard llama-cli\n");
|
LOG_WRN(" For normal use cases, please use the standard llama-cli\n");
|
||||||
|
|
||||||
|
eval_system_prompt_if_present();
|
||||||
|
|
||||||
if (is_single_turn) {
|
if (is_single_turn) {
|
||||||
g_is_generating = true;
|
g_is_generating = true;
|
||||||
if (params.prompt.find(mtmd_default_marker()) == std::string::npos) {
|
if (params.prompt.find(mtmd_default_marker()) == std::string::npos) {
|
||||||
|
|
@ -321,6 +334,7 @@ int main(int argc, char ** argv) {
|
||||||
params.prompt = mtmd_default_marker() + params.prompt;
|
params.prompt = mtmd_default_marker() + params.prompt;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
common_chat_msg msg;
|
common_chat_msg msg;
|
||||||
msg.role = "user";
|
msg.role = "user";
|
||||||
msg.content = params.prompt;
|
msg.content = params.prompt;
|
||||||
|
|
@ -370,6 +384,7 @@ int main(int argc, char ** argv) {
|
||||||
ctx.chat_history.clear();
|
ctx.chat_history.clear();
|
||||||
llama_memory_clear(llama_get_memory(ctx.lctx), true);
|
llama_memory_clear(llama_get_memory(ctx.lctx), true);
|
||||||
LOG("Chat history cleared\n\n");
|
LOG("Chat history cleared\n\n");
|
||||||
|
eval_system_prompt_if_present();
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
g_is_generating = true;
|
g_is_generating = true;
|
||||||
|
|
|
||||||
|
|
@ -331,6 +331,9 @@ struct mtmd_context {
|
||||||
case PROJECTOR_TYPE_VOXTRAL:
|
case PROJECTOR_TYPE_VOXTRAL:
|
||||||
audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
|
audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
|
||||||
break;
|
break;
|
||||||
|
case PROJECTOR_TYPE_LFM2A:
|
||||||
|
audio_preproc = std::make_unique<mtmd_audio_preprocessor_conformer>(ctx_a);
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
GGML_ABORT("unsupported audio projector type");
|
GGML_ABORT("unsupported audio projector type");
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue