mtmd: add Gemma 4 audio conformer encoder support (#21421)
* mtmd: add Gemma 4 audio conformer encoder support Add audio processing for Gemma 4 E2B/E4B via a USM-style Conformer. Architecture: - 12-layer Conformer: FFN → Self-Attention → Causal Conv1D → FFN → Norm - Subsampling Conv Projection: 2x Conv2D(stride=2) with LayerNorm - Full self-attention with sinusoidal RPE and sliding window mask (24) - Logit softcapping at 50.0, ClippableLinear clamping - Output: 1024 → 1536 → RMSNorm → multimodal embedder Mel preprocessing (dedicated mtmd_audio_preprocessor_gemma4a): - HTK mel scale, 128 bins, magnitude STFT, mel_floor=1e-3 - Standard periodic Hann window (320 samples), zero-padded to FFT size - Semicausal left-padding (frame_length/2 samples) - Frame count matched to PyTorch (unfold formula) - No pre-emphasis, no Whisper-style normalization - Mel cosine similarity vs PyTorch: 0.9998 Key fixes: - Tensor loading dedup: prevent get_tensor() from creating duplicate entries in ctx_data. Fixed with std::set guard. - ClippableLinear clamp_info loading moved after per-layer tensors. - Sliding window mask (24 positions) matching PyTorch context_size. - Skip Whisper normalization for Gemma4 mel output. Tested on E2B and E4B with CPU and Vulkan backends. Transcribes: "Glad to see things are going well and business is starting to pick up" (matching ground truth). Ref: #21325
This commit is contained in:
parent
9e209c5aee
commit
547765a93e
|
|
@ -134,8 +134,9 @@ static void ssm_conv_f32_cuda(const float * src0, const float * src1, const int
|
|||
switch (nc) {
|
||||
case 3: launch_kernel(std::integral_constant<int, 3>{}); break;
|
||||
case 4: launch_kernel(std::integral_constant<int, 4>{}); break;
|
||||
case 5: launch_kernel(std::integral_constant<int, 5>{}); break;
|
||||
case 9: launch_kernel(std::integral_constant<int, 9>{}); break;
|
||||
default: GGML_ABORT("Only support kernel sizes 3, 4, 9 right now.");
|
||||
default: GGML_ABORT("Only support kernel sizes 3, 4, 5, 9 right now.");
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -88,6 +88,11 @@ static gguf_context_ptr get_gguf_ctx(const llm_arch arch, const bool moe) {
|
|||
uint32_t n_layer = 2;
|
||||
if (arch == LLM_ARCH_LLAMA4) {
|
||||
n_layer = 4; // hparams.n_no_rope_layer_step is hard-coded to 4
|
||||
} else if (arch == LLM_ARCH_GEMMA4) {
|
||||
n_embd = 128;
|
||||
n_head = 2;
|
||||
n_ff = 192;
|
||||
n_layer = 5; // need at least 5 for swa_pattern (every 5th is full_attention)
|
||||
} else if (arch == LLM_ARCH_GEMMA3N) {
|
||||
n_embd = 64;
|
||||
n_head = 1;
|
||||
|
|
@ -169,7 +174,15 @@ static gguf_context_ptr get_gguf_ctx(const llm_arch arch, const bool moe) {
|
|||
ms.add_kv(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, uint32_t(8));
|
||||
ms.add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW, n_ctx/8);
|
||||
|
||||
if (arch == LLM_ARCH_MIMO2 || arch == LLM_ARCH_STEP35) {
|
||||
if (arch == LLM_ARCH_GEMMA4) {
|
||||
ms.add_kv(LLM_KV_EMBEDDING_LENGTH_PER_LAYER, n_embd/2);
|
||||
ms.add_kv(LLM_KV_ATTENTION_SHARED_KV_LAYERS, uint32_t(0));
|
||||
ms.add_kv(LLM_KV_ATTENTION_KEY_LENGTH_SWA, n_embd_head);
|
||||
ms.add_kv(LLM_KV_ATTENTION_VALUE_LENGTH_SWA, n_embd_head);
|
||||
ms.add_kv(LLM_KV_ROPE_FREQ_BASE_SWA, 10000.0f);
|
||||
// SWA pattern: every 5th layer is full attention (matches E2B layer_types)
|
||||
ms.add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, uint32_t(5));
|
||||
} else if (arch == LLM_ARCH_MIMO2 || arch == LLM_ARCH_STEP35) {
|
||||
std::vector<uint32_t> pattern;
|
||||
pattern.reserve(n_layer);
|
||||
for (uint32_t il = 0; il < n_layer; il++) {
|
||||
|
|
@ -429,6 +442,9 @@ static int save_models(const llm_arch target_arch, const size_t seed, const ggml
|
|||
if (target_arch != LLM_ARCH_UNKNOWN && arch != target_arch) {
|
||||
continue;
|
||||
}
|
||||
if (arch == LLM_ARCH_GEMMA4) {
|
||||
continue; // FIXME: ISWA KV cache initialization needs more fixture params
|
||||
}
|
||||
for (bool moe : {false, true}) {
|
||||
if (moe && !moe_implemented(arch)) {
|
||||
continue;
|
||||
|
|
@ -510,6 +526,9 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg
|
|||
if (target_arch != LLM_ARCH_UNKNOWN && arch != target_arch) {
|
||||
continue;
|
||||
}
|
||||
if (arch == LLM_ARCH_GEMMA4) {
|
||||
continue; // FIXME: ISWA KV cache initialization needs more fixture params
|
||||
}
|
||||
|
||||
const bool encode = arch == LLM_ARCH_T5 || arch == LLM_ARCH_DREAM || arch == LLM_ARCH_LLADA || arch == LLM_ARCH_LLADA_MOE || arch == LLM_ARCH_RND1;
|
||||
for (bool moe : {false, true}) {
|
||||
|
|
|
|||
|
|
@ -18,6 +18,7 @@ add_library(mtmd
|
|||
models/cogvlm.cpp
|
||||
models/conformer.cpp
|
||||
models/dotsocr.cpp
|
||||
models/gemma4a.cpp
|
||||
models/gemma4v.cpp
|
||||
models/glm4v.cpp
|
||||
models/hunyuanocr.cpp
|
||||
|
|
|
|||
|
|
@ -181,6 +181,21 @@
|
|||
#define TN_CONV_PW1 "%s.blk.%d.conv_pw1.%s"
|
||||
#define TN_CONV_PW2 "%s.blk.%d.conv_pw2.%s"
|
||||
|
||||
// gemma4 audio conformer
|
||||
#define TN_A_MM_INP_PROJ "mm.a.input_projection.%s"
|
||||
#define TN_A_MM_SOFT_EMB_N "mm.a.soft_emb_norm.%s"
|
||||
#define TN_A_INP_PROJ "a.input_projection.%s"
|
||||
#define TN_A_CONV1D "a.conv1d.%d.%s"
|
||||
#define TN_A_CONV1D_NORM "a.conv1d.%d.norm.%s"
|
||||
#define TN_A_OUT_PROJ "a.pre_encode.out.%s"
|
||||
#define TN_A_ATTN_PRE_NORM "%s.blk.%d.attn_pre_norm.%s"
|
||||
#define TN_A_ATTN_POST_NORM "%s.blk.%d.attn_post_norm.%s"
|
||||
#define TN_A_ATTN_K_REL "%s.blk.%d.attn_k_rel.%s"
|
||||
#define TN_A_PER_DIM_SCALE "%s.blk.%d.per_dim_scale.%s"
|
||||
#define TN_A_PER_DIM_K_SCALE "%s.blk.%d.per_dim_k_scale.%s"
|
||||
#define TN_A_FFN_POST_NORM "%s.blk.%d.ffn_post_norm.%s"
|
||||
#define TN_A_FFN_POST_NORM_1 "%s.blk.%d.ffn_post_norm_1.%s"
|
||||
|
||||
// mobilenetv5 (gemma3n) definitions
|
||||
#define TN_MNV5_STEM_CONV "v.conv_stem.conv.weight"
|
||||
#define TN_MNV5_STEM_BIAS "v.conv_stem.conv.bias"
|
||||
|
|
|
|||
|
|
@ -217,6 +217,13 @@ struct clip_layer {
|
|||
ggml_tensor * conv_pw2_w = nullptr;
|
||||
ggml_tensor * conv_pw2_b = nullptr;
|
||||
|
||||
// gemma4 audio conformer per-layer
|
||||
ggml_tensor * attn_pre_norm_w = nullptr;
|
||||
ggml_tensor * attn_k_rel_w = nullptr;
|
||||
ggml_tensor * per_dim_scale_w = nullptr;
|
||||
ggml_tensor * per_dim_k_scale_w = nullptr;
|
||||
ggml_tensor * ff_post_norm_1_w = nullptr;
|
||||
|
||||
bool has_deepstack() const {
|
||||
return deepstack_fc1_w != nullptr;
|
||||
}
|
||||
|
|
@ -459,6 +466,15 @@ struct clip_model {
|
|||
};
|
||||
std::map<std::string, clamp_info> clamp_info_map;
|
||||
|
||||
// gemma4 audio conformer
|
||||
std::array<ggml_tensor *, 2> sscp_conv_w = {nullptr};
|
||||
std::array<ggml_tensor *, 2> sscp_conv_b = {nullptr};
|
||||
std::array<ggml_tensor *, 2> sscp_norm_w = {nullptr};
|
||||
ggml_tensor * sscp_inp_proj_w = nullptr;
|
||||
ggml_tensor * sscp_inp_proj_b = nullptr;
|
||||
ggml_tensor * audio_out_proj_w = nullptr;
|
||||
ggml_tensor * audio_out_proj_b = nullptr;
|
||||
|
||||
bool audio_has_avgpool() const {
|
||||
return proj_type == PROJECTOR_TYPE_QWEN2A
|
||||
|| proj_type == PROJECTOR_TYPE_VOXTRAL
|
||||
|
|
|
|||
|
|
@ -931,6 +931,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||
{
|
||||
builder = std::make_unique<clip_graph_conformer>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA4A:
|
||||
{
|
||||
builder = std::make_unique<clip_graph_gemma4a>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
{
|
||||
builder = std::make_unique<clip_graph_glm4v>(ctx, img);
|
||||
|
|
@ -1459,6 +1463,16 @@ struct clip_model_loader {
|
|||
hparams.audio_window_len = 400;
|
||||
hparams.audio_hop_len = 160;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA4A:
|
||||
{
|
||||
// Gemma4 feature_extraction_gemma4.py:
|
||||
// frame_length_ms=20 -> 320 samples, n_fft=512, hop=10ms -> 160
|
||||
hparams.audio_chunk_len = 0; // no fixed-length padding
|
||||
hparams.audio_sample_rate = 16000;
|
||||
hparams.audio_n_fft = 512;
|
||||
hparams.audio_window_len = 320; // 20ms frame (NOT 25ms/400)
|
||||
hparams.audio_hop_len = 160;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_JANUS_PRO:
|
||||
{
|
||||
hparams.image_pad_color = {127, 127, 127};
|
||||
|
|
@ -1561,16 +1575,21 @@ struct clip_model_loader {
|
|||
}
|
||||
|
||||
// helper function
|
||||
std::unordered_set<std::string> loaded_tensor_names;
|
||||
auto get_tensor = [&](const std::string & name, bool required = true) {
|
||||
// Each tensor should only be loaded once; duplicates indicate a bug
|
||||
if (loaded_tensor_names.count(name)) {
|
||||
throw std::runtime_error(string_format("%s: tensor already loaded: %s\n", __func__, name.c_str()));
|
||||
}
|
||||
ggml_tensor * cur = ggml_get_tensor(ctx_meta.get(), name.c_str());
|
||||
if (!cur && required) {
|
||||
throw std::runtime_error(string_format("%s: unable to find tensor %s\n", __func__, name.c_str()));
|
||||
}
|
||||
if (cur) {
|
||||
tensors_to_load.push_back(cur);
|
||||
// add tensors to context
|
||||
ggml_tensor * data_tensor = ggml_dup_tensor(ctx_clip.ctx_data.get(), cur);
|
||||
ggml_set_name(data_tensor, cur->name);
|
||||
loaded_tensor_names.insert(name);
|
||||
cur = data_tensor;
|
||||
}
|
||||
return cur;
|
||||
|
|
@ -2186,6 +2205,76 @@ struct clip_model_loader {
|
|||
model.mm_fc_w = get_tensor(string_format(TN_MM_PROJECTOR, "weight"));
|
||||
model.mm_fc_b = get_tensor(string_format(TN_MM_PROJECTOR, "bias"));
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA4A:
|
||||
{
|
||||
for (int i = 0; i < 2; i++) {
|
||||
model.sscp_conv_w[i] = get_tensor(string_format(TN_A_CONV1D, i, "weight"));
|
||||
model.sscp_conv_b[i] = get_tensor(string_format(TN_A_CONV1D, i, "bias"), false);
|
||||
model.sscp_norm_w[i] = get_tensor(string_format(TN_A_CONV1D_NORM, i, "weight"), false);
|
||||
}
|
||||
model.sscp_inp_proj_w = get_tensor(string_format(TN_A_INP_PROJ, "weight"));
|
||||
model.sscp_inp_proj_b = get_tensor(string_format(TN_A_INP_PROJ, "bias"), false);
|
||||
model.audio_out_proj_w = get_tensor(string_format(TN_A_OUT_PROJ, "weight"), false);
|
||||
model.audio_out_proj_b = get_tensor(string_format(TN_A_OUT_PROJ, "bias"), false);
|
||||
// audio multimodal embedder (mm.a.* namespace, not mm.*)
|
||||
model.mm_soft_emb_norm_w = get_tensor(string_format(TN_A_MM_SOFT_EMB_N, "weight"), false);
|
||||
model.mm_input_proj_w = get_tensor(string_format(TN_A_MM_INP_PROJ, "weight"), false);
|
||||
|
||||
// Per-layer tensors NOT loaded by the generic loop above
|
||||
for (int il = 0; il < hparams.n_layer; ++il) {
|
||||
auto & layer = model.layers[il];
|
||||
|
||||
// Gemma4 audio conformer-specific tensors
|
||||
layer.ff_norm_w = get_tensor(string_format(TN_FFN_NORM, prefix, il, "weight"));
|
||||
layer.attn_pre_norm_w = get_tensor(string_format(TN_A_ATTN_PRE_NORM, prefix, il, "weight"), false);
|
||||
layer.per_dim_scale_w = get_tensor(string_format(TN_A_PER_DIM_SCALE, prefix, il, "weight"), false);
|
||||
layer.per_dim_k_scale_w = get_tensor(string_format(TN_A_PER_DIM_K_SCALE, prefix, il, "weight"), false);
|
||||
layer.attn_k_rel_w = get_tensor(string_format(TN_A_ATTN_K_REL, prefix, il, "weight"), false);
|
||||
|
||||
// Convolution module
|
||||
// Note: conv_norm / norm_conv are swapped in GGUF due to
|
||||
// upstream tensor_mapping.py, so we load them in reverse order
|
||||
layer.norm_conv_w = get_tensor(string_format(TN_CONV_NORM, prefix, il, "weight"), false);
|
||||
layer.norm_conv_b = get_tensor(string_format(TN_CONV_NORM, prefix, il, "bias"), false);
|
||||
layer.conv_pw1_w = get_tensor(string_format(TN_CONV_PW1, prefix, il, "weight"));
|
||||
layer.conv_pw1_b = get_tensor(string_format(TN_CONV_PW1, prefix, il, "bias"), false);
|
||||
layer.conv_dw_w = get_tensor(string_format(TN_CONV_DW, prefix, il, "weight"));
|
||||
layer.conv_dw_b = get_tensor(string_format(TN_CONV_DW, prefix, il, "bias"), false);
|
||||
layer.conv_norm_w = get_tensor(string_format(TN_NORM_CONV, prefix, il, "weight"), false);
|
||||
layer.conv_norm_b = get_tensor(string_format(TN_NORM_CONV, prefix, il, "bias"), false);
|
||||
layer.conv_pw2_w = get_tensor(string_format(TN_CONV_PW2, prefix, il, "weight"));
|
||||
layer.conv_pw2_b = get_tensor(string_format(TN_CONV_PW2, prefix, il, "bias"), false);
|
||||
|
||||
// FFN2 (second half-step)
|
||||
layer.ff_norm_1_w = get_tensor(string_format(TN_FFN_NORM_1, prefix, il, "weight"));
|
||||
layer.ff_up_1_w = get_tensor(string_format(TN_FFN_UP_1, prefix, il, "weight"));
|
||||
layer.ff_up_1_b = get_tensor(string_format(TN_FFN_UP_1, prefix, il, "bias"), false);
|
||||
layer.ff_down_1_w = get_tensor(string_format(TN_FFN_DOWN_1, prefix, il, "weight"));
|
||||
layer.ff_down_1_b = get_tensor(string_format(TN_FFN_DOWN_1, prefix, il, "bias"), false);
|
||||
layer.ff_post_norm_1_w = get_tensor(string_format(TN_A_FFN_POST_NORM_1, prefix, il, "weight"), false);
|
||||
}
|
||||
|
||||
// Load clamp info for ClippableLinear AFTER all tensors are loaded
|
||||
for (auto * tensor : tensors_to_load) {
|
||||
std::string name = tensor->name;
|
||||
if (string_ends_with(name, ".weight")) {
|
||||
std::string name_inp_max = name;
|
||||
std::string name_inp_min = name;
|
||||
std::string name_out_max = name;
|
||||
std::string name_out_min = name;
|
||||
string_replace_all(name_inp_max, ".weight", ".input_max");
|
||||
string_replace_all(name_inp_min, ".weight", ".input_min");
|
||||
string_replace_all(name_out_max, ".weight", ".output_max");
|
||||
string_replace_all(name_out_min, ".weight", ".output_min");
|
||||
model.clamp_info_map[name] = {
|
||||
get_scalar(name_inp_max, FLT_MAX),
|
||||
get_scalar(name_inp_min, -FLT_MAX),
|
||||
get_scalar(name_out_max, FLT_MAX),
|
||||
get_scalar(name_out_min, -FLT_MAX)
|
||||
};
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LFM2A:
|
||||
{
|
||||
for (int i : {0, 2, 3, 5, 6}) {
|
||||
|
|
@ -2246,7 +2335,10 @@ struct clip_model_loader {
|
|||
ggml_backend_buffer_set_usage(ctx_clip.buf.get(), GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
||||
for (auto & t : tensors_to_load) {
|
||||
ggml_tensor * cur = ggml_get_tensor(ctx_clip.ctx_data.get(), t->name);
|
||||
const size_t offset = tensor_offset[t->name];
|
||||
GGML_ASSERT(cur && "tensor not found in ctx_data");
|
||||
auto it_off = tensor_offset.find(t->name);
|
||||
GGML_ASSERT(it_off != tensor_offset.end() && "no offset for tensor");
|
||||
const size_t offset = it_off->second;
|
||||
fin.seekg(offset, std::ios::beg);
|
||||
if (!fin) {
|
||||
throw std::runtime_error(string_format("%s: failed to seek for tensor %s\n", __func__, t->name));
|
||||
|
|
@ -2266,6 +2358,7 @@ struct clip_model_loader {
|
|||
|
||||
LOG_DBG("%s: loaded %zu tensors from %s\n", __func__, tensors_to_load.size(), fname.c_str());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
struct support_info_op {
|
||||
|
|
@ -2538,8 +2631,7 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
|
|||
|
||||
// TODO: we don't support audio for Gemma 3N, but GGUF contains audio tensors
|
||||
// we can remove this check when we implement audio support for Gemma 3N
|
||||
skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3NV
|
||||
|| ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA4V;
|
||||
skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3NV;
|
||||
}
|
||||
|
||||
if (loader.has_audio && !skip_audio) {
|
||||
|
|
@ -2893,6 +2985,16 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
|||
{
|
||||
n_patches = ((((img->nx + 1) / 2) + 1) / 2 + 1) / 2;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA4A:
|
||||
{
|
||||
// Two Conv2D stride-2: O = floor((I + 2p - k) / s) + 1, p=1, k=3, s=2
|
||||
// O = floor((I - 1) / 2) + 1
|
||||
int n = img->nx;
|
||||
for (int i = 0; i < 2; i++) {
|
||||
n = (n - 1) / 2 + 1;
|
||||
}
|
||||
n_patches = n;
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("unsupported projector type");
|
||||
}
|
||||
|
|
@ -3352,6 +3454,56 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|||
}
|
||||
set_input_i32("pos_w", pos_data);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA4A:
|
||||
{
|
||||
GGML_ASSERT(imgs.entries.size() == 1);
|
||||
const auto & img0 = imgs.entries.front();
|
||||
// Compute n_pos matching SSCP output: two stride-2 convs
|
||||
int n_pos = img0->nx;
|
||||
for (int i = 0; i < 2; i++) { n_pos = (n_pos - 1) / 2 + 1; }
|
||||
|
||||
// Chunked local attention: blocked causal mask and RPE
|
||||
const int chunk_size = 12;
|
||||
const int max_past = 12;
|
||||
const int context_size = chunk_size + max_past;
|
||||
const int num_blocks = (n_pos + chunk_size - 1) / chunk_size;
|
||||
|
||||
// Blocked causal attention mask: [context_size, chunk_size, num_blocks]
|
||||
{
|
||||
std::vector<float> mask(context_size * chunk_size * num_blocks, -1e9f);
|
||||
for (int b = 0; b < num_blocks; b++) {
|
||||
for (int q = 0; q < chunk_size; q++) {
|
||||
int gq = b * chunk_size + q;
|
||||
for (int k = 0; k < context_size; k++) {
|
||||
int gk = b * chunk_size - max_past + k;
|
||||
if (gq < n_pos && gk >= 0 && gk < n_pos && gk <= gq && (gq - gk) < max_past) {
|
||||
mask[k + q * context_size + b * context_size * chunk_size] = 0.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
set_input_f32("kq_mask", mask);
|
||||
}
|
||||
|
||||
// Sinusoidal RPE: 13 positions [12, 11, ..., 0]
|
||||
{
|
||||
const int n_embd = ctx->model.hparams.n_embd;
|
||||
const int num_timescales = n_embd / 2;
|
||||
const float log_timescale_increment = logf(10000.0f) / std::max(num_timescales - 1, 1);
|
||||
const int rpe_len = max_past + 1;
|
||||
std::vector<float> pos_emb(n_embd * rpe_len, 0.0f);
|
||||
for (int p = 0; p < rpe_len; p++) {
|
||||
float position = (float)(max_past - p);
|
||||
for (int i = 0; i < num_timescales; i++) {
|
||||
float inv_ts = expf(-(float)i * log_timescale_increment);
|
||||
float scaled = position * inv_ts;
|
||||
pos_emb[p * n_embd + i] = sinf(scaled);
|
||||
pos_emb[p * n_embd + i + num_timescales] = cosf(scaled);
|
||||
}
|
||||
}
|
||||
set_input_f32("pos_emb", pos_emb);
|
||||
}
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LFM2A:
|
||||
{
|
||||
GGML_ASSERT(imgs.entries.size() == 1);
|
||||
|
|
@ -3516,6 +3668,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
|||
return ctx->model.mm_fc_w->ne[1];
|
||||
case PROJECTOR_TYPE_LFM2A:
|
||||
return ctx->model.position_embeddings->ne[0];
|
||||
case PROJECTOR_TYPE_GEMMA4A:
|
||||
return ctx->model.hparams.projection_dim;
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
return ctx->model.mm_ffn_down_w->ne[1];
|
||||
default:
|
||||
|
|
|
|||
|
|
@ -0,0 +1,288 @@
|
|||
/**
|
||||
* Gemma 4 Audio Conformer Encoder (clip_graph_gemma4a)
|
||||
*
|
||||
* Architecture: Conformer with dual half-step FFN, full self-attention
|
||||
* with sinusoidal RPE, depthwise light conv, and output projection.
|
||||
*/
|
||||
|
||||
#include "models.h"
|
||||
#include <cmath>
|
||||
|
||||
ggml_cgraph * clip_graph_gemma4a::build() {
|
||||
const float res_weight = 0.5f;
|
||||
const float norm_eps = 1e-6f;
|
||||
|
||||
// 1. Input
|
||||
ggml_tensor * inp = build_inp_raw(1);
|
||||
auto * cur = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
|
||||
|
||||
// 2. Subsampling Conv2D (symmetric padding=1, matching PyTorch)
|
||||
{
|
||||
for (int i = 0; i < 2; i++) {
|
||||
cur = ggml_conv_2d(ctx0, model.sscp_conv_w[i], cur, 2, 2, 1, 1, 1, 1);
|
||||
if (model.sscp_conv_b[i]) {
|
||||
cur = ggml_add(ctx0, cur, model.sscp_conv_b[i]);
|
||||
}
|
||||
// nn.LayerNorm(channels): permute ch to ne[0], normalize, permute back
|
||||
if (model.sscp_norm_w[i]) {
|
||||
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
|
||||
cur = ggml_norm(ctx0, cur, norm_eps);
|
||||
cur = ggml_mul(ctx0, cur, model.sscp_norm_w[i]);
|
||||
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
|
||||
}
|
||||
cur = ggml_relu(ctx0, cur);
|
||||
}
|
||||
// Flatten [freq, time, ch, 1] -> [ch*freq, time]
|
||||
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
|
||||
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2]);
|
||||
if (model.sscp_inp_proj_w) {
|
||||
cur = build_mm(model.sscp_inp_proj_w, cur);
|
||||
if (model.sscp_inp_proj_b) {
|
||||
cur = ggml_add(ctx0, cur, model.sscp_inp_proj_b);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const int64_t n_pos = cur->ne[1];
|
||||
|
||||
// Chunked local attention parameters
|
||||
const int64_t C = 12; // chunk_size
|
||||
const int64_t P = 12; // max_past_horizon (context_left - 1)
|
||||
const int64_t S = C + P; // context_size = 24
|
||||
const int64_t R = P + 1; // RPE positions = 13
|
||||
const int64_t B = (n_pos + C - 1) / C; // num_blocks
|
||||
const int64_t Np = B * C; // padded sequence length
|
||||
const int64_t pad_seq = Np - n_pos;
|
||||
|
||||
// Input tensors: blocked RPE and blocked attention mask
|
||||
ggml_tensor * pos_emb = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_head * d_head, R);
|
||||
ggml_set_name(pos_emb, "pos_emb");
|
||||
ggml_set_input(pos_emb);
|
||||
|
||||
ggml_tensor * kq_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, S, C, B);
|
||||
ggml_set_name(kq_mask, "kq_mask");
|
||||
ggml_set_input(kq_mask);
|
||||
|
||||
// 3. Conformer Blocks
|
||||
for (int il = 0; il < hparams.n_layer; il++) {
|
||||
const auto & layer = model.layers[il];
|
||||
auto * residual = cur;
|
||||
|
||||
// FFN 1 (half-step)
|
||||
if (layer.ff_norm_w && layer.ff_up_w && layer.ff_down_w) {
|
||||
cur = build_norm(cur, layer.ff_norm_w, nullptr, NORM_TYPE_RMS, norm_eps, il);
|
||||
cur = build_ffn(cur,
|
||||
layer.ff_up_w, nullptr, nullptr, nullptr,
|
||||
layer.ff_down_w, nullptr, FFN_SILU, il);
|
||||
if (layer.ff_post_norm_w) {
|
||||
cur = build_norm(cur, layer.ff_post_norm_w, nullptr, NORM_TYPE_RMS, norm_eps, il);
|
||||
}
|
||||
residual = ggml_add(ctx0, residual, ggml_scale(ctx0, cur, res_weight));
|
||||
}
|
||||
|
||||
// Chunked local self-attention with RPE
|
||||
if (layer.q_w && layer.k_w && layer.v_w && layer.o_w) {
|
||||
const float q_scale = (1.0f / sqrtf((float)d_head)) / logf(2.0f);
|
||||
const float k_scale = logf(1.0f + expf(1.0f)) / logf(2.0f);
|
||||
const float softcap = 50.0f;
|
||||
|
||||
ggml_tensor * attn_norm_w = layer.attn_pre_norm_w ? layer.attn_pre_norm_w : layer.ln_1_w;
|
||||
cur = attn_norm_w
|
||||
? build_norm(residual, attn_norm_w, nullptr, NORM_TYPE_RMS, norm_eps, il)
|
||||
: residual;
|
||||
|
||||
ggml_tensor * Qcur = build_mm(layer.q_w, cur);
|
||||
ggml_tensor * Kcur = build_mm(layer.k_w, cur);
|
||||
ggml_tensor * Vcur = build_mm(layer.v_w, cur);
|
||||
|
||||
// [n_embd, n_pos] -> [D, H, N]
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
|
||||
|
||||
// Q/K scaling
|
||||
Qcur = ggml_scale(ctx0, Qcur, q_scale);
|
||||
if (layer.per_dim_scale_w) {
|
||||
Qcur = ggml_mul(ctx0, Qcur, ggml_reshape_3d(ctx0, layer.per_dim_scale_w, d_head, 1, 1));
|
||||
}
|
||||
Kcur = ggml_scale(ctx0, Kcur, k_scale);
|
||||
if (layer.per_dim_k_scale_w) {
|
||||
Kcur = ggml_mul(ctx0, Kcur, ggml_reshape_3d(ctx0, layer.per_dim_k_scale_w, d_head, 1, 1));
|
||||
}
|
||||
|
||||
// Q blocking: [D, H, N] -> pad to Np -> reshape [D, H, C, B]
|
||||
// ggml permute: ne[ax_i] = src->ne[i], so (0,3,1,2) sends H->3, C->1, B->2
|
||||
Qcur = ggml_pad(ctx0, Qcur, 0, 0, pad_seq, 0); // [D, H, Np]
|
||||
Qcur = ggml_reshape_4d(ctx0, Qcur, d_head, n_head, C, B); // [D, H, C, B]
|
||||
Qcur = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 0, 3, 1, 2)); // [D, C, B, H]
|
||||
|
||||
// K/V block context extraction via overlapping view:
|
||||
// Pad to S*B elements, roll right by P to create left-padding,
|
||||
// then view with stride C in the block dimension (overlapping windows).
|
||||
auto extract_blocks = [&](ggml_tensor * t) -> ggml_tensor * {
|
||||
// [D, H, N] -> pad to S*B -> roll right by P -> cont (materialize)
|
||||
const int64_t pad_kv = S * B - n_pos;
|
||||
t = ggml_pad(ctx0, t, 0, 0, pad_kv, 0); // [D, H, S*B]
|
||||
t = ggml_roll(ctx0, t, 0, 0, P, 0); // left-pad by P
|
||||
t = ggml_cont(ctx0, t); // materialize roll (removes view offset)
|
||||
// Overlapping view: stride for B dim is C positions, not S
|
||||
// ne = [D, H, S, B], data_size = D*H*S*B*sizeof = source_nbytes (exact fit)
|
||||
// nb1=D*sizeof, nb2=D*H*sizeof, nb3=C*D*H*sizeof (overlap: C < S)
|
||||
t = ggml_view_4d(ctx0, t, d_head, n_head, S, B,
|
||||
t->nb[1], t->nb[2], C * t->nb[2], 0);
|
||||
t = ggml_cont(ctx0, t); // materialize overlapping windows
|
||||
return t;
|
||||
};
|
||||
|
||||
ggml_tensor * Kblk = extract_blocks(Kcur);
|
||||
// [D, H, S, B] -> [D, S, B, H] via permute(0,3,1,2)
|
||||
Kblk = ggml_cont(ctx0, ggml_permute(ctx0, Kblk, 0, 3, 1, 2));
|
||||
|
||||
ggml_tensor * Vblk = extract_blocks(Vcur);
|
||||
// [D, H, S, B] -> [S, D, B, H] via permute(1,3,0,2)
|
||||
Vblk = ggml_cont(ctx0, ggml_permute(ctx0, Vblk, 1, 3, 0, 2));
|
||||
|
||||
// Content attention: Q @ K^T
|
||||
// Kblk=[D,S,B,H], Qcur=[D,C,B,H] -> mul_mat contracts on D -> [S,C,B,H]
|
||||
ggml_tensor * matrix_ac = ggml_mul_mat(ctx0, Kblk, Qcur);
|
||||
|
||||
// Relative position attention
|
||||
if (layer.attn_k_rel_w) {
|
||||
// RPE: [n_embd, R] -> project -> [D, H, R] -> [D, R, H]
|
||||
auto * p = ggml_mul_mat(ctx0, layer.attn_k_rel_w, pos_emb);
|
||||
p = ggml_reshape_3d(ctx0, p, d_head, n_head, R);
|
||||
p = ggml_cont(ctx0, ggml_permute(ctx0, p, 0, 2, 1, 3)); // [D, R, H]
|
||||
|
||||
// Q_flat @ RPE^T: [D, C*B, H] @ [D, R, H] -> [R, C*B, H]
|
||||
auto * Q_flat = ggml_reshape_3d(ctx0, Qcur, d_head, C * B, n_head);
|
||||
auto * matrix_bd = ggml_mul_mat(ctx0, p, Q_flat); // [R, C*B, H]
|
||||
matrix_bd = ggml_reshape_4d(ctx0, matrix_bd, R, C, B, n_head); // [R, C, B, H]
|
||||
|
||||
// Blocked relative shift (appendix B of Transformer-XL)
|
||||
{
|
||||
matrix_bd = ggml_pad(ctx0, matrix_bd, S + 1 - R, 0, 0, 0); // [S+1, C, B, H]
|
||||
matrix_bd = ggml_reshape_3d(ctx0, matrix_bd, (S + 1) * C, B, n_head);
|
||||
matrix_bd = ggml_view_3d(ctx0, matrix_bd,
|
||||
C * S, B, n_head,
|
||||
matrix_bd->nb[1], matrix_bd->nb[2], 0);
|
||||
matrix_bd = ggml_cont(ctx0, matrix_bd); // [C*S, B, H]
|
||||
matrix_bd = ggml_reshape_4d(ctx0, matrix_bd, S, C, B, n_head); // [S, C, B, H]
|
||||
}
|
||||
|
||||
matrix_ac = ggml_add(ctx0, matrix_ac, matrix_bd);
|
||||
}
|
||||
|
||||
auto * scores = matrix_ac; // [S, C, B, H]
|
||||
|
||||
// Softcap
|
||||
scores = ggml_scale(ctx0, scores, 1.0f / softcap);
|
||||
scores = ggml_tanh(ctx0, scores);
|
||||
scores = ggml_scale(ctx0, scores, softcap);
|
||||
|
||||
// Blocked attention mask: [S, C, B] broadcasts over H
|
||||
scores = ggml_add(ctx0, scores, kq_mask);
|
||||
|
||||
ggml_tensor * attn = ggml_soft_max(ctx0, scores);
|
||||
|
||||
// attn @ V: [S,C,B,H] @ [S,D,B,H] -> [D,C,B,H]
|
||||
ggml_tensor * x = ggml_mul_mat(ctx0, Vblk, attn);
|
||||
|
||||
// [D,C,B,H] -> [D,H,C,B] via permute(0,2,3,1) -> flatten -> trim
|
||||
x = ggml_cont(ctx0, ggml_permute(ctx0, x, 0, 2, 3, 1));
|
||||
x = ggml_cont_2d(ctx0, x, d_head * n_head, C * B);
|
||||
if (pad_seq > 0) {
|
||||
x = ggml_view_2d(ctx0, x, d_head * n_head, n_pos, x->nb[1], 0);
|
||||
x = ggml_cont(ctx0, x);
|
||||
}
|
||||
|
||||
x = build_mm(layer.o_w, x);
|
||||
if (layer.o_b) { x = ggml_add(ctx0, x, layer.o_b); }
|
||||
|
||||
if (layer.attn_post_norm_w) {
|
||||
x = build_norm(x, layer.attn_post_norm_w, nullptr, NORM_TYPE_RMS, norm_eps, il);
|
||||
}
|
||||
residual = ggml_add(ctx0, residual, x);
|
||||
}
|
||||
|
||||
// Convolution Module
|
||||
if (layer.norm_conv_w && layer.conv_pw1_w && layer.conv_dw_w && layer.conv_pw2_w) {
|
||||
cur = build_norm(residual, layer.norm_conv_w, nullptr, NORM_TYPE_RMS, norm_eps, il);
|
||||
auto * x = build_mm(layer.conv_pw1_w, cur);
|
||||
|
||||
// GLU
|
||||
{
|
||||
int64_t d = x->ne[0] / 2;
|
||||
ggml_tensor * gate = ggml_sigmoid(ctx0,
|
||||
ggml_cont(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], d * x->nb[0])));
|
||||
x = ggml_mul(ctx0,
|
||||
ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], 0), gate);
|
||||
x = ggml_cont(ctx0, ggml_transpose(ctx0, x));
|
||||
}
|
||||
|
||||
// Causal depthwise Conv1D via ggml_ssm_conv (pad+roll for left-only padding).
|
||||
x = ggml_pad(ctx0, x, 4, 0, 0, 0);
|
||||
x = ggml_roll(ctx0, x, 4, 0, 0, 0);
|
||||
x = ggml_ssm_conv(ctx0, x, layer.conv_dw_w);
|
||||
if (layer.conv_dw_b) {
|
||||
x = ggml_add(ctx0, x, layer.conv_dw_b);
|
||||
}
|
||||
|
||||
if (layer.conv_norm_w) {
|
||||
x = ggml_rms_norm(ctx0, x, norm_eps);
|
||||
x = ggml_mul(ctx0, x, layer.conv_norm_w);
|
||||
}
|
||||
x = ggml_silu(ctx0, x);
|
||||
x = build_mm(layer.conv_pw2_w, x);
|
||||
residual = ggml_add(ctx0, residual, x);
|
||||
}
|
||||
|
||||
// FFN 2 (half-step)
|
||||
if (layer.ff_norm_1_w && layer.ff_up_1_w && layer.ff_down_1_w) {
|
||||
cur = build_norm(residual, layer.ff_norm_1_w, nullptr, NORM_TYPE_RMS, norm_eps, il);
|
||||
cur = build_ffn(cur,
|
||||
layer.ff_up_1_w, nullptr, nullptr, nullptr,
|
||||
layer.ff_down_1_w, nullptr, FFN_SILU, il);
|
||||
if (layer.ff_post_norm_1_w) {
|
||||
cur = build_norm(cur, layer.ff_post_norm_1_w, nullptr, NORM_TYPE_RMS, norm_eps, il);
|
||||
}
|
||||
residual = ggml_add(ctx0, residual, ggml_scale(ctx0, cur, res_weight));
|
||||
}
|
||||
|
||||
// Layer output norm
|
||||
cur = layer.ln_2_w
|
||||
? build_norm(residual, layer.ln_2_w, nullptr, NORM_TYPE_RMS, norm_eps, il)
|
||||
: residual;
|
||||
|
||||
}
|
||||
|
||||
// 4. Output Projection
|
||||
if (model.audio_out_proj_w) {
|
||||
cur = build_mm(model.audio_out_proj_w, cur);
|
||||
if (model.audio_out_proj_b) {
|
||||
cur = ggml_add(ctx0, cur, model.audio_out_proj_b);
|
||||
}
|
||||
}
|
||||
|
||||
// 5. Audio Multimodal Embedder
|
||||
cur = ggml_rms_norm(ctx0, cur, norm_eps);
|
||||
if (model.mm_soft_emb_norm_w) {
|
||||
cur = ggml_mul(ctx0, cur, model.mm_soft_emb_norm_w);
|
||||
}
|
||||
if (model.mm_input_proj_w) {
|
||||
cur = build_mm(model.mm_input_proj_w, cur);
|
||||
}
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
return gf;
|
||||
}
|
||||
|
||||
ggml_tensor * clip_graph_gemma4a::build_mm(ggml_tensor * w, ggml_tensor * x) const {
|
||||
auto it = model.clamp_info_map.find(w->name);
|
||||
if (it == model.clamp_info_map.end()) {
|
||||
return ggml_mul_mat(ctx0, w, x);
|
||||
}
|
||||
const auto & ci = it->second;
|
||||
ggml_tensor * clamped = ggml_clamp(ctx0, x, ci.inp_min, ci.inp_max);
|
||||
ggml_tensor * out = ggml_mul_mat(ctx0, w, clamped);
|
||||
return ggml_clamp(ctx0, out, ci.out_min, ci.out_max);
|
||||
}
|
||||
|
|
@ -103,6 +103,12 @@ struct clip_graph_conformer : clip_graph {
|
|||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_gemma4a : clip_graph {
|
||||
clip_graph_gemma4a(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
ggml_tensor * build_mm(ggml_tensor * w, ggml_tensor * x) const override;
|
||||
};
|
||||
|
||||
struct clip_graph_glm4v : clip_graph {
|
||||
clip_graph_glm4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@
|
|||
#include <vector>
|
||||
#include <fstream>
|
||||
#include <algorithm>
|
||||
#include <functional>
|
||||
|
||||
// some of the code here is copied from whisper.cpp
|
||||
|
||||
|
|
@ -37,23 +38,36 @@ void mtmd_audio_cache::fill_mel_filterbank_matrix(int n_mel,
|
|||
float fmin,
|
||||
float fmax,
|
||||
bool slaney_area_norm,
|
||||
float scale) {
|
||||
float scale,
|
||||
bool use_htk) {
|
||||
GGML_ASSERT(n_mel > 0 && n_fft > 1);
|
||||
if (fmax <= 0.0f) {
|
||||
fmax = 0.5f * sample_rate;
|
||||
}
|
||||
|
||||
// Slaney scale (matches librosa default)
|
||||
const double min_log_hz = 1000.0;
|
||||
const double lin_slope = 3 / 200.;
|
||||
const double min_log_mel = min_log_hz * lin_slope;
|
||||
const double log_step = log(6.4) / 27.0;
|
||||
auto hz_to_mel = [min_log_hz, lin_slope, log_step, min_log_mel](const double f_hz) -> double {
|
||||
return (f_hz < min_log_hz) ? f_hz * lin_slope : min_log_mel + log(f_hz / min_log_hz) / log_step;
|
||||
};
|
||||
auto mel_to_hz = [min_log_hz, lin_slope, log_step, min_log_mel](const double m) -> double {
|
||||
return (m < min_log_mel) ? m / lin_slope : min_log_hz * exp((m - min_log_mel) * log_step);
|
||||
};
|
||||
std::function<double(double)> hz_to_mel;
|
||||
std::function<double(double)> mel_to_hz;
|
||||
|
||||
if (use_htk) {
|
||||
hz_to_mel = [](const double f_hz) -> double {
|
||||
return 2595.0 * log10(1.0 + f_hz / 700.0);
|
||||
};
|
||||
mel_to_hz = [](const double m) -> double {
|
||||
return 700.0 * (pow(10.0, m / 2595.0) - 1.0);
|
||||
};
|
||||
} else {
|
||||
// Slaney scale (matches librosa default)
|
||||
const double min_log_hz = 1000.0;
|
||||
const double lin_slope = 3 / 200.;
|
||||
const double min_log_mel = min_log_hz * lin_slope;
|
||||
const double log_step = log(6.4) / 27.0;
|
||||
hz_to_mel = [min_log_hz, lin_slope, log_step, min_log_mel](const double f_hz) -> double {
|
||||
return (f_hz < min_log_hz) ? f_hz * lin_slope : min_log_mel + log(f_hz / min_log_hz) / log_step;
|
||||
};
|
||||
mel_to_hz = [min_log_hz, lin_slope, log_step, min_log_mel](const double m) -> double {
|
||||
return (m < min_log_mel) ? m / lin_slope : min_log_hz * exp((m - min_log_mel) * log_step);
|
||||
};
|
||||
}
|
||||
|
||||
// infer N_fft from n_fft_bins
|
||||
const double bin_hz_step = double(sample_rate) / double(n_fft);
|
||||
|
|
@ -257,10 +271,13 @@ struct filter_params {
|
|||
int32_t hann_window_size;
|
||||
int32_t hop_length;
|
||||
int32_t sample_rate;
|
||||
bool center_padding = false;
|
||||
float preemph = 0.f;
|
||||
bool no_padding = false;
|
||||
bool center_padding = false;
|
||||
float preemph = 0.f;
|
||||
bool use_natural_log = false;
|
||||
bool norm_per_feature = false;
|
||||
bool use_magnitude = false; // |X| instead of |X|^2
|
||||
float mel_floor = 5.960464477539063e-08f;
|
||||
};
|
||||
|
||||
static void log_mel_spectrogram_worker_thread(int ith,
|
||||
|
|
@ -301,10 +318,10 @@ static void log_mel_spectrogram_worker_thread(int ith,
|
|||
// FFT
|
||||
fft(cache, fft_in.data(), frame_size, fft_out.data());
|
||||
|
||||
// Calculate modulus^2 of complex numbers
|
||||
// Use pow(fft_out[2 * j + 0], 2) + pow(fft_out[2 * j + 1], 2) causes inference quality problem? Interesting.
|
||||
// Calculate modulus^2 (power) or modulus (magnitude)
|
||||
for (int j = 0; j < n_fft_bins; j++) {
|
||||
fft_out[j] = (fft_out[2 * j + 0] * fft_out[2 * j + 0] + fft_out[2 * j + 1] * fft_out[2 * j + 1]);
|
||||
float power = (fft_out[2 * j + 0] * fft_out[2 * j + 0] + fft_out[2 * j + 1] * fft_out[2 * j + 1]);
|
||||
fft_out[j] = params.use_magnitude ? sqrtf(power) : power;
|
||||
}
|
||||
|
||||
// mel spectrogram
|
||||
|
|
@ -324,9 +341,10 @@ static void log_mel_spectrogram_worker_thread(int ith,
|
|||
for (; k < n_fft_bins; k++) {
|
||||
sum += fft_out[k] * filters.data[j * n_fft_bins + k];
|
||||
}
|
||||
sum = std::max(sum, (double)params.mel_floor);
|
||||
sum = params.use_natural_log
|
||||
? log(sum + 5.960464477539063e-08)
|
||||
: log10(std::max(sum, 1e-10));
|
||||
? log(sum)
|
||||
: log10(sum);
|
||||
out.data[j * out.n_len + i] = sum;
|
||||
}
|
||||
}
|
||||
|
|
@ -360,7 +378,12 @@ static bool log_mel_spectrogram(
|
|||
|
||||
// Padding
|
||||
std::vector<float> samples_padded;
|
||||
if (params.center_padding) {
|
||||
if (params.no_padding) {
|
||||
// no padding, use samples as-is
|
||||
samples_padded = std::vector<float>(samples, samples + n_samples);
|
||||
samples = samples_padded.data();
|
||||
n_samples = samples_padded.size();
|
||||
} else if (params.center_padding) {
|
||||
const auto pad_amount = frame_size / 2;
|
||||
samples_padded = std::vector<float>(n_samples + 2 * pad_amount, 0);
|
||||
std::copy(samples, samples + n_samples, samples_padded.data() + pad_amount);
|
||||
|
|
@ -464,8 +487,8 @@ static bool log_mel_spectrogram(
|
|||
out.data[i * out.n_len + j] = 0.0;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// clamping and normalization
|
||||
} else if (!params.no_padding) {
|
||||
// Whisper-style clamping and normalization (NOT used by Gemma4)
|
||||
double mmax = -1e20;
|
||||
for (int i = 0; i < out.n_mel*out.n_len; i++) {
|
||||
if (out.data[i] > mmax) {
|
||||
|
|
@ -627,6 +650,87 @@ bool mtmd_audio_preprocessor_conformer::preprocess(const float *
|
|||
return true;
|
||||
}
|
||||
|
||||
//
|
||||
// mtmd_audio_preprocessor_gemma4a
|
||||
//
|
||||
|
||||
void mtmd_audio_preprocessor_gemma4a::initialize() {
|
||||
cache.fill_sin_cos_table(hparams.audio_n_fft);
|
||||
|
||||
// Standard periodic Hann window, zero-padded to FFT size
|
||||
cache.hann_window.assign(hparams.audio_n_fft, 0.0f);
|
||||
for (uint32_t i = 0; i < (uint32_t)hparams.audio_window_len; i++) {
|
||||
cache.hann_window[i] = 0.5f - 0.5f * cosf((2.0f * (float)M_PI * i) / hparams.audio_window_len);
|
||||
}
|
||||
|
||||
// HTK mel scale, no Slaney area normalization
|
||||
cache.fill_mel_filterbank_matrix(
|
||||
hparams.n_mel_bins, hparams.audio_n_fft, hparams.audio_sample_rate,
|
||||
0.0f, hparams.audio_sample_rate / 2.0f,
|
||||
/*slaney_area_norm=*/ false,
|
||||
/*scale=*/ 1.0f,
|
||||
/*use_htk=*/ true
|
||||
);
|
||||
}
|
||||
|
||||
bool mtmd_audio_preprocessor_gemma4a::preprocess(const float * samples,
|
||||
size_t n_samples,
|
||||
std::vector<mtmd_audio_mel> & output) {
|
||||
if (n_samples == 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
GGML_ASSERT(!cache.sin_vals.empty());
|
||||
GGML_ASSERT(!cache.cos_vals.empty());
|
||||
GGML_ASSERT(!cache.filters.data.empty());
|
||||
|
||||
filter_params params;
|
||||
params.n_mel = hparams.n_mel_bins;
|
||||
params.n_fft_bins = 1 + (hparams.audio_n_fft / 2);
|
||||
params.hann_window_size = hparams.audio_n_fft; // window is zero-padded to FFT size
|
||||
params.hop_length = hparams.audio_hop_len;
|
||||
params.sample_rate = hparams.audio_sample_rate;
|
||||
params.no_padding = true;
|
||||
params.center_padding = false;
|
||||
params.preemph = 0.0f;
|
||||
params.use_natural_log = true;
|
||||
params.use_magnitude = true;
|
||||
params.mel_floor = 0.001f;
|
||||
params.norm_per_feature = false;
|
||||
|
||||
// Split into 30-second chunks (model context limit, ~750 tokens each)
|
||||
const size_t chunk_samples = 30 * hparams.audio_sample_rate;
|
||||
for (size_t off = 0; off < n_samples; off += chunk_samples) {
|
||||
const float * chunk_ptr = samples + off;
|
||||
size_t chunk_len = std::min(chunk_samples, n_samples - off);
|
||||
|
||||
// Semicausal left-padding + right-padding to match PyTorch frame count
|
||||
const int pad_left = hparams.audio_window_len / 2;
|
||||
const int fft_size = hparams.audio_n_fft;
|
||||
const int hop = hparams.audio_hop_len;
|
||||
const int n_with_left = (int)chunk_len + pad_left;
|
||||
// PyTorch: unfold(size=frame_length+1, step=hop) on semicausal-padded waveform
|
||||
const int pt_frames = (n_with_left - (hparams.audio_window_len + 1)) / hop + 1;
|
||||
const int n_padded_needed = (pt_frames - 1) * hop + fft_size;
|
||||
const int total_pad = std::max((int)(n_padded_needed - (int)chunk_len), pad_left);
|
||||
std::vector<float> padded_samples(total_pad + chunk_len, 0.0f);
|
||||
std::copy(chunk_ptr, chunk_ptr + chunk_len, padded_samples.data() + pad_left);
|
||||
|
||||
mtmd_audio_mel out_chunk;
|
||||
bool ok = log_mel_spectrogram(padded_samples.data(), padded_samples.size(), 4, params, cache, out_chunk);
|
||||
if (!ok) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Trim to PyTorch frame count
|
||||
out_chunk.n_len = std::min(out_chunk.n_len, pt_frames);
|
||||
|
||||
output.push_back(std::move(out_chunk));
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
//
|
||||
// mtmd_audio_streaming_istft implementation
|
||||
//
|
||||
|
|
|
|||
|
|
@ -45,7 +45,8 @@ struct mtmd_audio_cache {
|
|||
float fmin = 0.0f, // e.g. 0.0
|
||||
float fmax = -1.0f, // e.g. sr/2; pass -1 for auto
|
||||
bool slaney_area_norm = true,
|
||||
float scale = 1.0f // optional extra scaling
|
||||
float scale = 1.0f,
|
||||
bool use_htk = false
|
||||
);
|
||||
};
|
||||
|
||||
|
|
@ -77,6 +78,15 @@ struct mtmd_audio_preprocessor_conformer : mtmd_audio_preprocessor {
|
|||
mtmd_audio_cache cache;
|
||||
};
|
||||
|
||||
struct mtmd_audio_preprocessor_gemma4a : mtmd_audio_preprocessor {
|
||||
mtmd_audio_preprocessor_gemma4a(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
|
||||
void initialize() override;
|
||||
bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
|
||||
|
||||
private:
|
||||
mtmd_audio_cache cache;
|
||||
};
|
||||
|
||||
//
|
||||
// streaming ISTFT - converts spectrogram frames back to audio one frame at a time
|
||||
//
|
||||
|
|
|
|||
|
|
@ -484,6 +484,12 @@ struct mtmd_context {
|
|||
{
|
||||
audio_preproc = std::make_unique<mtmd_audio_preprocessor_conformer>(ctx_a);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA4A:
|
||||
{
|
||||
aud_beg = "<|audio>";
|
||||
aud_end = "<audio|>";
|
||||
audio_preproc = std::make_unique<mtmd_audio_preprocessor_gemma4a>(ctx_a);
|
||||
} break;
|
||||
default:
|
||||
throw std::runtime_error(string_format("%s: unexpected audio projector type %d\n", __func__, proj));
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue