mtmd : Adding support for Nvidia Music Flamingo Model (#18470)
* Inital commit, debugging q5_k_s quant * Made hf_to_gguf extend whisper to reduce code duplication * addressed convert_hf_to_gguf pull request issue --------- Co-authored-by: Henry D <henrydorsey147@gmail.com>
This commit is contained in:
parent
9a6369bb60
commit
9b8329de7a
|
|
@ -3503,7 +3503,7 @@ class QwenModel(TextModel):
|
||||||
self._set_vocab_qwen()
|
self._set_vocab_qwen()
|
||||||
|
|
||||||
|
|
||||||
@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM", "Qwen2AudioForConditionalGeneration", "KORMoForCausalLM")
|
@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM", "Qwen2AudioForConditionalGeneration", "KORMoForCausalLM", "AudioFlamingo3ForConditionalGeneration")
|
||||||
class Qwen2Model(TextModel):
|
class Qwen2Model(TextModel):
|
||||||
model_arch = gguf.MODEL_ARCH.QWEN2
|
model_arch = gguf.MODEL_ARCH.QWEN2
|
||||||
|
|
||||||
|
|
@ -9292,6 +9292,18 @@ class VoxtralWhisperEncoderModel(WhisperEncoderModel):
|
||||||
self.gguf_writer.add_audio_stack_factor(4) # == intermediate_size // hidden_size
|
self.gguf_writer.add_audio_stack_factor(4) # == intermediate_size // hidden_size
|
||||||
|
|
||||||
|
|
||||||
|
@ModelBase.register("AudioFlamingo3ForConditionalGeneration")
|
||||||
|
class AudioFlamingo3WhisperEncoderModel(WhisperEncoderModel):
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
super().set_gguf_parameters()
|
||||||
|
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.MUSIC_FLAMINGO)
|
||||||
|
|
||||||
|
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||||
|
if ".conv" in name and ".weight" in name:
|
||||||
|
# Was trained in BF16, being safe, avoiding quantizing to FP16
|
||||||
|
return gguf.GGMLQuantizationType.F32
|
||||||
|
return super().tensor_force_quant(name, new_name, bid, n_dims)
|
||||||
|
|
||||||
@ModelBase.register("FalconH1ForCausalLM")
|
@ModelBase.register("FalconH1ForCausalLM")
|
||||||
class FalconH1Model(Mamba2Model):
|
class FalconH1Model(Mamba2Model):
|
||||||
model_arch = gguf.MODEL_ARCH.FALCON_H1
|
model_arch = gguf.MODEL_ARCH.FALCON_H1
|
||||||
|
|
|
||||||
|
|
@ -3492,6 +3492,7 @@ class VisionProjectorType:
|
||||||
COGVLM = "cogvlm"
|
COGVLM = "cogvlm"
|
||||||
JANUS_PRO = "janus_pro"
|
JANUS_PRO = "janus_pro"
|
||||||
LFM2A = "lfm2a" # audio
|
LFM2A = "lfm2a" # audio
|
||||||
|
MUSIC_FLAMINGO = "musicflamingo" # audio
|
||||||
GLM4V = "glm4v"
|
GLM4V = "glm4v"
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -180,6 +180,7 @@ enum projector_type {
|
||||||
PROJECTOR_TYPE_GLMA,
|
PROJECTOR_TYPE_GLMA,
|
||||||
PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
|
PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
|
||||||
PROJECTOR_TYPE_VOXTRAL,
|
PROJECTOR_TYPE_VOXTRAL,
|
||||||
|
PROJECTOR_TYPE_MUSIC_FLAMINGO,
|
||||||
PROJECTOR_TYPE_LFM2,
|
PROJECTOR_TYPE_LFM2,
|
||||||
PROJECTOR_TYPE_KIMIVL,
|
PROJECTOR_TYPE_KIMIVL,
|
||||||
PROJECTOR_TYPE_LIGHTONOCR,
|
PROJECTOR_TYPE_LIGHTONOCR,
|
||||||
|
|
@ -209,6 +210,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||||
{ PROJECTOR_TYPE_GLMA, "glma"},
|
{ PROJECTOR_TYPE_GLMA, "glma"},
|
||||||
{ PROJECTOR_TYPE_QWEN25O, "qwen2.5o"},
|
{ PROJECTOR_TYPE_QWEN25O, "qwen2.5o"},
|
||||||
{ PROJECTOR_TYPE_VOXTRAL, "voxtral"},
|
{ PROJECTOR_TYPE_VOXTRAL, "voxtral"},
|
||||||
|
{ PROJECTOR_TYPE_MUSIC_FLAMINGO, "musicflamingo"},
|
||||||
{ PROJECTOR_TYPE_LFM2, "lfm2"},
|
{ PROJECTOR_TYPE_LFM2, "lfm2"},
|
||||||
{ PROJECTOR_TYPE_KIMIVL, "kimivl"},
|
{ PROJECTOR_TYPE_KIMIVL, "kimivl"},
|
||||||
{ PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
|
{ PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
|
||||||
|
|
|
||||||
|
|
@ -319,7 +319,8 @@ struct clip_model {
|
||||||
|
|
||||||
bool audio_has_avgpool() const {
|
bool audio_has_avgpool() const {
|
||||||
return proj_type == PROJECTOR_TYPE_QWEN2A
|
return proj_type == PROJECTOR_TYPE_QWEN2A
|
||||||
|| proj_type == PROJECTOR_TYPE_VOXTRAL;
|
|| proj_type == PROJECTOR_TYPE_VOXTRAL
|
||||||
|
|| proj_type == PROJECTOR_TYPE_MUSIC_FLAMINGO;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool audio_has_stack_frames() const {
|
bool audio_has_stack_frames() const {
|
||||||
|
|
|
||||||
|
|
@ -818,6 +818,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
case PROJECTOR_TYPE_VOXTRAL:
|
case PROJECTOR_TYPE_VOXTRAL:
|
||||||
case PROJECTOR_TYPE_QWEN2A:
|
case PROJECTOR_TYPE_QWEN2A:
|
||||||
case PROJECTOR_TYPE_GLMA:
|
case PROJECTOR_TYPE_GLMA:
|
||||||
|
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
||||||
{
|
{
|
||||||
builder = std::make_unique<clip_graph_whisper_enc>(ctx, img);
|
builder = std::make_unique<clip_graph_whisper_enc>(ctx, img);
|
||||||
} break;
|
} break;
|
||||||
|
|
@ -1176,6 +1177,7 @@ struct clip_model_loader {
|
||||||
case PROJECTOR_TYPE_QWEN2A:
|
case PROJECTOR_TYPE_QWEN2A:
|
||||||
case PROJECTOR_TYPE_GLMA:
|
case PROJECTOR_TYPE_GLMA:
|
||||||
case PROJECTOR_TYPE_VOXTRAL:
|
case PROJECTOR_TYPE_VOXTRAL:
|
||||||
|
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
||||||
{
|
{
|
||||||
bool require_stack = model.proj_type == PROJECTOR_TYPE_ULTRAVOX ||
|
bool require_stack = model.proj_type == PROJECTOR_TYPE_ULTRAVOX ||
|
||||||
model.proj_type == PROJECTOR_TYPE_VOXTRAL ||
|
model.proj_type == PROJECTOR_TYPE_VOXTRAL ||
|
||||||
|
|
@ -1576,6 +1578,17 @@ struct clip_model_loader {
|
||||||
model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
|
model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
|
||||||
model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
|
model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
|
||||||
} break;
|
} break;
|
||||||
|
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
||||||
|
{
|
||||||
|
model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
|
||||||
|
model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
|
||||||
|
model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
|
||||||
|
model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
|
||||||
|
model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
|
||||||
|
model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias"));
|
||||||
|
model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
|
||||||
|
model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias"));
|
||||||
|
} break;
|
||||||
case PROJECTOR_TYPE_INTERNVL:
|
case PROJECTOR_TYPE_INTERNVL:
|
||||||
{
|
{
|
||||||
model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
|
model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
|
||||||
|
|
@ -3031,6 +3044,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||||
case PROJECTOR_TYPE_VOXTRAL:
|
case PROJECTOR_TYPE_VOXTRAL:
|
||||||
case PROJECTOR_TYPE_ULTRAVOX:
|
case PROJECTOR_TYPE_ULTRAVOX:
|
||||||
case PROJECTOR_TYPE_QWEN2A:
|
case PROJECTOR_TYPE_QWEN2A:
|
||||||
|
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
||||||
{
|
{
|
||||||
n_patches = img->nx;
|
n_patches = img->nx;
|
||||||
|
|
||||||
|
|
@ -3403,6 +3417,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||||
case PROJECTOR_TYPE_ULTRAVOX:
|
case PROJECTOR_TYPE_ULTRAVOX:
|
||||||
case PROJECTOR_TYPE_LFM2:
|
case PROJECTOR_TYPE_LFM2:
|
||||||
case PROJECTOR_TYPE_VOXTRAL:
|
case PROJECTOR_TYPE_VOXTRAL:
|
||||||
|
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
||||||
case PROJECTOR_TYPE_JANUS_PRO:
|
case PROJECTOR_TYPE_JANUS_PRO:
|
||||||
case PROJECTOR_TYPE_COGVLM:
|
case PROJECTOR_TYPE_COGVLM:
|
||||||
{
|
{
|
||||||
|
|
@ -3526,6 +3541,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||||
return ctx->model.projection->ne[1];
|
return ctx->model.projection->ne[1];
|
||||||
case PROJECTOR_TYPE_ULTRAVOX:
|
case PROJECTOR_TYPE_ULTRAVOX:
|
||||||
case PROJECTOR_TYPE_VOXTRAL:
|
case PROJECTOR_TYPE_VOXTRAL:
|
||||||
|
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
||||||
return ctx->model.mm_2_w->ne[1];
|
return ctx->model.mm_2_w->ne[1];
|
||||||
case PROJECTOR_TYPE_INTERNVL:
|
case PROJECTOR_TYPE_INTERNVL:
|
||||||
return ctx->model.mm_3_w->ne[1];
|
return ctx->model.mm_3_w->ne[1];
|
||||||
|
|
@ -3587,7 +3603,8 @@ bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
|
||||||
return ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX
|
return ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX
|
||||||
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN2A
|
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN2A
|
||||||
|| ctx->proj_type() == PROJECTOR_TYPE_GLMA
|
|| ctx->proj_type() == PROJECTOR_TYPE_GLMA
|
||||||
|| ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL;
|
|| ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL
|
||||||
|
|| ctx->proj_type() == PROJECTOR_TYPE_MUSIC_FLAMINGO;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
|
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
|
||||||
|
|
|
||||||
|
|
@ -86,6 +86,15 @@ ggml_cgraph * clip_graph_whisper_enc::build() {
|
||||||
FFN_GELU_ERF,
|
FFN_GELU_ERF,
|
||||||
-1);
|
-1);
|
||||||
|
|
||||||
|
} else if (proj_type == PROJECTOR_TYPE_MUSIC_FLAMINGO) {
|
||||||
|
// projector
|
||||||
|
cur = build_ffn(cur,
|
||||||
|
model.mm_1_w, model.mm_1_b,
|
||||||
|
nullptr, nullptr,
|
||||||
|
model.mm_2_w, model.mm_2_b,
|
||||||
|
FFN_GELU_ERF,
|
||||||
|
-1);
|
||||||
|
|
||||||
} else if (proj_type == PROJECTOR_TYPE_GLMA) {
|
} else if (proj_type == PROJECTOR_TYPE_GLMA) {
|
||||||
cur = ggml_norm(ctx0, cur, hparams.eps);
|
cur = ggml_norm(ctx0, cur, hparams.eps);
|
||||||
cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
|
cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
|
||||||
|
|
|
||||||
|
|
@ -330,6 +330,7 @@ struct mtmd_context {
|
||||||
case PROJECTOR_TYPE_ULTRAVOX:
|
case PROJECTOR_TYPE_ULTRAVOX:
|
||||||
case PROJECTOR_TYPE_VOXTRAL:
|
case PROJECTOR_TYPE_VOXTRAL:
|
||||||
case PROJECTOR_TYPE_GLMA:
|
case PROJECTOR_TYPE_GLMA:
|
||||||
|
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
||||||
audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
|
audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
|
||||||
break;
|
break;
|
||||||
case PROJECTOR_TYPE_LFM2A:
|
case PROJECTOR_TYPE_LFM2A:
|
||||||
|
|
@ -352,6 +353,9 @@ struct mtmd_context {
|
||||||
// [BEGIN_AUDIO] ... (embeddings) ...
|
// [BEGIN_AUDIO] ... (embeddings) ...
|
||||||
aud_beg = "[BEGIN_AUDIO]";
|
aud_beg = "[BEGIN_AUDIO]";
|
||||||
|
|
||||||
|
} else if (proj == PROJECTOR_TYPE_MUSIC_FLAMINGO) {
|
||||||
|
// <sound> ... (embeddings) ...
|
||||||
|
aud_beg = "<sound>";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue