mtmd: make sam hparams configurable
This commit is contained in:
parent
15f2ada0ed
commit
2d918b3e21
|
|
@ -6032,6 +6032,7 @@ class DeepseekOCRVisionModel(MmprojModel):
|
|||
sam_hparams = hparams['sam']
|
||||
self.gguf_writer.add_vision_sam_layers_count(sam_hparams['layers'])
|
||||
self.gguf_writer.add_vision_sam_embedding_length(sam_hparams['width'])
|
||||
self.gguf_writer.add_vision_sam_head_count(sam_hparams['heads'])
|
||||
|
||||
def get_vision_config(self) -> dict[str, Any]:
|
||||
vision_config: dict[str, Any] | None = self.global_config.get("vision_config")
|
||||
|
|
|
|||
|
|
@ -306,6 +306,7 @@ class Keys:
|
|||
class SAM:
|
||||
BLOCK_COUNT = "clip.vision.sam.block_count"
|
||||
EMBEDDING_LENGTH = "clip.vision.sam.embedding_length"
|
||||
HEAD_COUNT = "clip.vision.sam.head_count"
|
||||
|
||||
class ClipAudio:
|
||||
NUM_MEL_BINS = "clip.audio.num_mel_bins"
|
||||
|
|
|
|||
|
|
@ -1135,6 +1135,9 @@ class GGUFWriter:
|
|||
|
||||
def add_vision_sam_embedding_length(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipVision.SAM.EMBEDDING_LENGTH, value)
|
||||
|
||||
def add_vision_sam_head_count(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipVision.SAM.HEAD_COUNT, value)
|
||||
|
||||
# audio models
|
||||
|
||||
|
|
|
|||
|
|
@ -49,7 +49,9 @@
|
|||
#define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size"
|
||||
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
|
||||
#define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num"
|
||||
|
||||
#define KEY_SAM_N_HEAD "clip.vision.sam.head_count"
|
||||
#define KEY_SAM_N_BLOCK "clip.vision.sam.block_count"
|
||||
#define KEY_SAM_N_EMBD "clip.vision.sam.embedding_length"
|
||||
// audio-specific
|
||||
#define KEY_AUDIO_PROJ_TYPE "clip.audio.projector_type" // for models with mixed modalities
|
||||
#define KEY_A_NUM_MEL_BINS "clip.audio.num_mel_bins"
|
||||
|
|
|
|||
|
|
@ -193,6 +193,11 @@ struct clip_hparams {
|
|||
int32_t attn_window_size = 0;
|
||||
int32_t n_wa_pattern = 0;
|
||||
|
||||
// deepseek-ocr (sam)
|
||||
int32_t sam_n_layer = 0;
|
||||
int32_t sam_n_head = 0;
|
||||
int32_t sam_n_embd = 0;
|
||||
|
||||
// audio
|
||||
int32_t n_mel_bins = 0; // whisper preprocessor
|
||||
int32_t proj_stack_factor = 0; // ultravox
|
||||
|
|
@ -2676,9 +2681,9 @@ private:
|
|||
}
|
||||
|
||||
ggml_tensor * build_sam(ggml_tensor * inp_raw) {
|
||||
const int n_embd = 768;
|
||||
const int _depth = 12;
|
||||
const int n_heads = 12;
|
||||
const int n_embd = hparams.sam_n_embd;
|
||||
const int n_layer = hparams.sam_n_layer;
|
||||
const int n_heads = hparams.sam_n_head;
|
||||
const int d_heads = n_embd / n_heads;
|
||||
const int window = hparams.attn_window_size;
|
||||
|
||||
|
|
@ -2721,7 +2726,7 @@ private:
|
|||
}
|
||||
|
||||
// loop over layers
|
||||
for (int il = 0; il < _depth; il++) {
|
||||
for (int il = 0; il < n_layer; il++) {
|
||||
auto & layer = model.sam_layers[il];
|
||||
ggml_tensor * shortcut = cur;
|
||||
|
||||
|
|
@ -3286,6 +3291,10 @@ struct clip_model_loader {
|
|||
hparams.patch_size = 16;
|
||||
hparams.image_size = 1024;
|
||||
hparams.warmup_image_size = 1024;
|
||||
|
||||
get_u32(KEY_SAM_N_BLOCK, hparams.sam_n_layer, true);
|
||||
get_u32(KEY_SAM_N_HEAD, hparams.sam_n_head, true);
|
||||
get_u32(KEY_SAM_N_EMBD, hparams.sam_n_embd, true);
|
||||
get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true);
|
||||
} break;
|
||||
default:
|
||||
|
|
|
|||
Loading…
Reference in New Issue