Merge branch 'sf/deepseek-ocr' into sf/deepseek-ocr
This commit is contained in:
commit
13dc6fb305
|
|
@ -5790,16 +5790,16 @@ class Gemma3VisionModel(MmprojModel):
|
||||||
|
|
||||||
@ModelBase.register("DeepseekOCRForCausalLM")
|
@ModelBase.register("DeepseekOCRForCausalLM")
|
||||||
class DeepseekOCRVisionModel(MmprojModel):
|
class DeepseekOCRVisionModel(MmprojModel):
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
proc_fname = self.dir_model / "processor_config.json"
|
proc_fname = self.dir_model / "processor_config.json"
|
||||||
|
|
||||||
if proc_fname.is_file():
|
if proc_fname.is_file():
|
||||||
with open(proc_fname, "r") as f:
|
with open(proc_fname, "r") as f:
|
||||||
self.preprocessor_config = json.load(f)
|
self.preprocessor_config = json.load(f)
|
||||||
|
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
hparams = self.hparams
|
hparams = self.hparams
|
||||||
|
|
@ -5857,7 +5857,7 @@ class DeepseekOCRVisionModel(MmprojModel):
|
||||||
return [(self.map_tensor_name(name, try_suffixes=("",)), data_torch)]
|
return [(self.map_tensor_name(name, try_suffixes=("",)), data_torch)]
|
||||||
|
|
||||||
return [(self.map_tensor_name(name), data_torch)]
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
|
|
||||||
@ModelBase.register("Gemma3nForConditionalGeneration")
|
@ModelBase.register("Gemma3nForConditionalGeneration")
|
||||||
class Gemma3NModel(Gemma3Model):
|
class Gemma3NModel(Gemma3Model):
|
||||||
|
|
|
||||||
|
|
@ -130,18 +130,18 @@
|
||||||
#define TN_TOK_EOI "v.eoi"
|
#define TN_TOK_EOI "v.eoi"
|
||||||
|
|
||||||
// deepseek-ocr
|
// deepseek-ocr
|
||||||
#define TN_SAM_POS_EMBD "sam.pos_embd"
|
#define TN_SAM_POS_EMBD "v.sam.pos_embd"
|
||||||
#define TN_SAM_PATCH_EMBD "sam.patch_embd.%s"
|
#define TN_SAM_PATCH_EMBD "v.sam.patch_embd.%s"
|
||||||
#define TN_SAM_PRE_NORM "sam.blk.%d.pre_ln.%s"
|
#define TN_SAM_PRE_NORM "v.sam.blk.%d.pre_ln.%s"
|
||||||
#define TN_SAM_POST_NORM "sam.blk.%d.post_ln"
|
#define TN_SAM_POST_NORM "v.sam.blk.%d.post_ln"
|
||||||
#define TN_SAM_ATTN_POS_H "sam.blk.%d.attn.pos_h"
|
#define TN_SAM_ATTN_POS_H "v.sam.blk.%d.attn.pos_h"
|
||||||
#define TN_SAM_ATTN_POS_W "sam.blk.%d.attn.pos_w"
|
#define TN_SAM_ATTN_POS_W "v.sam.blk.%d.attn.pos_w"
|
||||||
#define TN_SAM_ATTN_QKV "sam.blk.%d.attn.qkv.%s"
|
#define TN_SAM_ATTN_QKV "v.sam.blk.%d.attn.qkv.%s"
|
||||||
#define TN_SAM_ATTN_OUT "sam.blk.%d.attn.out.%s"
|
#define TN_SAM_ATTN_OUT "v.sam.blk.%d.attn.out.%s"
|
||||||
#define TN_SAM_FFN_UP "sam.blk.%d.mlp.lin1.%s"
|
#define TN_SAM_FFN_UP "v.sam.blk.%d.mlp.lin1.%s"
|
||||||
#define TN_SAM_FFN_DOWN "sam.blk.%d.mlp.lin2.%s"
|
#define TN_SAM_FFN_DOWN "v.sam.blk.%d.mlp.lin2.%s"
|
||||||
#define TN_SAM_NECK "sam.neck.%d.%s"
|
#define TN_SAM_NECK "v.sam.neck.%d.%s"
|
||||||
#define TN_SAM_NET "sam.net_%d.%s"
|
#define TN_SAM_NET "v.sam.net_%d.%s"
|
||||||
|
|
||||||
// align x to upper multiple of n
|
// align x to upper multiple of n
|
||||||
#define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
|
#define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
|
||||||
|
|
@ -170,7 +170,7 @@ enum projector_type {
|
||||||
PROJECTOR_TYPE_LIGHTONOCR,
|
PROJECTOR_TYPE_LIGHTONOCR,
|
||||||
PROJECTOR_TYPE_COGVLM,
|
PROJECTOR_TYPE_COGVLM,
|
||||||
PROJECTOR_TYPE_JANUS_PRO,
|
PROJECTOR_TYPE_JANUS_PRO,
|
||||||
PROJECTOR_TYPE_DEEPSEEK_OCR,
|
PROJECTOR_TYPE_DEEPSEEKOCR,
|
||||||
PROJECTOR_TYPE_UNKNOWN,
|
PROJECTOR_TYPE_UNKNOWN,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
@ -197,7 +197,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||||
{ PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
|
{ PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
|
||||||
{ PROJECTOR_TYPE_COGVLM, "cogvlm"},
|
{ PROJECTOR_TYPE_COGVLM, "cogvlm"},
|
||||||
{ PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
|
{ PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
|
||||||
{ PROJECTOR_TYPE_DEEPSEEK_OCR,"deepseek_orc"},
|
{ PROJECTOR_TYPE_DEEPSEEKOCR,"deepseekocr"},
|
||||||
};
|
};
|
||||||
|
|
||||||
static projector_type clip_projector_type_from_string(const std::string & str) {
|
static projector_type clip_projector_type_from_string(const std::string & str) {
|
||||||
|
|
|
||||||
|
|
@ -682,8 +682,8 @@ struct clip_graph {
|
||||||
|
|
||||||
const int enc_n_patches = enc_image_size / enc_patch_size; // 64
|
const int enc_n_patches = enc_image_size / enc_patch_size; // 64
|
||||||
|
|
||||||
ggml_tensor * inpL = build_enc_inp(inp_raw, enc_patch_size, enc_image_size, enc_n_embd);
|
ggml_tensor * inpL = build_enc_inp(inp_raw, enc_patch_size, enc_n_patches, enc_n_embd);
|
||||||
ggml_tensor * cur = ggml_add(ctx0, inpL, model.position_embeddings);
|
ggml_tensor * cur = ggml_add(ctx0, inpL, model.pos_embed);
|
||||||
|
|
||||||
// loop over layers
|
// loop over layers
|
||||||
for (int il = 0; il < _depth; il++) {
|
for (int il = 0; il < _depth; il++) {
|
||||||
|
|
@ -842,7 +842,7 @@ struct clip_graph {
|
||||||
ggml_tensor * inp_raw = build_inp_raw();
|
ggml_tensor * inp_raw = build_inp_raw();
|
||||||
|
|
||||||
|
|
||||||
ggml_tensor * global_features_1 = build_sam_enc(inp_raw);
|
ggml_tensor * global_features_1 = build_sam_enc(inp_raw, std::max(img.nx, img.ny));
|
||||||
|
|
||||||
ggml_tensor * global_features_2 = build_dp_ocr_clip(inp_raw, global_features_1);
|
ggml_tensor * global_features_2 = build_dp_ocr_clip(inp_raw, global_features_1);
|
||||||
|
|
||||||
|
|
@ -2862,6 +2862,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
{
|
{
|
||||||
res = graph.build_cogvlm();
|
res = graph.build_cogvlm();
|
||||||
} break;
|
} break;
|
||||||
|
case PROJECTOR_TYPE_DEEPSEEKOCR:
|
||||||
|
{
|
||||||
|
res = graph.build_deepseek_ocr();
|
||||||
|
} break;
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
res = graph.build_llava();
|
res = graph.build_llava();
|
||||||
|
|
@ -3187,6 +3191,11 @@ struct clip_model_loader {
|
||||||
hparams.ffn_op = FFN_GELU_ERF;
|
hparams.ffn_op = FFN_GELU_ERF;
|
||||||
log_ffn_op = "gelu_erf"; // temporary solution for logging
|
log_ffn_op = "gelu_erf"; // temporary solution for logging
|
||||||
} break;
|
} break;
|
||||||
|
case PROJECTOR_TYPE_DEEPSEEKOCR:
|
||||||
|
{
|
||||||
|
hparams.set_limit_image_tokens(8, 1024);
|
||||||
|
hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
|
||||||
|
} break;
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
@ -3574,7 +3583,7 @@ struct clip_model_loader {
|
||||||
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
|
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
|
||||||
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
|
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
|
||||||
} break;
|
} break;
|
||||||
case PROJECTOR_TYPE_DEEPSEEK_OCR:
|
case PROJECTOR_TYPE_DEEPSEEKOCR:
|
||||||
{
|
{
|
||||||
model.pos_embed = get_tensor(TN_SAM_POS_EMBD);
|
model.pos_embed = get_tensor(TN_SAM_POS_EMBD);
|
||||||
model.patch_embed_proj_w = get_tensor(string_format(TN_SAM_PATCH_EMBD, "weight"));
|
model.patch_embed_proj_w = get_tensor(string_format(TN_SAM_PATCH_EMBD, "weight"));
|
||||||
|
|
@ -4830,7 +4839,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case PROJECTOR_TYPE_DEEPSEEK_OCR:
|
case PROJECTOR_TYPE_DEEPSEEKOCR:
|
||||||
{
|
{
|
||||||
// configurable, or read from params
|
// configurable, or read from params
|
||||||
const int min_num = 2;
|
const int min_num = 2;
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue