Merge branch 'sf/deepseek-ocr' into sf/deepseek-ocr
This commit is contained in:
commit
6dfda99c69
|
|
@ -2347,6 +2347,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||||
|| t.first == "_<EOT>"
|
|| t.first == "_<EOT>"
|
||||||
|| t.first == "<|end_of_text|>"
|
|| t.first == "<|end_of_text|>"
|
||||||
|| t.first == "<end_of_utterance>" // smoldocling
|
|| t.first == "<end_of_utterance>" // smoldocling
|
||||||
|
|| t.first == "<|end▁of▁sentence|>" // deepseek-ocr
|
||||||
) {
|
) {
|
||||||
special_eog_ids.insert(t.second);
|
special_eog_ids.insert(t.second);
|
||||||
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||||
|
|
|
||||||
|
|
@ -803,9 +803,8 @@ struct clip_graph {
|
||||||
|
|
||||||
cur = sam_layer_norm_2d(ctx0, cur, 256, model.neck_3_w, model.neck_3_b, hparams.eps);
|
cur = sam_layer_norm_2d(ctx0, cur, 256, model.neck_3_w, model.neck_3_b, hparams.eps);
|
||||||
|
|
||||||
//TODO : check conv padding
|
cur = ggml_conv_2d(ctx0, model.net_2, cur, 2,2,1,1, 1,1);
|
||||||
cur = ggml_conv_2d_s1_ph(ctx0, model.net_2, cur);
|
cur = ggml_conv_2d(ctx0, model.net_3, cur, 2,2,1,1, 1,1);
|
||||||
cur = ggml_conv_2d_s1_ph(ctx0, model.net_3, cur);
|
|
||||||
|
|
||||||
ggml_build_forward_expand(gf, cur);
|
ggml_build_forward_expand(gf, cur);
|
||||||
return cur;
|
return cur;
|
||||||
|
|
@ -840,22 +839,27 @@ struct clip_graph {
|
||||||
|
|
||||||
ggml_tensor * global_features_2 = build_dp_ocr_clip(global_features_1);
|
ggml_tensor * global_features_2 = build_dp_ocr_clip(global_features_1);
|
||||||
|
|
||||||
|
// FIXME remove n_patches is hardcoded
|
||||||
|
int clip_n_patches = 256; // FIXME hardcoded for sam 1024x1024 with 16x16 patches
|
||||||
|
|
||||||
// torch global_features = torch.cat((global_features_2[:, 1:], global_features_1.flatten(2).permute(0, 2, 1)), dim=-1)
|
// torch global_features = torch.cat((global_features_2[:, 1:], global_features_1.flatten(2).permute(0, 2, 1)), dim=-1)
|
||||||
global_features_1 = ggml_cont(ctx0,ggml_permute(ctx0, global_features_1,2,1,0,3));
|
global_features_1 = ggml_cont(ctx0,ggml_permute(ctx0, global_features_1,2,1,0,3));
|
||||||
global_features_1 = ggml_reshape_2d(ctx0, global_features_1, n_embd, n_patches);
|
// flatten 2nd and 3rd dims
|
||||||
|
global_features_1 = ggml_reshape_2d(ctx0, global_features_1, global_features_1->ne[0], clip_n_patches);
|
||||||
|
|
||||||
// remove CLS token
|
// remove CLS token
|
||||||
global_features_2 = ggml_view_2d(ctx0, global_features_2,
|
global_features_2 = ggml_view_2d(ctx0, global_features_2,
|
||||||
n_embd, n_patches,
|
n_embd, clip_n_patches,
|
||||||
ggml_row_size(global_features_2->type, n_embd), 0);
|
ggml_row_size(global_features_2->type, n_embd), 0);
|
||||||
|
|
||||||
ggml_tensor * global_features = ggml_concat(ctx0, global_features_2, global_features_1, 1);
|
ggml_tensor * global_features = ggml_concat(ctx0, global_features_2, global_features_1, 1);
|
||||||
global_features = ggml_reshape_2d(ctx0, global_features, 2* n_embd, n_patches);
|
global_features = ggml_reshape_2d(ctx0, global_features, 2* n_embd,clip_n_patches);
|
||||||
global_features = ggml_cont(ctx0, global_features);
|
global_features = ggml_cont(ctx0, global_features);
|
||||||
global_features = ggml_mul_mat(ctx0, model.fc_w, global_features);
|
global_features = ggml_mul_mat(ctx0, model.fc_w, global_features);
|
||||||
global_features = ggml_add(ctx0, global_features, model.fc_b);
|
global_features = ggml_add(ctx0, global_features, model.fc_b);
|
||||||
|
|
||||||
global_features = build_global_local_features(ctx0,global_features);
|
global_features = build_global_local_features(ctx0,global_features);
|
||||||
|
global_features = ggml_cont(ctx0, ggml_permute(ctx0, global_features, 1, 0, 2, 3));
|
||||||
ggml_build_forward_expand(gf, global_features);
|
ggml_build_forward_expand(gf, global_features);
|
||||||
return gf;
|
return gf;
|
||||||
}
|
}
|
||||||
|
|
@ -870,16 +874,16 @@ struct clip_graph {
|
||||||
GGML_ASSERT(model.view_seperator != nullptr);
|
GGML_ASSERT(model.view_seperator != nullptr);
|
||||||
|
|
||||||
// 1) global_features: [n_dim, h*w] -> [n_dim, w, h] -> [h, w, n_dim]
|
// 1) global_features: [n_dim, h*w] -> [n_dim, w, h] -> [h, w, n_dim]
|
||||||
ggml_tensor * t = ggml_reshape_4d(ctx0, global_features, 1280, 64, 64, 1); // (n_dim, w, h)
|
ggml_tensor * t = ggml_reshape_4d(ctx0, global_features, 1280, 16, 16, 1); // (n_dim, w, h)
|
||||||
t = ggml_cont(ctx0, ggml_permute(ctx0, t, 2, 1, 0, 3)); // (h, w, n_dim)
|
t = ggml_cont(ctx0, ggml_permute(ctx0, t, 2, 1, 0, 3)); // (h, w, n_dim)
|
||||||
ggml_tensor * nl = ggml_cont(ctx0,ggml_permute(ctx0, model.image_newline, 2, 1, 0, 3));
|
ggml_tensor * nl = ggml_cont(ctx0,ggml_permute(ctx0, model.image_newline, 2, 1, 0, 3));
|
||||||
nl = ggml_repeat_4d(ctx0, nl, 64, 1, 1280, 1); // n_pos rows
|
nl = ggml_repeat_4d(ctx0, nl, 16, 1, 1280, 1); // n_pos rows
|
||||||
|
|
||||||
|
|
||||||
// 2) image_newline: [n_dim] -> [1, 1, n_dim] -> repeat to [h, 1, n_dim]
|
// 2) image_newline: [n_dim] -> [1, 1, n_dim] -> repeat to [h, 1, n_dim]
|
||||||
t = ggml_concat(ctx0, t, nl, 1); // (h, w+1, n_dim)
|
t = ggml_concat(ctx0, t, nl, 1); // (h, w+1, n_dim)
|
||||||
|
|
||||||
t = ggml_reshape_2d(ctx0, t, 1280, 64 * (64 + 1)); // (n_dim, h*(w+1))
|
t = ggml_reshape_2d(ctx0, t, 1280, 16 * (16 + 1)); // (n_dim, h*(w+1))
|
||||||
|
|
||||||
|
|
||||||
// 5) append view_separator as an extra "token":
|
// 5) append view_separator as an extra "token":
|
||||||
|
|
@ -1540,9 +1544,12 @@ struct clip_graph {
|
||||||
GGML_ASSERT(model.class_embedding != nullptr);
|
GGML_ASSERT(model.class_embedding != nullptr);
|
||||||
GGML_ASSERT(model.position_embeddings != nullptr);
|
GGML_ASSERT(model.position_embeddings != nullptr);
|
||||||
|
|
||||||
const int n_pos = n_patches + 1;
|
ggml_tensor * inp = ggml_cpy(ctx0, patch_embeds, ggml_dup_tensor(ctx0, patch_embeds));
|
||||||
ggml_tensor * inp = ggml_cont(ctx0,ggml_permute(ctx0, patch_embeds,2,1,0,3));
|
|
||||||
inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches);
|
|
||||||
|
const int n_pos = 257; // +1 for [CLS]
|
||||||
|
inp = ggml_cont(ctx0,ggml_permute(ctx0, inp,2,1,0,3));
|
||||||
|
inp = ggml_reshape_2d(ctx0, inp, n_embd, inp->ne[1]*inp->ne[2]*inp->ne[3]);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1554,7 +1561,9 @@ struct clip_graph {
|
||||||
|
|
||||||
// for selecting learned pos embd, used by ViT
|
// for selecting learned pos embd, used by ViT
|
||||||
ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
|
ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
|
||||||
cb(positions, "positions", -1);
|
ggml_set_name(positions, "positions");
|
||||||
|
ggml_set_input(positions);
|
||||||
|
|
||||||
ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions);
|
ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions);
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -5676,10 +5685,20 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||||
case PROJECTOR_TYPE_VOXTRAL:
|
case PROJECTOR_TYPE_VOXTRAL:
|
||||||
case PROJECTOR_TYPE_JANUS_PRO:
|
case PROJECTOR_TYPE_JANUS_PRO:
|
||||||
case PROJECTOR_TYPE_COGVLM:
|
case PROJECTOR_TYPE_COGVLM:
|
||||||
case PROJECTOR_TYPE_DEEPSEEKOCR:
|
|
||||||
{
|
{
|
||||||
// do nothing
|
// do nothing
|
||||||
} break;
|
} break;
|
||||||
|
case PROJECTOR_TYPE_DEEPSEEKOCR:
|
||||||
|
{
|
||||||
|
//FIXME we need correct this when all model configs are set correctly
|
||||||
|
//n_patch is not correct right now
|
||||||
|
int32_t n_pos = 16 * 16 + 1; //hardcode for now
|
||||||
|
std::vector<int32_t> positions(n_pos);
|
||||||
|
for (int i = 0; i < n_pos; i++) {
|
||||||
|
positions[i] = i;
|
||||||
|
}
|
||||||
|
set_input_i32("positions", positions);
|
||||||
|
} break;
|
||||||
case PROJECTOR_TYPE_LLAMA4:
|
case PROJECTOR_TYPE_LLAMA4:
|
||||||
{
|
{
|
||||||
// set the 2D positions
|
// set the 2D positions
|
||||||
|
|
|
||||||
|
|
@ -222,14 +222,18 @@ static std::string chat_add_and_format(mtmd_cli_context & ctx, common_chat_msg &
|
||||||
|
|
||||||
static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg) {
|
static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg) {
|
||||||
bool add_bos = ctx.chat_history.empty();
|
bool add_bos = ctx.chat_history.empty();
|
||||||
auto formatted_chat = chat_add_and_format(ctx, msg);
|
|
||||||
LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.c_str());
|
|
||||||
|
|
||||||
mtmd_input_text text;
|
mtmd_input_text text;
|
||||||
text.text = formatted_chat.c_str();
|
text.text = msg.content.c_str();
|
||||||
text.add_special = add_bos;
|
text.add_special = add_bos;
|
||||||
text.parse_special = true;
|
text.parse_special = true;
|
||||||
|
|
||||||
|
if (!mtmd_is_deepseekocr(ctx.ctx_vision.get())) {
|
||||||
|
auto formatted_chat = chat_add_and_format(ctx, msg);
|
||||||
|
LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.c_str());
|
||||||
|
text.text = formatted_chat.c_str();
|
||||||
|
}
|
||||||
|
|
||||||
if (g_is_interrupted) return 0;
|
if (g_is_interrupted) return 0;
|
||||||
|
|
||||||
mtmd::input_chunks chunks(mtmd_input_chunks_init());
|
mtmd::input_chunks chunks(mtmd_input_chunks_init());
|
||||||
|
|
@ -332,6 +336,11 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
if (mtmd_is_deepseekocr(ctx.ctx_vision.get())) {
|
||||||
|
LOG_ERR("\n DeepSeek-OCR doesn't support chat mode.");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
LOG("\n Running in chat mode, available commands:");
|
LOG("\n Running in chat mode, available commands:");
|
||||||
if (mtmd_support_vision(ctx.ctx_vision.get())) {
|
if (mtmd_support_vision(ctx.ctx_vision.get())) {
|
||||||
LOG("\n /image <path> load an image");
|
LOG("\n /image <path> load an image");
|
||||||
|
|
|
||||||
|
|
@ -864,6 +864,10 @@ int mtmd_get_audio_bitrate(mtmd_context * ctx) {
|
||||||
return 16000; // 16kHz
|
return 16000; // 16kHz
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool mtmd_is_deepseekocr(mtmd_context * ctx) {
|
||||||
|
return ctx->ctx_v && clip_is_deepseekocr(ctx->ctx_v);
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// public API functions
|
// public API functions
|
||||||
//
|
//
|
||||||
|
|
|
||||||
|
|
@ -117,6 +117,9 @@ MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
|
||||||
// return -1 if audio is not supported
|
// return -1 if audio is not supported
|
||||||
MTMD_API int mtmd_get_audio_bitrate(mtmd_context * ctx);
|
MTMD_API int mtmd_get_audio_bitrate(mtmd_context * ctx);
|
||||||
|
|
||||||
|
// whether the current model is DeepSeek-OCR
|
||||||
|
MTMD_API bool mtmd_is_deepseekocr(mtmd_context * ctx);
|
||||||
|
|
||||||
// mtmd_bitmap
|
// mtmd_bitmap
|
||||||
//
|
//
|
||||||
// if bitmap is image:
|
// if bitmap is image:
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue