diff --git a/src/llama-chat.cpp b/src/llama-chat.cpp index fc6a6223cf..aa7e8f967d 100644 --- a/src/llama-chat.cpp +++ b/src/llama-chat.cpp @@ -49,6 +49,7 @@ static const std::map LLM_CHAT_TEMPLATES = { { "deepseek", LLM_CHAT_TEMPLATE_DEEPSEEK }, { "deepseek2", LLM_CHAT_TEMPLATE_DEEPSEEK_2 }, { "deepseek3", LLM_CHAT_TEMPLATE_DEEPSEEK_3 }, + { "deepseek-ocr", LLM_CHAT_TEMPLATE_DEEPSEEK_OCR }, { "command-r", LLM_CHAT_TEMPLATE_COMMAND_R }, { "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 }, { "chatglm3", LLM_CHAT_TEMPLATE_CHATGLM_3 }, @@ -541,6 +542,11 @@ int32_t llm_chat_apply_template( if (add_ass) { ss << LU8("<|Assistant|>"); } + } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK_OCR) { + for (auto message : chat) { + // no template + ss << message->content; + } } else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_3) { // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb // EXAONE-3.0-7.8B-Instruct diff --git a/src/llama-chat.h b/src/llama-chat.h index 684efb4d67..326db1896c 100644 --- a/src/llama-chat.h +++ b/src/llama-chat.h @@ -28,6 +28,7 @@ enum llm_chat_template { LLM_CHAT_TEMPLATE_DEEPSEEK, LLM_CHAT_TEMPLATE_DEEPSEEK_2, LLM_CHAT_TEMPLATE_DEEPSEEK_3, + LLM_CHAT_TEMPLATE_DEEPSEEK_OCR, LLM_CHAT_TEMPLATE_COMMAND_R, LLM_CHAT_TEMPLATE_LLAMA_3, LLM_CHAT_TEMPLATE_CHATGLM_3, diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp index ab0dc72628..3c41001100 100644 --- a/tools/mtmd/mtmd-cli.cpp +++ b/tools/mtmd/mtmd-cli.cpp @@ -222,20 +222,14 @@ static std::string chat_add_and_format(mtmd_cli_context & ctx, common_chat_msg & static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg) { bool add_bos = ctx.chat_history.empty(); + auto formatted_chat = chat_add_and_format(ctx, msg); + LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.c_str()); mtmd_input_text text; - text.text = msg.content.c_str(); + text.text = formatted_chat.c_str(); text.add_special = add_bos; text.parse_special = true; - std::string formatted_chat; - - if (!mtmd_is_deepseekocr(ctx.ctx_vision.get())) { - formatted_chat = chat_add_and_format(ctx, msg); - LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.c_str()); - text.text = formatted_chat.c_str(); - } - if (g_is_interrupted) return 0; mtmd::input_chunks chunks(mtmd_input_chunks_init()); @@ -319,18 +313,8 @@ int main(int argc, char ** argv) { if (is_single_turn) { g_is_generating = true; if (params.prompt.find(mtmd_default_marker()) == std::string::npos) { - if (mtmd_is_deepseekocr(ctx.ctx_vision.get())) { - std::string image_tokens = ""; - for (size_t i = 0; i < params.image.size(); i++) { - image_tokens += mtmd_default_marker(); - image_tokens += '\n'; - } - params.prompt = image_tokens + params.prompt; - } - else { for (size_t i = 0; i < params.image.size(); i++) { params.prompt += mtmd_default_marker(); - } } } common_chat_msg msg; @@ -349,11 +333,6 @@ int main(int argc, char ** argv) { } } else { - if (mtmd_is_deepseekocr(ctx.ctx_vision.get())) { - LOG_ERR("\n DeepSeek-OCR doesn't support chat mode."); - return 1; - } - LOG("\n Running in chat mode, available commands:"); if (mtmd_support_vision(ctx.ctx_vision.get())) { LOG("\n /image load an image"); diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 791ac77166..33042722eb 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -868,10 +868,6 @@ int mtmd_get_audio_bitrate(mtmd_context * ctx) { return 16000; // 16kHz } -bool mtmd_is_deepseekocr(mtmd_context * ctx) { - return ctx->ctx_v && clip_is_deepseekocr(ctx->ctx_v); -} - // // public API functions // diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h index 0c2d001db6..b3df24c299 100644 --- a/tools/mtmd/mtmd.h +++ b/tools/mtmd/mtmd.h @@ -117,9 +117,6 @@ MTMD_API bool mtmd_support_audio(mtmd_context * ctx); // return -1 if audio is not supported MTMD_API int mtmd_get_audio_bitrate(mtmd_context * ctx); -// whether the current model is DeepSeek-OCR -MTMD_API bool mtmd_is_deepseekocr(mtmd_context * ctx); - // mtmd_bitmap // // if bitmap is image: