mtmd: correct token order

This commit is contained in:
bluebread 2025-11-23 09:22:00 +00:00
parent 4cfa15fcd7
commit 3f71188303
4 changed files with 20 additions and 3 deletions

View File

@ -2347,6 +2347,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|| t.first == "_<EOT>"
|| t.first == "<|end_of_text|>"
|| t.first == "<end_of_utterance>" // smoldocling
|| t.first == "<end▁of▁sentence>" // deepseek-ocr
) {
special_eog_ids.insert(t.second);
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {

View File

@ -222,14 +222,18 @@ static std::string chat_add_and_format(mtmd_cli_context & ctx, common_chat_msg &
static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg) {
bool add_bos = ctx.chat_history.empty();
auto formatted_chat = chat_add_and_format(ctx, msg);
LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.c_str());
mtmd_input_text text;
text.text = formatted_chat.c_str();
text.text = msg.content.c_str();
text.add_special = add_bos;
text.parse_special = true;
if (!mtmd_is_deepseekocr(ctx.ctx_vision.get())) {
auto formatted_chat = chat_add_and_format(ctx, msg);
LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.c_str());
text.text = formatted_chat.c_str();
}
if (g_is_interrupted) return 0;
mtmd::input_chunks chunks(mtmd_input_chunks_init());
@ -332,6 +336,11 @@ int main(int argc, char ** argv) {
}
} else {
if (mtmd_is_deepseekocr(ctx.ctx_vision.get())) {
LOG_ERR("\n DeepSeek-OCR doesn't support chat mode.");
return 1;
}
LOG("\n Running in chat mode, available commands:");
if (mtmd_support_vision(ctx.ctx_vision.get())) {
LOG("\n /image <path> load an image");

View File

@ -864,6 +864,10 @@ int mtmd_get_audio_bitrate(mtmd_context * ctx) {
return 16000; // 16kHz
}
bool mtmd_is_deepseekocr(mtmd_context * ctx) {
return ctx->ctx_v && clip_is_deepseekocr(ctx->ctx_v);
}
//
// public API functions
//

View File

@ -117,6 +117,9 @@ MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
// return -1 if audio is not supported
MTMD_API int mtmd_get_audio_bitrate(mtmd_context * ctx);
// whether the current model is DeepSeek-OCR
MTMD_API bool mtmd_is_deepseekocr(mtmd_context * ctx);
// mtmd_bitmap
//
// if bitmap is image: