From 2e17f6a93144fa412fa40d0b92e3d022c887bb1a Mon Sep 17 00:00:00 2001 From: Tarek Dakhran Date: Fri, 13 Feb 2026 16:10:48 +0100 Subject: [PATCH] mtmd : chat : Fix extra \n between text and media marker Thanks to @tugot17 for detecting and reporting the issue. For vision models (e.g. LFM2.5-VL-1.6B and Qwen/Qwen3-VL-4B-Instruct) `llama-mtmd-cli` produces identical output to HF implementation. However `llama-server` doesn't. I traced it down to extra newline inserted after `<__media__>`. This happens in `to_json_oaicompat`, that treats media markers as text and joins all parts with `\n` separator. PR introduces new type `media_marker` and uses it for media markers. Extra logic is added to prevent insertion of newlines before and after media markers. With this change number of input tokens is identical to HF implementation and as a result the output is also identical. I explored other ways to address the issue * remove completely `\n` between text parts in `to_json_oaicompat` * merge text messages in server-common.cpp before sending them to `to_json_oaicompat` Please propose alternative ways of fixing this issue. --- common/chat.cpp | 11 +++++++---- tools/server/server-common.cpp | 6 ++---- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/common/chat.cpp b/common/chat.cpp index 47a34d5822..f8f3a0a797 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -65,15 +65,18 @@ json common_chat_msg::to_json_oaicompat(bool concat_typed_text) const { } else if (!content_parts.empty()) { if (concat_typed_text) { std::string text; + bool last_was_media_marker = false; + // join parts with newline, do not add newline before or after media markers for (const auto & part : content_parts) { - if (part.type != "text") { + if (part.type != "text" && part.type != "media_marker") { LOG_WRN("Ignoring content part type: %s\n", part.type.c_str()); continue; } - if (!text.empty()) { + if (part.type != "media_marker" && !last_was_media_marker && !text.empty()) { text += '\n'; } - text += part.text; + last_was_media_marker = (part.type == "media_marker"); + text += part.text; } jmsg["content"] = text; } else { @@ -319,7 +322,7 @@ std::vector common_chat_msgs_parse_oaicompat(const json & messa throw std::invalid_argument("Missing content part type: " + part.dump()); } const auto & type = part.at("type"); - if (type != "text") { + if (type != "text" && type != "media_marker") { throw std::invalid_argument("Unsupported content part type: " + type.dump()); } common_chat_msg_content_part msg_part; diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp index a853f65c8d..d717fb6698 100644 --- a/tools/server/server-common.cpp +++ b/tools/server/server-common.cpp @@ -916,8 +916,7 @@ json oaicompat_chat_params_parse( json image_url = json_value(p, "image_url", json::object()); handle_media(out_files, image_url, opt.media_path); - // replace this chunk with a marker - p["type"] = "text"; + p["type"] = "media_marker"; p["text"] = mtmd_default_marker(); p.erase("image_url"); @@ -938,8 +937,7 @@ json oaicompat_chat_params_parse( // TODO: add audio_url support by reusing handle_media() - // replace this chunk with a marker - p["type"] = "text"; + p["type"] = "media_marker"; p["text"] = mtmd_default_marker(); p.erase("input_audio");