diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index a56d3b35b4..d0a0a4865e 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -109,7 +109,7 @@ mtmd_context_params mtmd_context_params_default() { /* use_gpu */ true, /* print_timings */ true, /* n_threads */ 4, - /* image_marker */ MTMD_DEFAULT_IMAGE_MARKER, + /* image_marker */ nullptr, /* media_marker */ mtmd_default_marker(), /* flash_attn_type */ LLAMA_FLASH_ATTN_TYPE_AUTO, /* warmup */ true, @@ -169,7 +169,7 @@ struct mtmd_context { media_marker (ctx_params.media_marker), n_embd_text (llama_model_n_embd_inp(text_model)) { - if (std::string(ctx_params.image_marker) != MTMD_DEFAULT_IMAGE_MARKER) { + if (ctx_params.image_marker != nullptr) { throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead"); } @@ -584,9 +584,6 @@ struct mtmd_tokenizer { parse_special = text->parse_special; input_text = text->text; vocab = llama_model_get_vocab(ctx->text_model); - - // for compatibility, we convert image marker to media marker - string_replace_all(input_text, MTMD_DEFAULT_IMAGE_MARKER, ctx->media_marker); } int32_t tokenize(mtmd_input_chunks * output) { diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h index c91bc08105..a6fd8efa5d 100644 --- a/tools/mtmd/mtmd.h +++ b/tools/mtmd/mtmd.h @@ -46,9 +46,6 @@ # define MTMD_API #endif -// deprecated marker, use mtmd_default_marker() instead -#define MTMD_DEFAULT_IMAGE_MARKER "<__image__>" - #ifdef __cplusplus extern "C" { #endif diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp index e3f2439023..fd417393f8 100644 --- a/tools/server/server-common.cpp +++ b/tools/server/server-common.cpp @@ -84,6 +84,14 @@ std::string gen_tool_call_id() { return random_string(); } +static std::string media_marker = ""; +const char * get_media_marker() { + if (media_marker.empty()) { + media_marker = "<__media_" + random_string() + "__>"; + } + return media_marker.c_str(); +} + // // lora utils // @@ -975,7 +983,7 @@ json oaicompat_chat_params_parse( handle_media(out_files, image_url, opt.media_path); p["type"] = "media_marker"; - p["text"] = mtmd_default_marker(); + p["text"] = get_media_marker(); p.erase("image_url"); } else if (type == "input_audio") { @@ -996,7 +1004,7 @@ json oaicompat_chat_params_parse( // TODO: add audio_url support by reusing handle_media() p["type"] = "media_marker"; - p["text"] = mtmd_default_marker(); + p["text"] = get_media_marker(); p.erase("input_audio"); } else if (type != "text") { @@ -1460,7 +1468,7 @@ json convert_transcriptions_to_chatcmpl( if (!language.empty()) { prompt += string_format(" (language: %s)", language.c_str()); } - prompt += mtmd_default_marker(); + prompt += get_media_marker(); json chatcmpl_body = inp_body; // copy all fields chatcmpl_body["messages"] = json::array({ diff --git a/tools/server/server-common.h b/tools/server/server-common.h index 440ebc597a..57545aa53e 100644 --- a/tools/server/server-common.h +++ b/tools/server/server-common.h @@ -92,6 +92,9 @@ std::string random_string(); std::string gen_chatcmplid(); std::string gen_tool_call_id(); +// get a random marker; note: each time the server restarts, the marker will be different +const char * get_media_marker(); + // // lora utils // diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index e134b3cfb2..41bdad6f87 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -708,6 +708,7 @@ private: mparams.warmup = params_base.warmup; mparams.image_min_tokens = params_base.image_min_tokens; mparams.image_max_tokens = params_base.image_max_tokens; + mparams.media_marker = get_media_marker(); mctx = mtmd_init_from_file(mmproj_path.c_str(), model, mparams); if (mctx == nullptr) {