server: use random media marker (#21962)
* server: use random media marker * nits * remove legacy <__image__> token * revert special char in random
This commit is contained in:
parent
b3d758750a
commit
408225bb1a
|
|
@ -109,7 +109,7 @@ mtmd_context_params mtmd_context_params_default() {
|
|||
/* use_gpu */ true,
|
||||
/* print_timings */ true,
|
||||
/* n_threads */ 4,
|
||||
/* image_marker */ MTMD_DEFAULT_IMAGE_MARKER,
|
||||
/* image_marker */ nullptr,
|
||||
/* media_marker */ mtmd_default_marker(),
|
||||
/* flash_attn_type */ LLAMA_FLASH_ATTN_TYPE_AUTO,
|
||||
/* warmup */ true,
|
||||
|
|
@ -169,7 +169,7 @@ struct mtmd_context {
|
|||
media_marker (ctx_params.media_marker),
|
||||
n_embd_text (llama_model_n_embd_inp(text_model))
|
||||
{
|
||||
if (std::string(ctx_params.image_marker) != MTMD_DEFAULT_IMAGE_MARKER) {
|
||||
if (ctx_params.image_marker != nullptr) {
|
||||
throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead");
|
||||
}
|
||||
|
||||
|
|
@ -584,9 +584,6 @@ struct mtmd_tokenizer {
|
|||
parse_special = text->parse_special;
|
||||
input_text = text->text;
|
||||
vocab = llama_model_get_vocab(ctx->text_model);
|
||||
|
||||
// for compatibility, we convert image marker to media marker
|
||||
string_replace_all(input_text, MTMD_DEFAULT_IMAGE_MARKER, ctx->media_marker);
|
||||
}
|
||||
|
||||
int32_t tokenize(mtmd_input_chunks * output) {
|
||||
|
|
|
|||
|
|
@ -46,9 +46,6 @@
|
|||
# define MTMD_API
|
||||
#endif
|
||||
|
||||
// deprecated marker, use mtmd_default_marker() instead
|
||||
#define MTMD_DEFAULT_IMAGE_MARKER "<__image__>"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -84,6 +84,14 @@ std::string gen_tool_call_id() {
|
|||
return random_string();
|
||||
}
|
||||
|
||||
static std::string media_marker = "";
|
||||
const char * get_media_marker() {
|
||||
if (media_marker.empty()) {
|
||||
media_marker = "<__media_" + random_string() + "__>";
|
||||
}
|
||||
return media_marker.c_str();
|
||||
}
|
||||
|
||||
//
|
||||
// lora utils
|
||||
//
|
||||
|
|
@ -975,7 +983,7 @@ json oaicompat_chat_params_parse(
|
|||
handle_media(out_files, image_url, opt.media_path);
|
||||
|
||||
p["type"] = "media_marker";
|
||||
p["text"] = mtmd_default_marker();
|
||||
p["text"] = get_media_marker();
|
||||
p.erase("image_url");
|
||||
|
||||
} else if (type == "input_audio") {
|
||||
|
|
@ -996,7 +1004,7 @@ json oaicompat_chat_params_parse(
|
|||
// TODO: add audio_url support by reusing handle_media()
|
||||
|
||||
p["type"] = "media_marker";
|
||||
p["text"] = mtmd_default_marker();
|
||||
p["text"] = get_media_marker();
|
||||
p.erase("input_audio");
|
||||
|
||||
} else if (type != "text") {
|
||||
|
|
@ -1460,7 +1468,7 @@ json convert_transcriptions_to_chatcmpl(
|
|||
if (!language.empty()) {
|
||||
prompt += string_format(" (language: %s)", language.c_str());
|
||||
}
|
||||
prompt += mtmd_default_marker();
|
||||
prompt += get_media_marker();
|
||||
|
||||
json chatcmpl_body = inp_body; // copy all fields
|
||||
chatcmpl_body["messages"] = json::array({
|
||||
|
|
|
|||
|
|
@ -92,6 +92,9 @@ std::string random_string();
|
|||
std::string gen_chatcmplid();
|
||||
std::string gen_tool_call_id();
|
||||
|
||||
// get a random marker; note: each time the server restarts, the marker will be different
|
||||
const char * get_media_marker();
|
||||
|
||||
//
|
||||
// lora utils
|
||||
//
|
||||
|
|
|
|||
|
|
@ -708,6 +708,7 @@ private:
|
|||
mparams.warmup = params_base.warmup;
|
||||
mparams.image_min_tokens = params_base.image_min_tokens;
|
||||
mparams.image_max_tokens = params_base.image_max_tokens;
|
||||
mparams.media_marker = get_media_marker();
|
||||
|
||||
mctx = mtmd_init_from_file(mmproj_path.c_str(), model, mparams);
|
||||
if (mctx == nullptr) {
|
||||
|
|
|
|||
Loading…
Reference in New Issue