Allow VisionEmbedding to recognize embedded images without loading mmproj

1. Add /vision/embedding 2. Modify OAI API to allow uploading image emb(also b64 emb)
2026-03-08 15:44:19 +08:00 · 2026-03-08 15:44:19 +08:00 · ff89f63285
parent c5a778891b
commit ff89f63285
6 changed files with 430 additions and 37 deletions
--- a/tools/server/server-common.cpp
+++ b/tools/server/server-common.cpp
@ -166,7 +166,7 @@ static inline bool is_base64(uint8_t c) {
    return (isalnum(c) || (c == '+') || (c == '/'));
 }

-static inline raw_buffer base64_decode(const std::string & encoded_string) {
+inline raw_buffer base64_decode(const std::string & encoded_string) {
    int i = 0;
    int j = 0;
    int in_ = 0;
@ -232,7 +232,7 @@ server_tokens::server_tokens(const llama_tokens & tokens, bool has_mtmd) : has_m
 }

 llama_pos server_tokens::pos_next(int64_t n_tokens) const {
-    if (!has_mtmd) {
+    if (!has_mtmd && !has_precomputed) {
        if (n_tokens < 0) {
            return tokens.size();
        }
@ -258,6 +258,7 @@ llama_pos server_tokens::pos_next(int64_t n_tokens) const {

    while (idx < n_tokens) {
        const auto media_it = map_idx_to_media.find(idx);
+        const auto precomp_it = map_idx_to_precomputed.find(idx);
        if (media_it != map_idx_to_media.end()) {
            const auto & chunk = media_it->second;
            const llama_pos n_pos = mtmd_input_chunk_get_n_pos(chunk.get());
@ -265,6 +266,10 @@ llama_pos server_tokens::pos_next(int64_t n_tokens) const {

            pos += n_pos;
            idx += n_tok;
+        } else if (precomp_it != map_idx_to_precomputed.end()) {
+            const auto & img = precomp_it->second;
+            pos += img.n_tokens; // n_pos == n_tokens for precomputed
+            idx += img.n_tokens;
        } else {
            pos++;
            idx++;
@ -395,7 +400,7 @@ void server_tokens::set_token(llama_pos pos, llama_token id) {

 void server_tokens::keep_first(size_t n) {
    GGML_ASSERT(n <= tokens.size());
-    if (has_mtmd) {
+    if (has_mtmd || has_precomputed) {
        if (n == tokens.size()) {
            return; // nothing to do
        }
@ -410,7 +415,11 @@ void server_tokens::keep_first(size_t n) {
            // note that the case where we keep a full image at the end is allowed:
            //   tokens[n - 1] == LLAMA_TOKEN_NULL && tokens[n] != LLAMA_TOKEN_NULL
            if (tokens[n - 1] == LLAMA_TOKEN_NULL && tokens[n] == LLAMA_TOKEN_NULL) {
-                find_chunk(n - 1); // will throw an error if the token is not begin-of-chunk
+                    find_chunk(n - 1); // will throw an error if the token is not begin-of-chunk
+                }
+                if (has_precomputed) {
+                    find_precomputed(n - 1); // must be find_precomputed but not find_chunk.beacause it knows more than mtmd chunk
+                }
            }
        }
        // remove all image chunks that are not used anymore
@ -422,7 +431,14 @@ void server_tokens::keep_first(size_t n) {
                ++it;
            }
        }
-    }
+        for (auto it = map_idx_to_precomputed.begin(); it != map_idx_to_precomputed.end(); ) {
+            size_t idx = it->first;
+            if (idx >= n) {
+                it = map_idx_to_precomputed.erase(it);
+            } else {
+                ++it;
+            }
+        }
    tokens.resize(n);
 }

@ -495,13 +511,24 @@ bool server_tokens::validate(const struct llama_context * ctx) const {
    for (size_t i = 0; i < tokens.size(); ++i) {
        const auto & t = tokens[i];
        if (t == LLAMA_TOKEN_NULL) {
-            try {
-                const auto & chunk = find_chunk(i);
-                size_t n_tokens = mtmd_input_chunk_get_n_tokens(chunk.get());
-                i += n_tokens - 1; // will be +1 by the for loop
-            } catch (const std::exception & e) {
-                return false;
+            // check mtmd chunk first
+            auto media_it = map_idx_to_media.find(i);
+            if (media_it != map_idx_to_media.end()) {
+                try {
+                    size_t n_tokens = mtmd_input_chunk_get_n_tokens(media_it->second.get());
+                    i += n_tokens - 1;
+                } catch (const std::exception & e) {
+                    return false;
+                }
+                continue;
            }
+            // check precomputed
+            auto precomp_it = map_idx_to_precomputed.find(i);
+            if (precomp_it != map_idx_to_precomputed.end()) {
+                i += precomp_it->second.n_tokens - 1;
+                continue;
+            }
+            return false;
        } else if (t < 0 || t >= n_vocab) {
            return false;
        }
@ -539,16 +566,104 @@ int32_t server_tokens::process_chunk(
    n_tokens_out = mtmd_input_chunk_get_n_tokens(chunk.get());
    return 0;
 }
+//it is not duplicated with the process_chunk function
+int32_t server_tokens::process_precomputed_chunk(
+            llama_context * ctx,
+            size_t idx,
+            llama_pos pos,
+            int32_t seq_id,
+            size_t & n_tokens_out) const {
+    const auto & img = find_precomputed(idx);
+    SRV_INF("processing pre-computed image embeddings (%d tokens, %d embd)...\n", img.n_tokens, img.n_embd);

+    int32_t n_batch = llama_n_batch(ctx);
+    int64_t t0 = ggml_time_ms();
+
+    int32_t n_img_batches = (img.n_tokens + n_batch - 1) / n_batch;
+
+    for (int32_t i_batch = 0; i_batch < n_img_batches; i_batch++) {
+        int32_t pos_offset = i_batch * n_batch;
+        int32_t n_tokens_batch = std::min(n_batch, img.n_tokens - pos_offset);
+
+        llama_batch batch = {
+            /*n_tokens       =*/ n_tokens_batch,
+            /*tokens         =*/ nullptr,
+            /*embd           =*/ const_cast<float *>(img.embedding.data() + (size_t)pos_offset * img.n_embd),
+            /*pos            =*/ nullptr,
+            /*n_seq_id       =*/ nullptr,
+            /*seq_id         =*/ nullptr,
+            /*logits         =*/ nullptr,
+        };
+
+        // allocate temporary arrays for pos, n_seq_id, seq_id, logits
+        std::vector<llama_pos>       batch_pos(n_tokens_batch);
+        std::vector<int32_t>         batch_n_seq_id(n_tokens_batch);
+        std::vector<llama_seq_id *>  batch_seq_id(n_tokens_batch);
+        std::vector<int8_t>          batch_logits(n_tokens_batch, false);
+        std::vector<llama_seq_id>    batch_seq_id_0(1, seq_id);
+
+        for (int32_t j = 0; j < n_tokens_batch; j++) {
+            batch_pos[j]      = pos + pos_offset + j;
+            batch_n_seq_id[j] = 1;
+            batch_seq_id[j]   = batch_seq_id_0.data();
+        }
+
+        // set logits for last token of the last batch
+        if (i_batch == n_img_batches - 1) {
+            batch_logits[n_tokens_batch - 1] = true;
+        }
+
+        batch.pos      = batch_pos.data();
+        batch.n_seq_id = batch_n_seq_id.data();
+        batch.seq_id   = batch_seq_id.data();
+        batch.logits   = batch_logits.data();
+
+        SRV_INF("decoding pre-computed image batch %d/%d, n_tokens_batch = %d\n", i_batch + 1, n_img_batches, n_tokens_batch);
+
+        int32_t ret = llama_decode(ctx, batch);
+        if (ret != 0) {
+            LOG_ERR("failed to decode pre-computed image embeddings, ret = %d\n", ret);
+            n_tokens_out = 0;
+            return ret;
+        }
+    }
+
+    SRV_INF("pre-computed image processed in %" PRId64 " ms\n", ggml_time_ms() - t0);
+    n_tokens_out = img.n_tokens;
+    return 0;
+}
+
+void server_tokens::push_back_precomputed(const server_precomputed_image & img) {
+    size_t start_idx = tokens.size();
+    for (int32_t i = 0; i < img.n_tokens; ++i) {
+        tokens.emplace_back(LLAMA_TOKEN_NULL);
+    }
+    map_idx_to_precomputed[start_idx] = img;
+    has_precomputed = true;
+}
+
+bool server_tokens::has_precomputed_at(size_t idx) const {
+    return map_idx_to_precomputed.find(idx) != map_idx_to_precomputed.end();
+}
+
+const server_precomputed_image & server_tokens::find_precomputed(size_t idx) const {
+    auto it = map_idx_to_precomputed.find(idx);
+    if (it != map_idx_to_precomputed.end()) {
+        return it->second;
+    }
+    throw std::runtime_error("Pre-computed image embeddings not found at index " + std::to_string(idx));
+}
 server_tokens server_tokens::clone() const {
    server_tokens res;
    res.has_mtmd = has_mtmd;
+    res.has_precomputed = has_precomputed;
    res.tokens   = tokens;
    for (auto it = map_idx_to_media.begin(); it != map_idx_to_media.end(); ++it) {
        size_t idx = it->first;
        const mtmd::input_chunk_ptr & chunk = it->second;
        res.map_idx_to_media[idx] = mtmd::input_chunk_ptr(mtmd_input_chunk_copy(chunk.get()));
    }
+    res.map_idx_to_precomputed = map_idx_to_precomputed;
    return res;
 }

@ -725,6 +840,58 @@ server_tokens process_mtmd_prompt(mtmd_context * mctx, std::string prompt, std::
    return result;
 }

+server_tokens process_precomputed_image_prompt(
+        const llama_vocab * vocab,
+        const std::string & prompt,
+        const std::vector<server_precomputed_image> & precomputed_images) {
+    // Split prompt by the media marker <__media__>
+    const std::string marker = mtmd_default_marker();
+    server_tokens result;
+    result.has_precomputed = true;
+
+    std::string remaining = prompt;
+    size_t img_idx = 0;
+
+    while (true) {
+        size_t pos = remaining.find(marker);
+        if (pos == std::string::npos) {
+            // no more markers, tokenize the rest
+            if (!remaining.empty()) {
+                llama_tokens text_tokens = common_tokenize(vocab, remaining, img_idx == 0, true);
+                for (auto tok : text_tokens) {
+                    result.push_back(tok);
+                }
+            }
+            break;
+        }
+
+        // tokenize text before marker
+        std::string before = remaining.substr(0, pos);
+        if (!before.empty()) {
+            llama_tokens text_tokens = common_tokenize(vocab, before, img_idx == 0, true);
+            for (auto tok : text_tokens) {
+                result.push_back(tok);
+            }
+        }
+
+        // insert precomputed image embeddings
+        if (img_idx < precomputed_images.size()) {
+            result.push_back_precomputed(precomputed_images[img_idx]);
+            img_idx++;
+        } else {
+            throw std::runtime_error("Not enough precomputed images for the number of media markers in prompt");
+        }
+
+        remaining = remaining.substr(pos + marker.size());
+    }
+
+    if (img_idx != precomputed_images.size()) {
+        throw std::runtime_error("Number of precomputed images does not match the number of media markers in prompt");
+    }
+
+    return result;
+}
+
 /**
 * break the input "prompt" object into multiple prompt if needed, then tokenize them
 * use tokenize_input_prompts() if the input could be an array.
@ -889,7 +1056,8 @@ static void handle_media(
 json oaicompat_chat_params_parse(
    json & body, /* openai api json semantics */
    const server_chat_params & opt,
-    std::vector<raw_buffer> & out_files)
+    std::vector<raw_buffer> & out_files,
+    std::vector<server_precomputed_image> & out_precomputed_images)
 {
    json llama_params;

@ -978,7 +1146,42 @@ json oaicompat_chat_params_parse(
                p["text"] = mtmd_default_marker();
                p.erase("image_url");

-            } else if (type == "input_audio") {
+            } else if (type=="image_embedding" || type=="image_embedding_b64"){
+                json emb_data = json_value(p, type, json::object());
+                if (!(emb_data.contains("embedding")||emb_data.contains("embedding_b64")) || !emb_data.contains("n_tokens") || !emb_data.contains("n_embd")) {
+                    throw std::invalid_argument(type + " must contain 'embedding', 'n_tokens', and 'n_embd'");
+                }
+
+                server_precomputed_image img;
+                img.n_tokens = emb_data.at("n_tokens").get<int32_t>();
+                img.n_embd   = emb_data.at("n_embd").get<int32_t>();
+
+                if(type=="image_embedding"){
+                    const auto & embd_arr = emb_data.at("embedding");
+                    if (!embd_arr.is_array()) {
+                        throw std::invalid_argument(type + ".embedding must be an array of floats");
+                    }
+                    img.embedding.reserve(embd_arr.size());
+                    for (const auto & v : embd_arr) {
+                        img.embedding.push_back(v.get<float>());
+                    }
+                } else {
+                    const std::string & embd_b64 = emb_data.at("embedding_b64").get_ref<const std::string &>();
+                    raw_buffer raw_bytes = base64_decode(embd_b64);
+                    size_t expected_bytes = (size_t)img.n_tokens * img.n_embd * sizeof(float);
+                    if (raw_bytes.size() != expected_bytes) {
+                        throw std::invalid_argument(type + ".embedding size mismatch: expected " +
+                            std::to_string(expected_bytes) + " bytes, got " + std::to_string(raw_bytes.size()));
+                    }
+                    const float * float_ptr = reinterpret_cast<const float *>(raw_bytes.data());
+                    img.embedding.assign(float_ptr, float_ptr + (size_t)img.n_tokens * img.n_embd);
+                }
+                out_precomputed_images.push_back(std::move(img));
+
+                p["type"] = "media_marker";
+                p["text"] = mtmd_default_marker();
+                p.erase(type);
+            }else if (type == "input_audio") {
                if (!opt.allow_audio) {
                    throw std::runtime_error("audio input is not supported - hint: if this is unexpected, you may need to provide the mmproj");
                }
--- a/tools/server/server-common.h
+++ b/tools/server/server-common.h
@ -114,6 +114,18 @@ bool are_lora_equal(
 // get the ids of all enabled loras
 std::vector<size_t> lora_get_enabled_ids(const std::vector<common_adapter_lora_info> & loras);

+//
+// pre-computed image embeddings (from /vision/embedding)
+//
+
+raw_buffer base64_decode(const std::string & encoded_string);
+
+struct server_precomputed_image {
+    std::vector<float> embedding; // float array: n_tokens * n_embd
+    int32_t n_tokens;
+    int32_t n_embd;
+};
+
 //
 // server_tokens
 //
@ -124,9 +136,13 @@ std::vector<size_t> lora_get_enabled_ids(const std::vector<common_adapter_lora_i
 */
 struct server_tokens {
    bool has_mtmd = false;
+    bool has_precomputed = false; // set to true when pre-computed image embeddings are used

 private: // disallow accessing these members directly, risking out-of-sync

+    // map a **start** index in tokens to pre-computed image embeddings (from /vision/tokenize)
+    std::map<size_t, server_precomputed_image> map_idx_to_precomputed;
+
    // map a **start** index in tokens to the image chunk
    // note: the order need to be in-sync with tokens
    std::map<size_t, mtmd::input_chunk_ptr> map_idx_to_media;
@ -198,6 +214,7 @@ public:

    void clear() {
        map_idx_to_media.clear();
+        map_idx_to_precomputed.clear();
        tokens.clear();
    }

@ -219,6 +236,23 @@ public:
                int32_t seq_id,
                size_t & n_tokens_out) const;

+    // decode pre-computed image embeddings (no CLIP encoding needed)
+    int32_t process_precomputed_chunk(
+                llama_context * ctx,
+                size_t idx,
+                llama_pos pos,
+                int32_t seq_id,
+                size_t & n_tokens_out) const;
+
+    // push pre-computed image embedding tokens
+    void push_back_precomputed(const server_precomputed_image & img);
+
+    // check if a position has pre-computed embeddings
+    bool has_precomputed_at(size_t idx) const;
+
+    // find pre-computed embeddings at a position
+    const server_precomputed_image & find_precomputed(size_t idx) const;
+
    server_tokens clone() const;
 };

@ -253,6 +287,12 @@ size_t validate_utf8(const std::string& text);
 // process mtmd prompt, return the server_tokens containing both text tokens and media chunks
 server_tokens process_mtmd_prompt(mtmd_context * mctx, std::string prompt, std::vector<raw_buffer> files);

+// process prompt with pre-computed image embeddings, return server_tokens without requiring mtmd_context
+server_tokens process_precomputed_image_prompt(
+    const llama_vocab * vocab,
+    const std::string & prompt,
+    const std::vector<server_precomputed_image> & precomputed_images);
+
 /**
 * break the input "prompt" object into multiple prompt if needed, then tokenize them
 * this supports these cases:
@ -297,7 +337,8 @@ json oaicompat_completion_params_parse(const json & body);
 json oaicompat_chat_params_parse(
    json & body, /* openai api json semantics */
    const server_chat_params & opt,
-    std::vector<raw_buffer> & out_files);
+    std::vector<raw_buffer> & out_files,
+    std::vector<server_precomputed_image> & out_precomputed_images);

 // convert OpenAI Responses API format to OpenAI Chat Completions API format
 json convert_responses_to_chatcmpl(const json & body);
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@ -11,6 +11,7 @@
 #include "speculative.h"
 #include "mtmd.h"
 #include "mtmd-helper.h"
+#include "base64.hpp"

 #include <algorithm>
 #include <cstddef>
@ -2440,22 +2441,42 @@ private:

                    // check if we should process the image
                    if (slot.prompt.n_tokens() < slot.task->n_tokens() && input_tokens[slot.prompt.n_tokens()] == LLAMA_TOKEN_NULL) {
-                        // process the image
-                        size_t n_tokens_out = 0;
-                        int32_t res = input_tokens.process_chunk(ctx, mctx, slot.prompt.n_tokens(), slot.prompt.tokens.pos_next(), slot.id, n_tokens_out);
-                        if (res != 0) {
-                            SLT_ERR(slot, "failed to process image, res = %d\n", res);
-                            send_error(slot, "failed to process image", ERROR_TYPE_SERVER);
-                            slot.release();
-                            continue;
-                        }
+                        if (input_tokens.has_precomputed_at(slot.prompt.n_tokens())) {
+                            // precomputed image embeddings path: decode directly
+                            size_t n_tokens_out = 0;
+                            int32_t res = input_tokens.process_precomputed_chunk(ctx, slot.prompt.n_tokens(), slot.prompt.tokens.pos_next(), slot.id, n_tokens_out);
+                            if (res != 0) {
+                                SLT_ERR(slot, "failed to process precomputed image, res = %d\n", res);
+                                send_error(slot, "failed to process precomputed image", ERROR_TYPE_SERVER);
+                                slot.release();
+                                continue;
+                            }

-                        slot.n_prompt_tokens_processed += n_tokens_out;
+                            slot.n_prompt_tokens_processed += n_tokens_out;

-                        // add the image chunk to cache
-                        {
-                            const auto & chunk = input_tokens.find_chunk(slot.prompt.n_tokens());
-                            slot.prompt.tokens.push_back(chunk.get()); // copy
+                            // add precomputed chunk to cache
+                            {
+                                const auto & img = input_tokens.find_precomputed(slot.prompt.n_tokens());
+                                slot.prompt.tokens.push_back_precomputed(img);
+                            }
+                        } else {
+                            // process the image via CLIP
+                            size_t n_tokens_out = 0;
+                            int32_t res = input_tokens.process_chunk(ctx, mctx, slot.prompt.n_tokens(), slot.prompt.tokens.pos_next(), slot.id, n_tokens_out);
+                            if (res != 0) {
+                                SLT_ERR(slot, "failed to process image, res = %d\n", res);
+                                send_error(slot, "failed to process image", ERROR_TYPE_SERVER);
+                                slot.release();
+                                continue;
+                            }
+
+                            slot.n_prompt_tokens_processed += n_tokens_out;
+
+                            // add the image chunk to cache
+                            {
+                                const auto & chunk = input_tokens.find_chunk(slot.prompt.n_tokens());
+                                slot.prompt.tokens.push_back(chunk.get()); // copy
+                            }
                        }
                    }

@ -2983,7 +3004,8 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
            server_task_type type,
            const json & data,
            const std::vector<raw_buffer> & files,
-            task_response_type res_type) {
+            task_response_type res_type,
+            const std::vector<server_precomputed_image> & precomputed_images) {
    GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL);

    auto res = create_response();
@ -3000,7 +3022,10 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
        // process prompt
        std::vector<server_tokens> inputs;

-        if (res_type != TASK_RESPONSE_TYPE_NONE && ctx_server.mctx != nullptr) {
+        if (!precomputed_images.empty()) {
+            // Precomputed image embeddings path: bypass CLIP, inject embeddings directly
+            inputs.push_back(process_precomputed_image_prompt(ctx_server.vocab, prompt.get<std::string>(), precomputed_images));
+        } else if (res_type != TASK_RESPONSE_TYPE_NONE && ctx_server.mctx != nullptr) {
            // This is the case used by OAI compatible chat path with MTMD. TODO It can be moved to the path below.
            inputs.push_back(process_mtmd_prompt(ctx_server.mctx, prompt.get<std::string>(), files));
        } else {
@ -3601,29 +3626,34 @@ void server_routes::init_routes() {
    this->post_chat_completions = [this](const server_http_req & req) {
        auto res = create_response();
        std::vector<raw_buffer> files;
+        std::vector<server_precomputed_image> precomputed_images;
        json body = json::parse(req.body);
        json body_parsed = oaicompat_chat_params_parse(
            body,
            meta->chat_params,
-            files);
+            files,
+            precomputed_images);
        return handle_completions_impl(
            req,
            SERVER_TASK_TYPE_COMPLETION,
            body_parsed,
            files,
-            TASK_RESPONSE_TYPE_OAI_CHAT);
+            TASK_RESPONSE_TYPE_OAI_CHAT,
+            precomputed_images);
    };

    this->post_responses_oai = [this](const server_http_req & req) {
        auto res = create_response();
        std::vector<raw_buffer> files;
+        std::vector<server_precomputed_image> precomputed_images;
        json body = convert_responses_to_chatcmpl(json::parse(req.body));
        SRV_DBG("%s\n", "Request converted: OpenAI Responses -> OpenAI Chat Completions");
        SRV_DBG("converted request: %s\n", body.dump().c_str());
        json body_parsed = oaicompat_chat_params_parse(
            body,
            meta->chat_params,
-            files);
+            files,
+            precomputed_images);
        return handle_completions_impl(
            req,
            SERVER_TASK_TYPE_COMPLETION,
@ -3635,13 +3665,15 @@ void server_routes::init_routes() {
    this->post_anthropic_messages = [this](const server_http_req & req) {
        auto res = create_response();
        std::vector<raw_buffer> files;
+        std::vector<server_precomputed_image> precomputed_images;
        json body = convert_anthropic_to_oai(json::parse(req.body));
        SRV_DBG("%s\n", "Request converted: Anthropic -> OpenAI Chat Completions");
        SRV_DBG("converted request: %s\n", body.dump().c_str());
        json body_parsed = oaicompat_chat_params_parse(
            body,
            meta->chat_params,
-            files);
+            files,
+            precomputed_images);
        return handle_completions_impl(
            req,
            SERVER_TASK_TYPE_COMPLETION,
@ -3653,13 +3685,15 @@ void server_routes::init_routes() {
    this->post_anthropic_count_tokens = [this](const server_http_req & req) {
        auto res = create_response();
        std::vector<raw_buffer> files;
+        std::vector<server_precomputed_image> precomputed_images;
        json body = convert_anthropic_to_oai(json::parse(req.body));
        SRV_DBG("%s\n", "Request converted: Anthropic -> OpenAI Chat Completions");
        SRV_DBG("converted request: %s\n", body.dump().c_str());
        json body_parsed = oaicompat_chat_params_parse(
            body,
            meta->chat_params,
-            files);
+            files,
+            precomputed_images);

        json prompt = body_parsed.at("prompt");
        llama_tokens tokens = tokenize_mixed(ctx_server.vocab, prompt, true, true);
@ -3671,15 +3705,124 @@ void server_routes::init_routes() {
    this->post_apply_template = [this](const server_http_req & req) {
        auto res = create_response();
        std::vector<raw_buffer> files; // dummy, unused
+        std::vector<server_precomputed_image> precomputed_images; // dummy, unused
        json body = json::parse(req.body);
        json data = oaicompat_chat_params_parse(
            body,
            meta->chat_params,
-            files);
+            files,
+            precomputed_images);
        res->ok({{ "prompt", std::move(data.at("prompt")) }});
        return res;
    };

+    this->post_vision_embedding = [this](const server_http_req & req) {
+        auto res = create_response();
+
+        if (ctx_server.mctx == nullptr) {
+            res->error(format_error_response("This server does not support multimodal. Start it with a multimodal projector (--mmproj)", ERROR_TYPE_NOT_SUPPORTED));
+            return res;
+        }
+
+        try {
+            json body = json::parse(req.body);
+            if (!body.contains("image") || !body.at("image").is_string()) {
+                throw std::runtime_error("Request must contain 'image' field with base64-encoded image data");
+            }
+
+            const std::string & image_b64 = body.at("image").get_ref<const std::string &>();
+            bool return_b64 = json_value(body, "b64", false);
+
+            // strip data URI prefix if present
+            std::string b64_data;
+            {
+                auto comma_pos = image_b64.find(',');
+                if (comma_pos != std::string::npos && image_b64.substr(0, comma_pos).find("base64") != std::string::npos) {
+                    b64_data = image_b64.substr(comma_pos + 1);
+                } else {
+                    b64_data = image_b64;
+                }
+            }
+
+            // base64 decode the image
+            raw_buffer image_data = base64_decode(b64_data);
+            if (image_data.empty()) {
+                throw std::runtime_error("Failed to decode base64 image data");
+            }
+
+            // create bitmap from buffer
+            mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(ctx_server.mctx, image_data.data(), image_data.size()));
+            if (!bmp.ptr) {
+                throw std::runtime_error("Failed to load image from buffer");
+            }
+            bmp.set_id("vision_embedding");
+
+            // tokenize with a dummy prompt containing the media marker
+            std::string dummy_prompt = mtmd_default_marker();
+            mtmd_input_text inp_txt = {
+                dummy_prompt.c_str(),
+                /* add_special */   false,
+                /* parse_special */ true,
+            };
+            mtmd::input_chunks chunks(mtmd_input_chunks_init());
+            const mtmd_bitmap * bmp_ptr = bmp.ptr.get();
+            int32_t tokenized = mtmd_tokenize(ctx_server.mctx,
+                                              chunks.ptr.get(),
+                                              &inp_txt,
+                                              &bmp_ptr,
+                                              1);
+            if (tokenized != 0) {
+                throw std::runtime_error("Failed to tokenize image");
+            }
+
+            // find the image chunk and encode it
+            for (size_t i = 0; i < chunks.size(); i++) {
+                const auto * chunk = mtmd_input_chunks_get(chunks.ptr.get(), i);
+                auto chunk_type = mtmd_input_chunk_get_type(chunk);
+                if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE || chunk_type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
+                    int32_t encode_res = mtmd_encode_chunk(ctx_server.mctx, chunk);
+                    if (encode_res != 0) {
+                        throw std::runtime_error("Failed to encode image chunk");
+                    }
+
+                    size_t n_tokens = mtmd_input_chunk_get_n_tokens(chunk);
+                    int32_t n_embd = meta->model_n_embd_inp;
+                    float * embd = mtmd_get_output_embd(ctx_server.mctx);
+
+                    // build response based on b64 option
+                    size_t total_floats = n_tokens * n_embd;
+                    size_t total_bytes = total_floats * sizeof(float);
+
+                    json result = {
+                        {"n_tokens", (int32_t) n_tokens},
+                        {"n_embd",   n_embd},
+                    };
+
+                    if (return_b64) {
+                        // only base64
+                        result["embedding_b64"] = base64::encode(
+                            reinterpret_cast<const char *>(embd), total_bytes);
+                    } else {
+                        // only array of floats
+                        json embedding = json::array();
+                        for (size_t j = 0; j < total_floats; j++) {
+                            embedding.push_back(embd[j]);
+                        }
+                        result["embedding"]     = std::move(embedding); 
+                    }
+
+                    res->ok(std::move(result));
+                    return res;
+                }
+            }
+
+            throw std::runtime_error("No image chunk found after tokenization");
+        } catch (const std::exception & e) {
+            res->error(format_error_response(e.what(), ERROR_TYPE_INVALID_REQUEST));
+            return res;
+        }
+    };
+
    this->get_models = [this](const server_http_req &) {
        auto res = create_response(true);

--- a/tools/server/server-context.h
+++ b/tools/server/server-context.h
@ -115,13 +115,15 @@ struct server_routes {
    server_http_context::handler_t post_rerank;
    server_http_context::handler_t get_lora_adapters;
    server_http_context::handler_t post_lora_adapters;
+    server_http_context::handler_t post_vision_embedding;
 private:
    std::unique_ptr<server_res_generator> handle_completions_impl(
            const server_http_req & req,
            server_task_type type,
            const json & data,
            const std::vector<raw_buffer> & files,
-            task_response_type res_type);
+            task_response_type res_type,
+            const std::vector<server_precomputed_image> & precomputed_images = {});
    std::unique_ptr<server_res_generator> handle_slots_save(const server_http_req & req, int id_slot);
    std::unique_ptr<server_res_generator> handle_slots_restore(const server_http_req & req, int id_slot);
    std::unique_ptr<server_res_generator> handle_slots_erase(const server_http_req &, int id_slot);
--- a/tools/server/server-http.cpp
+++ b/tools/server/server-http.cpp
@ -119,6 +119,8 @@ bool server_http_context::init(const common_params & params) {
        LOG_INF("%s: api_keys: %zu keys loaded\n", __func__, params.api_keys.size());
    }

+    srv->set_payload_max_length(1024LL * 1024LL * 1024LL); //To allow big B64 embbeding.yeah i know this is a bit hacky, but cpp-httplib doesn't provide a way to set max payload size for specific endpoints, and we need to allow big payload for image embedding endpoint. ideally we should set this limit only for that endpoint, but for now we just set it globally to 1GB which should be enough for most cases.
+
    //
    // Middlewares
    //
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@ -158,6 +158,7 @@ int main(int argc, char ** argv) {
        routes.post_lora_adapters          = models_routes->proxy_post;
        routes.get_slots                   = models_routes->proxy_get;
        routes.post_slots                  = models_routes->proxy_post;
+        routes.post_vision_embedding        = models_routes->proxy_post;

        // custom routes for router
        routes.get_props  = models_routes->get_router_props;
@ -196,6 +197,7 @@ int main(int argc, char ** argv) {
    ctx_http.post("/tokenize",            ex_wrapper(routes.post_tokenize));
    ctx_http.post("/detokenize",          ex_wrapper(routes.post_detokenize));
    ctx_http.post("/apply-template",      ex_wrapper(routes.post_apply_template));
+    ctx_http.post("/vision/embedding",      ex_wrapper(routes.post_vision_embedding));
    // LoRA adapters hotswap
    ctx_http.get ("/lora-adapters",       ex_wrapper(routes.get_lora_adapters));
    ctx_http.post("/lora-adapters",       ex_wrapper(routes.post_lora_adapters));