#pragma once #include "common.h" #include "log.h" #include "llama.h" #include "chat.h" #include "mtmd.h" #define JSON_ASSERT GGML_ASSERT #include #include #include #include const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT); using json = nlohmann::ordered_json; #define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__) #define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__) #define SLT_ERR(slot, fmt, ...) LOG_ERR("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__) #define SLT_DBG(slot, fmt, ...) LOG_DBG("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__) #define SRV_INF(fmt, ...) LOG_INF("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) #define SRV_WRN(fmt, ...) LOG_WRN("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) #define SRV_ERR(fmt, ...) LOG_ERR("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) #define SRV_DBG(fmt, ...) LOG_DBG("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) using raw_buffer = std::vector; template static T json_value(const json & body, const std::string & key, const T & default_value) { // Fallback null to default value if (body.contains(key) && !body.at(key).is_null()) { try { return body.at(key); } catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const & err) { LOG_WRN("Wrong type supplied for parameter '%s'. Expected '%s', using default value: %s\n", key.c_str(), json(default_value).type_name(), err.what()); return default_value; } } else { return default_value; } } // https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11 enum error_type { ERROR_TYPE_INVALID_REQUEST, ERROR_TYPE_AUTHENTICATION, ERROR_TYPE_SERVER, ERROR_TYPE_NOT_FOUND, ERROR_TYPE_PERMISSION, ERROR_TYPE_UNAVAILABLE, // custom error ERROR_TYPE_NOT_SUPPORTED, // custom error ERROR_TYPE_EXCEED_CONTEXT_SIZE, // custom error }; // thin wrapper around common_grammar_trigger with (de)serialization functions struct server_grammar_trigger { common_grammar_trigger value; server_grammar_trigger() = default; server_grammar_trigger(const common_grammar_trigger & value) : value(value) {} server_grammar_trigger(const json & in) { value.type = (common_grammar_trigger_type) in.at("type").get(); value.value = in.at("value").get(); if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) { value.token = (llama_token) in.at("token").get(); } } json to_json() const { json out { {"type", (int) value.type}, {"value", value.value}, }; if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) { out["token"] = (int) value.token; } return out; } }; json format_error_response(const std::string & message, const enum error_type type); // // random string / id // std::string random_string(); std::string gen_chatcmplid(); std::string gen_tool_call_id(); // // lora utils // // check whether the given lora set has only aloras activated (empty => false) bool lora_all_alora(const std::vector & loras); // if the two sets of loras are different, they require a cache clear unless the // change is only from aloras to aloras. bool lora_should_clear_cache( const std::vector & current, const std::vector & next); std::vector parse_lora_request( const std::vector & lora_base, const json & data); bool are_lora_equal( const std::vector & l1, const std::vector & l2); // get the ids of all enabled loras std::vector lora_get_enabled_ids(const std::vector & loras); // // server_tokens // /** * server_tokens is a helper to manage the input tokens and image for the server. * it is made this way to simplify the logic of KV cache management. */ struct server_tokens { bool has_mtmd = false; private: // disallow accessing these members directly, risking out-of-sync // map a **start** index in tokens to the image chunk // note: the order need to be in-sync with tokens std::map map_idx_to_media; // list of tokens // if the token is LLAMA_TOKEN_NULL, it indicates that this position is occupied by media chunk // otherwise, it is a normal text token // note: a non-text chunk can occupy multiple tokens (aka memory cells) in the token list // note(2): for M-RoPE, an image can occupy different number of pos; do not assume 1-to-1 mapping tokens <-> pos llama_tokens tokens; // for ex. with input of 5 text tokens and 2 images (each image occupies 3 tokens and 2 pos): // [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1] [img1] // idx 0 1 2 3 4 5 6 7 8 9 10 // pos 0 1 2 3 4 5 5 5 7 7 7 // map_idx_to_media will contain: {5, img0}, {8, img1} public: server_tokens() = default; ~server_tokens() = default; // Prevent copying // TODO: server_tokens should be copyable - remove this: server_tokens(const server_tokens&) = delete; server_tokens& operator=(const server_tokens&) = delete; // Allow moving (usually implicitly generated if members are movable) server_tokens(server_tokens&&) = default; server_tokens& operator=(server_tokens&&) = default; // Allow accessing elements using [] operator llama_token operator[](size_t index) { return tokens[index]; } const llama_token& operator[](size_t index) const { return tokens[index]; } server_tokens(mtmd::input_chunks & mtmd_chunks, bool has_mtmd); server_tokens(const llama_tokens & tokens, bool has_mtmd); // for debugging std::string str() const; llama_pos pos_next() const; const mtmd::input_chunk_ptr & find_chunk(size_t idx) const; void push_back(llama_token tok); // will create a copy of the chunk if it contains non-text data void push_back(const mtmd_input_chunk * chunk); // appends server tokens, updates the media map. copies media chunks. void push_back(server_tokens & tokens); // for compatibility with context shift and prompt truncation void insert(const llama_tokens & inp_tokens); // for compatibility with speculative decoding, ctx shift, slot save/load const llama_tokens & get_text_tokens() const; // for compatibility with speculative decoding void set_token(llama_pos pos, llama_token id); size_t size() const { return tokens.size(); } bool empty() const { return tokens.empty(); } void clear() { map_idx_to_media.clear(); tokens.clear(); } void keep_first(size_t n); std::string detokenize(const llama_context * ctx, bool special) const; size_t get_common_prefix(const server_tokens & b) const; // make sure all text tokens are within the vocab range bool validate(const struct llama_context * ctx) const; // encode and decode the image chunk int32_t process_chunk( llama_context * ctx, mtmd_context * mctx, size_t idx, llama_pos pos, int32_t seq_id, size_t & n_tokens_out) const; server_tokens clone() const; }; // // tokenizer and input processing utils // bool json_is_array_of_numbers(const json & data); // is array having BOTH numbers & strings? bool json_is_array_of_mixed_numbers_strings(const json & data); // does array have any individual integers/tokens? bool json_is_array_and_contains_numbers(const json & data); // get value by path(key1 / key2) json json_get_nested_values(const std::vector & paths, const json & js); /** * this handles 2 cases: * - only string, example: "string" * - mixed string and tokens, example: [12, 34, "string", 56, 78] */ llama_tokens tokenize_mixed(const llama_vocab * vocab, const json & json_prompt, bool add_special, bool parse_special); // return the last index of character that can form a valid string // if the last character is potentially cut in half, return the index before the cut // if validate_utf8(text) == text.size(), then the whole text is valid utf8 size_t validate_utf8(const std::string& text); // process mtmd prompt, return the server_tokens containing both text tokens and media chunks server_tokens process_mtmd_prompt(mtmd_context * mctx, std::string prompt, std::vector files); /** * break the input "prompt" object into multiple prompt if needed, then tokenize them * this supports these cases: * - "prompt": "string" * - "prompt": [12, 34, 56] * - "prompt": [12, 34, "string", 56, 78] * - "prompt": { "prompt_string": "string", "multimodal_data": [ "base64" ] } * and multiple prompts (multi-tasks): * - "prompt": ["string1", "string2"] * - "prompt": ["string1", [12, 34, 56]] * - "prompt": [[12, 34, 56], [78, 90, 12]] * - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56], { "prompt_string": "string", "multimodal_data": [ "base64" ]}] */ std::vector tokenize_input_prompts( const llama_vocab * vocab, mtmd_context * mctx, const json & json_prompt, bool add_special, bool parse_special); // // OAI utils // // used by /completions endpoint json oaicompat_completion_params_parse(const json & body); struct oaicompat_parser_options { bool use_jinja; bool prefill_assistant; common_reasoning_format reasoning_format; std::map chat_template_kwargs; common_chat_templates * tmpls; bool allow_image; bool allow_audio; bool enable_thinking = true; std::string media_path; }; // used by /chat/completions endpoint json oaicompat_chat_params_parse( json & body, /* openai api json semantics */ const oaicompat_parser_options & opt, std::vector & out_files); // convert Anthropic Messages API format to OpenAI Chat Completions API format json convert_anthropic_to_oai(const json & body); // TODO: move it to server-task.cpp json format_embeddings_response_oaicompat( const json & request, const std::string & model_name, const json & embeddings, bool use_base64 = false); // TODO: move it to server-task.cpp json format_response_rerank( const json & request, const std::string & model_name, const json & ranks, bool is_tei_format, std::vector & texts, int top_n); // // other utils // std::vector get_token_probabilities(llama_context * ctx, int idx); std::string safe_json_to_str(const json & data); std::string tokens_to_str(llama_context * ctx, const llama_tokens & tokens); // format incomplete utf-8 multibyte character for output std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token); // format server-sent event (SSE), return the formatted string to send // note: if data is a json array, it will be sent as multiple events, one per item std::string format_oai_sse(const json & data); // format Anthropic-style SSE with event types std::string format_anthropic_sse(const json & data); bool is_valid_utf8(const std::string & str); // // formatting output responses // TODO: move these to server-task.cpp // llama_tokens format_prompt_infill( const llama_vocab * vocab, const json & input_prefix, const json & input_suffix, const json & input_extra, const int n_batch, const int n_predict, const int n_ctx, const bool spm_infill, const llama_tokens & tokens_prompt); // format rerank task: [BOS]query[EOS][SEP]doc[EOS]. server_tokens format_prompt_rerank( const struct llama_model * model, const struct llama_vocab * vocab, mtmd_context * mctx, const std::string & query, const std::string & doc);