#include "server-http.h" #include "server-task.h" #include "server-queue.h" #include #include #include struct server_context_impl; // private implementation struct server_context_meta { std::string build_info; std::string model_name; std::string model_path; bool has_mtmd; bool has_inp_image; bool has_inp_audio; json json_webui_settings; int slot_n_ctx; enum llama_pooling_type pooling_type; // chat template std::string chat_template; std::string chat_template_tool_use; // tokens std::string bos_token_str; std::string eos_token_str; llama_token fim_pre_token; llama_token fim_sub_token; llama_token fim_mid_token; // model meta enum llama_vocab_type model_vocab_type; int32_t model_vocab_n_tokens; int32_t model_n_ctx_train; int32_t model_n_embd_inp; uint64_t model_n_params; uint64_t model_size; }; struct server_context { std::unique_ptr impl; server_context(); ~server_context(); // load the model and initialize llama_context // returns true on success bool load_model(const common_params & params); // this function will block main thread until termination void start_loop(); // terminate main loop (will unblock start_loop) void terminate(); // get the underlaying llama_context, can return nullptr if sleeping // not thread-safe, should only be used from the main thread llama_context * get_llama_context() const; // get a new response reader, used by CLI application server_response_reader get_response_reader(); // get server metadata (read-only), can only be called after load_model() // not thread-safe, should only be used from the main thread server_context_meta get_meta() const; }; // forward declarations struct server_res_generator; struct server_routes { server_routes(const common_params & params, server_context & ctx_server); void init_routes(); // note: this is not thread-safe and can only when ctx_http.is_ready is false void update_meta(const server_context & ctx_server) { this->meta = std::make_unique(ctx_server.get_meta()); } // handlers using lambda function, so that they can capture `this` without `std::bind` // they won't be called until ctx_http.is_ready is set to true server_http_context::handler_t get_health; server_http_context::handler_t get_metrics; server_http_context::handler_t get_slots; server_http_context::handler_t post_slots; server_http_context::handler_t get_props; server_http_context::handler_t post_props; server_http_context::handler_t get_api_show; server_http_context::handler_t post_infill; server_http_context::handler_t post_completions; server_http_context::handler_t post_completions_oai; server_http_context::handler_t post_chat_completions; server_http_context::handler_t post_anthropic_messages; server_http_context::handler_t post_anthropic_count_tokens; server_http_context::handler_t post_apply_template; server_http_context::handler_t get_models; server_http_context::handler_t post_tokenize; server_http_context::handler_t post_detokenize; server_http_context::handler_t post_embeddings; server_http_context::handler_t post_embeddings_oai; server_http_context::handler_t post_rerank; server_http_context::handler_t get_lora_adapters; server_http_context::handler_t post_lora_adapters; private: std::unique_ptr handle_completions_impl( const server_http_req & req, server_task_type type, const json & data, const std::vector & files, task_response_type res_type); std::unique_ptr handle_slots_save(const server_http_req & req, int id_slot); std::unique_ptr handle_slots_restore(const server_http_req & req, int id_slot); std::unique_ptr handle_slots_erase(const server_http_req &, int id_slot); std::unique_ptr handle_embeddings_impl(const server_http_req & req, task_response_type res_type); // using unique_ptr to allow late initialization of const std::unique_ptr meta; const common_params & params; const server_context_impl & ctx_server; server_queue & queue_tasks; server_response & queue_results; std::unique_ptr create_response(bool bypass_sleep = false); };