#include "server-http.h" #include "server-task.h" #include "server-queue.h" #include #include #include struct server_context_impl; // private implementation struct server_context { std::unique_ptr impl; server_context(); ~server_context(); // initialize slots and server-related data void init(); // load the model and initialize llama_context // returns true on success bool load_model(const common_params & params); // this function will block main thread until termination void start_loop(); // terminate main loop (will unblock start_loop) void terminate(); // get the underlaying llama_context llama_context * get_llama_context() const; // get the underlaying queue_tasks and queue_results // used by CLI application std::pair get_queues(); }; // forward declarations struct server_res_generator; struct server_routes { server_routes(const common_params & params, server_context & ctx_server, std::function is_ready = []() { return true; }) : params(params), ctx_server(*ctx_server.impl), is_ready(is_ready) { init_routes(); } void init_routes(); // handlers using lambda function, so that they can capture `this` without `std::bind` server_http_context::handler_t get_health; server_http_context::handler_t get_metrics; server_http_context::handler_t get_slots; server_http_context::handler_t post_slots; server_http_context::handler_t get_props; server_http_context::handler_t post_props; server_http_context::handler_t get_api_show; server_http_context::handler_t post_infill; server_http_context::handler_t post_completions; server_http_context::handler_t post_completions_oai; server_http_context::handler_t post_chat_completions; server_http_context::handler_t post_anthropic_messages; server_http_context::handler_t post_anthropic_count_tokens; server_http_context::handler_t post_apply_template; server_http_context::handler_t get_models; server_http_context::handler_t post_tokenize; server_http_context::handler_t post_detokenize; server_http_context::handler_t post_embeddings; server_http_context::handler_t post_embeddings_oai; server_http_context::handler_t post_rerank; server_http_context::handler_t get_lora_adapters; server_http_context::handler_t post_lora_adapters; private: // TODO: move these outside of server_routes? std::unique_ptr handle_slots_save(const server_http_req & req, int id_slot); std::unique_ptr handle_slots_restore(const server_http_req & req, int id_slot); std::unique_ptr handle_slots_erase(const server_http_req &, int id_slot); std::unique_ptr handle_embeddings_impl(const server_http_req & req, task_response_type res_type); const common_params & params; server_context_impl & ctx_server; std::function is_ready; };