97 lines
3.4 KiB
C++
97 lines
3.4 KiB
C++
#include "server-http.h"
|
|
#include "server-task.h"
|
|
#include "server-queue.h"
|
|
|
|
#include <nlohmann/json_fwd.hpp>
|
|
|
|
#include <cstddef>
|
|
#include <memory>
|
|
|
|
struct server_context_impl; // private implementation
|
|
|
|
struct server_context_info {
|
|
std::string build_info;
|
|
std::string model_name;
|
|
bool has_inp_image;
|
|
bool has_inp_audio;
|
|
};
|
|
|
|
struct server_context {
|
|
std::unique_ptr<server_context_impl> impl;
|
|
|
|
server_context();
|
|
~server_context();
|
|
|
|
// initialize slots and server-related data
|
|
void init();
|
|
|
|
// load the model and initialize llama_context
|
|
// returns true on success
|
|
bool load_model(const common_params & params);
|
|
|
|
// this function will block main thread until termination
|
|
void start_loop();
|
|
|
|
// terminate main loop (will unblock start_loop)
|
|
void terminate();
|
|
|
|
// get the underlaying llama_context
|
|
llama_context * get_llama_context() const;
|
|
|
|
// get a new response reader, used by CLI application
|
|
server_response_reader get_response_reader();
|
|
|
|
// get server info
|
|
// used by CLI application
|
|
server_context_info get_info() const;
|
|
};
|
|
|
|
|
|
// forward declarations
|
|
struct server_res_generator;
|
|
|
|
struct server_routes {
|
|
server_routes(const common_params & params, server_context & ctx_server, std::function<bool()> is_ready = []() { return true; }, std::function<void()> on_shutdown = nullptr)
|
|
: params(params), ctx_server(*ctx_server.impl), is_ready(is_ready), on_shutdown(on_shutdown) {
|
|
init_routes();
|
|
}
|
|
|
|
void init_routes();
|
|
// handlers using lambda function, so that they can capture `this` without `std::bind`
|
|
server_http_context::handler_t get_health;
|
|
server_http_context::handler_t get_metrics;
|
|
server_http_context::handler_t get_slots;
|
|
server_http_context::handler_t post_slots;
|
|
server_http_context::handler_t get_props;
|
|
server_http_context::handler_t post_props;
|
|
server_http_context::handler_t get_api_show;
|
|
server_http_context::handler_t post_infill;
|
|
server_http_context::handler_t post_completions;
|
|
server_http_context::handler_t post_completions_oai;
|
|
server_http_context::handler_t post_chat_completions;
|
|
server_http_context::handler_t post_anthropic_messages;
|
|
server_http_context::handler_t post_anthropic_count_tokens;
|
|
server_http_context::handler_t post_apply_template;
|
|
server_http_context::handler_t get_models;
|
|
server_http_context::handler_t post_tokenize;
|
|
server_http_context::handler_t post_detokenize;
|
|
server_http_context::handler_t post_embeddings;
|
|
server_http_context::handler_t post_embeddings_oai;
|
|
server_http_context::handler_t post_rerank;
|
|
server_http_context::handler_t get_lora_adapters;
|
|
server_http_context::handler_t post_lora_adapters;
|
|
server_http_context::handler_t post_exit;
|
|
|
|
private:
|
|
// TODO: move these outside of server_routes?
|
|
std::unique_ptr<server_res_generator> handle_slots_save(const server_http_req & req, int id_slot);
|
|
std::unique_ptr<server_res_generator> handle_slots_restore(const server_http_req & req, int id_slot);
|
|
std::unique_ptr<server_res_generator> handle_slots_erase(const server_http_req &, int id_slot);
|
|
std::unique_ptr<server_res_generator> handle_embeddings_impl(const server_http_req & req, task_response_type res_type);
|
|
|
|
const common_params & params;
|
|
server_context_impl & ctx_server;
|
|
std::function<bool()> is_ready;
|
|
const std::function<void()> on_shutdown;
|
|
};
|