#include "server-context.h" #include "server-http.h" #include "arg.h" #include "common.h" #include "llama.h" #include "log.h" #include #include #include // for std::thread::hardware_concurrency #if defined(_WIN32) #include #endif static std::function shutdown_handler; static std::atomic_flag is_terminating = ATOMIC_FLAG_INIT; static inline void signal_handler(int signal) { if (is_terminating.test_and_set()) { // in case it hangs, we can force terminate the server by hitting Ctrl+C twice // this is for better developer experience, we can remove when the server is stable enough fprintf(stderr, "Received second interrupt, terminating immediately.\n"); exit(1); } shutdown_handler(signal); } // wrapper function that handles exceptions and logs errors // this is to make sure handler_t never throws exceptions; instead, it returns an error response static server_http_context::handler_t ex_wrapper(server_http_context::handler_t func) { return [func = std::move(func)](const server_http_req & req) -> server_http_res_ptr { std::string message; try { return func(req); } catch (const std::exception & e) { message = e.what(); } catch (...) { message = "unknown error"; } auto res = std::make_unique(); res->status = 500; try { json error_data = format_error_response(message, ERROR_TYPE_SERVER); res->status = json_value(error_data, "code", 500); res->data = safe_json_to_str({{ "error", error_data }}); LOG_WRN("got exception: %s\n", res->data.c_str()); } catch (const std::exception & e) { LOG_ERR("got another exception: %s | while hanlding exception: %s\n", e.what(), message.c_str()); res->data = "Internal Server Error"; } return res; }; } int main(int argc, char ** argv) { // own arguments required by this example common_params params; if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER)) { return 1; } // TODO: should we have a separate n_parallel parameter for the server? // https://github.com/ggml-org/llama.cpp/pull/16736#discussion_r2483763177 // TODO: this is a common configuration that is suitable for most local use cases // however, overriding the parameters is a bit confusing - figure out something more intuitive if (params.n_parallel == 1 && params.kv_unified == false && !params.has_speculative()) { LOG_WRN("%s: setting n_parallel = 4 and kv_unified = true (add -kvu to disable this)\n", __func__); params.n_parallel = 4; params.kv_unified = true; } common_init(); // struct that contains llama context and inference server_context ctx_server; llama_backend_init(); llama_numa_init(params.numa); LOG_INF("system info: n_threads = %d, n_threads_batch = %d, total_threads = %d\n", params.cpuparams.n_threads, params.cpuparams_batch.n_threads, std::thread::hardware_concurrency()); LOG_INF("\n"); LOG_INF("%s\n", common_params_get_system_info(params).c_str()); LOG_INF("\n"); server_http_context ctx_http; if (!ctx_http.init(params)) { LOG_ERR("%s: failed to initialize HTTP server\n", __func__); return 1; } // // Router // // register API routes server_routes routes(params, ctx_server, [&ctx_http]() { return ctx_http.is_ready.load(); }); ctx_http.get ("/health", ex_wrapper(routes.get_health)); // public endpoint (no API key check) ctx_http.get ("/v1/health", ex_wrapper(routes.get_health)); // public endpoint (no API key check) ctx_http.get ("/metrics", ex_wrapper(routes.get_metrics)); ctx_http.get ("/props", ex_wrapper(routes.get_props)); ctx_http.post("/props", ex_wrapper(routes.post_props)); ctx_http.post("/api/show", ex_wrapper(routes.get_api_show)); ctx_http.get ("/models", ex_wrapper(routes.get_models)); // public endpoint (no API key check) ctx_http.get ("/v1/models", ex_wrapper(routes.get_models)); // public endpoint (no API key check) ctx_http.get ("/api/tags", ex_wrapper(routes.get_models)); // ollama specific endpoint. public endpoint (no API key check) ctx_http.post("/completion", ex_wrapper(routes.post_completions)); // legacy ctx_http.post("/completions", ex_wrapper(routes.post_completions)); ctx_http.post("/v1/completions", ex_wrapper(routes.post_completions_oai)); ctx_http.post("/chat/completions", ex_wrapper(routes.post_chat_completions)); ctx_http.post("/v1/chat/completions", ex_wrapper(routes.post_chat_completions)); ctx_http.post("/api/chat", ex_wrapper(routes.post_chat_completions)); // ollama specific endpoint ctx_http.post("/v1/messages", ex_wrapper(routes.post_anthropic_messages)); // anthropic messages API ctx_http.post("/v1/messages/count_tokens", ex_wrapper(routes.post_anthropic_count_tokens)); // anthropic token counting ctx_http.post("/infill", ex_wrapper(routes.post_infill)); ctx_http.post("/embedding", ex_wrapper(routes.post_embeddings)); // legacy ctx_http.post("/embeddings", ex_wrapper(routes.post_embeddings)); ctx_http.post("/v1/embeddings", ex_wrapper(routes.post_embeddings_oai)); ctx_http.post("/rerank", ex_wrapper(routes.post_rerank)); ctx_http.post("/reranking", ex_wrapper(routes.post_rerank)); ctx_http.post("/v1/rerank", ex_wrapper(routes.post_rerank)); ctx_http.post("/v1/reranking", ex_wrapper(routes.post_rerank)); ctx_http.post("/tokenize", ex_wrapper(routes.post_tokenize)); ctx_http.post("/detokenize", ex_wrapper(routes.post_detokenize)); ctx_http.post("/apply-template", ex_wrapper(routes.post_apply_template)); // LoRA adapters hotswap ctx_http.get ("/lora-adapters", ex_wrapper(routes.get_lora_adapters)); ctx_http.post("/lora-adapters", ex_wrapper(routes.post_lora_adapters)); // Save & load slots ctx_http.get ("/slots", ex_wrapper(routes.get_slots)); ctx_http.post("/slots/:id_slot", ex_wrapper(routes.post_slots)); // // Start the server // // setup clean up function, to be called before exit auto clean_up = [&ctx_http, &ctx_server]() { SRV_INF("%s: cleaning up before exit...\n", __func__); ctx_http.stop(); ctx_server.terminate(); llama_backend_free(); }; // start the HTTP server before loading the model to be able to serve /health requests if (!ctx_http.start()) { clean_up(); LOG_ERR("%s: exiting due to HTTP server error\n", __func__); return 1; } // load the model LOG_INF("%s: loading model\n", __func__); if (!ctx_server.load_model(params)) { clean_up(); if (ctx_http.thread.joinable()) { ctx_http.thread.join(); } LOG_ERR("%s: exiting due to model loading error\n", __func__); return 1; } ctx_server.init(); ctx_http.is_ready.store(true); LOG_INF("%s: model loaded\n", __func__); shutdown_handler = [&](int) { // this will unblock start_loop() ctx_server.terminate(); }; // TODO: refactor in common/console #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) struct sigaction sigint_action; sigint_action.sa_handler = signal_handler; sigemptyset (&sigint_action.sa_mask); sigint_action.sa_flags = 0; sigaction(SIGINT, &sigint_action, NULL); sigaction(SIGTERM, &sigint_action, NULL); #elif defined (_WIN32) auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL { return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false; }; SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); #endif LOG_INF("%s: server is listening on %s\n", __func__, ctx_http.listening_address.c_str()); LOG_INF("%s: starting the main loop...\n", __func__); // this call blocks the main thread until ctx_server.terminate() is called ctx_server.start_loop(); clean_up(); if (ctx_http.thread.joinable()) { ctx_http.thread.join(); } llama_memory_breakdown_print(ctx_server.get_llama_context()); return 0; }