diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index 3a5bd2b215..ae2b78efba 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -570,7 +570,7 @@ server_http_res_ptr server_models::proxy_request(const server_http_req & req, co return proxy; } -void server_models::setup_child_server(const common_params & base_params, int router_port, const std::string & name, std::function & shutdown_handler) { +std::thread server_models::setup_child_server(const common_params & base_params, int router_port, const std::string & name, std::function & shutdown_handler) { // send a notification to the router server that a model instance is ready // TODO @ngxson : use HTTP client from libcommon httplib::Client cli(base_params.hostname, router_port); @@ -598,7 +598,7 @@ void server_models::setup_child_server(const common_params & base_params, int ro } // setup thread for monitoring stdin - std::thread([shutdown_handler]() { + return std::thread([shutdown_handler]() { // wait for EOF on stdin SRV_INF("%s", "child server monitoring thread started, waiting for EOF on stdin...\n"); bool eof = false; @@ -619,7 +619,7 @@ void server_models::setup_child_server(const common_params & base_params, int ro SRV_INF("%s", "EOF on stdin detected, forcing shutdown...\n"); exit(1); } - }).detach(); + }); } diff --git a/tools/server/server-models.h b/tools/server/server-models.h index 552e750f58..029ba2fad8 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -122,7 +122,8 @@ public: server_http_res_ptr proxy_request(const server_http_req & req, const std::string & method, const std::string & name, bool update_last_used); // notify the router server that a model instance is ready - static void setup_child_server(const common_params & base_params, int router_port, const std::string & name, std::function & shutdown_handler); + // return the monitoring thread (to be joined by the caller) + static std::thread setup_child_server(const common_params & base_params, int router_port, const std::string & name, std::function & shutdown_handler); }; struct server_models_routes { diff --git a/tools/server/server.cpp b/tools/server/server.cpp index d78c21a875..950537d82d 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -276,8 +276,9 @@ int main(int argc, char ** argv, char ** envp) { // optionally, notify router server that this instance is ready const char * router_port = std::getenv("LLAMA_SERVER_ROUTER_PORT"); + std::thread monitor_thread; if (router_port != nullptr) { - server_models::setup_child_server(params, std::atoi(router_port), params.model_alias, shutdown_handler); + monitor_thread = server_models::setup_child_server(params, std::atoi(router_port), params.model_alias, shutdown_handler); } // this call blocks the main thread until queue_tasks.terminate() is called @@ -287,6 +288,9 @@ int main(int argc, char ** argv, char ** envp) { if (ctx_http.thread.joinable()) { ctx_http.thread.join(); } + if (monitor_thread.joinable()) { + monitor_thread.join(); + } llama_memory_breakdown_print(ctx_server.get_llama_context()); }