diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index f82a6cce56..ceafcac179 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -15,7 +15,6 @@ #include #include #include -#include #include // fix problem with std::min and std::max @@ -2748,15 +2747,7 @@ private: slot.i_batch = -1; - try { - common_sampler_accept(slot.smpl.get(), id, true); - } catch (std::runtime_error & e) { - SLT_ERR(slot, "Error when accepting token for sampler: %s\n", e.what()); - send_error(slot, std::string("Error when accepting token for sampler: ") + e.what(), ERROR_TYPE_SERVER); - slot.release(); - slot.i_batch = -1; - continue; // continue loop of slots - } + common_sampler_accept(slot.smpl.get(), id, true); // here we have synchronized the llama_context (due to the sampling above), so we can do time measurement const int64_t t_current = ggml_time_us();