rm redundant includes
This commit is contained in:
parent
3b7946034c
commit
ca993bad51
|
|
@ -42,11 +42,6 @@ using json = nlohmann::ordered_json;
|
|||
#define SRV_ERR(fmt, ...) LOG_ERR("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
||||
#define SRV_DBG(fmt, ...) LOG_DBG("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
||||
|
||||
#define QUE_INF(fmt, ...) LOG_INF("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
||||
#define QUE_WRN(fmt, ...) LOG_WRN("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
||||
#define QUE_ERR(fmt, ...) LOG_ERR("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
||||
#define QUE_DBG(fmt, ...) LOG_DBG("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
||||
|
||||
using raw_buffer = std::vector<uint8_t>;
|
||||
|
||||
template <typename T>
|
||||
|
|
@ -71,41 +66,6 @@ enum stop_type {
|
|||
STOP_TYPE_LIMIT,
|
||||
};
|
||||
|
||||
// state diagram: https://github.com/ggml-org/llama.cpp/pull/9283
|
||||
enum slot_state {
|
||||
SLOT_STATE_IDLE,
|
||||
SLOT_STATE_STARTED, // TODO: this state is only used for setting up the initial prompt processing; maybe merge it with launch_slot_with_task in the future
|
||||
SLOT_STATE_PROCESSING_PROMPT,
|
||||
SLOT_STATE_DONE_PROMPT,
|
||||
SLOT_STATE_GENERATING,
|
||||
};
|
||||
|
||||
enum server_state {
|
||||
SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet
|
||||
SERVER_STATE_READY, // Server is ready and model is loaded
|
||||
};
|
||||
|
||||
enum server_task_type {
|
||||
SERVER_TASK_TYPE_COMPLETION,
|
||||
SERVER_TASK_TYPE_EMBEDDING,
|
||||
SERVER_TASK_TYPE_RERANK,
|
||||
SERVER_TASK_TYPE_INFILL,
|
||||
SERVER_TASK_TYPE_CANCEL,
|
||||
SERVER_TASK_TYPE_NEXT_RESPONSE,
|
||||
SERVER_TASK_TYPE_METRICS,
|
||||
SERVER_TASK_TYPE_SLOT_SAVE,
|
||||
SERVER_TASK_TYPE_SLOT_RESTORE,
|
||||
SERVER_TASK_TYPE_SLOT_ERASE,
|
||||
SERVER_TASK_TYPE_SET_LORA,
|
||||
};
|
||||
|
||||
enum oaicompat_type {
|
||||
OAICOMPAT_TYPE_NONE,
|
||||
OAICOMPAT_TYPE_CHAT,
|
||||
OAICOMPAT_TYPE_COMPLETION,
|
||||
OAICOMPAT_TYPE_EMBEDDING,
|
||||
};
|
||||
|
||||
// https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
|
||||
enum error_type {
|
||||
ERROR_TYPE_INVALID_REQUEST,
|
||||
|
|
|
|||
|
|
@ -1,12 +1,17 @@
|
|||
#include "server-common.h"
|
||||
#include "server-task.h"
|
||||
#include "server-queue.h"
|
||||
|
||||
#include <chrono>
|
||||
#include <condition_variable>
|
||||
#include <deque>
|
||||
#include <mutex>
|
||||
#include <unordered_set>
|
||||
#include "log.h"
|
||||
|
||||
#define QUE_INF(fmt, ...) LOG_INF("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
||||
#define QUE_WRN(fmt, ...) LOG_WRN("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
||||
#define QUE_ERR(fmt, ...) LOG_ERR("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
||||
#define QUE_DBG(fmt, ...) LOG_DBG("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
||||
|
||||
#define RES_INF(fmt, ...) LOG_INF("res %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
||||
#define RES_WRN(fmt, ...) LOG_WRN("res %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
||||
#define RES_ERR(fmt, ...) LOG_ERR("res %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
||||
#define RES_DBG(fmt, ...) LOG_DBG("res %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
||||
|
||||
//
|
||||
// server_queue
|
||||
|
|
@ -150,7 +155,7 @@ void server_queue::cleanup_pending_task(int id_target) {
|
|||
//
|
||||
|
||||
void server_response::add_waiting_task_id(int id_task) {
|
||||
SRV_DBG("add task %d to waiting list. current waiting = %d (before add)\n", id_task, (int) waiting_task_ids.size());
|
||||
RES_DBG("add task %d to waiting list. current waiting = %d (before add)\n", id_task, (int) waiting_task_ids.size());
|
||||
|
||||
std::unique_lock<std::mutex> lock(mutex_results);
|
||||
waiting_task_ids.insert(id_task);
|
||||
|
|
@ -160,13 +165,13 @@ void server_response::add_waiting_tasks(const std::vector<server_task> & tasks)
|
|||
std::unique_lock<std::mutex> lock(mutex_results);
|
||||
|
||||
for (const auto & task : tasks) {
|
||||
SRV_DBG("add task %d to waiting list. current waiting = %d (before add)\n", task.id, (int) waiting_task_ids.size());
|
||||
RES_DBG("add task %d to waiting list. current waiting = %d (before add)\n", task.id, (int) waiting_task_ids.size());
|
||||
waiting_task_ids.insert(task.id);
|
||||
}
|
||||
}
|
||||
|
||||
void server_response::remove_waiting_task_id(int id_task) {
|
||||
SRV_DBG("remove task %d from waiting list. current waiting = %d (before remove)\n", id_task, (int) waiting_task_ids.size());
|
||||
RES_DBG("remove task %d from waiting list. current waiting = %d (before remove)\n", id_task, (int) waiting_task_ids.size());
|
||||
|
||||
std::unique_lock<std::mutex> lock(mutex_results);
|
||||
waiting_task_ids.erase(id_task);
|
||||
|
|
@ -182,7 +187,7 @@ void server_response::remove_waiting_task_ids(const std::unordered_set<int> & id
|
|||
std::unique_lock<std::mutex> lock(mutex_results);
|
||||
|
||||
for (const auto & id_task : id_tasks) {
|
||||
SRV_DBG("remove task %d from waiting list. current waiting = %d (before remove)\n", id_task, (int) waiting_task_ids.size());
|
||||
RES_DBG("remove task %d from waiting list. current waiting = %d (before remove)\n", id_task, (int) waiting_task_ids.size());
|
||||
waiting_task_ids.erase(id_task);
|
||||
}
|
||||
}
|
||||
|
|
@ -192,7 +197,7 @@ server_task_result_ptr server_response::recv(const std::unordered_set<int> & id_
|
|||
std::unique_lock<std::mutex> lock(mutex_results);
|
||||
condition_results.wait(lock, [&]{
|
||||
if (!running) {
|
||||
SRV_DBG("%s : queue result stop\n", __func__);
|
||||
RES_DBG("%s : queue result stop\n", __func__);
|
||||
std::terminate(); // we cannot return here since the caller is HTTP code
|
||||
}
|
||||
return !queue_results.empty();
|
||||
|
|
@ -224,7 +229,7 @@ server_task_result_ptr server_response::recv_with_timeout(const std::unordered_s
|
|||
|
||||
std::cv_status cr_res = condition_results.wait_for(lock, std::chrono::seconds(timeout));
|
||||
if (!running) {
|
||||
SRV_DBG("%s : queue result stop\n", __func__);
|
||||
RES_DBG("%s : queue result stop\n", __func__);
|
||||
std::terminate(); // we cannot return here since the caller is HTTP code
|
||||
}
|
||||
if (cr_res == std::cv_status::timeout) {
|
||||
|
|
@ -241,12 +246,12 @@ server_task_result_ptr server_response::recv(int id_task) {
|
|||
}
|
||||
|
||||
void server_response::send(server_task_result_ptr && result) {
|
||||
SRV_DBG("sending result for task id = %d\n", result->id);
|
||||
RES_DBG("sending result for task id = %d\n", result->id);
|
||||
|
||||
std::unique_lock<std::mutex> lock(mutex_results);
|
||||
for (const auto & id_task : waiting_task_ids) {
|
||||
if (result->id == id_task) {
|
||||
SRV_DBG("task id = %d pushed to result queue\n", result->id);
|
||||
RES_DBG("task id = %d pushed to result queue\n", result->id);
|
||||
|
||||
queue_results.emplace_back(std::move(result));
|
||||
condition_results.notify_all();
|
||||
|
|
|
|||
|
|
@ -1,6 +1,5 @@
|
|||
#pragma once
|
||||
|
||||
#include "server-common.h"
|
||||
#include "server-task.h"
|
||||
|
||||
#include <chrono>
|
||||
|
|
|
|||
|
|
@ -1,16 +1,12 @@
|
|||
#include "server-common.h"
|
||||
#include "server-task.h"
|
||||
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
#include "chat.h"
|
||||
#include "sampling.h"
|
||||
#include "json-schema-to-grammar.h"
|
||||
|
||||
#include <string>
|
||||
#include <unordered_set>
|
||||
#include <list>
|
||||
|
||||
#include "server-common.h"
|
||||
#include "server-task.h"
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
//
|
||||
|
|
|
|||
|
|
@ -10,10 +10,33 @@
|
|||
#include <unordered_set>
|
||||
#include <list>
|
||||
|
||||
// TODO: prevent including the whole server-common.h as we only use server_tokens
|
||||
#include "server-common.h"
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
enum server_task_type {
|
||||
SERVER_TASK_TYPE_COMPLETION,
|
||||
SERVER_TASK_TYPE_EMBEDDING,
|
||||
SERVER_TASK_TYPE_RERANK,
|
||||
SERVER_TASK_TYPE_INFILL,
|
||||
SERVER_TASK_TYPE_CANCEL,
|
||||
SERVER_TASK_TYPE_NEXT_RESPONSE,
|
||||
SERVER_TASK_TYPE_METRICS,
|
||||
SERVER_TASK_TYPE_SLOT_SAVE,
|
||||
SERVER_TASK_TYPE_SLOT_RESTORE,
|
||||
SERVER_TASK_TYPE_SLOT_ERASE,
|
||||
SERVER_TASK_TYPE_SET_LORA,
|
||||
};
|
||||
|
||||
// TODO: change this to more generic "response_format" to replace the "format_response_*" in server-common
|
||||
enum oaicompat_type {
|
||||
OAICOMPAT_TYPE_NONE,
|
||||
OAICOMPAT_TYPE_CHAT,
|
||||
OAICOMPAT_TYPE_COMPLETION,
|
||||
OAICOMPAT_TYPE_EMBEDDING,
|
||||
};
|
||||
|
||||
struct task_params {
|
||||
bool stream = true;
|
||||
bool include_usage = false;
|
||||
|
|
|
|||
|
|
@ -23,6 +23,20 @@ using json = nlohmann::ordered_json;
|
|||
|
||||
constexpr int HTTP_POLLING_SECONDS = 1;
|
||||
|
||||
// state diagram: https://github.com/ggml-org/llama.cpp/pull/9283
|
||||
enum slot_state {
|
||||
SLOT_STATE_IDLE,
|
||||
SLOT_STATE_STARTED, // TODO: this state is only used for setting up the initial prompt processing; maybe merge it with launch_slot_with_task in the future
|
||||
SLOT_STATE_PROCESSING_PROMPT,
|
||||
SLOT_STATE_DONE_PROMPT,
|
||||
SLOT_STATE_GENERATING,
|
||||
};
|
||||
|
||||
enum server_state {
|
||||
SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet
|
||||
SERVER_STATE_READY, // Server is ready and model is loaded
|
||||
};
|
||||
|
||||
static bool server_task_type_need_embd(server_task_type task_type) {
|
||||
switch (task_type) {
|
||||
case SERVER_TASK_TYPE_EMBEDDING:
|
||||
|
|
|
|||
Loading…
Reference in New Issue