parent
bac8bed248
commit
76681e3c73
|
|
@ -1464,6 +1464,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
params.swa_full = true;
|
||||
}
|
||||
).set_env("LLAMA_ARG_SWA_FULL"));
|
||||
add_opt(common_arg(
|
||||
{"--graph-reuse", "-gr"},
|
||||
string_format("reuse previous compute graphs when possible (default: %s)"
|
||||
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/14482)", params.graph_reuse ? "true" : "false"),
|
||||
[](common_params & params) {
|
||||
params.graph_reuse = true;
|
||||
}
|
||||
).set_env("LLAMA_ARG_GRAPH_REUSE"));
|
||||
add_opt(common_arg(
|
||||
{"--no-context-shift"},
|
||||
string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
|
||||
|
|
|
|||
|
|
@ -1157,6 +1157,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
|
|||
cparams.no_perf = params.no_perf;
|
||||
cparams.op_offload = !params.no_op_offload;
|
||||
cparams.swa_full = params.swa_full;
|
||||
cparams.graph_reuse = params.graph_reuse;
|
||||
|
||||
cparams.type_k = params.cache_type_k;
|
||||
cparams.type_v = params.cache_type_v;
|
||||
|
|
|
|||
|
|
@ -330,6 +330,7 @@ struct common_params {
|
|||
bool no_perf = false; // disable performance metrics
|
||||
bool ctx_shift = true; // context shift on inifinite text generation
|
||||
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
||||
bool graph_reuse = false; // reuse previous compute graphs when possible
|
||||
|
||||
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
||||
bool use_mmap = true; // use mmap for faster loads
|
||||
|
|
|
|||
|
|
@ -374,6 +374,8 @@ extern "C" {
|
|||
bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
||||
// NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
|
||||
|
||||
bool graph_reuse; // reuse previous compute graphs when possible
|
||||
};
|
||||
|
||||
// model quantization parameters
|
||||
|
|
@ -1429,6 +1431,7 @@ extern "C" {
|
|||
|
||||
int32_t n_p_eval;
|
||||
int32_t n_eval;
|
||||
int32_t n_reused;
|
||||
};
|
||||
|
||||
struct llama_perf_sampler_data {
|
||||
|
|
|
|||
|
|
@ -34,6 +34,31 @@ struct llama_ubatch {
|
|||
llama_seq_id * seq_id_unq; // [n_seqs_unq] | s | seq_id
|
||||
int32_t * seq_idx; // [LLAMA_MAX_SEQ] | - | seq_idx
|
||||
int8_t * output; // [n_tokens] | i | -
|
||||
|
||||
bool is_same(const llama_ubatch & other) const {
|
||||
bool res =
|
||||
equal_seqs == other.equal_seqs &&
|
||||
n_tokens == other.n_tokens &&
|
||||
n_seq_tokens == other.n_seq_tokens &&
|
||||
n_seqs == other.n_seqs &&
|
||||
n_seqs_unq == other.n_seqs_unq &&
|
||||
(
|
||||
(!token && !other.token) ||
|
||||
(!embd && !other.embd)
|
||||
);
|
||||
|
||||
if (!res) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// TODO: this won't work because seq_id_unq ptr can point to an old balloc that has
|
||||
// been freed by this point. find a way to fix this
|
||||
//for (uint32_t s = 0; s < n_seqs_unq; ++s) {
|
||||
// res &= seq_id_unq[s] == other.seq_id_unq[s];
|
||||
//}
|
||||
|
||||
return res;
|
||||
}
|
||||
};
|
||||
|
||||
// a helper for sanitizing, fulfilling and splitting a batch
|
||||
|
|
|
|||
|
|
@ -101,7 +101,8 @@ llama_context::llama_context(
|
|||
|
||||
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
|
||||
|
||||
cparams.op_offload = params.op_offload;
|
||||
cparams.op_offload = params.op_offload;
|
||||
cparams.graph_reuse = params.graph_reuse;
|
||||
|
||||
const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
|
||||
|
||||
|
|
@ -227,8 +228,8 @@ llama_context::llama_context(
|
|||
|
||||
LLAMA_LOG_DEBUG("%s: max_nodes = %zu\n", __func__, max_nodes);
|
||||
|
||||
// buffer used to store the computation graph and the tensor meta data
|
||||
buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
|
||||
gf_res_prev.reset(new llm_graph_result(max_nodes));
|
||||
gf_res_reserve.reset(new llm_graph_result(max_nodes));
|
||||
|
||||
// TODO: move these checks to ggml_backend_sched
|
||||
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
|
||||
|
|
@ -388,10 +389,6 @@ ggml_backend_sched_t llama_context::get_sched() const {
|
|||
return sched.get();
|
||||
}
|
||||
|
||||
ggml_context * llama_context::get_ctx_compute() const {
|
||||
return ctx_compute.get();
|
||||
}
|
||||
|
||||
uint32_t llama_context::n_ctx() const {
|
||||
return cparams.n_ctx;
|
||||
}
|
||||
|
|
@ -678,38 +675,52 @@ bool llama_context::apply_adapter_cvec(
|
|||
return cvec.apply(model, data, len, n_embd, il_start, il_end);
|
||||
}
|
||||
|
||||
llm_graph_result_ptr llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) {
|
||||
llm_graph_result_i * llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) {
|
||||
if (mctx && !mctx->apply()) {
|
||||
LLAMA_LOG_ERROR("%s: failed to apply memory context\n", __func__);
|
||||
ret = GGML_STATUS_FAILED;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto * gf = graph_init();
|
||||
if (!gf) {
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize graph\n", __func__);
|
||||
ret = GGML_STATUS_FAILED;
|
||||
return nullptr;
|
||||
}
|
||||
auto * res = gf_res_prev.get();
|
||||
auto * gf = res->get_gf();
|
||||
|
||||
auto res = graph_build(ctx_compute.get(), gf, ubatch, gtype, mctx);
|
||||
if (!res) {
|
||||
LLAMA_LOG_ERROR("%s: failed to build graph\n", __func__);
|
||||
ret = GGML_STATUS_FAILED;
|
||||
return nullptr;
|
||||
}
|
||||
// the new graph parameters
|
||||
// in order to correctly reuse a graph, it's full topology has to be uniquely determined by these parameters
|
||||
const auto gparams = graph_params(res, ubatch, mctx, gtype);
|
||||
|
||||
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
||||
const bool can_reuse = cparams.graph_reuse && res->update(gparams);
|
||||
if (can_reuse) {
|
||||
LLAMA_LOG_DEBUG("%s: reusing previous graph\n", __func__);
|
||||
n_reused++;
|
||||
} else {
|
||||
res->reset();
|
||||
|
||||
if (!ggml_backend_sched_alloc_graph(sched.get(), gf)) {
|
||||
LLAMA_LOG_ERROR("%s: failed to allocate graph\n", __func__);
|
||||
ret = GGML_STATUS_ALLOC_FAILED;
|
||||
return nullptr;
|
||||
ggml_backend_sched_reset(sched.get());
|
||||
ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
|
||||
|
||||
//const auto t_start_us = ggml_time_us();
|
||||
|
||||
gf = model.build_graph(gparams);
|
||||
|
||||
//LLAMA_LOG_INFO("graph build time: %.3f ms\n", (ggml_time_us() - t_start_us)/1000.0);
|
||||
|
||||
if (!gf) {
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize graph\n", __func__);
|
||||
ret = GGML_STATUS_FAILED;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if (!ggml_backend_sched_alloc_graph(sched.get(), gf)) {
|
||||
LLAMA_LOG_ERROR("%s: failed to allocate graph\n", __func__);
|
||||
ret = GGML_STATUS_ALLOC_FAILED;
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
res->set_inputs(&ubatch);
|
||||
|
||||
const auto status = graph_compute(gf, ubatch.n_tokens > 1);
|
||||
const auto status = graph_compute(res->get_gf(), ubatch.n_tokens > 1);
|
||||
if (status != GGML_STATUS_SUCCESS) {
|
||||
LLAMA_LOG_ERROR("%s: failed to compute graph, compute status: %d\n", __func__, status);
|
||||
ret = status;
|
||||
|
|
@ -767,9 +778,6 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
|||
|
||||
n_outputs = n_tokens;
|
||||
|
||||
ggml_backend_sched_reset(sched.get());
|
||||
ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
|
||||
|
||||
const auto causal_attn_org = cparams.causal_attn;
|
||||
|
||||
// always use non-causal attention for encoder graphs
|
||||
|
|
@ -778,7 +786,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
|||
cparams.causal_attn = false;
|
||||
|
||||
ggml_status status;
|
||||
const auto res = process_ubatch(ubatch, LLM_GRAPH_TYPE_ENCODER, nullptr, status);
|
||||
const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_ENCODER, nullptr, status);
|
||||
|
||||
cparams.causal_attn = causal_attn_org;
|
||||
|
||||
|
|
@ -846,7 +854,9 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
|||
|
||||
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
|
||||
// overlap with device computation.
|
||||
ggml_backend_sched_reset(sched.get());
|
||||
if (!cparams.graph_reuse) {
|
||||
ggml_backend_sched_reset(sched.get());
|
||||
}
|
||||
|
||||
// TODO: hacky solution
|
||||
if (model.arch == LLM_ARCH_T5 && t_embd) {
|
||||
|
|
@ -1005,11 +1015,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
|||
n_outputs = n_outputs_new;
|
||||
}
|
||||
|
||||
ggml_backend_sched_reset(sched.get());
|
||||
ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
|
||||
|
||||
ggml_status status;
|
||||
const auto res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status);
|
||||
const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status);
|
||||
|
||||
if (!res) {
|
||||
// the last ubatch failed or was aborted -> remove all positions of that ubatch from the KV cache
|
||||
|
|
@ -1192,7 +1199,9 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
|||
|
||||
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
|
||||
// overlap with device computation.
|
||||
ggml_backend_sched_reset(sched.get());
|
||||
if (!cparams.graph_reuse) {
|
||||
ggml_backend_sched_reset(sched.get());
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -1275,20 +1284,8 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
|||
// graph
|
||||
//
|
||||
|
||||
int32_t llama_context::graph_max_nodes() const {
|
||||
return std::max<int32_t>(65536, 5*model.n_tensors());
|
||||
}
|
||||
|
||||
ggml_cgraph * llama_context::graph_init() {
|
||||
ggml_init_params params = {
|
||||
/*.mem_size =*/ buf_compute_meta.size(),
|
||||
/*.mem_buffer =*/ buf_compute_meta.data(),
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
|
||||
ctx_compute.reset(ggml_init(params));
|
||||
|
||||
return ggml_new_graph_custom(ctx_compute.get(), graph_max_nodes(), false);
|
||||
uint32_t llama_context::graph_max_nodes() const {
|
||||
return std::max<uint32_t>(65536u, 5u*model.n_tensors());
|
||||
}
|
||||
|
||||
ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx) {
|
||||
|
|
@ -1301,6 +1298,9 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
|
|||
LLAMA_LOG_DEBUG("%s: making n_tokens a multiple of n_seqs - n_tokens = %u, n_seqs = %u, n_outputs = %u\n", __func__, n_tokens, n_seqs, n_outputs);
|
||||
}
|
||||
|
||||
gf_res_prev->reset();
|
||||
ggml_backend_sched_reset(sched.get());
|
||||
|
||||
// store the n_outputs as it is, and restore it afterwards
|
||||
// TODO: not sure if needed, might simplify in the future by removing this
|
||||
const auto save_n_outputs = this->n_outputs;
|
||||
|
|
@ -1310,18 +1310,16 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
|
|||
llama_batch_allocr balloc(model.hparams.n_pos_per_embd());
|
||||
llama_ubatch ubatch = balloc.ubatch_reserve(n_tokens/n_seqs, n_seqs);
|
||||
|
||||
auto * gf = graph_init();
|
||||
auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT, mctx);
|
||||
auto * res = gf_res_reserve.get();
|
||||
|
||||
const auto gparams = graph_params(res, ubatch, mctx, LLM_GRAPH_TYPE_DEFAULT);
|
||||
|
||||
res->reset();
|
||||
|
||||
auto * gf = model.build_graph(gparams);
|
||||
|
||||
this->n_outputs = save_n_outputs;
|
||||
|
||||
if (!res) {
|
||||
LLAMA_LOG_ERROR("%s: failed to build worst-case graph\n", __func__);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
ggml_backend_sched_reset(sched.get());
|
||||
|
||||
// initialize scheduler with the specified graph
|
||||
if (!ggml_backend_sched_reserve(sched.get(), gf)) {
|
||||
LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
|
||||
|
|
@ -1331,28 +1329,27 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
|
|||
return gf;
|
||||
}
|
||||
|
||||
llm_graph_result_ptr llama_context::graph_build(
|
||||
ggml_context * ctx,
|
||||
ggml_cgraph * gf,
|
||||
const llama_ubatch & ubatch,
|
||||
llm_graph_type gtype,
|
||||
const llama_memory_context_i * mctx) {
|
||||
return model.build_graph(
|
||||
{
|
||||
/*.ctx =*/ ctx,
|
||||
/*.arch =*/ model.arch,
|
||||
/*.hparams =*/ model.hparams,
|
||||
/*.cparams =*/ cparams,
|
||||
/*.ubatch =*/ ubatch,
|
||||
/*.sched =*/ sched.get(),
|
||||
/*.backend_cpu =*/ backend_cpu,
|
||||
/*.cvec =*/ &cvec,
|
||||
/*.loras =*/ &loras,
|
||||
/*.mctx =*/ mctx,
|
||||
/*.cross =*/ &cross,
|
||||
/*.n_outputs =*/ n_outputs,
|
||||
/*.cb =*/ graph_get_cb(),
|
||||
}, gf, gtype);
|
||||
llm_graph_params llama_context::graph_params(
|
||||
llm_graph_result_i * res,
|
||||
const llama_ubatch & ubatch,
|
||||
const llama_memory_context_i * mctx,
|
||||
llm_graph_type gtype) const {
|
||||
return {
|
||||
/*.arch =*/ model.arch,
|
||||
/*.hparams =*/ model.hparams,
|
||||
/*.cparams =*/ cparams,
|
||||
/*.ubatch =*/ ubatch,
|
||||
/*.gtype =*/ gtype,
|
||||
/*.sched =*/ sched.get(),
|
||||
/*.backend_cpu =*/ backend_cpu,
|
||||
/*.cvec =*/ &cvec,
|
||||
/*.loras =*/ &loras,
|
||||
/*.mctx =*/ mctx,
|
||||
/*.cross =*/ &cross,
|
||||
/*.n_outputs =*/ n_outputs,
|
||||
/*.cb =*/ graph_get_cb(),
|
||||
/*.res =*/ res,
|
||||
};
|
||||
}
|
||||
|
||||
ggml_status llama_context::graph_compute(
|
||||
|
|
@ -1930,6 +1927,7 @@ llama_perf_context_data llama_context::perf_get_data() const {
|
|||
data.t_eval_ms = 1e-3 * t_eval_us;
|
||||
data.n_p_eval = std::max(1, n_p_eval);
|
||||
data.n_eval = std::max(1, n_eval);
|
||||
data.n_reused = std::max(0, n_reused);
|
||||
|
||||
return data;
|
||||
}
|
||||
|
|
@ -1938,6 +1936,7 @@ void llama_context::perf_reset() {
|
|||
t_start_us = ggml_time_us();
|
||||
t_eval_us = n_eval = 0;
|
||||
t_p_eval_us = n_p_eval = 0;
|
||||
n_reused = 0;
|
||||
}
|
||||
|
||||
//
|
||||
|
|
@ -2064,8 +2063,13 @@ void llama_context::opt_epoch_iter(
|
|||
break;
|
||||
}
|
||||
|
||||
auto * gf = graph_init();
|
||||
auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT, mctx.get());
|
||||
auto * res = gf_res_prev.get();
|
||||
|
||||
const auto gparams = graph_params(res, ubatch, mctx.get(), LLM_GRAPH_TYPE_DEFAULT);
|
||||
|
||||
res->reset();
|
||||
|
||||
auto * gf = model.build_graph(gparams);
|
||||
|
||||
struct ggml_context * ctx_compute_opt;
|
||||
{
|
||||
|
|
@ -2187,6 +2191,7 @@ llama_context_params llama_context_default_params() {
|
|||
/*.no_perf =*/ true,
|
||||
/*.op_offload =*/ true,
|
||||
/*.swa_full =*/ true,
|
||||
/*.graph_reuse =*/ false,
|
||||
};
|
||||
|
||||
return result;
|
||||
|
|
@ -2807,6 +2812,7 @@ void llama_perf_context_print(const llama_context * ctx) {
|
|||
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||
__func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
|
||||
LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
|
||||
LLAMA_LOG_INFO("%s: graphs reused = %10d\n", __func__, data.n_reused);
|
||||
}
|
||||
|
||||
void llama_perf_context_reset(llama_context * ctx) {
|
||||
|
|
|
|||
|
|
@ -35,8 +35,6 @@ struct llama_context {
|
|||
|
||||
ggml_backend_sched_t get_sched() const;
|
||||
|
||||
ggml_context * get_ctx_compute() const;
|
||||
|
||||
uint32_t n_ctx() const;
|
||||
uint32_t n_ctx_per_seq() const;
|
||||
uint32_t n_batch() const;
|
||||
|
|
@ -96,7 +94,7 @@ struct llama_context {
|
|||
// if memory_context is provided, it will be applied first to the context's memory
|
||||
// ret contains the status of the graph computation
|
||||
// returns nullptr only if ret != GGML_STATUS_SUCCESS
|
||||
llm_graph_result_ptr process_ubatch(
|
||||
llm_graph_result_i * process_ubatch(
|
||||
const llama_ubatch & ubatch,
|
||||
llm_graph_type gtype,
|
||||
llama_memory_context_i * mctx,
|
||||
|
|
@ -188,10 +186,7 @@ private:
|
|||
//
|
||||
|
||||
public:
|
||||
int32_t graph_max_nodes() const;
|
||||
|
||||
// zero-out inputs and create the ctx_compute for the compute graph
|
||||
ggml_cgraph * graph_init();
|
||||
uint32_t graph_max_nodes() const;
|
||||
|
||||
// returns the result of ggml_backend_sched_graph_compute_async execution
|
||||
ggml_status graph_compute(ggml_cgraph * gf, bool batched);
|
||||
|
|
@ -200,12 +195,11 @@ public:
|
|||
ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx);
|
||||
|
||||
private:
|
||||
llm_graph_result_ptr graph_build(
|
||||
ggml_context * ctx,
|
||||
ggml_cgraph * gf,
|
||||
const llama_ubatch & ubatch,
|
||||
llm_graph_type gtype,
|
||||
const llama_memory_context_i * mctx);
|
||||
llm_graph_params graph_params(
|
||||
llm_graph_result_i * res,
|
||||
const llama_ubatch & ubatch,
|
||||
const llama_memory_context_i * mctx,
|
||||
llm_graph_type gtype) const;
|
||||
|
||||
llm_graph_cb graph_get_cb() const;
|
||||
|
||||
|
|
@ -258,8 +252,6 @@ private:
|
|||
ggml_backend_t backend_cpu = nullptr;
|
||||
std::vector<ggml_backend_ptr> backends;
|
||||
|
||||
ggml_context_ptr ctx_compute;
|
||||
|
||||
// training
|
||||
ggml_opt_context_t opt_ctx = nullptr;
|
||||
|
||||
|
|
@ -275,8 +267,8 @@ private:
|
|||
std::vector<ggml_backend_t> backend_ptrs;
|
||||
std::vector<ggml_backend_buffer_type_t> backend_buft;
|
||||
|
||||
// memory buffers used to evaluate the model
|
||||
std::vector<uint8_t> buf_compute_meta;
|
||||
llm_graph_result_ptr gf_res_prev;
|
||||
llm_graph_result_ptr gf_res_reserve;
|
||||
|
||||
// host buffer for the model output (logits and embeddings)
|
||||
ggml_backend_buffer_ptr buf_output;
|
||||
|
|
@ -294,4 +286,6 @@ private:
|
|||
|
||||
mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
||||
mutable int32_t n_eval = 0; // number of eval calls
|
||||
|
||||
mutable int32_t n_reused = 0; // number of times the previous graph was reused
|
||||
};
|
||||
|
|
|
|||
|
|
@ -3,3 +3,32 @@
|
|||
size_t llama_max_parallel_sequences(void) {
|
||||
return LLAMA_MAX_SEQ;
|
||||
}
|
||||
|
||||
bool llama_cparams::is_same(const llama_cparams & other) const {
|
||||
return
|
||||
n_ctx == other.n_ctx &&
|
||||
n_batch == other.n_batch &&
|
||||
n_ubatch == other.n_ubatch &&
|
||||
n_seq_max == other.n_seq_max &&
|
||||
n_threads == other.n_threads &&
|
||||
n_threads_batch == other.n_threads_batch &&
|
||||
rope_freq_base == other.rope_freq_base &&
|
||||
rope_freq_scale == other.rope_freq_scale &&
|
||||
n_ctx_orig_yarn == other.n_ctx_orig_yarn &&
|
||||
yarn_ext_factor == other.yarn_ext_factor &&
|
||||
yarn_attn_factor == other.yarn_attn_factor &&
|
||||
yarn_beta_fast == other.yarn_beta_fast &&
|
||||
yarn_beta_slow == other.yarn_beta_slow &&
|
||||
defrag_thold == other.defrag_thold &&
|
||||
embeddings == other.embeddings &&
|
||||
causal_attn == other.causal_attn &&
|
||||
offload_kqv == other.offload_kqv &&
|
||||
flash_attn == other.flash_attn &&
|
||||
no_perf == other.no_perf &&
|
||||
warmup == other.warmup &&
|
||||
op_offload == other.op_offload &&
|
||||
graph_reuse == other.graph_reuse &&
|
||||
pooling_type == other.pooling_type &&
|
||||
cb_eval == other.cb_eval &&
|
||||
cb_eval_user_data == other.cb_eval_user_data;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -33,9 +33,12 @@ struct llama_cparams {
|
|||
bool no_perf;
|
||||
bool warmup;
|
||||
bool op_offload;
|
||||
bool graph_reuse;
|
||||
|
||||
enum llama_pooling_type pooling_type;
|
||||
|
||||
ggml_backend_sched_eval_callback cb_eval;
|
||||
void * cb_eval_user_data;
|
||||
|
||||
bool is_same(const llama_cparams & other) const;
|
||||
};
|
||||
|
|
|
|||
|
|
@ -28,6 +28,15 @@ void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
|
|||
}
|
||||
}
|
||||
|
||||
bool llm_graph_input_embd::update(const llm_graph_params & params) {
|
||||
bool res = true;
|
||||
|
||||
res &= (!tokens && !params.ubatch.token) || (tokens && tokens->ne[0] == params.ubatch.n_tokens);
|
||||
res &= (!embd && !params.ubatch.embd) || (embd && embd->ne[0] == params.ubatch.n_tokens);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
|
||||
if (ubatch->pos && pos) {
|
||||
const int64_t n_tokens = ubatch->n_tokens;
|
||||
|
|
@ -50,6 +59,14 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
|
|||
}
|
||||
}
|
||||
|
||||
bool llm_graph_input_pos::update(const llm_graph_params & params) {
|
||||
bool res = true;
|
||||
|
||||
res &= pos->ne[0] == params.ubatch.n_tokens;
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
|
||||
if (ubatch->pos && attn_scale) {
|
||||
const int64_t n_tokens = ubatch->n_tokens;
|
||||
|
|
@ -118,6 +135,14 @@ void llm_graph_input_out_ids::set_input(const llama_ubatch * ubatch) {
|
|||
}
|
||||
}
|
||||
|
||||
bool llm_graph_input_out_ids::update(const llm_graph_params & params) {
|
||||
bool res = true;
|
||||
|
||||
res &= n_outputs == params.n_outputs;
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) {
|
||||
if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
|
||||
const int64_t n_tokens = ubatch->n_tokens;
|
||||
|
|
@ -287,6 +312,24 @@ void llm_graph_input_attn_kv_unified::set_input(const llama_ubatch * ubatch) {
|
|||
mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
|
||||
}
|
||||
|
||||
bool llm_graph_input_attn_kv_unified::update(const llm_graph_params & params) {
|
||||
const auto * mctx = static_cast<const llama_kv_cache_unified_context *>(params.mctx);
|
||||
|
||||
this->mctx = mctx;
|
||||
|
||||
bool res = true;
|
||||
|
||||
res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
|
||||
res &= self_v_idxs->ne[0] == params.ubatch.n_tokens;
|
||||
|
||||
res &= self_kq_mask->ne[0] == mctx->get_n_kv();
|
||||
res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
|
||||
|
||||
res &= mctx->get_supports_set_rows(); // TODO: tmp
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch) {
|
||||
mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch);
|
||||
mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch);
|
||||
|
|
@ -299,6 +342,30 @@ void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch
|
|||
mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
|
||||
}
|
||||
|
||||
bool llm_graph_input_attn_kv_unified_iswa::update(const llm_graph_params & params) {
|
||||
const auto * mctx = static_cast<const llama_kv_cache_unified_iswa_context *>(params.mctx);
|
||||
|
||||
this->mctx = mctx;
|
||||
|
||||
bool res = true;
|
||||
|
||||
res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
|
||||
res &= self_v_idxs->ne[0] == params.ubatch.n_tokens;
|
||||
|
||||
res &= self_k_idxs_swa->ne[0] == params.ubatch.n_tokens;
|
||||
res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens;
|
||||
|
||||
res &= self_kq_mask->ne[0] == mctx->get_base()->get_n_kv();
|
||||
res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
|
||||
|
||||
res &= self_kq_mask_swa->ne[0] == mctx->get_swa()->get_n_kv();
|
||||
res &= self_kq_mask_swa->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
|
||||
|
||||
res &= mctx->get_base()->get_supports_set_rows(); // TODO: tmp
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
|
||||
GGML_ASSERT(cross_kq_mask);
|
||||
|
||||
|
|
@ -395,7 +462,6 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
|
|||
n_ctx_orig (cparams.n_ctx_orig_yarn),
|
||||
pooling_type (cparams.pooling_type),
|
||||
rope_type (hparams.rope_type),
|
||||
ctx0 (params.ctx),
|
||||
sched (params.sched),
|
||||
backend_cpu (params.backend_cpu),
|
||||
cvec (params.cvec),
|
||||
|
|
@ -403,7 +469,8 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
|
|||
mctx (params.mctx),
|
||||
cross (params.cross),
|
||||
cb_func (params.cb),
|
||||
res (std::make_unique<llm_graph_result>()) {
|
||||
res (static_cast<llm_graph_result *>(params.res)),
|
||||
ctx0 (res->get_ctx()) {
|
||||
}
|
||||
|
||||
void llm_graph_context::cb(ggml_tensor * cur, const char * name, int il) const {
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
#pragma once
|
||||
|
||||
#include "llama-arch.h"
|
||||
#include "llama-batch.h"
|
||||
#include "llama-hparams.h"
|
||||
#include "llama-adapter.h"
|
||||
|
||||
|
|
@ -14,7 +15,6 @@ struct ggml_cgraph;
|
|||
struct ggml_context;
|
||||
struct ggml_tensor;
|
||||
|
||||
struct llama_ubatch;
|
||||
struct llama_cparams;
|
||||
|
||||
struct llama_memory_context_i;
|
||||
|
|
@ -69,6 +69,8 @@ struct llama_cross {
|
|||
std::vector<std::set<llama_seq_id>> seq_ids_enc;
|
||||
};
|
||||
|
||||
struct llm_graph_params;
|
||||
|
||||
//
|
||||
// llm_graph_input
|
||||
//
|
||||
|
|
@ -78,11 +80,19 @@ public:
|
|||
virtual ~llm_graph_input_i() = default;
|
||||
|
||||
virtual void set_input(const llama_ubatch * ubatch) = 0;
|
||||
|
||||
// return true if the resulting input tensors using the provided graph parameters would be
|
||||
// the same as the previous input tensors that we have currently stored in the object
|
||||
virtual bool update(const llm_graph_params & params) {
|
||||
// returning false here by default will prevent from reusing the graph if the check
|
||||
// for the input type has not been implemented yet
|
||||
GGML_UNUSED(params);
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
using llm_graph_input_ptr = std::unique_ptr<llm_graph_input_i>;
|
||||
|
||||
|
||||
class llm_graph_input_embd : public llm_graph_input_i {
|
||||
public:
|
||||
llm_graph_input_embd() = default;
|
||||
|
|
@ -90,6 +100,8 @@ public:
|
|||
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
|
||||
bool update(const llm_graph_params & params) override;
|
||||
|
||||
ggml_tensor * tokens = nullptr; // I32 [n_batch]
|
||||
ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch]
|
||||
};
|
||||
|
|
@ -101,6 +113,8 @@ public:
|
|||
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
|
||||
bool update(const llm_graph_params & params) override;
|
||||
|
||||
ggml_tensor * pos = nullptr; // I32 [n_batch]
|
||||
|
||||
const uint32_t n_pos_per_embd = 1;
|
||||
|
|
@ -154,17 +168,19 @@ public:
|
|||
llm_graph_input_out_ids(
|
||||
const llama_hparams & hparams,
|
||||
const llama_cparams & cparams,
|
||||
int32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {}
|
||||
uint32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {}
|
||||
virtual ~llm_graph_input_out_ids() = default;
|
||||
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
|
||||
bool update(const llm_graph_params & params) override;
|
||||
|
||||
ggml_tensor * out_ids; // I32 [n_outputs]
|
||||
|
||||
const llama_hparams & hparams;
|
||||
const llama_cparams & cparams;
|
||||
|
||||
const int32_t n_outputs;
|
||||
const uint32_t n_outputs;
|
||||
};
|
||||
|
||||
class llm_graph_input_mean : public llm_graph_input_i {
|
||||
|
|
@ -249,6 +265,8 @@ public:
|
|||
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
|
||||
bool update(const llm_graph_params & params) override;
|
||||
|
||||
ggml_tensor * get_k_idxs() const { return self_k_idxs; }
|
||||
ggml_tensor * get_v_idxs() const { return self_v_idxs; }
|
||||
|
||||
|
|
@ -280,6 +298,8 @@ public:
|
|||
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
|
||||
bool update(const llm_graph_params & params) override;
|
||||
|
||||
ggml_tensor * get_k_idxs() const { return self_k_idxs; }
|
||||
ggml_tensor * get_v_idxs() const { return self_v_idxs; }
|
||||
ggml_tensor * get_k_idxs_swa() const { return self_k_idxs_swa; }
|
||||
|
|
@ -373,29 +393,110 @@ public:
|
|||
// along with the input tensors, the object also provides commonly used outputs tensors, such as logits, embeddings, etc.
|
||||
// these are used by the llama_context to extact the relevant data, based on the compute parameters
|
||||
|
||||
// TODO: this interface seems redundant - remove it
|
||||
class llm_graph_result_i {
|
||||
public:
|
||||
virtual ~llm_graph_result_i() = default;
|
||||
|
||||
virtual ggml_tensor * get_tokens() = 0;
|
||||
virtual ggml_tensor * get_logits() = 0;
|
||||
virtual ggml_tensor * get_embd() = 0;
|
||||
virtual ggml_tensor * get_embd_pooled() = 0;
|
||||
virtual ggml_tensor * get_tokens() const = 0;
|
||||
virtual ggml_tensor * get_logits() const = 0;
|
||||
virtual ggml_tensor * get_embd() const = 0;
|
||||
virtual ggml_tensor * get_embd_pooled() const = 0;
|
||||
|
||||
virtual ggml_cgraph * get_gf() = 0;
|
||||
virtual ggml_context * get_ctx() = 0;
|
||||
|
||||
virtual void reset() = 0;
|
||||
|
||||
virtual void set_inputs(const llama_ubatch * ubatch) = 0;
|
||||
|
||||
virtual bool update(const llm_graph_params & params) = 0;
|
||||
};
|
||||
|
||||
using llm_graph_result_ptr = std::unique_ptr<llm_graph_result_i>;
|
||||
|
||||
// callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
|
||||
using llm_graph_cb = std::function<void(const llama_ubatch & ubatch, ggml_tensor * cur, const char * name, int il)>;
|
||||
|
||||
struct llm_graph_params {
|
||||
llm_arch arch = LLM_ARCH_UNKNOWN;
|
||||
|
||||
llama_hparams hparams;
|
||||
llama_cparams cparams;
|
||||
|
||||
llama_ubatch ubatch; // note: intentionally make a copy
|
||||
|
||||
llm_graph_type gtype;
|
||||
|
||||
ggml_backend_sched_t sched;
|
||||
ggml_backend_t backend_cpu;
|
||||
|
||||
const llama_adapter_cvec * cvec;
|
||||
const llama_adapter_loras * loras;
|
||||
const llama_memory_context_i * mctx;
|
||||
const llama_cross * cross;
|
||||
|
||||
uint32_t n_outputs;
|
||||
|
||||
llm_graph_cb cb;
|
||||
|
||||
// TODO: temporary
|
||||
llm_graph_result_i * res;
|
||||
|
||||
bool is_same(const llm_graph_params & other) const {
|
||||
return
|
||||
hparams.is_same(other.hparams) &&
|
||||
cparams.is_same(other.cparams) &&
|
||||
ubatch .is_same(other.ubatch) &&
|
||||
arch == other.arch &&
|
||||
gtype == other.gtype &&
|
||||
cvec == other.cvec &&
|
||||
loras == other.loras &&
|
||||
cross == other.cross &&
|
||||
n_outputs == other.n_outputs;
|
||||
}
|
||||
};
|
||||
|
||||
class llm_graph_result : public llm_graph_result_i {
|
||||
public:
|
||||
llm_graph_result(int64_t max_nodes) : max_nodes(max_nodes) {
|
||||
reset();
|
||||
}
|
||||
|
||||
virtual ~llm_graph_result() = default;
|
||||
|
||||
ggml_tensor * get_tokens() override { return t_tokens; }
|
||||
ggml_tensor * get_logits() override { return t_logits; }
|
||||
ggml_tensor * get_embd() override { return t_embd; }
|
||||
ggml_tensor * get_embd_pooled() override { return t_embd_pooled; }
|
||||
ggml_tensor * get_tokens() const override { return t_tokens; }
|
||||
ggml_tensor * get_logits() const override { return t_logits; }
|
||||
ggml_tensor * get_embd() const override { return t_embd; }
|
||||
ggml_tensor * get_embd_pooled() const override { return t_embd_pooled; }
|
||||
|
||||
ggml_cgraph * get_gf() override { return gf; }
|
||||
ggml_context * get_ctx() override { return ctx_compute.get(); }
|
||||
|
||||
void set_max_nodes(int64_t max_nodes) {
|
||||
this->max_nodes = max_nodes;
|
||||
}
|
||||
|
||||
void reset() override {
|
||||
t_tokens = nullptr;
|
||||
t_logits = nullptr;
|
||||
t_embd = nullptr;
|
||||
t_embd_pooled = nullptr;
|
||||
|
||||
inputs.clear();
|
||||
|
||||
buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
|
||||
|
||||
ggml_init_params params = {
|
||||
/*.mem_size =*/ buf_compute_meta.size(),
|
||||
/*.mem_buffer =*/ buf_compute_meta.data(),
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
|
||||
ctx_compute.reset(ggml_init(params));
|
||||
|
||||
gf = ggml_new_graph_custom(ctx_compute.get(), max_nodes, false);
|
||||
}
|
||||
|
||||
void set_inputs(const llama_ubatch * ubatch) override {
|
||||
for (auto & input : inputs) {
|
||||
|
|
@ -403,6 +504,25 @@ public:
|
|||
}
|
||||
}
|
||||
|
||||
// try to update the existing graph result using the new graph parameters
|
||||
// this can only be done if we determine that the resulting graph using the new graph parameters
|
||||
// would be identical to the existing graph. in that case, we simply have to update the memory
|
||||
// contexts of the input tensors of the graph and we can reuse it for another computation
|
||||
// return true if the graph was updated and can be reused
|
||||
bool update(const llm_graph_params & params) override {
|
||||
if (!this->params.is_same(params)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool res = true;
|
||||
|
||||
for (auto & input : inputs) {
|
||||
res &= input->update(params);
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
llm_graph_input_i * add_input(llm_graph_input_ptr input) {
|
||||
inputs.emplace_back(std::move(input));
|
||||
return inputs.back().get();
|
||||
|
|
@ -415,37 +535,26 @@ public:
|
|||
ggml_tensor * t_embd_pooled = nullptr;
|
||||
|
||||
std::vector<llm_graph_input_ptr> inputs;
|
||||
|
||||
ggml_context_ptr ctx_compute;
|
||||
|
||||
// memory buffers used to evaluate the model
|
||||
std::vector<uint8_t> buf_compute_meta;
|
||||
|
||||
ggml_cgraph * gf;
|
||||
|
||||
int64_t max_nodes;
|
||||
|
||||
// keep a copy of the previous graph parameters
|
||||
// we will use this to determine whether the graph can be reused by comparing them with the new parameters
|
||||
// note: these are updated after constructing the new graph
|
||||
llm_graph_params params;
|
||||
};
|
||||
|
||||
//
|
||||
// llm_graph_context
|
||||
//
|
||||
|
||||
// callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
|
||||
using llm_graph_cb = std::function<void(const llama_ubatch & ubatch, ggml_tensor * cur, const char * name, int il)>;
|
||||
|
||||
struct llm_graph_params {
|
||||
ggml_context * ctx;
|
||||
|
||||
const llm_arch arch;
|
||||
|
||||
const llama_hparams & hparams;
|
||||
const llama_cparams & cparams;
|
||||
const llama_ubatch & ubatch;
|
||||
|
||||
ggml_backend_sched_t sched;
|
||||
ggml_backend_t backend_cpu;
|
||||
|
||||
const llama_adapter_cvec * cvec;
|
||||
const llama_adapter_loras * loras;
|
||||
const llama_memory_context_i * mctx;
|
||||
const llama_cross * cross;
|
||||
|
||||
uint32_t n_outputs;
|
||||
|
||||
const llm_graph_cb & cb;
|
||||
};
|
||||
|
||||
// used in build_rs to properly order writes and avoid unnecessary copies
|
||||
using llm_graph_get_rows_fn = std::function<ggml_tensor * (ggml_context *, ggml_tensor * states, ggml_tensor * ids)>;
|
||||
|
||||
|
|
@ -485,8 +594,6 @@ struct llm_graph_context {
|
|||
const enum llama_pooling_type pooling_type;
|
||||
const enum llama_rope_type rope_type;
|
||||
|
||||
ggml_context * ctx0 = nullptr;
|
||||
|
||||
ggml_backend_sched_t sched;
|
||||
|
||||
ggml_backend_t backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?
|
||||
|
|
@ -498,7 +605,9 @@ struct llm_graph_context {
|
|||
|
||||
const llm_graph_cb & cb_func;
|
||||
|
||||
std::unique_ptr<llm_graph_result> res;
|
||||
llm_graph_result * res;
|
||||
|
||||
ggml_context * ctx0 = nullptr;
|
||||
|
||||
llm_graph_context(const llm_graph_params & params);
|
||||
virtual ~llm_graph_context() = default;
|
||||
|
|
|
|||
|
|
@ -102,3 +102,12 @@ bool llama_hparams::is_swa(uint32_t il) const {
|
|||
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
bool llama_hparams::is_same(const llama_hparams & other) const {
|
||||
return
|
||||
n_ctx_train == other.n_ctx_train &&
|
||||
n_embd == other.n_embd &&
|
||||
n_layer == other.n_layer &&
|
||||
n_expert == other.n_expert &&
|
||||
n_expert_used == other.n_expert_used;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -202,6 +202,8 @@ struct llama_hparams {
|
|||
uint32_t n_pos_per_embd() const;
|
||||
|
||||
bool is_swa(uint32_t il) const;
|
||||
|
||||
bool is_same(const llama_hparams & other) const;
|
||||
};
|
||||
|
||||
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
|
||||
|
|
|
|||
|
|
@ -68,6 +68,8 @@ llama_kv_cache_unified::llama_kv_cache_unified(
|
|||
|
||||
cells.resize(kv_size);
|
||||
|
||||
gf_res.reset(new llm_graph_result(32768)); // note: the max nodes will be updated later
|
||||
|
||||
for (uint32_t il = 0; il < n_layer_cache; il++) {
|
||||
if (filter && !filter(il)) {
|
||||
LLAMA_LOG_DEBUG("%s: layer %3d: skipped\n", __func__, il);
|
||||
|
|
@ -158,7 +160,7 @@ llama_kv_cache_unified::llama_kv_cache_unified(
|
|||
debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0;
|
||||
|
||||
const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
|
||||
supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) : 0;
|
||||
supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) != 0 : 0;
|
||||
|
||||
if (!supports_set_rows) {
|
||||
LLAMA_LOG_WARN("%s: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility\n", __func__);
|
||||
|
|
@ -480,14 +482,12 @@ bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const d
|
|||
if (hparams.rope_type != LLAMA_ROPE_TYPE_NONE) {
|
||||
ggml_backend_sched_reset(sched);
|
||||
|
||||
auto * gf = lctx->graph_init();
|
||||
auto * res = gf_res.get();
|
||||
|
||||
auto res = build_graph_shift(lctx->get_cparams(), lctx->get_ctx_compute(), gf);
|
||||
if (!res) {
|
||||
LLAMA_LOG_ERROR("%s: failed to build graph for K-shift\n", __func__);
|
||||
return updated;
|
||||
}
|
||||
res->set_max_nodes(lctx->graph_max_nodes());
|
||||
res->reset();
|
||||
|
||||
auto * gf = build_graph_shift(res, lctx);
|
||||
if (!ggml_backend_sched_alloc_graph(sched, gf)) {
|
||||
LLAMA_LOG_ERROR("%s: failed to allocate compute graph for K-shift\n", __func__);
|
||||
return updated;
|
||||
|
|
@ -529,14 +529,12 @@ bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const d
|
|||
|
||||
ggml_backend_sched_reset(sched);
|
||||
|
||||
auto * gf = lctx->graph_init();
|
||||
auto * res = gf_res.get();
|
||||
|
||||
auto res = build_graph_defrag(lctx->get_cparams(), lctx->get_ctx_compute(), gf, dinfo);
|
||||
if (!res) {
|
||||
LLAMA_LOG_ERROR("%s: failed to build graph for defrag\n", __func__);
|
||||
return updated;
|
||||
}
|
||||
res->set_max_nodes(lctx->graph_max_nodes());
|
||||
res->reset();
|
||||
|
||||
auto * gf = build_graph_defrag(res, lctx, dinfo);
|
||||
if (!ggml_backend_sched_alloc_graph(sched, gf)) {
|
||||
LLAMA_LOG_ERROR("%s: failed to allocate compute graph for defrag\n", __func__);
|
||||
return updated;
|
||||
|
|
@ -780,6 +778,10 @@ uint32_t llama_kv_cache_unified::get_n_kv() const {
|
|||
return std::min(cells.size(), std::max(n_pad, GGML_PAD(cells.used_max_p1(), n_pad)));
|
||||
}
|
||||
|
||||
bool llama_kv_cache_unified::get_supports_set_rows() const {
|
||||
return supports_set_rows;
|
||||
}
|
||||
|
||||
ggml_tensor * llama_kv_cache_unified::get_k(ggml_context * ctx, int32_t il, uint32_t n_kv) const {
|
||||
const int32_t ikv = map_layer_ids.at(il);
|
||||
|
||||
|
|
@ -1142,11 +1144,9 @@ void llm_graph_input_k_shift::set_input(const llama_ubatch * ubatch) {
|
|||
}
|
||||
}
|
||||
|
||||
llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
|
||||
const llama_cparams & cparams,
|
||||
ggml_context * ctx,
|
||||
ggml_cgraph * gf) const {
|
||||
auto res = std::make_unique<llm_graph_result>();
|
||||
ggml_cgraph * llama_kv_cache_unified::build_graph_shift(llm_graph_result * res, llama_context * lctx) const {
|
||||
auto * ctx = res->get_ctx();
|
||||
auto * gf = res->get_gf();
|
||||
|
||||
const auto & n_embd_head_k = hparams.n_embd_head_k;
|
||||
//const auto & n_embd_head_v = hparams.n_embd_head_v;
|
||||
|
|
@ -1156,6 +1156,8 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
|
|||
inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, cells.size());
|
||||
ggml_set_input(inp->k_shift);
|
||||
|
||||
const auto & cparams = lctx->get_cparams();
|
||||
|
||||
for (const auto & layer : layers) {
|
||||
const uint32_t il = layer.il;
|
||||
|
||||
|
|
@ -1181,18 +1183,20 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
|
|||
|
||||
res->add_input(std::move(inp));
|
||||
|
||||
return res;
|
||||
return gf;
|
||||
}
|
||||
|
||||
llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
|
||||
const llama_cparams & cparams,
|
||||
ggml_context * ctx,
|
||||
ggml_cgraph * gf,
|
||||
const defrag_info & dinfo) const {
|
||||
auto res = std::make_unique<llm_graph_result>();
|
||||
ggml_cgraph * llama_kv_cache_unified::build_graph_defrag(
|
||||
llm_graph_result * res,
|
||||
llama_context * lctx,
|
||||
const defrag_info & dinfo) const {
|
||||
auto * ctx = res->get_ctx();
|
||||
auto * gf = res->get_gf();
|
||||
|
||||
const auto & ids = dinfo.ids;
|
||||
|
||||
const auto & cparams = lctx->get_cparams();
|
||||
|
||||
#if 0
|
||||
// CPU defrag
|
||||
//
|
||||
|
|
@ -1329,7 +1333,7 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
|
|||
//LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
|
||||
#endif
|
||||
|
||||
return res;
|
||||
return gf;
|
||||
}
|
||||
|
||||
llama_kv_cache_unified::defrag_info llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) const {
|
||||
|
|
@ -1940,6 +1944,10 @@ uint32_t llama_kv_cache_unified_context::get_n_kv() const {
|
|||
return n_kv;
|
||||
}
|
||||
|
||||
bool llama_kv_cache_unified_context::get_supports_set_rows() const {
|
||||
return kv->get_supports_set_rows();
|
||||
}
|
||||
|
||||
ggml_tensor * llama_kv_cache_unified_context::get_k(ggml_context * ctx, int32_t il) const {
|
||||
return kv->get_k(ctx, il, n_kv);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -121,6 +121,9 @@ public:
|
|||
|
||||
uint32_t get_n_kv() const;
|
||||
|
||||
// TODO: temporary
|
||||
bool get_supports_set_rows() const;
|
||||
|
||||
// get views of the current state of the cache
|
||||
ggml_tensor * get_k(ggml_context * ctx, int32_t il, uint32_t n_kv) const;
|
||||
ggml_tensor * get_v(ggml_context * ctx, int32_t il, uint32_t n_kv) const;
|
||||
|
|
@ -193,13 +196,15 @@ private:
|
|||
|
||||
// env: LLAMA_SET_ROWS (temporary)
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14285
|
||||
int supports_set_rows = false;
|
||||
bool supports_set_rows = false;
|
||||
|
||||
const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
|
||||
|
||||
std::vector<ggml_context_ptr> ctxs;
|
||||
std::vector<ggml_backend_buffer_ptr> bufs;
|
||||
|
||||
std::unique_ptr<llm_graph_result> gf_res;
|
||||
|
||||
llama_kv_cells_unified cells;
|
||||
|
||||
std::vector<kv_layer> layers;
|
||||
|
|
@ -226,15 +231,13 @@ private:
|
|||
float freq_base,
|
||||
float freq_scale) const;
|
||||
|
||||
llm_graph_result_ptr build_graph_shift(
|
||||
const llama_cparams & cparams,
|
||||
ggml_context * ctx,
|
||||
ggml_cgraph * gf) const;
|
||||
ggml_cgraph * build_graph_shift(
|
||||
llm_graph_result * res,
|
||||
llama_context * lctx) const;
|
||||
|
||||
llm_graph_result_ptr build_graph_defrag(
|
||||
const llama_cparams & cparams,
|
||||
ggml_context * ctx,
|
||||
ggml_cgraph * gf,
|
||||
ggml_cgraph * build_graph_defrag(
|
||||
llm_graph_result * res,
|
||||
llama_context * lctx,
|
||||
const defrag_info & dinfo) const;
|
||||
|
||||
void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
|
||||
|
|
@ -288,6 +291,9 @@ public:
|
|||
|
||||
uint32_t get_n_kv() const;
|
||||
|
||||
// TODO: temporary
|
||||
bool get_supports_set_rows() const;
|
||||
|
||||
// get views of the current state of the cache
|
||||
ggml_tensor * get_k(ggml_context * ctx, int32_t il) const;
|
||||
ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
|
||||
|
|
|
|||
|
|
@ -14751,10 +14751,10 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|||
return res;
|
||||
}
|
||||
|
||||
llm_graph_result_ptr llama_model::build_graph(
|
||||
const llm_graph_params & params,
|
||||
ggml_cgraph * gf,
|
||||
llm_graph_type type) const {
|
||||
ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||
// TODO: temporary - will refactor this to keep the "gf" instance in the llm_graph_context and avoid passing it everywhere
|
||||
auto * gf = params.res->get_gf();
|
||||
|
||||
std::unique_ptr<llm_graph_context> llm;
|
||||
|
||||
switch (arch) {
|
||||
|
|
@ -14961,7 +14961,7 @@ llm_graph_result_ptr llama_model::build_graph(
|
|||
} break;
|
||||
case LLM_ARCH_T5:
|
||||
{
|
||||
switch (type) {
|
||||
switch (params.gtype) {
|
||||
case LLM_GRAPH_TYPE_ENCODER:
|
||||
llm = std::make_unique<llm_build_t5_enc>(*this, params, gf);
|
||||
break;
|
||||
|
|
@ -15047,7 +15047,10 @@ llm_graph_result_ptr llama_model::build_graph(
|
|||
// add on pooling layer
|
||||
llm->build_pooling(gf, cls, cls_b, cls_out, cls_out_b);
|
||||
|
||||
return std::move(llm->res);
|
||||
// TODO: updating the graph parameters here is a little bit obscure - figure out something better
|
||||
llm->res->params = params;
|
||||
|
||||
return llm->res->get_gf();
|
||||
}
|
||||
|
||||
//
|
||||
|
|
|
|||
|
|
@ -436,10 +436,7 @@ struct llama_model {
|
|||
llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams) const;
|
||||
|
||||
// TODO: move this to new llm_arch_model_i interface
|
||||
llm_graph_result_ptr build_graph(
|
||||
const llm_graph_params & params,
|
||||
ggml_cgraph * gf,
|
||||
llm_graph_type type) const;
|
||||
ggml_cgraph * build_graph(const llm_graph_params & params) const;
|
||||
|
||||
private:
|
||||
struct impl;
|
||||
|
|
|
|||
Loading…
Reference in New Issue