diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 3f74b51bc6..d1b02ae71c 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1005,7 +1005,7 @@ void llama_context::set_warmup(bool value) { } bool llama_context::set_sampler(llama_seq_id seq_id, llama_sampler * sampler) { - LLAMA_LOG_DEBUG("%s: seq_id = %d, sampler = %p\n", __func__, (int) seq_id, (void *) sampler); + LLAMA_LOG_ERROR("%s: seq_id = %d, sampler = %p\n", __func__, (int) seq_id, (void *) sampler); const bool can_offload = sampler && diff --git a/src/llama-context.h b/src/llama-context.h index d085d25779..960e4a0782 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -40,7 +40,15 @@ struct llama_context { ~llama_context(); + // reserve a new backend scheduler + // recommended to call whenver the context changes in such a way that the compute graph is modified. + // for example: + // - changing loras + // - changing samplers + // - changing attention type + // - etc. void reserve(); + void synchronize(); const llama_model & get_model() const;