Merge de30196b51 into 9e2e2198b0
This commit is contained in:
commit
97f0dd7bce
|
|
@ -92,15 +92,14 @@ llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & ba
|
|||
|
||||
// prepare the recurrent batches first
|
||||
if (!mem_recr->prepare(ubatches)) {
|
||||
// TODO: will the recurrent cache be in an undefined context at this point?
|
||||
LLAMA_LOG_ERROR("%s: failed to prepare recurrent ubatches\n", __func__);
|
||||
LLAMA_LOG_DEBUG("%s: failed to prepare recurrent ubatches\n", __func__);
|
||||
return std::make_unique<llama_memory_hybrid_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
|
||||
}
|
||||
|
||||
// prepare the attention cache
|
||||
auto heads_attn = mem_attn->prepare(ubatches);
|
||||
if (heads_attn.empty()) {
|
||||
LLAMA_LOG_ERROR("%s: failed to prepare attention ubatches\n", __func__);
|
||||
LLAMA_LOG_DEBUG("%s: failed to prepare attention ubatches\n", __func__);
|
||||
return std::make_unique<llama_memory_hybrid_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -2681,6 +2681,8 @@ private:
|
|||
}
|
||||
|
||||
int32_t i_next = 0;
|
||||
int32_t n_retries = 0;
|
||||
const int32_t max_retries = 10;
|
||||
|
||||
// process the created batch of tokens
|
||||
for (int32_t i = 0; i < batch.n_tokens; i = i_next) {
|
||||
|
|
@ -2740,14 +2742,20 @@ private:
|
|||
}
|
||||
|
||||
// retry with half the batch size to try to find a free slot in the KV cache
|
||||
if (!try_clear_idle_slots()) {
|
||||
n_batch /= 2;
|
||||
bool cleared_slot = try_clear_idle_slots();
|
||||
n_retries++;
|
||||
if (n_retries >= max_retries || (!cleared_slot && n_batch > 1)) {
|
||||
n_batch = std::max(1, n_batch / 2);
|
||||
n_retries = 0;
|
||||
SRV_WRN("reducing batch size to %d after %d retries (cleared_slot = %d)\n", n_batch, n_retries, cleared_slot);
|
||||
}
|
||||
|
||||
SRV_WRN("failed to find free space in the KV cache, retrying with smaller batch size, i = %d, n_batch = %d, ret = %d\n", i, n_batch, ret);
|
||||
SRV_WRN("failed to find free space in the KV cache, retrying (attempt %d/%d), i = %d, n_batch = %d, ret = %d\n",
|
||||
n_retries, max_retries, i, n_batch, ret);
|
||||
|
||||
continue; // continue loop of n_batch
|
||||
}
|
||||
n_retries = 0;
|
||||
|
||||
// move the head of the batch forward with the number of tokens we just processed
|
||||
i_next = i + n_tokens;
|
||||
|
|
|
|||
Loading…
Reference in New Issue