Compare commits

...

4 Commits

Author SHA1 Message Date
Aadeshveer Singh 58062860af
ggml : use WARP_SIZE/2 for argmax reduction offset (#18092) 2025-12-17 11:47:01 +08:00
Yuri Khrustalev 2973a65ecb
gguf-py : allow converting multi-tensor models from read-only locations (#18100) 2025-12-17 02:27:03 +01:00
Johannes Gäßler d0794e89d9
llama-fit-params: force disable mlock (#18103) 2025-12-17 00:50:12 +01:00
Johannes Gäßler 9dcac6cf9f
llama-fit-params: lower ctx size for multi GPU (#18101) 2025-12-17 00:49:34 +01:00
3 changed files with 35 additions and 18 deletions

View File

@ -21,7 +21,7 @@ static __global__ void argmax_f32(const float * __restrict__ x, int32_t * __rest
} }
#pragma unroll #pragma unroll
for (int offset = 16; offset > 0; offset >>= 1) { for (int offset = WARP_SIZE/2; offset > 0; offset >>= 1) {
const float val = __shfl_xor_sync(0xFFFFFFFF, maxval, offset, WARP_SIZE); const float val = __shfl_xor_sync(0xFFFFFFFF, maxval, offset, WARP_SIZE);
const int col = __shfl_xor_sync(0xFFFFFFFF, argmax, offset, WARP_SIZE); const int col = __shfl_xor_sync(0xFFFFFFFF, argmax, offset, WARP_SIZE);
if (val > maxval) { if (val > maxval) {
@ -50,7 +50,7 @@ static __global__ void argmax_f32(const float * __restrict__ x, int32_t * __rest
argmax = shared_argmax[lane_id]; argmax = shared_argmax[lane_id];
} }
#pragma unroll #pragma unroll
for (int offset = 16; offset > 0; offset >>= 1) { for (int offset = WARP_SIZE/2; offset > 0; offset >>= 1) {
const float val = __shfl_xor_sync(0xFFFFFFFF, maxval, offset, WARP_SIZE); const float val = __shfl_xor_sync(0xFFFFFFFF, maxval, offset, WARP_SIZE);
const int col = __shfl_xor_sync(0xFFFFFFFF, argmax, offset, WARP_SIZE); const int col = __shfl_xor_sync(0xFFFFFFFF, argmax, offset, WARP_SIZE);
if (val > maxval) { if (val > maxval) {

View File

@ -288,7 +288,7 @@ class LocalTensor:
data_range: LocalTensorRange data_range: LocalTensorRange
def mmap_bytes(self) -> np.ndarray: def mmap_bytes(self) -> np.ndarray:
return np.memmap(self.data_range.filename, offset=self.data_range.offset, shape=self.data_range.size) return np.memmap(self.data_range.filename, mode='r', offset=self.data_range.offset, shape=self.data_range.size)
class SafetensorsLocal: class SafetensorsLocal:

View File

@ -73,6 +73,7 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
llama_model_params mparams_copy = *mparams; llama_model_params mparams_copy = *mparams;
mparams_copy.no_alloc = true; mparams_copy.no_alloc = true;
mparams_copy.use_mmap = false; mparams_copy.use_mmap = false;
mparams_copy.use_mlock = false;
llama_model * model = llama_model_load_from_file(path_model, mparams_copy); llama_model * model = llama_model_load_from_file(path_model, mparams_copy);
if (model == nullptr) { if (model == nullptr) {
@ -184,6 +185,7 @@ static void llama_params_fit_impl(
int64_t sum_projected_free = 0; int64_t sum_projected_free = 0;
int64_t min_projected_free = INT64_MAX; int64_t min_projected_free = INT64_MAX;
int64_t sum_projected_used = 0; int64_t sum_projected_used = 0;
int64_t sum_projected_model = 0;
int64_t sum_projected_ctx = 0; int64_t sum_projected_ctx = 0;
if (nd > 1) { if (nd > 1) {
@ -199,6 +201,7 @@ static void llama_params_fit_impl(
sum_projected_used += projected_used; sum_projected_used += projected_used;
sum_projected_free += projected_free; sum_projected_free += projected_free;
min_projected_free = std::min(min_projected_free, projected_free); min_projected_free = std::min(min_projected_free, projected_free);
sum_projected_model += dmd.mb.model;
sum_projected_ctx += dmd.mb.context; sum_projected_ctx += dmd.mb.context;
if (nd > 1) { if (nd > 1) {
@ -234,10 +237,24 @@ static void llama_params_fit_impl(
if (cparams->n_ctx == 0) { if (cparams->n_ctx == 0) {
if (hp_nct > n_ctx_min) { if (hp_nct > n_ctx_min) {
const int64_t bytes_per_ctx = sum_projected_ctx / hp_nct; const int64_t bytes_per_ctx = sum_projected_ctx / hp_nct;
const uint32_t ctx_reduction = std::min(
uint32_t((-global_surplus + bytes_per_ctx - 1) / bytes_per_ctx), hp_nct - n_ctx_min); int64_t memory_reduction = -global_surplus;
if (nd > 1) {
// for multiple devices we need to be more conservative in terms of how much context we think can fit:
// - for dense models only whole layers can be assigned to devices
// - for MoE models only whole tensors can be assigned to devices, which we estimate to be <= 1/3 of a layer
// - on average we expect a waste of 0.5 layers/tensors per device
// - use slightly more than the expected average for nd devices to be safe
const int64_t model_per_layer = sum_projected_model / std::min(uint32_t(mparams->n_gpu_layers), hp_ngl);
memory_reduction += (nd + 1) * model_per_layer / (hp_nex == 0 ? 2 : 6);
}
uint32_t ctx_reduction = std::min(uint32_t((memory_reduction + bytes_per_ctx - 1) / bytes_per_ctx), hp_nct - n_ctx_min);
cparams->n_ctx = hp_nct - ctx_reduction; cparams->n_ctx = hp_nct - ctx_reduction;
const int64_t memory_reduction = ctx_reduction * bytes_per_ctx; cparams->n_ctx = std::max(cparams->n_ctx - cparams->n_ctx % 256, n_ctx_min); // round down context for CUDA backend
ctx_reduction = hp_nct - cparams->n_ctx;
memory_reduction = ctx_reduction * bytes_per_ctx;
global_surplus += memory_reduction; global_surplus += memory_reduction;
LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n", LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB); __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);