ggml : use WARP_SIZE/2 for argmax reduction offset (#18092 )

gguf-py : allow converting multi-tensor models from read-only locations (#18100 )
llama-fit-params: force disable mlock (#18103 )
2025-12-17 11:47:01 +08:00 · 2025-12-17 02:27:03 +01:00 · 2025-12-17 00:50:12 +01:00
3 changed files with 6 additions and 5 deletions
--- a/ggml/src/ggml-cuda/argmax.cu
+++ b/ggml/src/ggml-cuda/argmax.cu
@ -21,7 +21,7 @@ static __global__ void argmax_f32(const float * __restrict__ x, int32_t * __rest
    }
 #pragma unroll
-    for (int offset = 16; offset > 0; offset >>= 1) {
+    for (int offset = WARP_SIZE/2; offset > 0; offset >>= 1) {
        const float val = __shfl_xor_sync(0xFFFFFFFF, maxval, offset, WARP_SIZE);
        const int   col = __shfl_xor_sync(0xFFFFFFFF, argmax, offset, WARP_SIZE);
        if (val > maxval) {
@ -50,7 +50,7 @@ static __global__ void argmax_f32(const float * __restrict__ x, int32_t * __rest
                argmax = shared_argmax[lane_id];
            }
 #pragma unroll
-            for (int offset = 16; offset > 0; offset >>= 1) {
+            for (int offset = WARP_SIZE/2; offset > 0; offset >>= 1) {
                const float val = __shfl_xor_sync(0xFFFFFFFF, maxval, offset, WARP_SIZE);
                const int   col = __shfl_xor_sync(0xFFFFFFFF, argmax, offset, WARP_SIZE);
                if (val > maxval) {
--- a/gguf-py/gguf/utility.py
+++ b/gguf-py/gguf/utility.py
@ -288,7 +288,7 @@ class LocalTensor:
    data_range: LocalTensorRange
    def mmap_bytes(self) -> np.ndarray:
-        return np.memmap(self.data_range.filename, offset=self.data_range.offset, shape=self.data_range.size)
+        return np.memmap(self.data_range.filename, mode='r', offset=self.data_range.offset, shape=self.data_range.size)
 class SafetensorsLocal:
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -71,8 +71,9 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
    }, &ud);
    llama_model_params mparams_copy = *mparams;
-    mparams_copy.no_alloc = true;
+    mparams_copy.no_alloc  = true;
-    mparams_copy.use_mmap = false;
+    mparams_copy.use_mmap  = false;
    mparams_copy.use_mlock = false;
    llama_model * model = llama_model_load_from_file(path_model, mparams_copy);
    if (model == nullptr) {
Author	SHA1	Message	Date
Aadeshveer Singh	58062860af	ggml : use WARP_SIZE/2 for argmax reduction offset (#18092 )	2025-12-17 11:47:01 +08:00
Yuri Khrustalev	2973a65ecb	gguf-py : allow converting multi-tensor models from read-only locations (#18100 )	2025-12-17 02:27:03 +01:00
Johannes Gäßler	d0794e89d9	llama-fit-params: force disable mlock (#18103 )	2025-12-17 00:50:12 +01:00