From 10d197409bd9537ff302ad09966fe406882fef9d Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Mon, 15 Sep 2025 14:38:42 -0700
Subject: [PATCH 01/19] releases : switch to rocWMMA develop branch, add
 gfx1151 (#15992)

* releases : switch to rocWMMA develop branch, add gfx1151

* remove unused variable ROCM_VERSION
---
 .github/workflows/release.yml | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index e959209ab2..f461456edf 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -530,15 +530,13 @@ jobs:
     runs-on: windows-2022
 
     env:
-      # The ROCm version must correspond to the version used in the HIP SDK.
-      ROCM_VERSION: "6.4.2"
       HIPSDK_INSTALLER_VERSION: "25.Q3"
 
     strategy:
       matrix:
         include:
           - name: "radeon"
-            gpu_targets: "gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
+            gpu_targets: "gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
 
     steps:
       - name: Clone
@@ -548,7 +546,7 @@ jobs:
       - name: Clone rocWMMA repository
         id: clone_rocwmma
         run: |
-          git clone https://github.com/rocm/rocwmma --branch rocm-${{ env.ROCM_VERSION }} --depth 1
+          git clone https://github.com/rocm/rocwmma --branch develop --depth 1
 
       - name: Cache ROCm Installation
         id: cache-rocm

From dc381aa9a6dc45f00673471d34b8bddd30e77570 Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Mon, 15 Sep 2025 14:38:52 -0700
Subject: [PATCH 02/19] docker : enable rocWMMA in ROCm images, add gfx1151
 (#15997)

---
 .devops/rocm.Dockerfile | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/.devops/rocm.Dockerfile b/.devops/rocm.Dockerfile
index 221077197d..106c62b4dc 100644
--- a/.devops/rocm.Dockerfile
+++ b/.devops/rocm.Dockerfile
@@ -17,14 +17,11 @@ FROM ${BASE_ROCM_DEV_CONTAINER} AS build
 # gfx906 is deprecated
 #check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.1/reference/system-requirements.html
 
-ARG ROCM_DOCKER_ARCH='gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102,gfx1200,gfx1201'
-#ARG ROCM_DOCKER_ARCH=gfx1100
+ARG ROCM_DOCKER_ARCH='gfx803;gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1010;gfx1030;gfx1032;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx1151'
+#ARG ROCM_DOCKER_ARCH='gfx1151'
 
-# Set ROCm architectured
+# Set ROCm architectures
 ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
-# Enable ROCm
-# ENV CC=/opt/rocm/llvm/bin/clang
-# ENV CXX=/opt/rocm/llvm/bin/clang++
 
 RUN apt-get update \
     && apt-get install -y \
@@ -39,8 +36,16 @@ WORKDIR /app
 
 COPY . .
 
+RUN git clone https://github.com/rocm/rocwmma --branch develop --depth 1
+
 RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
-    cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_BUILD_TESTS=OFF \
+    cmake -S . -B build \
+        -DGGML_HIP=ON \
+        -DGGML_HIP_ROCWMMA_FATTN=ON \
+        -DCMAKE_HIP_FLAGS="-I$(pwd)/rocwmma/library/include/" \
+        -DAMDGPU_TARGETS="$ROCM_DOCKER_ARCH" \
+        -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON \
+        -DCMAKE_BUILD_TYPE=Release -DLLAMA_BUILD_TESTS=OFF \
     && cmake --build build --config Release -j$(nproc)
 
 RUN mkdir -p /app/lib \

From 3d4053f77f0f78ee2b791088c02af653ebee42dd Mon Sep 17 00:00:00 2001
From: Jake Karnes <jake.karnes@gmail.com>
Date: Mon, 15 Sep 2025 16:28:31 -0600
Subject: [PATCH 03/19] CUDA: fix im2col_3d to respect non-contiguous inputs
 (views) (#15956)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix im2col_3d to respect non-contiguous inputs (views)

The CUDA 3D im2col kernel computed source addresses assuming compact layout (products of dims), ignoring nb[] strides.

This patch switches im2col_3d source indexing to use true strides derived from src1->nb[] (in elements), mirroring the approach used in the 2D CUDA im2col path. Destination indexing is unchanged.

* use ggml_element_size() for src strides

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
---
 ggml/src/ggml-cuda/im2col.cu | 31 ++++++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/ggml/src/ggml-cuda/im2col.cu b/ggml/src/ggml-cuda/im2col.cu
index 7737d6a5d5..56dc054574 100644
--- a/ggml/src/ggml-cuda/im2col.cu
+++ b/ggml/src/ggml-cuda/im2col.cu
@@ -122,11 +122,14 @@ static  __global__ void im2col_3d_kernel(
         int64_t OH_OW, int64_t KD_KH_KW, int64_t ID_IH_IW, int64_t KH_KW, int64_t IH_IW, int64_t IC_ID_IH_IW,
         int64_t IC_KD_KH_KW, int64_t OW_KD_KH_KW, int64_t OD_OH_OW_IC_KD_KH_KW, int64_t OH_OW_IC_KD_KH_KW,
         int64_t OW_IC_KD_KH_KW, int64_t N_OD_OH, int64_t OD_OH,
+        int64_t stride_q, int64_t stride_z, int64_t stride_y, int64_t stride_x,
         int s0, int s1, int s2, int p0, int p1, int p2, int d0, int d1, int d2) {
     const int64_t i = threadIdx.x + blockIdx.x * blockDim.x;
     if (i >= IC_KD_KH_KW) {
         return;
     }
+    GGML_UNUSED(N); GGML_UNUSED(OC); GGML_UNUSED(OH_OW); GGML_UNUSED(OD); GGML_UNUSED(OW); GGML_UNUSED(KD); GGML_UNUSED(KH);
+    GGML_UNUSED(ID_IH_IW); GGML_UNUSED(IH_IW); GGML_UNUSED(IC_ID_IH_IW); GGML_UNUSED(OW_KD_KH_KW);
 
     const int64_t iic = i / KD_KH_KW;
     const int64_t ikd = (i - iic * KD_KH_KW) / KH_KW;
@@ -148,7 +151,7 @@ static  __global__ void im2col_3d_kernel(
         if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW || iid < 0 || iid >= ID) {
             dst[offset_dst] = 0.0f;
         } else {
-            const int64_t offset_src = in*IC_ID_IH_IW + iic*ID_IH_IW + iid*IH_IW + iih*IW + iiw;
+            const int64_t offset_src = ((in * IC + iic) * stride_q) + (iid * stride_z) + (iih * stride_y) + (iiw * stride_x);
             dst[offset_dst] = src[offset_src];
         }
     }
@@ -159,6 +162,7 @@ template <typename T>
 static void im2col_3d_cuda(const float * src, T* dst,
     int64_t N, int64_t IC, int64_t ID, int64_t IH, int64_t IW, int64_t OC,
     int64_t KD, int64_t KH, int64_t KW, int64_t OD, int64_t OH, int64_t OW,
+    int64_t stride_q, int64_t stride_z, int64_t stride_y, int64_t stride_x,
     int s0, int s1, int s2, int p0, int p1, int p2, int d0, int d1, int d2, cudaStream_t stream) {
     const int64_t OH_OW = OH*OW;
     const int64_t KD_KH_KW = KD*KH*KW;
@@ -179,23 +183,30 @@ static void im2col_3d_cuda(const float * src, T* dst,
                                                                                            OH_OW, KD_KH_KW, ID_IH_IW, KH_KW, IH_IW, IC_ID_IH_IW,
                                                                                            IC_KD_KH_KW, OW_KD_KH_KW, OD_OH_OW_IC_KD_KH_KW,
                                                                                            OH_OW_IC_KD_KH_KW, OW_IC_KD_KH_KW, N_OD_OH, OD_OH,
+                                                                                           stride_q, stride_z, stride_y, stride_x,
                                                                                            s0, s1, s2, p0, p1, p2, d0, d1, d2);
 }
 
 static void im2col_3d_cuda_f16(const float * src, half * dst,
     int64_t N, int64_t IC, int64_t ID, int64_t IH, int64_t IW, int64_t OC,
     int64_t KD, int64_t KH, int64_t KW, int64_t OD, int64_t OH, int64_t OW,
+    int64_t stride_q, int64_t stride_z, int64_t stride_y, int64_t stride_x,
     int s0, int s1, int s2, int p0, int p1, int p2, int d0, int d1, int d2, cudaStream_t stream) {
 
-    im2col_3d_cuda<half>(src, dst, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW, s0, s1, s2, p0, p1, p2, d0, d1, d2, stream);
+    im2col_3d_cuda<half>(src, dst, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW,
+                         stride_q, stride_z, stride_y, stride_x,
+                         s0, s1, s2, p0, p1, p2, d0, d1, d2, stream);
 }
 
 static void im2col_3d_cuda_f32(const float * src, float * dst,
     int64_t N, int64_t IC, int64_t ID, int64_t IH, int64_t IW, int64_t OC,
     int64_t KD, int64_t KH, int64_t KW, int64_t OD, int64_t OH, int64_t OW,
+    int64_t stride_q, int64_t stride_z, int64_t stride_y, int64_t stride_x,
     int s0, int s1, int s2, int p0, int p1, int p2, int d0, int d1, int d2, cudaStream_t stream) {
 
-    im2col_3d_cuda<float>(src, dst, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW, s0, s1, s2, p0, p1, p2, d0, d1, d2, stream);
+    im2col_3d_cuda<float>(src, dst, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW,
+                          stride_q, stride_z, stride_y, stride_x,
+                          s0, s1, s2, p0, p1, p2, d0, d1, d2, stream);
 }
 
 void ggml_cuda_op_im2col_3d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@@ -235,9 +246,19 @@ void ggml_cuda_op_im2col_3d(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
     const int64_t OH = ne2;
     const int64_t OW = ne1;
 
+    const size_t  es       = ggml_element_size(src1);
+    const int64_t stride_x = src1->nb[0] / es;
+    const int64_t stride_y = src1->nb[1] / es;
+    const int64_t stride_z = src1->nb[2] / es;
+    const int64_t stride_q = src1->nb[3] / es;
+
     if(dst->type == GGML_TYPE_F16) {
-        im2col_3d_cuda_f16(src1_d, (half *) dst_d, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW, s0, s1, s2, p0, p1, p2, d0, d1, d2, stream);
+        im2col_3d_cuda_f16(src1_d, (half *) dst_d, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW,
+                           stride_q, stride_z, stride_y, stride_x,
+                           s0, s1, s2, p0, p1, p2, d0, d1, d2, stream);
     } else {
-        im2col_3d_cuda_f32(src1_d, (float *) dst_d, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW, s0, s1, s2, p0, p1, p2, d0, d1, d2, stream);
+        im2col_3d_cuda_f32(src1_d, (float *) dst_d, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW,
+                           stride_q, stride_z, stride_y, stride_x,
+                           s0, s1, s2, p0, p1, p2, d0, d1, d2, stream);
     }
 }

From 6d758839ff741d4966ca92b7f801b7a8b5b96364 Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Tue, 16 Sep 2025 10:38:28 +0800
Subject: [PATCH 04/19] Add LLaDA-7b-MoE diffusion model (#16003)

---
 common/arg.cpp                       |   2 +-
 convert_hf_to_gguf.py                |  73 +++++++++++
 convert_hf_to_gguf_update.py         |   1 +
 examples/diffusion/diffusion-cli.cpp |  24 ++--
 gguf-py/gguf/constants.py            |  19 +++
 src/llama-arch.cpp                   |  22 ++++
 src/llama-arch.h                     |   1 +
 src/llama-model.cpp                  | 179 +++++++++++++++++++++++++++
 src/llama-vocab.cpp                  |   3 +-
 9 files changed, 315 insertions(+), 9 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index c15008fe79..19189d8f9f 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1704,7 +1704,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             params.system_prompt = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION}));
     add_opt(common_arg(
         {"--no-perf"},
         string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 855789f1ba..5a21ba2110 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -888,6 +888,9 @@ class TextModel(ModelBase):
         if chkhsh == "a1e163ecab2e718a4c829d1148b6e86824ec36163bb71941c3dca9cd5ac25756":
             # ref: https://huggingface.co/JetBrains/Mellum-4b-base
             res = "mellum"
+        if chkhsh == "9b1be57e70d20d9501b2b3186e792d81181ae36ada3903c26f9fea418cf87206":
+            # ref: https://huggingface.co/inclusionAI/LLaDA-MoE-7B-A1B-Base
+            res = "llada-moe"
 
         if res is None:
             logger.warning("\n")
@@ -8239,6 +8242,76 @@ class HunYuanMoEModel(TextModel):
                 raise ValueError(f"Unprocessed experts: {experts}")
 
 
+@ModelBase.register("LLaDAMoEModel", "LLaDAMoEModelLM")
+class LLaDAMoEModel(TextModel):
+    model_arch = gguf.MODEL_ARCH.LLADA_MOE
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        if (n_experts := self.hparams.get("num_experts")) is not None:
+            self.gguf_writer.add_expert_count(n_experts)
+
+        if (expert_intermediate_size := self.hparams.get("expert_intermediate_size")) is not None:
+            self.gguf_writer.add_expert_feed_forward_length(expert_intermediate_size)
+
+        # number of experts used per token (top-k)
+        if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
+            self.gguf_writer.add_expert_used_count(n_experts_used)
+
+        self.gguf_writer.add_mask_token_id(156895)
+        self.gguf_writer.add_causal_attention(False)
+        self.gguf_writer.add_diffusion_shift_logits(False)
+
+    _experts: list[dict[str, Tensor]] | None = None
+
+    # Copied from: Qwen2MoeModel
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # process the experts separately
+        if name.find("experts") != -1:
+            n_experts = self.hparams["num_experts"]
+            assert bid is not None
+
+            if self._experts is None:
+                self._experts = [{} for _ in range(self.block_count)]
+
+            self._experts[bid][name] = data_torch
+
+            if len(self._experts[bid]) >= n_experts * 3:
+                tensors: list[tuple[str, Tensor]] = []
+
+                # merge the experts into a single 3d tensor
+                for w_name in ["down_proj", "gate_proj", "up_proj"]:
+                    datas: list[Tensor] = []
+
+                    for xid in range(n_experts):
+                        ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
+                        datas.append(self._experts[bid][ename])
+                        del self._experts[bid][ename]
+
+                    data_torch = torch.stack(datas, dim=0)
+
+                    merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
+
+                    new_name = self.map_tensor_name(merged_name)
+
+                    tensors.append((new_name, data_torch))
+                return tensors
+            else:
+                return []
+
+        return [(self.map_tensor_name(name), data_torch)]
+
+    # Copied from: Qwen2MoeModel
+    def prepare_tensors(self):
+        super().prepare_tensors()
+
+        if self._experts is not None:
+            # flatten `list[dict[str, Tensor]]` into `list[str]`
+            experts = [k for d in self._experts for k in d.keys()]
+            if len(experts) > 0:
+                raise ValueError(f"Unprocessed experts: {experts}")
+
+
 @ModelBase.register("HunYuanDenseV1ForCausalLM")
 class HunYuanModel(TextModel):
     model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE
diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py
index eb8fdfa7e1..21bb4a9f3e 100755
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -139,6 +139,7 @@ models = [
     {"name": "lfm2",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"},
     {"name": "exaone4",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", },
     {"name": "mellum",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum-4b-base", },
+    {"name": "llada-moe",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/LLaDA-MoE-7B-A1B-Base", },
 ]
 
 # some models are known to be broken upstream, so we will skip them as exceptions
diff --git a/examples/diffusion/diffusion-cli.cpp b/examples/diffusion/diffusion-cli.cpp
index abf7fb3573..273942a165 100644
--- a/examples/diffusion/diffusion-cli.cpp
+++ b/examples/diffusion/diffusion-cli.cpp
@@ -510,19 +510,27 @@ static void diffusion_generate(llama_context *          ctx,
     n_generated = params.max_length;
 }
 
-static std::string format_input_text(const std::string & prompt, bool use_chat_template, llama_model * model) {
+static std::string format_input_text(const std::string & prompt, const std::string & system_prompt, bool use_chat_template, llama_model * model) {
     if (!use_chat_template) {
         return prompt;
     }
 
     auto chat_templates = common_chat_templates_init(model, "");
-
     common_chat_templates_inputs inputs;
-    common_chat_msg              user_msg;
-    user_msg.role                = "user";
-    user_msg.content             = prompt;
-    inputs.add_generation_prompt = true;
+    common_chat_msg system_msg;
+
+    if (!system_prompt.empty()) {
+        system_msg.role = "system";
+        system_msg.content = system_prompt;
+        inputs.messages.push_back(system_msg);
+    }
+
+    common_chat_msg user_msg;
+    user_msg.role = "user";
+    user_msg.content = prompt;
+
     inputs.messages.push_back(user_msg);
+    inputs.add_generation_prompt = true;
 
     auto result = common_chat_templates_apply(chat_templates.get(), inputs);
 
@@ -579,7 +587,8 @@ int main(int argc, char ** argv) {
     llama_set_n_threads(ctx, params.cpuparams.n_threads, params.cpuparams_batch.n_threads);
 
     const llama_vocab * vocab            = llama_model_get_vocab(model);
-    std::string         formatted_prompt = format_input_text(params.prompt, params.enable_chat_template, model);
+
+    std::string         formatted_prompt = format_input_text(params.prompt, params.system_prompt, params.enable_chat_template, model);
 
     std::vector<llama_token> input_tokens = common_tokenize(vocab,
                                                             formatted_prompt,
@@ -596,6 +605,7 @@ int main(int argc, char ** argv) {
     }
 
     llama_token mask_token_id = llama_vocab_mask(vocab);
+
     GGML_ASSERT(mask_token_id != LLAMA_TOKEN_NULL);
 
     bool visual_mode = params.diffusion.visual_mode;
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index c7edef919b..7e16cbcbde 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -399,6 +399,7 @@ class MODEL_ARCH(IntEnum):
     DREAM            = auto()
     SMALLTHINKER     = auto()
     LLADA            = auto()
+    LLADA_MOE        = auto()
     SEED_OSS         = auto()
 
 
@@ -735,6 +736,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
     MODEL_ARCH.DREAM:            "dream",
     MODEL_ARCH.SMALLTHINKER:     "smallthinker",
     MODEL_ARCH.LLADA:            "llada",
+    MODEL_ARCH.LLADA_MOE:        "llada-moe",
     MODEL_ARCH.SEED_OSS:         "seed_oss",
 }
 
@@ -2693,6 +2695,23 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
         MODEL_TENSOR.FFN_DOWN_EXP,
         MODEL_TENSOR.FFN_UP_EXP,
     ],
+    MODEL_ARCH.LLADA_MOE: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q_NORM,
+        MODEL_TENSOR.ATTN_K_NORM,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE_INP,
+        MODEL_TENSOR.FFN_GATE_EXP,
+        MODEL_TENSOR.FFN_UP_EXP,
+        MODEL_TENSOR.FFN_DOWN_EXP,
+    ],
     # TODO
 }
 
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 3122331d8e..a4d2973ada 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -96,6 +96,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_DREAM,            "dream"            },
     { LLM_ARCH_SMALLTHINKER,     "smallthinker"     },
     { LLM_ARCH_LLADA,            "llada"            },
+    { LLM_ARCH_LLADA_MOE,        "llada-moe"        },
     { LLM_ARCH_SEED_OSS,         "seed_oss"         },
     { LLM_ARCH_UNKNOWN,          "(unknown)"        },
 };
@@ -2147,6 +2148,26 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
         },
     },
+    {
+        LLM_ARCH_LLADA_MOE,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,         "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,        "output_norm" },
+            { LLM_TENSOR_OUTPUT,             "output" },
+            { LLM_TENSOR_ATTN_NORM,          "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,             "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_Q_NORM,        "blk.%d.attn_q_norm" },
+            { LLM_TENSOR_ATTN_K,             "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_K_NORM,        "blk.%d.attn_k_norm" },
+            { LLM_TENSOR_ATTN_V,             "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,           "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,           "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE_INP,       "blk.%d.ffn_gate_inp" },
+            { LLM_TENSOR_FFN_GATE_EXPS,      "blk.%d.ffn_gate_exps" },
+            { LLM_TENSOR_FFN_DOWN_EXPS,      "blk.%d.ffn_down_exps" },
+            { LLM_TENSOR_FFN_UP_EXPS,        "blk.%d.ffn_up_exps" },
+        },
+    },
     {
         LLM_ARCH_SEED_OSS,
         {
@@ -2427,6 +2448,7 @@ bool llm_arch_is_diffusion(const llm_arch & arch) {
     switch (arch) {
         case LLM_ARCH_DREAM:
         case LLM_ARCH_LLADA:
+        case LLM_ARCH_LLADA_MOE:
             return true;
         default:
             return false;
diff --git a/src/llama-arch.h b/src/llama-arch.h
index a4ac28b525..d181ce6784 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -100,6 +100,7 @@ enum llm_arch {
     LLM_ARCH_DREAM,
     LLM_ARCH_SMALLTHINKER,
     LLM_ARCH_LLADA,
+    LLM_ARCH_LLADA_MOE,
     LLM_ARCH_SEED_OSS,
     LLM_ARCH_UNKNOWN,
 };
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 4864ed8e72..731e87383b 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -936,6 +936,18 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                 hparams.causal_attn = false;
             }
             break;
+        case LLM_ARCH_LLADA_MOE:
+            {
+                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
+
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                // diffusion language model uses non-causal attention
+                hparams.causal_attn = false;
+                switch (hparams.n_layer) {
+                    case 16: type = LLM_TYPE_A1_7B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
         case LLM_ARCH_QWEN2MOE:
             {
                 ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp, false);
@@ -2387,6 +2399,40 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                     }
                 }
                 break;
+            case LLM_ARCH_LLADA_MOE:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+                    GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for llada-moe");
+                    GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for llada-moe");
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
+                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
+
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+
+                        const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
+
+                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
+                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
+                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
+                    }
+                } break;
             case LLM_ARCH_LLAMA4:
                 {
                     tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -12444,6 +12490,132 @@ struct llm_build_olmoe : public llm_graph_context {
     }
 };
 
+struct llm_build_llada_moe : public llm_graph_context {
+    llm_build_llada_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        ggml_tensor * cur;
+        ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        // inp_pos - contains the positions
+        ggml_tensor * inp_pos = build_inp_pos();
+
+        auto * inp_attn = build_attn_inp_no_cache();
+
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+        for (int il = 0; il < n_layer; ++il) {
+            ggml_tensor * inpSA = inpL;
+
+            // norm
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "attn_norm", il);
+
+            // self_attention
+            {
+                // compute Q and K and RoPE them
+                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+
+                ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+
+                ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+                Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+                Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+                cb(Qcur, "Qcur_normed", il);
+
+                Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+                cb(Kcur, "Kcur_normed", il);
+
+                Qcur = ggml_rope_ext(
+                        ctx0, Qcur, inp_pos, nullptr,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow
+                        );
+
+                Kcur = ggml_rope_ext(
+                        ctx0, Kcur, inp_pos, nullptr,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow
+                        );
+
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
+
+                cur = build_attn(inp_attn,
+                        model.layers[il].wo, NULL,
+                        Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+            }
+
+            if (il == n_layer - 1 && inp_out_ids) {
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // MoE branch
+            cur = build_norm(ffn_inp,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "ffn_norm", il);
+
+            cur = build_moe_ffn(cur,
+                    model.layers[il].ffn_gate_inp,
+                    model.layers[il].ffn_up_exps,
+                    model.layers[il].ffn_gate_exps,
+                    model.layers[il].ffn_down_exps,
+                    nullptr,
+                    n_expert, n_expert_used,
+                    LLM_FFN_SILU, false,
+                    false, 0.0,
+                    LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+                    il);
+            cb(cur, "ffn_moe_out", il);
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+
+            cur = build_cvec(cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = build_norm(cur,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, -1);
+
+        cb(cur, "result_norm", -1);
+        res->t_embd = cur;
+
+        // lm_head
+        cur = build_lora_mm(model.output, cur);
+
+        cb(cur, "result_output", -1);
+        res->t_logits = cur;
+
+        ggml_build_forward_expand(gf, cur);
+    }
+};
+
 struct llm_build_openelm : public llm_graph_context {
     llm_build_openelm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -18636,6 +18808,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
         //case LLM_ARCH_GEMMA_EMBEDDING: // TODO: disabled until the cacheless SWA logic is fixed [TAG_NO_CACHE_ISWA]
         case LLM_ARCH_DREAM:
         case LLM_ARCH_LLADA:
+        case LLM_ARCH_LLADA_MOE:
             {
                 res = nullptr;
             } break;
@@ -18841,6 +19014,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
                 llm = std::make_unique<llm_build_llada>(*this, params);
             }
             break;
+        case LLM_ARCH_LLADA_MOE:
+            {
+                llm = std::make_unique<llm_build_llada_moe>(*this, params);
+            }
+            break;
         case LLM_ARCH_QWEN2VL:
             {
                 llm = std::make_unique<llm_build_qwen2vl>(*this, params);
@@ -19307,6 +19485,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_QWEN2MOE:
         case LLM_ARCH_QWEN3:
         case LLM_ARCH_QWEN3MOE:
+        case LLM_ARCH_LLADA_MOE:
         case LLM_ARCH_OLMO2:
         case LLM_ARCH_OLMOE:
         case LLM_ARCH_PHI2:
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index b551253afb..8cb36661a0 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1962,7 +1962,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                 pre_type = LLAMA_VOCAB_PRE_TYPE_TRILLION;
                 clean_spaces = false;
             } else if (
-                tokenizer_pre == "bailingmoe") {
+                tokenizer_pre == "bailingmoe" ||
+                tokenizer_pre == "llada-moe") {
                 pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
                 clean_spaces = false;
             } else if (

From 07808ebb07e2b1aa19032705e332679ddf967614 Mon Sep 17 00:00:00 2001
From: Yuri Khrustalev <ykhrustalev@users.noreply.github.com>
Date: Mon, 15 Sep 2025 22:54:44 -0400
Subject: [PATCH 05/19] cmake : Do not install tools on iOS targets (#15903)

---
 CMakeLists.txt                         | 7 +++++++
 tools/batched-bench/CMakeLists.txt     | 5 ++++-
 tools/cvector-generator/CMakeLists.txt | 5 ++++-
 tools/export-lora/CMakeLists.txt       | 5 ++++-
 tools/gguf-split/CMakeLists.txt        | 5 ++++-
 tools/imatrix/CMakeLists.txt           | 5 ++++-
 tools/llama-bench/CMakeLists.txt       | 5 ++++-
 tools/main/CMakeLists.txt              | 5 ++++-
 tools/mtmd/CMakeLists.txt              | 2 +-
 tools/perplexity/CMakeLists.txt        | 5 ++++-
 tools/quantize/CMakeLists.txt          | 5 ++++-
 tools/run/CMakeLists.txt               | 4 +++-
 tools/tokenize/CMakeLists.txt          | 4 +++-
 tools/tts/CMakeLists.txt               | 5 ++++-
 14 files changed, 54 insertions(+), 13 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 36a2078e4c..4720e1f1a2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -58,6 +58,12 @@ if (MSVC)
     add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/bigobj>")
 endif()
 
+if (CMAKE_SYSTEM_NAME STREQUAL "iOS")
+    set(LLAMA_TOOLS_INSTALL_DEFAULT OFF)
+else()
+    set(LLAMA_TOOLS_INSTALL_DEFAULT ${LLAMA_STANDALONE})
+endif()
+
 #
 # option list
 #
@@ -82,6 +88,7 @@ option(LLAMA_BUILD_TESTS    "llama: build tests"          ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_TOOLS    "llama: build tools"          ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES "llama: build examples"       ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_SERVER   "llama: build server example" ${LLAMA_STANDALONE})
+option(LLAMA_TOOLS_INSTALL  "llama: install tools"        ${LLAMA_TOOLS_INSTALL_DEFAULT})
 
 # 3rd party libs
 option(LLAMA_CURL       "llama: use libcurl to download model from an URL" ON)
diff --git a/tools/batched-bench/CMakeLists.txt b/tools/batched-bench/CMakeLists.txt
index 68ad707f32..4a46b57a52 100644
--- a/tools/batched-bench/CMakeLists.txt
+++ b/tools/batched-bench/CMakeLists.txt
@@ -1,5 +1,8 @@
 set(TARGET llama-batched-bench)
 add_executable(${TARGET} batched-bench.cpp)
-install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} RUNTIME)
+endif()
diff --git a/tools/cvector-generator/CMakeLists.txt b/tools/cvector-generator/CMakeLists.txt
index 49ad9561c8..baeb4d00c1 100644
--- a/tools/cvector-generator/CMakeLists.txt
+++ b/tools/cvector-generator/CMakeLists.txt
@@ -1,5 +1,8 @@
 set(TARGET llama-cvector-generator)
 add_executable(${TARGET} cvector-generator.cpp pca.hpp)
-install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} RUNTIME)
+endif()
diff --git a/tools/export-lora/CMakeLists.txt b/tools/export-lora/CMakeLists.txt
index 310455787a..cddfa77f02 100644
--- a/tools/export-lora/CMakeLists.txt
+++ b/tools/export-lora/CMakeLists.txt
@@ -1,5 +1,8 @@
 set(TARGET llama-export-lora)
 add_executable(${TARGET} export-lora.cpp)
-install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} RUNTIME)
+endif()
diff --git a/tools/gguf-split/CMakeLists.txt b/tools/gguf-split/CMakeLists.txt
index c407e2f0af..9b2125087c 100644
--- a/tools/gguf-split/CMakeLists.txt
+++ b/tools/gguf-split/CMakeLists.txt
@@ -1,5 +1,8 @@
 set(TARGET llama-gguf-split)
 add_executable(${TARGET} gguf-split.cpp)
-install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} RUNTIME)
+endif()
diff --git a/tools/imatrix/CMakeLists.txt b/tools/imatrix/CMakeLists.txt
index 412696c47c..22f2fe5fdb 100644
--- a/tools/imatrix/CMakeLists.txt
+++ b/tools/imatrix/CMakeLists.txt
@@ -1,5 +1,8 @@
 set(TARGET llama-imatrix)
 add_executable(${TARGET} imatrix.cpp)
-install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} RUNTIME)
+endif()
diff --git a/tools/llama-bench/CMakeLists.txt b/tools/llama-bench/CMakeLists.txt
index 17e3b9b87b..b8543a9692 100644
--- a/tools/llama-bench/CMakeLists.txt
+++ b/tools/llama-bench/CMakeLists.txt
@@ -1,5 +1,8 @@
 set(TARGET llama-bench)
 add_executable(${TARGET} llama-bench.cpp)
-install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} RUNTIME)
+endif()
diff --git a/tools/main/CMakeLists.txt b/tools/main/CMakeLists.txt
index af3d9150f8..8f8e9d444c 100644
--- a/tools/main/CMakeLists.txt
+++ b/tools/main/CMakeLists.txt
@@ -1,5 +1,8 @@
 set(TARGET llama-cli)
 add_executable(${TARGET} main.cpp)
-install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} RUNTIME)
+endif()
diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt
index 0979488560..2381012a0d 100644
--- a/tools/mtmd/CMakeLists.txt
+++ b/tools/mtmd/CMakeLists.txt
@@ -55,7 +55,7 @@ add_executable(llama-qwen2vl-cli  deprecation-warning.cpp)
 set(TARGET llama-mtmd-cli)
 add_executable         (${TARGET} mtmd-cli.cpp)
 set_target_properties  (${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli)
-if(NOT CMAKE_SYSTEM_NAME STREQUAL "iOS")
+if(LLAMA_TOOLS_INSTALL)
     install(TARGETS ${TARGET} RUNTIME)
 endif()
 target_link_libraries  (${TARGET} PRIVATE common mtmd Threads::Threads)
diff --git a/tools/perplexity/CMakeLists.txt b/tools/perplexity/CMakeLists.txt
index 3e68640933..12b28b2be4 100644
--- a/tools/perplexity/CMakeLists.txt
+++ b/tools/perplexity/CMakeLists.txt
@@ -1,5 +1,8 @@
 set(TARGET llama-perplexity)
 add_executable(${TARGET} perplexity.cpp)
-install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} RUNTIME)
+endif()
diff --git a/tools/quantize/CMakeLists.txt b/tools/quantize/CMakeLists.txt
index 47e5cbe30c..bd9ddbd67d 100644
--- a/tools/quantize/CMakeLists.txt
+++ b/tools/quantize/CMakeLists.txt
@@ -1,6 +1,9 @@
 set(TARGET llama-quantize)
 add_executable(${TARGET} quantize.cpp)
-install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_include_directories(${TARGET} PRIVATE ../../common)
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} RUNTIME)
+endif()
diff --git a/tools/run/CMakeLists.txt b/tools/run/CMakeLists.txt
index d018959698..e52294ccc0 100644
--- a/tools/run/CMakeLists.txt
+++ b/tools/run/CMakeLists.txt
@@ -10,6 +10,8 @@ if (LLAMA_CURL)
     set(LLAMA_RUN_EXTRA_LIBS ${LLAMA_RUN_EXTRA_LIBS} ${CURL_LIBRARIES})
 endif ()
 
-install(TARGETS ${TARGET} RUNTIME)
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} RUNTIME)
+endif()
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT} ${LLAMA_RUN_EXTRA_LIBS})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/tools/tokenize/CMakeLists.txt b/tools/tokenize/CMakeLists.txt
index 1690b53e5d..feed9a1062 100644
--- a/tools/tokenize/CMakeLists.txt
+++ b/tools/tokenize/CMakeLists.txt
@@ -1,5 +1,7 @@
 set(TARGET llama-tokenize)
 add_executable(${TARGET} tokenize.cpp)
-install(TARGETS ${TARGET} RUNTIME)
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} RUNTIME)
+endif()
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/tools/tts/CMakeLists.txt b/tools/tts/CMakeLists.txt
index c72bd814c3..76320d4c2d 100644
--- a/tools/tts/CMakeLists.txt
+++ b/tools/tts/CMakeLists.txt
@@ -1,5 +1,8 @@
 set(TARGET llama-tts)
 add_executable(${TARGET} tts.cpp)
-install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} RUNTIME)
+endif()

From 51abc96bdc52ba8cd6ad78dcf12ed9a041d7b442 Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Tue, 16 Sep 2025 05:57:16 +0200
Subject: [PATCH 06/19] ci : update macos-latest* jobs to use macos-latest
 (#15938)

* ci : update macos-latest* jobs to use macos-latest

This commit updates the jobs that are named macos-latest* to use the
macos-latest label instead explicit versions.

The motivation for this is that there is currently a mixuture of
versions in this workflow and there are jobs that are failing because
they require a newer version.

Refs: https://github.com/ggml-org/llama.cpp/actions/runs/17644792595/job/50140010907#step:5:1759

* ci : add xcodebuild -downloadPlatform iOS command
---
 .github/workflows/build.yml | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 1bd35627da..9a668bf8dd 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -56,7 +56,7 @@ env:
 
 jobs:
   macOS-latest-cmake-arm64:
-    runs-on: macos-14
+    runs-on: macos-latest
 
     steps:
       - name: Clone
@@ -97,7 +97,7 @@ jobs:
           ctest -L 'main|curl' --verbose --timeout 900
 
   macOS-latest-cmake-x64:
-    runs-on: macos-13
+    runs-on: macos-latest
 
     steps:
       - name: Clone
@@ -138,7 +138,7 @@ jobs:
           ctest -L main --verbose --timeout 900
 
   macOS-latest-cmake-arm64-webgpu:
-    runs-on: macos-14
+    runs-on: latest
 
     steps:
       - name: Clone
@@ -1171,7 +1171,9 @@ jobs:
           ./build-xcframework.sh
 
       - name: Build Xcode project
-        run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
+        run: |
+          xcodebuild -downloadPlatform iOS
+          xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
 
   android-build:
     runs-on: ubuntu-latest

From f1fbffb5c0b34b2a68febb7da3fd0f8333f1ed4c Mon Sep 17 00:00:00 2001
From: Bowen Han <fancycode@gmail.com>
Date: Mon, 15 Sep 2025 23:59:19 -0700
Subject: [PATCH 07/19] fix: apply clang-format to CUDA macros (#16017)

clang-format previously broke long CUDA macros (e.g. __launch_bounds__) into
unreadable line breaks inside template declarations, such as:

  template<int D, int ncols, int nwarps, int VKQ_stride,
           typename KQ_acc_t, bool use_logit_softcap>
      __launch_bounds__(nwarps*ggml_cuda_get_physical_warp_size(), 1)

This change adjusts formatting rules so that CUDA macros remain consistent
and aligned with the surrounding template syntax.
---
 .clang-format | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.clang-format b/.clang-format
index 117e6986f6..742723fc8f 100644
--- a/.clang-format
+++ b/.clang-format
@@ -22,6 +22,13 @@ AllowShortIfStatementsOnASingleLine: Never
 AllowShortLambdasOnASingleLine: Inline
 AllowShortLoopsOnASingleLine: false
 AlwaysBreakBeforeMultilineStrings: true
+# Treat CUDA keywords/attributes as "attribute macros" and avoid breaking lines inside them
+AttributeMacros:
+  - __host__
+  - __device__
+  - __global__
+  - __forceinline__
+  - __launch_bounds__
 BinPackArguments: true
 BinPackParameters: false # OnePerLine
 BitFieldColonSpacing: Both

From 76888d202ed2b835ae19ea9f9db6baf39e419297 Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Tue, 16 Sep 2025 13:41:38 +0200
Subject: [PATCH 08/19] ci : upload xcframework artifact from ios-xcode-build
 job (#16010)

This commit updates the github workflows build.yml file to include steps
for uploading and downloading the xcframework artifact. The
macos-latest-swift job now depends on the ios-xcode-build job and
downloads the xcframework artifact produced by it.

The motivation for this changes is that it takes a long time to build
the xcframework and we are currently doing this twice in the workflow.
With this change, we only build it once and reuse the artifact.
---
 .github/workflows/build.yml | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 9a668bf8dd..7f2dccde0c 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -711,6 +711,7 @@ jobs:
 
   macOS-latest-swift:
     runs-on: macos-latest
+    needs: ios-xcode-build
 
     strategy:
       matrix:
@@ -727,6 +728,12 @@ jobs:
           key: macOS-latest-swift
           evict-old-files: 1d
 
+      - name: Download xcframework artifact
+        uses: actions/download-artifact@v4
+        with:
+          name: llama-xcframework
+          path: build-apple/llama.xcframework/
+
       - name: Dependencies
         id: depends
         continue-on-error: true
@@ -748,11 +755,6 @@ jobs:
             -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"
           cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
 
-      - name: xcodebuild for swift package
-        id: xcodebuild
-        run: |
-          ./build-xcframework.sh
-
   windows-msys2:
     runs-on: windows-2025
 
@@ -1170,6 +1172,13 @@ jobs:
         run: |
           ./build-xcframework.sh
 
+      - name: Upload xcframework artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: llama-xcframework
+          path: build-apple/llama.xcframework/
+          retention-days: 1
+
       - name: Build Xcode project
         run: |
           xcodebuild -downloadPlatform iOS

From 3913f8730ec6d6245480affc30ae3049107956f4 Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Tue, 16 Sep 2025 15:25:57 +0200
Subject: [PATCH 09/19] ggml : fix padding in timestep embedding kernels
 (#15932)

* ggml : remove adding extra dim timestep embedding

This commit updates the ggml_timestep_embedding function to no longer
add an extra dimension when the specified dimension is odd.

The motivation for this change is that this introduces an unnecessary
dimension when the dimension is odd, which caused an issue in the
kernels which were not expecting this extra dimension and it resulted in
uninitialized memory for the second to last dimension.

* ggml-cuda : fix padding in timestep embedding kernel

This commit removes the zeroing out of the last dimension now that we
are not adding the extra padding dimension.

* ggml-metal : fix padding in timestep embedding kernel

This commit fixes the zero padding for odd dimensions in
the timestep embedding kernel

* ggml-opencl : fix padding in timestep embedding kernel

This commit fixes the zero padding for odd dimensions in
the timestep embedding kernel.

* ggml-sycl : fix padding in timestep embedding kernel

This commit fixes the zero padding for odd dimensions in
the timestep embedding kernel.

* ggml-vulkan : fix padding in timestep embedding kernel

This commit fixes the zero padding for odd dimensions in
the timestep embedding kernel.

* ggml-cpu : fix padding in timestep embedding function

This commit removes the zeroing out of the last dimension now that we
are not adding the extra padding dimension.
---
 ggml/src/ggml-cpu/ops.cpp                                  | 1 -
 ggml/src/ggml-cuda/tsembd.cu                               | 6 +++---
 ggml/src/ggml-metal/ggml-metal.metal                       | 2 +-
 ggml/src/ggml-opencl/kernels/tsembd.cl                     | 4 ++--
 ggml/src/ggml-sycl/tsembd.cpp                              | 7 ++++---
 .../src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp | 7 ++++---
 ggml/src/ggml.c                                            | 6 +-----
 7 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 212e52ef6a..c4824d145a 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -8599,7 +8599,6 @@ static void ggml_compute_forward_timestep_embedding_f32(
         }
         if (dim % 2 != 0 && ith == 0) {
             embed_data[2 * half] = 0.f;
-            embed_data[dim] = 0.f;
         }
     }
 }
diff --git a/ggml/src/ggml-cuda/tsembd.cu b/ggml/src/ggml-cuda/tsembd.cu
index 153ddbcda9..b91a26fc80 100644
--- a/ggml/src/ggml-cuda/tsembd.cu
+++ b/ggml/src/ggml-cuda/tsembd.cu
@@ -7,11 +7,11 @@ static __global__ void timestep_embedding_f32(const float * timesteps, float * d
     int j = threadIdx.x + blockIdx.x * blockDim.x;
     float * embed_data = (float *)((char *)dst +  i*nb1);
 
-    if (dim % 2 != 0 && j == ((dim + 1) / 2)) {
-        embed_data[dim] = 0.f;
+    int half = dim / 2;
+    if (dim % 2 != 0 && j == half) {
+        embed_data[2 * half] = 0.f;
     }
 
-    int half = dim / 2;
     if (j >= half) {
         return;
     }
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 4314c9cc93..5057e264f6 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -4167,7 +4167,7 @@ kernel void kernel_timestep_embedding_f32(
     }
 
     if (args.dim % 2 != 0 && tpitg.x == 0) {
-        embed_data[args.dim] = 0.f;
+        embed_data[2 * half_] = 0.f;
     }
 }
 
diff --git a/ggml/src/ggml-opencl/kernels/tsembd.cl b/ggml/src/ggml-opencl/kernels/tsembd.cl
index 4b1107f70b..21444bd958 100644
--- a/ggml/src/ggml-opencl/kernels/tsembd.cl
+++ b/ggml/src/ggml-opencl/kernels/tsembd.cl
@@ -26,8 +26,8 @@ kernel void kernel_timestep_embedding(
     local_half_dim = logical_dim / 2;
     local_embed_data_ptr = (global float *)((global char *)local_dst_output_base_ptr + local_i * dst_nb1_bytes);
 
-    if (logical_dim % 2 != 0 && local_j == ((logical_dim + 1) / 2)) {
-        local_embed_data_ptr[logical_dim] = 0.0f;
+    if (logical_dim % 2 != 0 && local_j == local_half_dim) {
+        local_embed_data_ptr[2 * local_half_dim] = 0.0f;
     }
 
     if (local_j >= local_half_dim) {
diff --git a/ggml/src/ggml-sycl/tsembd.cpp b/ggml/src/ggml-sycl/tsembd.cpp
index f6ca626ea7..f2003794d3 100644
--- a/ggml/src/ggml-sycl/tsembd.cpp
+++ b/ggml/src/ggml-sycl/tsembd.cpp
@@ -21,11 +21,12 @@ static void timestep_embedding_f32(
     int j = item_ct1.get_local_id(2) + item_ct1.get_group(2) * item_ct1.get_local_range(2);
     float * embed_data = (float *)((char *)dst +  i*nb1);
 
-    if (dim % 2 != 0 && j == ((dim + 1) / 2)) {
-        embed_data[dim] = 0.f;
+    int half = dim / 2;
+
+    if (dim % 2 != 0 && j == half) {
+        embed_data[2 * half] = 0.f;
     }
 
-    int half = dim / 2;
     if (j >= half) {
         return;
     }
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp b/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp
index 79e065a931..ce8e09442d 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp
@@ -24,11 +24,12 @@ void main() {
     const uint j = gl_GlobalInvocationID.x;
     const uint d_offset = i * p.nb1;
 
-    if (p.dim % 2 != 0 && j == ((p.dim + 1) / 2)) {
-        data_d[d_offset + p.dim] = 0.f;
+    const uint half_dim = p.dim / 2;
+
+    if (p.dim % 2 != 0 && j == half_dim) {
+        data_d[d_offset + 2 * half_dim] = 0.f;
     }
 
-    const uint half_dim = p.dim / 2;
     if (j >= half_dim) {
         return;
     }
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 50dc1aa24f..3584827dca 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -4923,12 +4923,8 @@ struct ggml_tensor * ggml_timestep_embedding(
         struct ggml_tensor  * timesteps,
         int                   dim,
         int                   max_period) {
-    int actual_dim = dim;
-    if (dim % 2 != 0) {
-        actual_dim = dim + 1;
-    }
 
-    struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, actual_dim, timesteps->ne[0]);
+    struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, dim, timesteps->ne[0]);
 
     ggml_set_op_params_i32(result, 0, dim);
     ggml_set_op_params_i32(result, 1, max_period);

From 77475530b8bbea3bf578632507e1284cdfe2c8c0 Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Tue, 16 Sep 2025 15:27:52 +0200
Subject: [PATCH 10/19] ci : use macos-latest for arm64 webgpu build (#16029)

This commit updates the runs-on field for the macOS arm64 webgpu build
job to use macos-latest instead of just latest.

The motivation for this is that this job can wait for a runner to pick
up the job for a very long time, sometimes over 7 hours. This is an
attempt to see if this change can help reduce the wait time.

Refs: https://github.com/ggml-org/llama.cpp/actions/runs/17754163447/job/50454257570?pr=16004
---
 .github/workflows/build.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 7f2dccde0c..0a00c85b73 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -138,7 +138,7 @@ jobs:
           ctest -L main --verbose --timeout 900
 
   macOS-latest-cmake-arm64-webgpu:
-    runs-on: latest
+    runs-on: macos-latest
 
     steps:
       - name: Clone

From 8ff206097c2bf3ca1c7aa95f9d6db779fc7bdd68 Mon Sep 17 00:00:00 2001
From: jacekpoplawski <67507230+jacekpoplawski@users.noreply.github.com>
Date: Tue, 16 Sep 2025 16:17:08 +0200
Subject: [PATCH 11/19] llama-bench: add --n-cpu-moe support (#15952)

* llama-bench: add --n-cpu-moe support

Support --n-cpu-moe in llama-bench the same way it is supported by
llama-server.
---
 common/arg.cpp                    |  8 +--
 common/common.h                   | 14 +++++
 tools/llama-bench/llama-bench.cpp | 87 +++++++++++++++++++++++++------
 3 files changed, 90 insertions(+), 19 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 19189d8f9f..9fd8858e82 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2548,7 +2548,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         {"--cpu-moe", "-cmoe"},
         "keep all Mixture of Experts (MoE) weights in the CPU",
         [](common_params & params) {
-            params.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
+            params.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
         }
     ).set_env("LLAMA_ARG_CPU_MOE"));
     add_opt(common_arg(
@@ -2561,7 +2561,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             for (int i = 0; i < value; ++i) {
                 // keep strings alive and avoid leaking memory by storing them in a static vector
                 static std::list<std::string> buft_overrides;
-                buft_overrides.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
+                buft_overrides.push_back(llm_ffn_exps_block_regex(i));
                 params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()});
             }
         }
@@ -2570,7 +2570,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         {"--cpu-moe-draft", "-cmoed"},
         "keep all Mixture of Experts (MoE) weights in the CPU for the draft model",
         [](common_params & params) {
-            params.speculative.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
+            params.speculative.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
     add_opt(common_arg(
@@ -2582,7 +2582,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             }
             for (int i = 0; i < value; ++i) {
                 static std::list<std::string> buft_overrides_draft;
-                buft_overrides_draft.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
+                buft_overrides_draft.push_back(llm_ffn_exps_block_regex(i));
                 params.speculative.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()});
             }
         }
diff --git a/common/common.h b/common/common.h
index 5063d73f96..83d44dbaa7 100644
--- a/common/common.h
+++ b/common/common.h
@@ -734,6 +734,20 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
 
 }
 
+//
+// MoE utils
+//
+
+const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_exps";
+
+static std::string llm_ffn_exps_block_regex(int idx) {
+    return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);
+}
+
+static llama_model_tensor_buft_override llm_ffn_exps_cpu_override() {
+    return { LLM_FFN_EXPS_REGEX, ggml_backend_cpu_buffer_type() };
+}
+
 //
 // training utils
 //
diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp
index 95f662a297..ad47bf144f 100644
--- a/tools/llama-bench/llama-bench.cpp
+++ b/tools/llama-bench/llama-bench.cpp
@@ -250,6 +250,7 @@ struct cmd_params {
     std::vector<bool>                cpu_strict;
     std::vector<int>                 poll;
     std::vector<int>                 n_gpu_layers;
+    std::vector<int>                 n_cpu_moe;
     std::vector<std::string>         rpc_servers;
     std::vector<llama_split_mode>    split_mode;
     std::vector<int>                 main_gpu;
@@ -286,6 +287,7 @@ static const cmd_params cmd_params_defaults = {
     /* cpu_strict           */ { false },
     /* poll                 */ { 50 },
     /* n_gpu_layers         */ { 99 },
+    /* n_cpu_moe            */ { 0 },
     /* rpc_servers          */ { "" },
     /* split_mode           */ { LLAMA_SPLIT_MODE_LAYER },
     /* main_gpu             */ { 0 },
@@ -353,6 +355,8 @@ static void print_usage(int /* argc */, char ** argv) {
     printf("  --poll <0...100>                          (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
     printf("  -ngl, --n-gpu-layers <n>                  (default: %s)\n",
            join(cmd_params_defaults.n_gpu_layers, ",").c_str());
+    printf("  -ncmoe, --n-cpu-moe <n>                   (default: %s)\n",
+           join(cmd_params_defaults.n_cpu_moe, ",").c_str());
     if (llama_supports_rpc()) {
         printf("  -rpc, --rpc <rpc_servers>                 (default: %s)\n",
                join(cmd_params_defaults.rpc_servers, ",").c_str());
@@ -564,6 +568,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                 }
                 auto p = parse_int_range(argv[i]);
                 params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
+            } else if (arg == "-ncmoe" || arg == "--n-cpu-moe") {
+                if (++i >= argc) {
+                    invalid_param = true;
+                    break;
+                }
+                auto p = parse_int_range(argv[i]);
+                params.n_cpu_moe.insert(params.n_cpu_moe.end(), p.begin(), p.end());
             } else if (llama_supports_rpc() && (arg == "-rpc" || arg == "--rpc")) {
                 if (++i >= argc) {
                     invalid_param = true;
@@ -841,6 +852,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
     if (params.n_gpu_layers.empty()) {
         params.n_gpu_layers = cmd_params_defaults.n_gpu_layers;
     }
+    if (params.n_cpu_moe.empty()) {
+        params.n_cpu_moe = cmd_params_defaults.n_cpu_moe;
+    }
     if (params.rpc_servers.empty()) {
         params.rpc_servers = cmd_params_defaults.rpc_servers;
     }
@@ -901,6 +915,7 @@ struct cmd_params_instance {
     bool               cpu_strict;
     int                poll;
     int                n_gpu_layers;
+    int                n_cpu_moe;
     std::string        rpc_servers_str;
     llama_split_mode   split_mode;
     int                main_gpu;
@@ -973,20 +988,50 @@ struct cmd_params_instance {
         mparams.tensor_split = tensor_split.data();
         mparams.use_mmap     = use_mmap;
 
-        if (tensor_buft_overrides.empty()) {
-            mparams.tensor_buft_overrides = nullptr;
+        if (n_cpu_moe <= 0) {
+            if (tensor_buft_overrides.empty()) {
+                mparams.tensor_buft_overrides = nullptr;
+            } else {
+                GGML_ASSERT(tensor_buft_overrides.back().pattern == nullptr &&
+                            "Tensor buffer overrides not terminated with empty pattern");
+                mparams.tensor_buft_overrides = tensor_buft_overrides.data();
+            }
         } else {
-            GGML_ASSERT(tensor_buft_overrides.back().pattern == nullptr && "Tensor buffer overrides not terminated with empty pattern");
-            mparams.tensor_buft_overrides = tensor_buft_overrides.data();
+            static std::vector<llama_model_tensor_buft_override> merged;
+            static std::vector<std::string> patterns;
+
+            merged.clear();
+            patterns.clear();
+
+            auto first = tensor_buft_overrides.begin();
+            auto last  = tensor_buft_overrides.end();
+            if (first != last && (last - 1)->pattern == nullptr) {
+                --last;
+            }
+            merged.insert(merged.end(), first, last);
+
+            patterns.reserve((size_t) n_cpu_moe);
+            merged.reserve(merged.size() + (size_t) n_cpu_moe + 1);
+
+            for (int i = 0; i < n_cpu_moe; ++i) {
+                patterns.push_back(llm_ffn_exps_block_regex(i));
+                merged.push_back({ patterns.back().c_str(),
+                                ggml_backend_cpu_buffer_type() });
+            }
+
+            merged.push_back({ nullptr, nullptr });
+
+            mparams.tensor_buft_overrides = merged.data();
         }
 
         return mparams;
     }
 
     bool equal_mparams(const cmd_params_instance & other) const {
-        return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers_str == other.rpc_servers_str &&
-               split_mode == other.split_mode && main_gpu == other.main_gpu && use_mmap == other.use_mmap &&
-               tensor_split == other.tensor_split && vec_tensor_buft_override_equal(tensor_buft_overrides, other.tensor_buft_overrides);
+        return model == other.model && n_gpu_layers == other.n_gpu_layers && n_cpu_moe == other.n_cpu_moe &&
+               rpc_servers_str == other.rpc_servers_str && split_mode == other.split_mode &&
+               main_gpu == other.main_gpu && use_mmap == other.use_mmap && tensor_split == other.tensor_split &&
+               vec_tensor_buft_override_equal(tensor_buft_overrides, other.tensor_buft_overrides);
     }
 
     llama_context_params to_llama_cparams() const {
@@ -1014,6 +1059,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
     // clang-format off
     for (const auto & m : params.model)
     for (const auto & nl : params.n_gpu_layers)
+    for (const auto & ncmoe : params.n_cpu_moe)
     for (const auto & rpc : params.rpc_servers)
     for (const auto & sm : params.split_mode)
     for (const auto & mg : params.main_gpu)
@@ -1051,6 +1097,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                 /* .cpu_strict   = */ cs,
                 /* .poll         = */ pl,
                 /* .n_gpu_layers = */ nl,
+                /* .n_cpu_moe    = */ ncmoe,
                 /* .rpc_servers  = */ rpc,
                 /* .split_mode   = */ sm,
                 /* .main_gpu     = */ mg,
@@ -1083,6 +1130,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                 /* .cpu_strict   = */ cs,
                 /* .poll         = */ pl,
                 /* .n_gpu_layers = */ nl,
+                /* .n_cpu_moe    = */ ncmoe,
                 /* .rpc_servers  = */ rpc,
                 /* .split_mode   = */ sm,
                 /* .main_gpu     = */ mg,
@@ -1115,6 +1163,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                 /* .cpu_strict   = */ cs,
                 /* .poll         = */ pl,
                 /* .n_gpu_layers = */ nl,
+                /* .n_cpu_moe    = */ ncmoe,
                 /* .rpc_servers  = */ rpc,
                 /* .split_mode   = */ sm,
                 /* .main_gpu     = */ mg,
@@ -1152,6 +1201,7 @@ struct test {
     ggml_type                type_k;
     ggml_type                type_v;
     int                      n_gpu_layers;
+    int                      n_cpu_moe;
     llama_split_mode         split_mode;
     int                      main_gpu;
     bool                     no_kv_offload;
@@ -1186,6 +1236,7 @@ struct test {
         type_k         = inst.type_k;
         type_v         = inst.type_v;
         n_gpu_layers   = inst.n_gpu_layers;
+        n_cpu_moe      = inst.n_cpu_moe;
         split_mode     = inst.split_mode;
         main_gpu       = inst.main_gpu;
         no_kv_offload  = inst.no_kv_offload;
@@ -1236,12 +1287,14 @@ struct test {
 
     static const std::vector<std::string> & get_fields() {
         static const std::vector<std::string> fields = {
-            "build_commit", "build_number", "cpu_info",       "gpu_info",   "backends",     "model_filename",
-            "model_type",   "model_size",   "model_n_params", "n_batch",    "n_ubatch",     "n_threads",
-            "cpu_mask",     "cpu_strict",   "poll",           "type_k",     "type_v",       "n_gpu_layers",
-            "split_mode",   "main_gpu",     "no_kv_offload",  "flash_attn", "tensor_split", "tensor_buft_overrides",
-            "use_mmap",     "embeddings",   "no_op_offload",   "n_prompt",       "n_gen",      "n_depth",      "test_time",
-            "avg_ns",       "stddev_ns",    "avg_ts",         "stddev_ts",
+            "build_commit",   "build_number",  "cpu_info",      "gpu_info",       "backends",
+            "model_filename", "model_type",    "model_size",    "model_n_params", "n_batch",
+            "n_ubatch",       "n_threads",     "cpu_mask",      "cpu_strict",     "poll",
+            "type_k",         "type_v",        "n_gpu_layers",  "n_cpu_moe",      "split_mode",
+            "main_gpu",       "no_kv_offload", "flash_attn",    "tensor_split",   "tensor_buft_overrides",
+            "use_mmap",       "embeddings",    "no_op_offload", "n_prompt",       "n_gen",
+            "n_depth",        "test_time",     "avg_ns",        "stddev_ns",      "avg_ts",
+            "stddev_ts"
         };
         return fields;
     }
@@ -1251,8 +1304,8 @@ struct test {
     static field_type get_field_type(const std::string & field) {
         if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" ||
             field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" ||
-            field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" ||
-            field == "avg_ns" || field == "stddev_ns" || field == "no_op_offload") {
+            field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" || field == "avg_ns" ||
+            field == "stddev_ns" || field == "no_op_offload" || field == "n_cpu_moe") {
             return INT;
         }
         if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" ||
@@ -1320,6 +1373,7 @@ struct test {
                                             ggml_type_name(type_k),
                                             ggml_type_name(type_v),
                                             std::to_string(n_gpu_layers),
+                                            std::to_string(n_cpu_moe),
                                             split_mode_str(split_mode),
                                             std::to_string(main_gpu),
                                             std::to_string(no_kv_offload),
@@ -1568,6 +1622,9 @@ struct markdown_printer : public printer {
         if (!is_cpu_backend) {
             fields.emplace_back("n_gpu_layers");
         }
+        if (params.n_cpu_moe.size() > 1) {
+            fields.emplace_back("n_cpu_moe");
+        }
         if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
             fields.emplace_back("n_threads");
         }

From d5fabe3682de515fd09d6c981f7a0d1b75614455 Mon Sep 17 00:00:00 2001
From: Chenguang Li <757486878@qq.com>
Date: Wed, 17 Sep 2025 14:33:08 +0800
Subject: [PATCH 12/19] CANN: Optimize ggml_cann_set_device (#15935)

* CANN: Fix ggml_cann_set_device to avoid redundant device switches

- Added a check to skip aclrtSetDevice if the current device is already set.
- Prevents unnecessary context switches while keeping thread/device consistency.

* CANN: add device default id
---
 ggml/src/ggml-cann/common.h      |  5 ++++-
 ggml/src/ggml-cann/ggml-cann.cpp | 12 ++++++------
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/ggml/src/ggml-cann/common.h b/ggml/src/ggml-cann/common.h
index c5fce8dc91..b707b84359 100755
--- a/ggml/src/ggml-cann/common.h
+++ b/ggml/src/ggml-cann/common.h
@@ -526,7 +526,10 @@ struct ggml_backend_cann_context {
      */
     aclrtStream stream(int stream) {
         if (streams[stream] == nullptr) {
-            ggml_cann_set_device(device);
+            // If the device is not set here, destroying the stream later may cause a mismatch
+            // between the thread contexts where the stream was created and destroyed.
+            // However, I printed the device_id, thread_id, and stream, and they are all consistent.
+            ACL_CHECK(aclrtSetDevice(device));
             ACL_CHECK(aclrtCreateStream(&streams[stream]));
         }
         return streams[stream];
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index 19a18a281d..56d82b4af3 100755
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -75,13 +75,12 @@
  * @param device The device ID to set.
  */
 void ggml_cann_set_device(const int32_t device) {
-    // TODO: uncomment these lines after empty context has fixed.
-    // int current_device;
-    // ACL_CHECK(aclrtGetDevice(&current_device));
+    int current_device = -1;
+    aclrtGetDevice(&current_device);
 
-    // if (device == current_device) {
-    //   return;
-    // }
+    if (device == current_device) {
+      return;
+    }
     ACL_CHECK(aclrtSetDevice(device));
 }
 
@@ -1729,6 +1728,7 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
             ggml_cann_get_rows(ctx, dst);
             break;
         case GGML_OP_SET_ROWS:
+            std::cout << "lcg GGML_OP_SET_ROWS"<< std::endl;
             ggml_cann_set_rows(ctx, dst);
             break;
         case GGML_OP_DUP:

From 85286f354813056f6c835046c0acfa3bf6ba9432 Mon Sep 17 00:00:00 2001
From: Shane A <shanea@allenai.org>
Date: Wed, 17 Sep 2025 00:01:58 -0700
Subject: [PATCH 13/19] model : add OLMo3 support (#16015)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add HF to gguf conversion logic for Olmo3

* Add Olmo3 implementation

* Update rope comment

* Fix indentation

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Apply suggestion from @CISC

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
---
 convert_hf_to_gguf.py | 25 +++++++++++++++++++++++
 src/llama-model.cpp   | 47 +++++++++++++++++++++++++++++++++++++++----
 2 files changed, 68 insertions(+), 4 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 5a21ba2110..ce83f24695 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -6009,9 +6009,34 @@ class SeedOssModel(TextModel):
 
 
 @ModelBase.register("Olmo2ForCausalLM")
+@ModelBase.register("Olmo3ForCausalLM")
 class Olmo2Model(TextModel):
     model_arch = gguf.MODEL_ARCH.OLMO2
 
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+
+        rope_scaling = self.hparams.get("rope_scaling") or {}
+        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
+            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
+            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
+            self.gguf_writer.add_rope_scaling_attn_factors(rope_scaling["attention_factor"])
+            self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
+
+        if "sliding_window" in self.hparams:
+            self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
+
+            sliding_window_pattern = []
+            if "layer_types" in self.hparams:
+                sliding_window_pattern = [t == "sliding_attention" for t in self.hparams["layer_types"]]
+            else:
+                # Olmo2 does not use sliding window attention.
+                # Olmo3 defaults to using sliding window for all layers except every 4th.
+                for i in range(self.hparams["num_hidden_layers"]):
+                    sliding_window_pattern.append((i + 1) % 4 != 0)
+
+            self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
+
 
 @ModelBase.register("OlmoeForCausalLM")
 class OlmoeModel(TextModel):
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 731e87383b..2be807a6a9 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1350,6 +1350,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 
+                const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
+                if (found_swa && hparams.n_swa > 0) {
+                    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+                    hparams.set_swa_pattern(4);
+                } else {
+                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
+                }
+
                 switch (hparams.n_layer) {
                     case 16: type = LLM_TYPE_1B; break;
                     case 32: type = LLM_TYPE_7B; break;
@@ -12233,6 +12241,7 @@ struct llm_build_olmo : public llm_graph_context {
     }
 };
 
+template <bool iswa>
 struct llm_build_olmo2 : public llm_graph_context {
     llm_build_olmo2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -12248,7 +12257,14 @@ struct llm_build_olmo2 : public llm_graph_context {
         // inp_pos - contains the positions
         ggml_tensor * inp_pos = build_inp_pos();
 
-        auto * inp_attn = build_attn_inp_kv();
+        using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
+        inp_attn_type * inp_attn = nullptr;
+
+        if constexpr (iswa) {
+            inp_attn = build_attn_inp_kv_iswa();
+        } else {
+            inp_attn = build_attn_inp_kv();
+        }
 
         ggml_tensor * inp_out_ids = build_inp_out_ids();
 
@@ -12281,17 +12297,36 @@ struct llm_build_olmo2 : public llm_graph_context {
                 Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
                 Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
 
-                Qcur = ggml_rope_ext(
+                const bool is_swa = hparams.is_swa(il);
+
+                if (is_swa) {
+                    // For sliding window layers, Olmo3 use regular rope with no yarn rope scaling.
+                    // This is achieved here by setting freq_scale and attn_factor to 1.
+                    // We also set ext_factor to 0 to avoid a few unnecessary computations.
+                    Qcur = ggml_rope_ext(
+                        ctx0, Qcur, inp_pos, nullptr,
+                        n_rot, rope_type, n_ctx_orig, freq_base, 1.0,
+                        0.0, 1.0, beta_fast, beta_slow
+                        );
+
+                    Kcur = ggml_rope_ext(
+                        ctx0, Kcur, inp_pos, nullptr,
+                        n_rot, rope_type, n_ctx_orig, freq_base, 1.0,
+                        0.0, 1.0, beta_fast, beta_slow
+                        );
+                } else {
+                    Qcur = ggml_rope_ext(
                         ctx0, Qcur, inp_pos, nullptr,
                         n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                         ext_factor, attn_factor, beta_fast, beta_slow
                         );
 
-                Kcur = ggml_rope_ext(
+                    Kcur = ggml_rope_ext(
                         ctx0, Kcur, inp_pos, nullptr,
                         n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                         ext_factor, attn_factor, beta_fast, beta_slow
                         );
+                }
 
                 cb(Qcur, "Qcur", il);
                 cb(Kcur, "Kcur", il);
@@ -19131,7 +19166,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
             } break;
         case LLM_ARCH_OLMO2:
             {
-                llm = std::make_unique<llm_build_olmo2>(*this, params);
+                if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
+                    llm = std::make_unique<llm_build_olmo2<true>>(*this, params);
+                } else {
+                    llm = std::make_unique<llm_build_olmo2<false>>(*this, params);
+                }
             } break;
         case LLM_ARCH_OLMOE:
             {

From 1cbd80f8cf80a817715b1ccc5680fe2a3c5172c8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jie=20Fu=20=28=E5=82=85=E6=9D=B0=29?= <jiefu@tencent.com>
Date: Wed, 17 Sep 2025 15:29:00 +0800
Subject: [PATCH 14/19] examples : support encoder-decoder models in the simple
 example (#16002)

Signed-off-by: Jie Fu <jiefu@tencent.com>
---
 examples/simple/simple.cpp | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index 633b87e584..d09771d104 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -145,6 +145,20 @@ int main(int argc, char ** argv) {
 
     llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
 
+    if (llama_model_has_encoder(model)) {
+        if (llama_encode(ctx, batch)) {
+            fprintf(stderr, "%s : failed to eval\n", __func__);
+            return 1;
+        }
+
+        llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
+        if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
+            decoder_start_token_id = llama_vocab_bos(vocab);
+        }
+
+        batch = llama_batch_get_one(&decoder_start_token_id, 1);
+    }
+
     // main loop
 
     const auto t_main_start = ggml_time_us();

From 745cbcf2fe1eb88f8db615ac622f0b944d924ad6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jie=20Fu=20=28=E5=82=85=E6=9D=B0=29?= <jiefu@tencent.com>
Date: Wed, 17 Sep 2025 15:30:55 +0800
Subject: [PATCH 15/19] llama-quant : fix the verification of attention layers
 for encoder-decoder models (#16023)

Signed-off-by: Jie Fu <jiefu@tencent.com>
---
 src/llama-quant.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index c93e8065a8..97228b2a69 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -725,7 +725,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         // attention layers have a non-zero number of kv heads
         int32_t n_attn_layer = model.hparams.n_layer - std::count(n_head_kv_iter, n_head_kv_iter + model.hparams.n_layer, 0);
         if (llama_model_has_encoder(&model)) {
-            n_attn_layer *= 3;
+            // now n_attn_layer is the number of attention layers in the encoder
+            // for each decoder block, there are 2 attention layers
+            n_attn_layer += 2 * model.hparams.dec_n_layer;
         }
         GGML_ASSERT((qs.n_attention_wv == n_attn_layer - pruned_attention_w) && "n_attention_wv is unexpected");
     }

From a91d035b901e8a9edf810f63d130ee49adf27be2 Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Wed, 17 Sep 2025 09:34:09 +0200
Subject: [PATCH 16/19] ci : revert back to macos-13 for macOS-latest-cmake-x64
 (#16040)

This commit reverts the change of the runs-on parameter for the
macOS-latest-cmake-x64 job back to macos-13 that was make in
Commit 51abc96bdc52ba8cd6ad78dcf12ed9a041d7b442 ("ci : update
macos-latest* jobs to use macos-latest (#15938)").

The motivation for this is that using macos-latest will cause an ARM
based runner to be used, and not an x64 based runner.

Refs: https://github.com/ggml-org/llama.cpp/pull/15938#issuecomment-3300805127
---
 .github/workflows/build.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 0a00c85b73..ff42b19f1d 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -97,7 +97,7 @@ jobs:
           ctest -L 'main|curl' --verbose --timeout 900
 
   macOS-latest-cmake-x64:
-    runs-on: macos-latest
+    runs-on: macos-13
 
     steps:
       - name: Clone

From cb5bb6cc05119c24e7711ca2956cd0e6d409d396 Mon Sep 17 00:00:00 2001
From: Eve <139727413+netrunnereve@users.noreply.github.com>
Date: Wed, 17 Sep 2025 07:35:37 +0000
Subject: [PATCH 17/19] vulkan: automatically remove unsupported devices
 (#15976)

* remove unsupported vulkan devices

* make this happen during selection instead

* pass by reference
---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 60a99dc78b..1f1136382e 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -4423,8 +4423,8 @@ static void ggml_vk_print_gpu_info(size_t idx) {
 
 static bool ggml_vk_instance_validation_ext_available();
 static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
-
 static bool ggml_vk_instance_debug_utils_ext_available(const std::vector<vk::ExtensionProperties> & instance_extensions);
+static bool ggml_vk_device_is_supported(const vk::PhysicalDevice & vkdev);
 
 static void ggml_vk_instance_init() {
     if (vk_instance_initialized) {
@@ -4540,7 +4540,7 @@ static void ggml_vk_instance_init() {
             new_driver.pNext = &new_id;
             devices[i].getProperties2(&new_props);
 
-            if (new_props.properties.deviceType == vk::PhysicalDeviceType::eDiscreteGpu || new_props.properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu) {
+            if ((new_props.properties.deviceType == vk::PhysicalDeviceType::eDiscreteGpu || new_props.properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu) && ggml_vk_device_is_supported(devices[i])) {
                 // Check if there are two physical devices corresponding to the same GPU
                 auto old_device = std::find_if(
                     vk_instance.device_indices.begin(),
@@ -12738,6 +12738,20 @@ static bool ggml_vk_instance_debug_utils_ext_available(
     UNUSED(instance_extensions);
 }
 
+static bool ggml_vk_device_is_supported(const vk::PhysicalDevice & vkdev) {
+    VkPhysicalDeviceFeatures2 device_features2;
+    device_features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2;
+
+    VkPhysicalDeviceVulkan11Features vk11_features;
+    vk11_features.pNext = nullptr;
+    vk11_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES;
+    device_features2.pNext = &vk11_features;
+
+    vkGetPhysicalDeviceFeatures2(vkdev, &device_features2);
+
+    return vk11_features.storageBuffer16BitAccess;
+}
+
 static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch) {
     switch (props.vendorID) {
     case VK_VENDOR_ID_INTEL:

From cd08fc3ecc0264b4414b68af3874a6c689ed60c1 Mon Sep 17 00:00:00 2001
From: David Ribeiro Alves <davidralves@gmail.com>
Date: Wed, 17 Sep 2025 01:08:02 -0700
Subject: [PATCH 18/19] common : Fix corrupted memory error on json grammar
 initialization (#16038)

Initalizing RESERVED_NAME in is_reserved_name() is not thread
safe and leads to corrupted memory when used from multiple threads
as can be seen in the asan trace below. This fixes the initialization
to make it thread-safe.

    #0 0x000100abd018 in std::__1::pair<std::__1::__hash_iterator<std::__1::__hash_node<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>, void*>*>, bool> std::__1::__hash_table<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>, std::__1::hash<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>>, std::__1::equal_to<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>>, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>>>::__emplace_unique_key_args<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>> const&>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>> const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>> const&) __hash_table:1565
    #1 0x000100ab0320 in SchemaConverter::visit(nlohmann::json_abi_v3_12_0::basic_json<nlohmann::json_abi_v3_12_0::ordered_map, std::__1::vector, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>, bool, long long, unsigned long long, double, std::__1::allocator, nlohmann::json_abi_v3_12_0::adl_serializer, std::__1::vector<unsigned char, std::__1::allocator<unsigned char>>, void> const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>> const&) json-schema-to-grammar.cpp:802
    #2 0x000100aafc48 in std::__1::__function::__func<build_grammar(std::__1::function<void (common_grammar_builder const&)> const&, common_grammar_options const&)::$_2, std::__1::allocator<build_grammar(std::__1::function<void (common_grammar_builder const&)> const&, common_grammar_options const&)::$_2>, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>> (std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>> const&, nlohmann::json_abi_v3_12_0::basic_json<nlohmann::json_abi_v3_12_0::ordered_map, std::__1::vector, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>, bool, long long, unsigned long long, double, std::__1::allocator, nlohmann::json_abi_v3_12_0::adl_serializer, std::__1::vector<unsigned char, std::__1::allocator<unsigned char>>, void> const&)>::operator()(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>> const&, nlohmann::json_abi_v3_12_0::basic_json<nlohmann::json_abi_v3_12_0::ordered_map, std::__1::vector, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>, bool, long long, unsigned long long, double, std::__1::allocator, nlohmann::json_abi_v3_12_0::adl_serializer, std::__1::vector<unsigned char, std::__1::allocator<unsigned char>>, void> const&) function.h:319
    #3 0x000100a2c938 in std::__1::__function::__func<common_chat_params_init_llama_3_x(minja::chat_template const&, templates_params const&, bool)::$_0::operator()(common_grammar_builder const&) const::'lambda'(nlohmann::json_abi_v3_12_0::basic_json<nlohmann::json_abi_v3_12_0::ordered_map, std::__1::vector, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>, bool, long long, unsigned long long, double, std::__1::allocator, nlohmann::json_abi_v3_12_0::adl_serializer, std::__1::vector<unsigned char, std::__1::allocator<unsigned char>>, void> const&), std::__1::allocator<common_chat_params_init_llama_3_x(minja::chat_template const&, templates_params const&, bool)::$_0::operator()(common_grammar_builder const&) const::'lambda'(nlohmann::json_abi_v3_12_0::basic_json<nlohmann::json_abi_v3_12_0::ordered_map, std::__1::vector, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>, bool, long long, unsigned long long, double, std::__1::allocator, nlohmann::json_abi_v3_12_0::adl_serializer, std::__1::vector<unsigned char, std::__1::allocator<unsigned char>>, void> const&)>, void (nlohmann::json_abi_v3_12_0::basic_json<nlohmann::json_abi_v3_12_0::ordered_map, std::__1::vector, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>, bool, long long, unsigned long long, double, std::__1::allocator, nlohmann::json_abi_v3_12_0::adl_serializer, std::__1::vector<unsigned char, std::__1::allocator<unsigned char>>, void> const&)>::operator()(nlohmann::json_abi_v3_12_0::basic_json<nlohmann::json_abi_v3_12_0::ordered_map, std::__1::vector, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>, bool, long long, unsigned long long, double, std::__1::allocator, nlohmann::json_abi_v3_12_0::adl_serializer, std::__1::vector<unsigned char, std::__1::allocator<unsigned char>>, void> const&) function.h:319
    #4 0x000100a139f8 in foreach_function(nlohmann::json_abi_v3_12_0::basic_json<nlohmann::json_abi_v3_12_0::ordered_map, std::__1::vector, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>, bool, long long, unsigned long long, double, std::__1::allocator, nlohmann::json_abi_v3_12_0::adl_serializer, std::__1::vector<unsigned char, std::__1::allocator<unsigned char>>, void> const&, std::__1::function<void (nlohmann::json_abi_v3_12_0::basic_json<nlohmann::json_abi_v3_12_0::ordered_map, std::__1::vector, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>, bool, long long, unsigned long long, double, std::__1::allocator, nlohmann::json_abi_v3_12_0::adl_serializer, std::__1::vector<unsigned char, std::__1::allocator<unsigned char>>, void> const&)> const&) chat.cpp:762
    #5 0x000100a2a7f4 in std::__1::__function::__func<common_chat_params_init_llama_3_x(minja::chat_template const&, templates_params const&, bool)::$_0, std::__1::allocator<common_chat_params_init_llama_3_x(minja::chat_template const&, templates_params const&, bool)::$_0>, void (common_grammar_builder const&)>::operator()(common_grammar_builder const&) function.h:319
    #6 0x000100aa98f4 in build_grammar(std::__1::function<void (common_grammar_builder const&)> const&, common_grammar_options const&) json-schema-to-grammar.cpp:982
    #7 0x0001009c9314 in common_chat_params_init_llama_3_x(minja::chat_template const&, templates_params const&, bool) chat.cpp:1110
    #8 0x0001009b8afc in common_chat_templates_apply_jinja(common_chat_templates const*, common_chat_templates_inputs const&) chat.cpp:1992
    #9 0x0001009b533c in common_chat_templates_apply(common_chat_templates const*, common_chat_templates_inputs const&) chat.cpp:2074
    #10 0x000100810120 in llamacpp_apply_chat_template+0x724 (predict_oai-98384e17fb94e863:arm64+0x100090120)
    ...

==45482==Register values:
 x[0] = 0x00006020004147f8   x[1] = 0x00006080000013c8   x[2] = 0x0000000000000000   x[3] = 0x0000604006289738
 x[4] = 0x0000000000000002   x[5] = 0x0000000000000001   x[6] = 0x04034000004b4000   x[7] = 0x0000000000000001
 x[8] = 0xbebebebebebebebe   x[9] = 0x17d7d7d7d7d7d7d7  x[10] = 0x00000c04000828ff  x[11] = 0x0000000000000001
x[12] = 0x000000002018d383  x[13] = 0x0000000000000000  x[14] = 0xfa0000000000fafa  x[15] = 0x000010700001ffff
x[16] = 0x000000019dc012c0  x[17] = 0x00000001021284f8  x[18] = 0x0000000000000000  x[19] = 0x00000001700acdc0
x[20] = 0x0000000000000002  x[21] = 0x000000002018d384  x[22] = 0x16dd16fd2e731151  x[23] = 0x0000007000020000
x[24] = 0x0000000100c69c08  x[25] = 0x0000000100c69c20  x[26] = 0x00006080000013c7  x[27] = 0x0000000100c69c00
x[28] = 0x00000001700acd60     fp = 0x00000001700aceb0     lr = 0x0000000100abce30     sp = 0x00000001700acd60
AddressSanitizer can not provide additional info.
SUMMARY: AddressSanitizer: SEGV __hash_table:1565 in std::__1::pair<std::__1::__hash_iterator<std::__1::__hash_node<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>, void*>*>, bool> std::__1::__hash_table<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>, std::__1::hash<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>>, std::__1::equal_to<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>>, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>>>::__emplace_unique_key_args<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>> const&>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>> const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>> const&)
Thread T5 created by T0 here:
    #0 0x0001020b99d4 in pthread_create+0x5c (libclang_rt.asan_osx_dynamic.dylib:arm64e+0x359d4)
    #1 0x000100873910 in std::sys::pal::unix::thread::Thread::new::h77254fdd87a28e05+0x118 (predict_oai-98384e17fb94e863:arm64+0x1000f3910)
    #2 0x0001007c7a1c in test::run_test::haeb3c2bcd5ed6cf6+0x76c (predict_oai-98384e17fb94e863:arm64+0x100047a1c)
    #3 0x0001007aedb0 in test::console::run_tests_console::he9d142d704f3a986+0x149c (predict_oai-98384e17fb94e863:arm64+0x10002edb0)
    #4 0x0001007c5758 in test::test_main::hf86a5e20735245b9+0x118 (predict_oai-98384e17fb94e863:arm64+0x100045758)
    #5 0x0001007c5da0 in test::test_main_static::h61ee9c8fd30abca0+0x54 (predict_oai-98384e17fb94e863:arm64+0x100045da0)
    ...

==45482==ABORTING
---
 common/json-schema-to-grammar.cpp | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
index 182c787544..db1f0b23dd 100644
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -257,12 +257,13 @@ std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
 };
 
 static bool is_reserved_name(const std::string & name) {
-    static std::unordered_set<std::string> RESERVED_NAMES;
-    if (RESERVED_NAMES.empty()) {
-        RESERVED_NAMES.insert("root");
-        for (const auto &p : PRIMITIVE_RULES) RESERVED_NAMES.insert(p.first);
-        for (const auto &p : STRING_FORMAT_RULES) RESERVED_NAMES.insert(p.first);
-    }
+    static const std::unordered_set<std::string> RESERVED_NAMES = [] {
+        std::unordered_set<std::string> s;
+        s.insert("root");
+        for (const auto & p : PRIMITIVE_RULES) s.insert(p.first);
+        for (const auto & p : STRING_FORMAT_RULES) s.insert(p.first);
+        return s;
+    }();
     return RESERVED_NAMES.find(name) != RESERVED_NAMES.end();
 }
 

From c959b676be29e93f8dbc3bd6056ceba812a9eb72 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Wed, 17 Sep 2025 15:32:42 +0200
Subject: [PATCH 19/19] CUDA: fix FA occupancy, optimize tile kernel (#15982)

---
 ggml/src/ggml-cuda/common.cuh       |  16 +
 ggml/src/ggml-cuda/fattn-common.cuh |  13 +-
 ggml/src/ggml-cuda/fattn-tile.cu    | 551 ++++++++++++++++------------
 ggml/src/ggml-cuda/vendors/hip.h    |  34 +-
 4 files changed, 361 insertions(+), 253 deletions(-)

diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index b0feea3623..045c6d3006 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -75,6 +75,8 @@
 #define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4)
 #define GGML_CUDA_CC_IS_GCN(cc)   (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA1)
 #define GGML_CUDA_CC_IS_CDNA(cc)  (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_RDNA1)
+#define GGML_CUDA_CC_IS_CDNA1(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_CDNA2)
+#define GGML_CUDA_CC_IS_CDNA2(cc) (cc >= GGML_CUDA_CC_CDNA2 && cc < GGML_CUDA_CC_CDNA3)
 #define GGML_CUDA_CC_IS_CDNA3(cc) (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_RDNA1)
 
 // Moore Threads
@@ -325,6 +327,20 @@ static constexpr __device__ int ggml_cuda_get_physical_warp_size() {
 #endif // defined(GGML_USE_HIP) && (defined(__GFX9__) || defined(__GFX8__))
 }
 
+// Maximum number of bytes that can be copied in a single instruction.
+static constexpr __device__ int ggml_cuda_get_max_cpy_bytes() {
+#ifdef GGML_USE_HIP
+    return 16;
+#else
+#if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
+    return 16;
+#else
+    return 8;
+#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
+#endif // GGML_USE_HIP
+}
+
+
 [[noreturn]]
 static __device__ void no_device_code(
     const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh
index b69f57d659..142a3a88d1 100644
--- a/ggml/src/ggml-cuda/fattn-common.cuh
+++ b/ggml/src/ggml-cuda/fattn-common.cuh
@@ -647,9 +647,7 @@ static __global__ void flash_attn_stream_k_fixup(
 }
 
 template<int D> // D == head size
-#if !defined(GGML_USE_HIP)
 __launch_bounds__(D, 1)
-#endif // !(defined(GGML_USE_HIP)
 static __global__ void flash_attn_combine_results(
         const float  * __restrict__ VKQ_parts,
         const float2 * __restrict__ VKQ_meta,
@@ -692,10 +690,7 @@ static __global__ void flash_attn_combine_results(
     float VKQ_numerator   = 0.0f;
     float VKQ_denominator = 0.0f;
     for (int l = 0; l < parallel_blocks; ++l) {
-        const float diff = meta[l].x - kqmax;
-        float KQ_max_scale = expf(diff);
-        const uint32_t ftz_mask = 0xFFFFFFFF * (diff > SOFTMAX_FTZ_THRESHOLD);
-        *((uint32_t *) &KQ_max_scale) &= ftz_mask;
+        const float KQ_max_scale = expf(meta[l].x - kqmax);
 
         VKQ_numerator   += KQ_max_scale * VKQ_parts[l*D + tid];
         VKQ_denominator += KQ_max_scale * meta[l].y;
@@ -836,11 +831,10 @@ void launch_fattn(
         CUDA_CHECK(cudaGetLastError());
     }
 
-    int parallel_blocks = 1;
-
     const dim3 block_dim(warp_size, nwarps, 1);
     int max_blocks_per_sm = 1; // Max. number of active blocks limited by occupancy.
     CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, fattn_kernel, block_dim.x * block_dim.y * block_dim.z, nbytes_shared));
+    int parallel_blocks = max_blocks_per_sm;
 
     dim3 blocks_num;
     if (stream_k) {
@@ -862,9 +856,6 @@ void launch_fattn(
         GGML_ASSERT(K->ne[1] % KQ_row_granularity == 0);
         const int ntiles_KQ = K->ne[1] / KQ_row_granularity; // Max. number of parallel blocks limited by tensor size.
 
-        // parallel_blocks should be at least large enough to achieve max. occupancy for a single wave:
-        parallel_blocks = std::max((nsm * max_blocks_per_sm) / ntiles_total, 1);
-
         // parallel_blocks must not be larger than what the tensor size allows:
         parallel_blocks = std::min(parallel_blocks, ntiles_KQ);
 
diff --git a/ggml/src/ggml-cuda/fattn-tile.cu b/ggml/src/ggml-cuda/fattn-tile.cu
index c6a399ce5d..a2d9951ea5 100644
--- a/ggml/src/ggml-cuda/fattn-tile.cu
+++ b/ggml/src/ggml-cuda/fattn-tile.cu
@@ -2,20 +2,30 @@
 #include "fattn-common.cuh"
 #include "fattn-tile.cuh"
 
-#define FATTN_TILE_NTHREADS 256
+// kq_stride == number of KQ rows to process per iteration
+// kq_nbatch == number of K columns to load in parallel for KQ calculation
 
 static int fattn_tile_get_kq_stride_host(const int D, const int ncols, const int cc, const int warp_size) {
     if (GGML_CUDA_CC_IS_AMD(cc)) {
+        if (GGML_CUDA_CC_IS_RDNA(cc)) {
+            switch (D) {
+                case 64:
+                    return 128;
+                case 128:
+                case 256:
+                    return ncols <= 16 ? 128 : 64;
+                default:
+                    GGML_ABORT("fatal error");
+                    return -1;
+            }
+        }
         switch (D) {
             case 64:
-                return 64;
+                return ncols == 32 ? 128 : 64;
             case 128:
+                return ncols == 32 ? 64 : 32;
             case 256:
-                if (GGML_CUDA_CC_IS_GCN(cc) || GGML_CUDA_CC_IS_CDNA(cc)) {
-                    return ncols <= 16 ? 64 : 32;
-                } else {
-                    return 64;
-                }
+                return 32;
             default:
                 GGML_ABORT("fatal error");
                 return -1;
@@ -49,24 +59,28 @@ static int fattn_tile_get_kq_stride_host(const int D, const int ncols, const int
 
 static constexpr __device__ int fattn_tile_get_kq_stride_device(int D, int ncols, int warp_size) {
 #ifdef GGML_USE_HIP
+#ifdef RDNA
     switch (D) {
         case 64:
-            return 64;
+            return 128;
         case 128:
-#if defined(GCN) || defined(CDNA)
-            return ncols <= 16 ? 64 : 32;
-#else
-            return 64;
-#endif // defined(GCN) || defined(CDNA)
         case 256:
-#if defined(GCN) || defined(CDNA)
-            return ncols <= 16 ? 64 : 32;
-#else
-            return 64;
-#endif // defined(GCN) || defined(CDNA)
+            return ncols <= 16 ? 128 : 64;
         default:
             return -1;
     }
+#else
+    switch (D) {
+        case 64:
+            return ncols == 32 ? 128 : 64;
+        case 128:
+            return ncols == 32 ? 64 : 32;
+        case 256:
+            return 32;
+        default:
+            return -1;
+    }
+#endif // RDNA
 #else
 #ifdef FAST_FP16_AVAILABLE
     switch (D) {
@@ -100,17 +114,8 @@ static constexpr __device__ int fattn_tile_get_kq_nbatch_device(int D, int ncols
         case 64:
             return 64;
         case 128:
-#if defined(GCN) || defined(CDNA)
-            return ncols <= 16 ? 64 : 128;
-#else
-            return 64;
-#endif // defined(GCN) || defined(CDNA)
         case 256:
-#if defined(GCN) || defined(CDNA)
-            return ncols <= 16 ? 64 : 128;
-#else
-            return ncols <= 16 ? 64 : 256;
-#endif // defined(GCN) || defined(CDNA)
+            return 128;
         default:
             return -1;
     }
@@ -120,9 +125,8 @@ static constexpr __device__ int fattn_tile_get_kq_nbatch_device(int D, int ncols
         case 64:
             return 64;
         case 128:
-            return ncols <= 16 ? 128 : 64;
         case 256:
-            return ncols <= 16 ? 64 : 128;
+            return 128;
         default:
             return -1;
     }
@@ -142,12 +146,27 @@ static constexpr __device__ int fattn_tile_get_kq_nbatch_device(int D, int ncols
     GGML_UNUSED_VARS(ncols, warp_size);
 }
 
-template<int D, int ncols, bool use_logit_softcap> // D == head size
-#ifdef GGML_USE_HIP
-__launch_bounds__(FATTN_TILE_NTHREADS, 1)
+static int fattn_tile_get_nthreads_host(const int cc, const int ncols) {
+    return 256;
+    GGML_UNUSED_VARS(cc, ncols);
+}
+
+static constexpr __device__ int fattn_tile_get_nthreads_device(int ncols) {
+    return 256;
+    GGML_UNUSED(ncols);
+}
+
+static constexpr __device__ int fattn_tile_get_occupancy_device(int ncols) {
+#ifdef RDNA
+    return 3;
 #else
-__launch_bounds__(FATTN_TILE_NTHREADS, 2)
-#endif // GGML_USE_HIP
+    return ncols <= 16 ? 3 : 2;
+#endif // RDNA
+    GGML_UNUSED(ncols);
+}
+
+template<int D, int ncols, bool use_logit_softcap> // D == head size
+__launch_bounds__(fattn_tile_get_nthreads_device(ncols), fattn_tile_get_occupancy_device(ncols))
 static __global__ void flash_attn_tile(
         const char * __restrict__ Q,
         const char * __restrict__ K,
@@ -193,7 +212,7 @@ static __global__ void flash_attn_tile(
     }
 
     constexpr int warp_size = 32;
-    constexpr int nwarps    = FATTN_TILE_NTHREADS / warp_size;
+    constexpr int nwarps    = fattn_tile_get_nthreads_device(ncols) / warp_size;
     constexpr int kq_stride = fattn_tile_get_kq_stride_device(D, ncols, warp_size);
     static_assert(kq_stride % warp_size == 0, "kq_stride not divisable by warp_size.");
     constexpr int kq_nbatch = fattn_tile_get_kq_nbatch_device(D, ncols, warp_size);
@@ -206,90 +225,126 @@ static __global__ void flash_attn_tile(
     const int sequence = blockIdx.z / ne02;
     const int head = blockIdx.z - sequence*ne02;
     const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
-    const float2 * Q_f2   = (const float2 *) (Q    + nb03* sequence         + nb02* head              + nb01*ic0);
-    const half2  * K_h2   = (const half2  *) (K    + nb13* sequence         + nb12*(head / gqa_ratio));
-    const half2  * V_h2   = (const half2  *) (V    + nb13* sequence         + nb12*(head / gqa_ratio)); // K and V have same shape
-    const half   * maskh  = (const half   *) (mask  + nb33*(sequence % ne33)                          + nb31*ic0);
-    const float  * sinksf = (const float  *) (sinks);
+    const float * Q_f    = (const float *) (Q    + nb03* sequence         + nb02* head              + nb01*ic0);
+    const half2 * K_h2   = (const half2 *) (K    + nb13* sequence         + nb12*(head / gqa_ratio));
+    const half2 * V_h2   = (const half2 *) (V    + nb13* sequence         + nb12*(head / gqa_ratio)); // K and V have same shape
+    const half  * maskh  = (const half  *) (mask + nb33*(sequence % ne33)                           + nb31*ic0);
+    const float * sinksf = (const float *) (sinks);
 
     const int stride_KV2 = nb11 / sizeof(half2);
 
     const float slope = get_alibi_slope(max_bias, head, n_head_log2, m0, m1);
 
-#if defined(GGML_USE_HIP)
-    constexpr int cpy_nb = 16;
-#else
-    constexpr int cpy_nb = 8;
-#endif // defined(GGML_USE_HIP) && defined(GCN)
+    constexpr int cpy_nb = ggml_cuda_get_max_cpy_bytes();
     constexpr int cpy_ne = cpy_nb / 4;
 
-    __shared__ float KQ[ncols][kq_stride];
+    constexpr int cpw = ncols/nwarps; // cols per warp
+
+    // softmax_iter_j == number of KQ columns for which to calculate softmax in parallel.
+    // KQ is originall 2D but uses a Z-shaped memory pattern for larger reads/writes.
 #ifdef FAST_FP16_AVAILABLE
+    constexpr int softmax_iter_j = cpw < 2*cpy_ne ? cpw : 2*cpy_ne;
+
+    __shared__ half  KQ[ncols/softmax_iter_j][kq_stride][softmax_iter_j];
     __shared__ half2 Q_tmp[ncols][D/2];
-    __shared__ half2 KV_tmp_h2[kq_stride * (kq_nbatch/2 + cpy_ne)]; // Padded to avoid memory bank conflicts.
-    half2 VKQ[ncols/nwarps][D/(2*warp_size)] = {{{0.0f, 0.0f}}};
+    __shared__ half2 KV_tmp[kq_stride * (kq_nbatch/2 + cpy_ne)]; // Padded to avoid memory bank conflicts.
+    half2 VKQ[cpw][D/(2*warp_size)] = {{{0.0f, 0.0f}}};
 #else
+    constexpr int softmax_iter_j = cpw < 1*cpy_ne ? cpw : 1*cpy_ne;
+
+    __shared__ float KQ[ncols/softmax_iter_j][kq_stride][softmax_iter_j];
     __shared__ float Q_tmp[ncols][D];
-    __shared__ float KV_tmp_f[kq_stride * (kq_nbatch + cpy_ne)]; // Padded to avoid memory bank conflicts.
-    float2 * KV_tmp_f2 = (float2 *) KV_tmp_f;
-    float2 VKQ[ncols/nwarps][D/(2*warp_size)] = {{{0.0f, 0.0f}}};
+    __shared__ float KV_tmp[kq_stride * (kq_nbatch + cpy_ne)]; // Padded to avoid memory bank conflicts.
+    float2 VKQ[cpw][D/(2*warp_size)] = {{{0.0f, 0.0f}}};
 #endif // FAST_FP16_AVAILABLE
+    static_assert(cpw % softmax_iter_j == 0, "bad softmax_iter_j");
 
-
-    float kqmax[ncols/nwarps];
+    float KQ_max[cpw];
 #pragma unroll
     for (int j0 = 0; j0 < ncols; j0 += nwarps) {
-        kqmax[j0/nwarps] = -FLT_MAX/2.0f;
+        KQ_max[j0/nwarps] = -FLT_MAX/2.0f;
     }
-    float kqsum[ncols/nwarps] = {0.0f};
+    float KQ_sum[cpw] = {0.0f};
+
+    // Load Q data, convert to FP16 if fast.
+#pragma unroll
+    for (int j0 = 0; j0 < cpw; ++j0) {
+        const int j = j0 + threadIdx.y*cpw;
+
+        constexpr int cpy_ne_D = cpy_ne < D/warp_size ? cpy_ne : D/warp_size;
 
 #pragma unroll
-    for (int j0 = 0; j0 < ncols; j0 += nwarps) {
-        const int j = j0 + threadIdx.y;
+        for (int i0 = 0; i0 < D; i0 += warp_size*cpy_ne_D) {
+            float tmp_f[cpy_ne_D] = {0.0f};
+            if (ic0 + j < ne01) {
+                ggml_cuda_memcpy_1<sizeof(tmp_f)>(tmp_f, &Q_f[j*(nb01/sizeof(float)) + i0 + threadIdx.x*cpy_ne_D]);
+            }
 
 #pragma unroll
-        for (int i0 = 0; i0 < D/2; i0 += warp_size) {
-            const float2 tmp = ic0 + j < ne01 ? Q_f2[j*(nb01/sizeof(float2)) + i0 + threadIdx.x] : make_float2(0.0f, 0.0f);
+            for (int i1 = 0; i1 < cpy_ne_D; ++i1) {
+                tmp_f[i1] *= scale;
+            }
+
 #ifdef FAST_FP16_AVAILABLE
-            Q_tmp[j][i0 + threadIdx.x] = make_half2(tmp.x * scale, tmp.y * scale);
+            half2 tmp_h2[cpy_ne_D/2];
+#pragma unroll
+            for (int i1 = 0; i1 < cpy_ne_D; i1 += 2) {
+                tmp_h2[i1/2] = make_half2(tmp_f[i1 + 0], tmp_f[i1 + 1]);
+            }
+            ggml_cuda_memcpy_1<sizeof(tmp_h2)>(&Q_tmp[j][i0/2 + threadIdx.x*(cpy_ne_D/2)], tmp_h2);
 #else
-            Q_tmp[j][2*i0             + threadIdx.x] = tmp.x * scale;
-            Q_tmp[j][2*i0 + warp_size + threadIdx.x] = tmp.y * scale;
+            ggml_cuda_memcpy_1<sizeof(tmp_f)> (&Q_tmp[j][i0   + threadIdx.x* cpy_ne_D],    tmp_f);
 #endif // FAST_FP16_AVAILABLE
         }
     }
 
     __syncthreads();
 
+    // Main loop over KV cache:
     const int k_VKQ_max = KV_max ? KV_max[sequence*gridDim.x + blockIdx.x] : ne11;
     for (int k_VKQ_0 = blockIdx.y*kq_stride; k_VKQ_0 < k_VKQ_max; k_VKQ_0 += gridDim.y*kq_stride) {
         // Calculate KQ tile and keep track of new maximum KQ values:
 
-        float kqmax_new[ncols/nwarps];
+        float KQ_max_new[cpw];
 #pragma unroll
-        for (int j = 0; j < ncols/nwarps; ++j) {
-            kqmax_new[j] = kqmax[j];
+        for (int j = 0; j < cpw; ++j) {
+            KQ_max_new[j] = KQ_max[j];
         }
 
-        float sum[kq_stride/warp_size][ncols/nwarps] = {{0.0f}};
+        float KQ_acc[kq_stride/warp_size][cpw] = {{0.0f}}; // Accumulators for KQ matrix multiplication.
 
+        // KQ = K @ Q matrix multiplication:
 #pragma unroll
         for (int k_KQ_0 = 0; k_KQ_0 < D; k_KQ_0 += kq_nbatch) {
 #pragma unroll
             for (int i_KQ_0 = 0; i_KQ_0 < kq_stride; i_KQ_0 += nwarps) {
                 const int i_KQ = i_KQ_0 + threadIdx.y;
 
-#pragma unroll
-                for (int k_KQ_1 = 0; k_KQ_1 < kq_nbatch/2; k_KQ_1 += warp_size) {
-                    const half2 tmp_h2 = K_h2[int64_t(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ_0/2 + k_KQ_1 + threadIdx.x];
 #ifdef FAST_FP16_AVAILABLE
-                    KV_tmp_h2[i_KQ*(kq_nbatch/2 + cpy_ne) + k_KQ_1 + threadIdx.x] = tmp_h2;
-#else
-                    const float2 tmp_f2 = __half22float2(tmp_h2);
-                    KV_tmp_f[i_KQ*(kq_nbatch + cpy_ne) + 2*k_KQ_1             + threadIdx.x] = tmp_f2.x;
-                    KV_tmp_f[i_KQ*(kq_nbatch + cpy_ne) + 2*k_KQ_1 + warp_size + threadIdx.x] = tmp_f2.y;
-#endif // FAST_FP16_AVAILABLE
+                constexpr int cpy_ne_kqnb = cpy_ne < kq_nbatch/(2*warp_size) ? cpy_ne : kq_nbatch/(2*warp_size);
+#pragma unroll
+                for (int k_KQ_1 = 0; k_KQ_1 < kq_nbatch/2; k_KQ_1 += warp_size*cpy_ne_kqnb) {
+                    ggml_cuda_memcpy_1<cpy_ne_kqnb*4>(
+                        &KV_tmp[i_KQ*(kq_nbatch/2 + cpy_ne) + k_KQ_1 + threadIdx.x*cpy_ne_kqnb],
+                        &K_h2[int64_t(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ_0/2 + k_KQ_1 + threadIdx.x*cpy_ne_kqnb]);
                 }
+#else
+                constexpr int cpy_ne_kqnb = cpy_ne < kq_nbatch/warp_size ? cpy_ne : kq_nbatch/warp_size;
+#pragma unroll
+                for (int k_KQ_1 = 0; k_KQ_1 < kq_nbatch; k_KQ_1 += warp_size*cpy_ne_kqnb) {
+                    half2 tmp_h2[cpy_ne_kqnb/2];
+                    ggml_cuda_memcpy_1<sizeof(tmp_h2)>(
+                        tmp_h2, &K_h2[int64_t(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ_0/2 + k_KQ_1/2 + threadIdx.x*(cpy_ne_kqnb/2)]);
+
+                    float2 tmp_f2[cpy_ne_kqnb/2];
+#pragma unroll
+                    for (int k_KQ_2 = 0; k_KQ_2 < cpy_ne_kqnb/2; ++k_KQ_2) {
+                        tmp_f2[k_KQ_2] = __half22float2(tmp_h2[k_KQ_2]);
+                    }
+                    ggml_cuda_memcpy_1<sizeof(tmp_f2)>(
+                        &KV_tmp[i_KQ*(kq_nbatch + cpy_ne) + k_KQ_1 + threadIdx.x*cpy_ne_kqnb], tmp_f2);
+                }
+#endif // FAST_FP16_AVAILABLE
             }
 
             __syncthreads();
@@ -298,12 +353,12 @@ static __global__ void flash_attn_tile(
 #pragma unroll
             for (int k_KQ_1 = 0; k_KQ_1 < kq_nbatch/2; k_KQ_1 += cpy_ne) {
                 half2 K_k[kq_stride/warp_size][cpy_ne];
-                half2 Q_k[ncols/nwarps][cpy_ne];
+                half2 Q_k[cpw][cpy_ne];
 #else
 #pragma unroll
             for (int k_KQ_1 = 0; k_KQ_1 < kq_nbatch; k_KQ_1 += cpy_ne) {
                 float K_k[kq_stride/warp_size][cpy_ne];
-                float Q_k[ncols/nwarps][cpy_ne];
+                float Q_k[cpw][cpy_ne];
 #endif // FAST_FP16_AVAILABLE
 
 #pragma unroll
@@ -311,29 +366,29 @@ static __global__ void flash_attn_tile(
                     const int i_KQ = i_KQ_0 + threadIdx.x;
 
 #ifdef FAST_FP16_AVAILABLE
-                    ggml_cuda_memcpy_1<cpy_nb>(&K_k[i_KQ_0/warp_size], &KV_tmp_h2[i_KQ*(kq_nbatch/2 + cpy_ne) + k_KQ_1]);
+                    ggml_cuda_memcpy_1<cpy_nb>(&K_k[i_KQ_0/warp_size], &KV_tmp[i_KQ*(kq_nbatch/2 + cpy_ne) + k_KQ_1]);
 #else
-                    ggml_cuda_memcpy_1<cpy_nb>(&K_k[i_KQ_0/warp_size], &KV_tmp_f [i_KQ*(kq_nbatch   + cpy_ne) + k_KQ_1]);
+                    ggml_cuda_memcpy_1<cpy_nb>(&K_k[i_KQ_0/warp_size], &KV_tmp[i_KQ*(kq_nbatch   + cpy_ne) + k_KQ_1]);
 #endif // FAST_FP16_AVAILABLE
                 }
 #pragma unroll
-                for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
-                    const int j_KQ = j_KQ_0 + threadIdx.y;
+                for (int j_KQ_0 = 0; j_KQ_0 < cpw; ++j_KQ_0) {
+                    const int j_KQ = j_KQ_0 + threadIdx.y*cpw;
 
 #ifdef FAST_FP16_AVAILABLE
-                    ggml_cuda_memcpy_1<cpy_nb>(&Q_k[j_KQ_0/nwarps], &Q_tmp[j_KQ][k_KQ_0/2 + k_KQ_1]);
+                    ggml_cuda_memcpy_1<cpy_nb>(&Q_k[j_KQ_0], &Q_tmp[j_KQ][k_KQ_0/2 + k_KQ_1]);
 #else
-                    ggml_cuda_memcpy_1<cpy_nb>(&Q_k[j_KQ_0/nwarps], &Q_tmp[j_KQ][k_KQ_0   + k_KQ_1]);
+                    ggml_cuda_memcpy_1<cpy_nb>(&Q_k[j_KQ_0], &Q_tmp[j_KQ][k_KQ_0   + k_KQ_1]);
 #endif // FAST_FP16_AVAILABLE
                 }
 
 #pragma unroll
                 for (int i_KQ_0 = 0; i_KQ_0 < kq_stride; i_KQ_0 += warp_size) {
 #pragma unroll
-                    for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
+                    for (int j_KQ_0 = 0; j_KQ_0 < cpw; ++j_KQ_0) {
 #pragma unroll
                         for (int k = 0; k < cpy_ne; ++k) {
-                            ggml_cuda_mad(sum[i_KQ_0/warp_size][j_KQ_0/nwarps], K_k[i_KQ_0/warp_size][k], Q_k[j_KQ_0/nwarps][k]);
+                            ggml_cuda_mad(KQ_acc[i_KQ_0/warp_size][j_KQ_0], K_k[i_KQ_0/warp_size][k], Q_k[j_KQ_0][k]);
                         }
                     }
                 }
@@ -344,104 +399,77 @@ static __global__ void flash_attn_tile(
             }
         }
 
+        // Apply logit softcap, mask, update KQ_max:
 #pragma unroll
         for (int i_KQ_0 = 0; i_KQ_0 < kq_stride; i_KQ_0 += warp_size) {
             const int i_KQ = i_KQ_0 + threadIdx.x;
 
 #pragma unroll
-            for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
-                const int j_KQ = j_KQ_0 + threadIdx.y;
+            for (int j_KQ_0 = 0; j_KQ_0 < cpw; ++j_KQ_0) {
+                const int j_KQ = j_KQ_0 + threadIdx.y*cpw;
 
                 if (use_logit_softcap) {
-                    sum[i_KQ_0/warp_size][j_KQ_0/nwarps] = logit_softcap * tanhf(sum[i_KQ_0/warp_size][j_KQ_0/nwarps]);
+                    KQ_acc[i_KQ_0/warp_size][j_KQ_0] = logit_softcap * tanhf(KQ_acc[i_KQ_0/warp_size][j_KQ_0]);
                 }
 
-                sum[i_KQ_0/warp_size][j_KQ_0/nwarps] += mask ? slope*__half2float(maskh[j_KQ*ne11 + k_VKQ_0 + i_KQ]) : 0.0f;
+                KQ_acc[i_KQ_0/warp_size][j_KQ_0] += mask ? slope*__half2float(maskh[j_KQ*ne11 + k_VKQ_0 + i_KQ]) : 0.0f;
 
-                kqmax_new[j_KQ_0/nwarps] = fmaxf(kqmax_new[j_KQ_0/nwarps], sum[i_KQ_0/warp_size][j_KQ_0/nwarps]);
-
-                KQ[j_KQ][i_KQ] = sum[i_KQ_0/warp_size][j_KQ_0/nwarps];
+                KQ_max_new[j_KQ_0] = fmaxf(KQ_max_new[j_KQ_0], KQ_acc[i_KQ_0/warp_size][j_KQ_0]);
             }
         }
 
         __syncthreads();
 
+        // Calculate KQ softmax, write to shared KQ buffer, re-scale VKQ accumulators:
 #pragma unroll
-        for (int j0 = 0; j0 < ncols; j0 += nwarps) {
-            const int j = j0 + threadIdx.y;
-
-            kqmax_new[j0/nwarps] = warp_reduce_max<warp_size>(kqmax_new[j0/nwarps]);
-            const float KQ_max_scale = expf(kqmax[j0/nwarps] - kqmax_new[j0/nwarps]);
-            kqmax[j0/nwarps] = kqmax_new[j0/nwarps];
-
-            float kqsum_add = 0.0f;
-            if (kq_stride % (4*warp_size) == 0 && cpy_ne % 4 == 0) {
-#pragma unroll
-                for (int i0 = 0; i0 < kq_stride; i0 += 4*warp_size) {
-                    const int i = i0 + 4*threadIdx.x;
-
-                    float4 val = *(const float4 *) &KQ[j][i];
-                    val.x = expf(val.x - kqmax[j0/nwarps]);
-                    val.y = expf(val.y - kqmax[j0/nwarps]);
-                    val.z = expf(val.z - kqmax[j0/nwarps]);
-                    val.w = expf(val.w - kqmax[j0/nwarps]);
-                    kqsum_add += val.x + val.y + val.z + val.w;
-
+        for (int j0 = 0; j0 < cpw; j0 += softmax_iter_j) {
 #ifdef FAST_FP16_AVAILABLE
-                    const half2 tmp[2] = {make_half2(val.x, val.y), make_half2(val.z, val.w)};
-                    ggml_cuda_memcpy_1<sizeof(tmp)>(&KQ[j][i/2], &tmp);
+            half  tmp[kq_stride/warp_size][softmax_iter_j];
 #else
-                    ggml_cuda_memcpy_1<sizeof(val)>(&KQ[j][i], &val);
+            float tmp[kq_stride/warp_size][softmax_iter_j];
 #endif // FAST_FP16_AVAILABLE
-                }
-            } else if (kq_stride % (2*warp_size) == 0 && cpy_ne % 2 == 0) {
-#pragma unroll
-                for (int i0 = 0; i0 < kq_stride; i0 += 2*warp_size) {
-                    const int i = i0 + 2*threadIdx.x;
 
-                    float2 val = *(const float2 *) &KQ[j][i];
-                    val.x = expf(val.x - kqmax[j0/nwarps]);
-                    val.y = expf(val.y - kqmax[j0/nwarps]);
-                    kqsum_add += val.x + val.y;
-#ifdef FAST_FP16_AVAILABLE
-                    const half2 tmp = make_half2(val.x, val.y);
-                    ggml_cuda_memcpy_1<sizeof(tmp)>(&KQ[j][i/2], &tmp);
-#else
-                    ggml_cuda_memcpy_1<sizeof(val)>(&KQ[j][i], &val);
-#endif // FAST_FP16_AVAILABLE
-                }
-            } else {
+#pragma unroll
+            for (int j1 = 0; j1 < softmax_iter_j; ++j1) {
+                KQ_max_new[j0+j1] = warp_reduce_max<warp_size>(KQ_max_new[j0+j1]);
+                const float KQ_max_scale = expf(KQ_max[j0+j1] - KQ_max_new[j0+j1]);
+                KQ_max[j0+j1] = KQ_max_new[j0+j1];
+
+                float KQ_sum_add = 0.0f;
+#pragma unroll
                 for (int i0 = 0; i0 < kq_stride; i0 += warp_size) {
-                    const int i = i0 + threadIdx.x;
-
-                    const float diff = KQ[j][i] - kqmax[j0/nwarps];
-                    const float val = expf(diff);
-                    kqsum_add += val;
-#ifdef FAST_FP16_AVAILABLE
-                    ((half *) KQ[j])[i] = val;
-#else
-                    KQ[j][i] = val;
-#endif // FAST_FP16_AVAILABLE
+                    const float val = expf(KQ_acc[i0/warp_size][j0+j1] - KQ_max[j0+j1]);
+                    KQ_sum_add += val;
+                    tmp[i0/warp_size][j1] = val;
                 }
-            }
-            kqsum[j0/nwarps] = kqsum[j0/nwarps]*KQ_max_scale + kqsum_add;
+                KQ_sum[j0+j1] = KQ_sum[j0+j1]*KQ_max_scale + KQ_sum_add;
 
 #ifdef FAST_FP16_AVAILABLE
-            const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale, KQ_max_scale);
+                const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale, KQ_max_scale);
 #pragma unroll
-            for (int i0 = 0; i0 < D/2; i0 += warp_size) {
-                VKQ[j0/nwarps][i0/warp_size] *= KQ_max_scale_h2;
-            }
+                for (int i0 = 0; i0 < D/2; i0 += warp_size) {
+                    VKQ[j0+j1][i0/warp_size] *= KQ_max_scale_h2;
+                }
 #else
 #pragma unroll
-            for (int i0 = 0; i0 < D/2; i0 += warp_size) {
-                VKQ[j0/nwarps][i0/warp_size].x *= KQ_max_scale;
-                VKQ[j0/nwarps][i0/warp_size].y *= KQ_max_scale;
-            }
+                for (int i0 = 0; i0 < D/2; i0 += warp_size) {
+                    VKQ[j0+j1][i0/warp_size].x *= KQ_max_scale;
+                    VKQ[j0+j1][i0/warp_size].y *= KQ_max_scale;
+                }
 #endif // FAST_FP16_AVAILABLE
+            }
+
+#pragma unroll
+            for (int i0 = 0; i0 < kq_stride; i0 += warp_size) {
+                const int i = i0 + threadIdx.x;
+
+                ggml_cuda_memcpy_1<sizeof(tmp[0])>(
+                    KQ[j0/softmax_iter_j + threadIdx.y*(cpw/softmax_iter_j)][i], tmp[i0/warp_size]);
+            }
         }
 
-        constexpr int V_cols_per_iter = kq_stride*kq_nbatch / D;
+        // VKQ = V @ KQ matrix multiplication:
+        constexpr int V_cols_per_iter = kq_stride*kq_nbatch / D; // Number of V columns that fit in SRAM for K.
         static_assert(kq_stride % V_cols_per_iter == 0, "bad V_cols_per_iter");
 #pragma unroll
         for (int k0 = 0; k0 < kq_stride; k0 += V_cols_per_iter) {
@@ -449,65 +477,96 @@ static __global__ void flash_attn_tile(
             for (int k1 = 0; k1 < V_cols_per_iter; k1 += nwarps) {
                 const int k_tile = k1 + threadIdx.y;
 
-#pragma unroll
-                for (int i0 = 0; i0 < D/2; i0 += warp_size) {
-                    const int i = i0 + threadIdx.x;
-
-                    const half2 tmp = V_h2[int64_t(k_VKQ_0 + k0 + k_tile)*stride_KV2 + i];
 #ifdef FAST_FP16_AVAILABLE
-                    KV_tmp_h2[k_tile*(D/2) + i] = tmp;
-#else
-                    KV_tmp_f2[k_tile*(D/2) + i] = __half22float2(tmp);
-#endif // FAST_FP16_AVAILABLE
+                constexpr int cpy_ne_D = cpy_ne < D/(2*warp_size) ? cpy_ne : D/(2*warp_size);
+#pragma unroll
+                for (int i0 = 0; i0 < D/2; i0 += warp_size*cpy_ne_D) {
+                    ggml_cuda_memcpy_1<cpy_ne_D*4>(
+                        &KV_tmp[k_tile*(D/2) + i0 + threadIdx.x*cpy_ne_D],
+                        &V_h2[int64_t(k_VKQ_0 + k0 + k_tile)*stride_KV2 + i0 + threadIdx.x*cpy_ne_D]);
                 }
+#else
+                constexpr int cpy_ne_D = cpy_ne < D/warp_size ? cpy_ne : D/warp_size;
+#pragma unroll
+                for (int i0 = 0; i0 < D; i0 += warp_size*cpy_ne_D) {
+                    half2 tmp_h2[cpy_ne_D/2];
+                    ggml_cuda_memcpy_1<sizeof(tmp_h2)>(
+                        tmp_h2, &V_h2[int64_t(k_VKQ_0 + k0 + k_tile)*stride_KV2 + i0/2 + threadIdx.x*(cpy_ne_D/2)]);
+
+                    float2 tmp_f2[cpy_ne_D/2];
+#pragma unroll
+                    for (int i1 = 0; i1 < cpy_ne_D/2; ++i1) {
+                        tmp_f2[i1] = __half22float2(tmp_h2[i1]);
+                    }
+                    ggml_cuda_memcpy_1<sizeof(tmp_f2)>(
+                        &KV_tmp[k_tile*D + i0 + threadIdx.x*cpy_ne_D], tmp_f2);
+                }
+#endif // FAST_FP16_AVAILABLE
             }
 
             __syncthreads();
 
+#ifdef FAST_FP16_AVAILABLE
 #pragma unroll
             for (int k1 = 0; k1 < V_cols_per_iter; ++k1) {
-#ifdef FAST_FP16_AVAILABLE
                 half2 V_k[(D/2)/warp_size];
-                half2 KQ_k[ncols/nwarps];
-#else
-                float2 V_k[(D/2)/warp_size];
-                float  KQ_k[ncols/nwarps];
-#endif // FAST_FP16_AVAILABLE
+                half2 KQ_k[cpw];
 
+                constexpr int cpy_ne_D = cpy_ne/2 < (D/2)/warp_size ? cpy_ne/2 : (D/2)/warp_size;
 #pragma unroll
-                for (int i0 = 0; i0 < D/2; i0 += warp_size) {
-                    const int i = i0 + threadIdx.x;
-
-#ifdef FAST_FP16_AVAILABLE
-                    V_k[i0/warp_size] = KV_tmp_h2[k1*(D/2) + i];
-#else
-                    V_k[i0/warp_size] = KV_tmp_f2[k1*(D/2) + i];
-#endif // FAST_FP16_AVAILABLE
+                for (int i0 = 0; i0 < D/2; i0 += warp_size*cpy_ne_D) {
+                    ggml_cuda_memcpy_1<cpy_ne_D*4>(&V_k[i0/warp_size], &KV_tmp[k1*(D/2) + i0 + threadIdx.x*cpy_ne_D]);
                 }
 #pragma unroll
-                for (int j0 = 0; j0 < ncols; j0 += nwarps) {
-                    const int j = j0 + threadIdx.y;
+                for (int j0 = 0; j0 < cpw; j0 += softmax_iter_j) {
+                    const int j = j0/softmax_iter_j + threadIdx.y*(cpw/softmax_iter_j);
 
-#ifdef FAST_FP16_AVAILABLE
-                    KQ_k[j0/nwarps] = __half2half2(((const half *)KQ[j])[k0 + k1]);
-#else
-                    KQ_k[j0/nwarps] = KQ[j][k0 + k1];
-#endif // FAST_FP16_AVAILABLE
+                    half tmp[softmax_iter_j];
+                    ggml_cuda_memcpy_1<softmax_iter_j*sizeof(half)>(
+                        &tmp, KQ[j][k0 + k1]);
+#pragma unroll
+                    for (int j1 = 0; j1 < softmax_iter_j; ++j1) {
+                        KQ_k[j0+j1] = __half2half2(tmp[j1]);
+                    }
                 }
 
 #pragma unroll
                 for (int i0 = 0; i0 < D/2; i0 += warp_size) {
 #pragma unroll
-                    for (int j0 = 0; j0 < ncols; j0 += nwarps) {
-#ifdef FAST_FP16_AVAILABLE
-                        VKQ[j0/nwarps][i0/warp_size]   += V_k[i0/warp_size]  *KQ_k[j0/nwarps];
-#else
-                        VKQ[j0/nwarps][i0/warp_size].x += V_k[i0/warp_size].x*KQ_k[j0/nwarps];
-                        VKQ[j0/nwarps][i0/warp_size].y += V_k[i0/warp_size].y*KQ_k[j0/nwarps];
-#endif // FAST_FP16_AVAILABLE
+                    for (int j0 = 0; j0 < cpw; ++j0) {
+                        VKQ[j0][i0/warp_size] += V_k[i0/warp_size]*KQ_k[j0];
                     }
                 }
             }
+#else
+#pragma unroll
+            for (int k1 = 0; k1 < V_cols_per_iter; ++k1) {
+                float2 V_k[(D/2)/warp_size];
+                float  KQ_k[cpw];
+
+                constexpr int cpy_ne_D = cpy_ne < D/warp_size ? cpy_ne : D/warp_size;
+#pragma unroll
+                for (int i0 = 0; i0 < D; i0 += warp_size*cpy_ne_D) {
+                    ggml_cuda_memcpy_1<cpy_ne_D*4>(&V_k[i0/(2*warp_size)], &KV_tmp[k1*D + i0 + threadIdx.x*cpy_ne_D]);
+                }
+#pragma unroll
+                for (int j0 = 0; j0 < cpw; j0 += softmax_iter_j) {
+                    const int j = j0/softmax_iter_j + threadIdx.y*(cpw/softmax_iter_j);
+
+                    ggml_cuda_memcpy_1<softmax_iter_j*sizeof(float)>(
+                        &KQ_k[j0], KQ[j][k0 + k1]);
+                }
+
+#pragma unroll
+                for (int i0 = 0; i0 < D/2; i0 += warp_size) {
+#pragma unroll
+                    for (int j0 = 0; j0 < cpw; ++j0) {
+                        VKQ[j0][i0/warp_size].x += V_k[i0/warp_size].x*KQ_k[j0];
+                        VKQ[j0][i0/warp_size].y += V_k[i0/warp_size].y*KQ_k[j0];
+                    }
+                }
+            }
+#endif // FAST_FP16_AVAILABLE
 
             __syncthreads();
         }
@@ -519,69 +578,92 @@ static __global__ void flash_attn_tile(
         const float sink = sinksf[head];
 
 #pragma unroll
-        for (int j0 = 0; j0 < ncols; j0 += nwarps) {
-            float kqmax_new_j = fmaxf(kqmax[j0/nwarps], sink);
-            kqmax_new_j = warp_reduce_max<warp_size>(kqmax_new_j);
+        for (int j0 = 0; j0 < cpw; ++j0) {
+            float KQ_max_new_j = fmaxf(KQ_max[j0], sink);
+            KQ_max_new_j = warp_reduce_max<warp_size>(KQ_max_new_j);
 
-            const float KQ_max_scale = expf(kqmax[j0/nwarps] - kqmax_new_j);
-            kqmax[j0/nwarps] = kqmax_new_j;
+            const float KQ_max_scale = expf(KQ_max[j0] - KQ_max_new_j);
+            KQ_max[j0] = KQ_max_new_j;
 
-            const float val = expf(sink - kqmax[j0/nwarps]);
-            kqsum[j0/nwarps] = kqsum[j0/nwarps] * KQ_max_scale;
+            const float val = expf(sink - KQ_max[j0]);
+            KQ_sum[j0] = KQ_sum[j0] * KQ_max_scale;
             if (threadIdx.x == 0) {
-                kqsum[j0/nwarps] += val;
+                KQ_sum[j0] += val;
             }
 
 #ifdef FAST_FP16_AVAILABLE
             const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale, KQ_max_scale);
 #pragma unroll
             for (int i0 = 0; i0 < D/2; i0 += warp_size) {
-                VKQ[j0/nwarps][i0/warp_size] *= KQ_max_scale_h2;
+                VKQ[j0][i0/warp_size] *= KQ_max_scale_h2;
             }
 #else
 #pragma unroll
             for (int i0 = 0; i0 < D/2; i0 += warp_size) {
-                VKQ[j0/nwarps][i0/warp_size].x *= KQ_max_scale;
-                VKQ[j0/nwarps][i0/warp_size].y *= KQ_max_scale;
+                VKQ[j0][i0/warp_size].x *= KQ_max_scale;
+                VKQ[j0][i0/warp_size].y *= KQ_max_scale;
             }
 #endif // FAST_FP16_AVAILABLE
         }
     }
 
-    float2 * dst2 = (float2 *) dst;
-
 #pragma unroll
-    for (int j_VKQ_0 = 0; j_VKQ_0 < ncols; j_VKQ_0 += nwarps) {
-        const int j_VKQ = j_VKQ_0 + threadIdx.y;
+    for (int j_VKQ_0 = 0; j_VKQ_0 < cpw; ++j_VKQ_0) {
+        KQ_sum[j_VKQ_0] = warp_reduce_sum<warp_size>(KQ_sum[j_VKQ_0]);
+    }
+    if (gridDim.y == 1) {
+#pragma unroll
+        for (int j_VKQ_0 = 0; j_VKQ_0 < cpw; ++j_VKQ_0) {
+#ifdef FAST_FP16_AVAILABLE
+            const half2 KQ_sum_j_inv = make_half2(1.0f/KQ_sum[j_VKQ_0], 1.0f/KQ_sum[j_VKQ_0]);
+#pragma unroll
+            for (int i = 0; i < (D/2)/warp_size; ++i) {
+                VKQ[j_VKQ_0][i] *= KQ_sum_j_inv;
+            }
+#else
+            const float KQ_sum_j_inv = 1.0f/KQ_sum[j_VKQ_0];
+#pragma unroll
+            for (int i = 0; i < (D/2)/warp_size; ++i) {
+                VKQ[j_VKQ_0][i].x *= KQ_sum_j_inv;
+                VKQ[j_VKQ_0][i].y *= KQ_sum_j_inv;
+            }
+#endif // FAST_FP16_AVAILABLE
+        }
+    }
+
+    // Write back results:
+#pragma unroll
+    for (int j_VKQ_0 = 0; j_VKQ_0 < cpw; ++j_VKQ_0) {
+        const int j_VKQ = j_VKQ_0 + threadIdx.y*cpw;
 
         if (ic0 + j_VKQ >= ne01) {
             return;
         }
 
-        float kqsum_j = kqsum[j_VKQ_0/nwarps];
-        kqsum_j = warp_reduce_sum<warp_size>(kqsum_j);
-
         const int j_dst_unrolled = ((sequence*ne01 + ic0 + j_VKQ)*ne02 + head)*gridDim.y + blockIdx.y;
 
-#pragma unroll
-        for (int i00 = 0; i00 < D/2; i00 += warp_size) {
-            const int i0 = i00 + threadIdx.x;
-
 #ifdef FAST_FP16_AVAILABLE
-            float2 dst_val = __half22float2(VKQ[j_VKQ_0/nwarps][i0/warp_size]);
+        constexpr int cpy_ne_D = cpy_ne/2 < (D/2)/warp_size ? cpy_ne/2 : (D/2)/warp_size;
+#pragma unroll
+        for (int i0 = 0; i0 < D/2; i0 += warp_size*cpy_ne_D) {
+            float2 tmp[cpy_ne_D];
+#pragma unroll
+            for (int i1 = 0; i1 < cpy_ne_D; ++i1) {
+                tmp[i1] = __half22float2(VKQ[j_VKQ_0][i0/warp_size + i1]);
+            }
+            ggml_cuda_memcpy_1<sizeof(tmp)>(&dst[j_dst_unrolled*D + 2*i0 + threadIdx.x*(2*cpy_ne_D)], tmp);
+        }
 #else
-            float2 dst_val = VKQ[j_VKQ_0/nwarps][i0/warp_size];
+        constexpr int cpy_ne_D = cpy_ne < D/warp_size ? cpy_ne : D/warp_size;
+#pragma unroll
+        for (int i0 = 0; i0 < D; i0 += warp_size*cpy_ne_D) {
+            ggml_cuda_memcpy_1<cpy_ne_D*4>(
+                &dst[j_dst_unrolled*D + i0 + threadIdx.x*cpy_ne_D], &VKQ[j_VKQ_0][i0/(2*warp_size)]);
+        }
 #endif // FAST_FP16_AVAILABLE
 
-            if (gridDim.y == 1) {
-                dst_val.x /= kqsum_j;
-                dst_val.y /= kqsum_j;
-            }
-            dst2[j_dst_unrolled*(D/2) + i0] = dst_val;
-        }
-
         if (gridDim.y != 1 && threadIdx.x == 0) {
-            dst_meta[j_dst_unrolled] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j);
+            dst_meta[j_dst_unrolled] = make_float2(KQ_max[j_VKQ_0], KQ_sum[j_VKQ_0]);
         }
     }
 #else
@@ -602,15 +684,29 @@ template <int D, bool use_logit_softcap>
 static void launch_fattn_tile_switch_ncols(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const ggml_tensor * Q = dst->src[0];
 
-    const int id                 = ggml_cuda_get_device();
-    const int cc                 = ggml_cuda_info().devices[id].cc;
-    const int warp_size          = 32;
-    const int nwarps             = FATTN_TILE_NTHREADS / warp_size;
+    const int id        = ggml_cuda_get_device();
+    const int cc        = ggml_cuda_info().devices[id].cc;
+    const int warp_size = 32;
 
     constexpr size_t nbytes_shared = 0;
 
+#ifdef GGML_USE_HIP
+    if constexpr (D <= 128) {
+        if (Q->ne[1] > 32) {
+            constexpr int cols_per_block = 64;
+            const int nwarps = fattn_tile_get_nthreads_host(cc, cols_per_block) / warp_size;
+            fattn_kernel_t fattn_kernel = flash_attn_tile<D, cols_per_block, use_logit_softcap>;
+            const int kq_stride = fattn_tile_get_kq_stride_host(D, cols_per_block, cc, warp_size);
+            launch_fattn<D, cols_per_block, 1>
+                (ctx, dst, fattn_kernel, nwarps, nbytes_shared, kq_stride, true, true, false, warp_size);
+            return;
+        }
+    }
+#endif // GGML_USE_HIP
+
     if (Q->ne[1] > 16) {
         constexpr int cols_per_block = 32;
+        const int nwarps = fattn_tile_get_nthreads_host(cc, cols_per_block) / warp_size;
         fattn_kernel_t fattn_kernel = flash_attn_tile<D, cols_per_block, use_logit_softcap>;
         const int kq_stride = fattn_tile_get_kq_stride_host(D, cols_per_block, cc, warp_size);
         launch_fattn<D, cols_per_block, 1>
@@ -619,6 +715,7 @@ static void launch_fattn_tile_switch_ncols(ggml_backend_cuda_context & ctx, ggml
     }
 
     constexpr int cols_per_block = 16;
+    const int nwarps = fattn_tile_get_nthreads_host(cc, cols_per_block) / warp_size;
     fattn_kernel_t fattn_kernel = flash_attn_tile<D, cols_per_block, use_logit_softcap>;
     const int kq_stride = fattn_tile_get_kq_stride_host(D, cols_per_block, cc, warp_size);
     launch_fattn<D, cols_per_block, 1>
diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
index 12bbee4556..37386afcd4 100644
--- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h
@@ -158,41 +158,41 @@
 
 #define __CUDA_ARCH__ 1300
 
-#if defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__)
-#define GCN
-#endif
-
 #if defined(__gfx900__) || defined(__gfx906__)
 #define GCN5
-#endif
+#endif // defined(__gfx900__) || defined(__gfx906__)
 
 #if defined(__gfx803__)
 #define GCN4
-#endif
+#endif // defined(__gfx803__)
 
-#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx942__)
-#define CDNA // For the entire family
-#endif
+#if defined(GCN5) || defined(GCN4)
+#define GCN
+#endif // defined(GCN5) || defined(GCN4)
 
 #if defined(__gfx942__)
 #define CDNA3
-#endif
+#endif // defined(__gfx942__)
 
 #if defined(__gfx90a__)
 #define CDNA2
-#endif
+#endif // defined(__gfx90a__)
 
 #if defined(__gfx908__)
 #define CDNA1
-#endif
+#endif // defined(__gfx908__)
+
+#if defined(CDNA3) || defined(CDNA2) || defined(CDNA1)
+#define CDNA // For the entire family
+#endif // defined(CDNA3) || defined(CDNA2) || defined(CDNA1)
 
 #if defined(__GFX12__)
 #define RDNA4
-#endif
+#endif // defined(__GFX12__)
 
 #if defined(__GFX11__)
 #define RDNA3
-#endif
+#endif // defined(__GFX11__)
 
 #if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
     defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
@@ -201,7 +201,11 @@
 
 #if defined(__gfx1010__) || defined(__gfx1012__)
 #define RDNA1
-#endif
+#endif // defined(__gfx1010__) || defined(__gfx1012__)
+
+#if defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(RDNA1)
+#define RDNA // For the entire family
+#endif // defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(RDNA1)
 
 #ifndef __has_builtin
     #define __has_builtin(x) 0