From 10d197409bd9537ff302ad09966fe406882fef9d Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Mon, 15 Sep 2025 14:38:42 -0700 Subject: [PATCH 01/19] releases : switch to rocWMMA develop branch, add gfx1151 (#15992) * releases : switch to rocWMMA develop branch, add gfx1151 * remove unused variable ROCM_VERSION --- .github/workflows/release.yml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index e959209ab2..f461456edf 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -530,15 +530,13 @@ jobs: runs-on: windows-2022 env: - # The ROCm version must correspond to the version used in the HIP SDK. - ROCM_VERSION: "6.4.2" HIPSDK_INSTALLER_VERSION: "25.Q3" strategy: matrix: include: - name: "radeon" - gpu_targets: "gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032" + gpu_targets: "gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032" steps: - name: Clone @@ -548,7 +546,7 @@ jobs: - name: Clone rocWMMA repository id: clone_rocwmma run: | - git clone https://github.com/rocm/rocwmma --branch rocm-${{ env.ROCM_VERSION }} --depth 1 + git clone https://github.com/rocm/rocwmma --branch develop --depth 1 - name: Cache ROCm Installation id: cache-rocm From dc381aa9a6dc45f00673471d34b8bddd30e77570 Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Mon, 15 Sep 2025 14:38:52 -0700 Subject: [PATCH 02/19] docker : enable rocWMMA in ROCm images, add gfx1151 (#15997) --- .devops/rocm.Dockerfile | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/.devops/rocm.Dockerfile b/.devops/rocm.Dockerfile index 221077197d..106c62b4dc 100644 --- a/.devops/rocm.Dockerfile +++ b/.devops/rocm.Dockerfile @@ -17,14 +17,11 @@ FROM ${BASE_ROCM_DEV_CONTAINER} AS build # gfx906 is deprecated #check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.1/reference/system-requirements.html -ARG ROCM_DOCKER_ARCH='gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102,gfx1200,gfx1201' -#ARG ROCM_DOCKER_ARCH=gfx1100 +ARG ROCM_DOCKER_ARCH='gfx803;gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1010;gfx1030;gfx1032;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx1151' +#ARG ROCM_DOCKER_ARCH='gfx1151' -# Set ROCm architectured +# Set ROCm architectures ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH} -# Enable ROCm -# ENV CC=/opt/rocm/llvm/bin/clang -# ENV CXX=/opt/rocm/llvm/bin/clang++ RUN apt-get update \ && apt-get install -y \ @@ -39,8 +36,16 @@ WORKDIR /app COPY . . +RUN git clone https://github.com/rocm/rocwmma --branch develop --depth 1 + RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \ - cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_BUILD_TESTS=OFF \ + cmake -S . -B build \ + -DGGML_HIP=ON \ + -DGGML_HIP_ROCWMMA_FATTN=ON \ + -DCMAKE_HIP_FLAGS="-I$(pwd)/rocwmma/library/include/" \ + -DAMDGPU_TARGETS="$ROCM_DOCKER_ARCH" \ + -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON \ + -DCMAKE_BUILD_TYPE=Release -DLLAMA_BUILD_TESTS=OFF \ && cmake --build build --config Release -j$(nproc) RUN mkdir -p /app/lib \ From 3d4053f77f0f78ee2b791088c02af653ebee42dd Mon Sep 17 00:00:00 2001 From: Jake Karnes Date: Mon, 15 Sep 2025 16:28:31 -0600 Subject: [PATCH 03/19] CUDA: fix im2col_3d to respect non-contiguous inputs (views) (#15956) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix im2col_3d to respect non-contiguous inputs (views) The CUDA 3D im2col kernel computed source addresses assuming compact layout (products of dims), ignoring nb[] strides. This patch switches im2col_3d source indexing to use true strides derived from src1->nb[] (in elements), mirroring the approach used in the 2D CUDA im2col path. Destination indexing is unchanged. * use ggml_element_size() for src strides Co-authored-by: Johannes Gäßler --------- Co-authored-by: Johannes Gäßler --- ggml/src/ggml-cuda/im2col.cu | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-cuda/im2col.cu b/ggml/src/ggml-cuda/im2col.cu index 7737d6a5d5..56dc054574 100644 --- a/ggml/src/ggml-cuda/im2col.cu +++ b/ggml/src/ggml-cuda/im2col.cu @@ -122,11 +122,14 @@ static __global__ void im2col_3d_kernel( int64_t OH_OW, int64_t KD_KH_KW, int64_t ID_IH_IW, int64_t KH_KW, int64_t IH_IW, int64_t IC_ID_IH_IW, int64_t IC_KD_KH_KW, int64_t OW_KD_KH_KW, int64_t OD_OH_OW_IC_KD_KH_KW, int64_t OH_OW_IC_KD_KH_KW, int64_t OW_IC_KD_KH_KW, int64_t N_OD_OH, int64_t OD_OH, + int64_t stride_q, int64_t stride_z, int64_t stride_y, int64_t stride_x, int s0, int s1, int s2, int p0, int p1, int p2, int d0, int d1, int d2) { const int64_t i = threadIdx.x + blockIdx.x * blockDim.x; if (i >= IC_KD_KH_KW) { return; } + GGML_UNUSED(N); GGML_UNUSED(OC); GGML_UNUSED(OH_OW); GGML_UNUSED(OD); GGML_UNUSED(OW); GGML_UNUSED(KD); GGML_UNUSED(KH); + GGML_UNUSED(ID_IH_IW); GGML_UNUSED(IH_IW); GGML_UNUSED(IC_ID_IH_IW); GGML_UNUSED(OW_KD_KH_KW); const int64_t iic = i / KD_KH_KW; const int64_t ikd = (i - iic * KD_KH_KW) / KH_KW; @@ -148,7 +151,7 @@ static __global__ void im2col_3d_kernel( if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW || iid < 0 || iid >= ID) { dst[offset_dst] = 0.0f; } else { - const int64_t offset_src = in*IC_ID_IH_IW + iic*ID_IH_IW + iid*IH_IW + iih*IW + iiw; + const int64_t offset_src = ((in * IC + iic) * stride_q) + (iid * stride_z) + (iih * stride_y) + (iiw * stride_x); dst[offset_dst] = src[offset_src]; } } @@ -159,6 +162,7 @@ template static void im2col_3d_cuda(const float * src, T* dst, int64_t N, int64_t IC, int64_t ID, int64_t IH, int64_t IW, int64_t OC, int64_t KD, int64_t KH, int64_t KW, int64_t OD, int64_t OH, int64_t OW, + int64_t stride_q, int64_t stride_z, int64_t stride_y, int64_t stride_x, int s0, int s1, int s2, int p0, int p1, int p2, int d0, int d1, int d2, cudaStream_t stream) { const int64_t OH_OW = OH*OW; const int64_t KD_KH_KW = KD*KH*KW; @@ -179,23 +183,30 @@ static void im2col_3d_cuda(const float * src, T* dst, OH_OW, KD_KH_KW, ID_IH_IW, KH_KW, IH_IW, IC_ID_IH_IW, IC_KD_KH_KW, OW_KD_KH_KW, OD_OH_OW_IC_KD_KH_KW, OH_OW_IC_KD_KH_KW, OW_IC_KD_KH_KW, N_OD_OH, OD_OH, + stride_q, stride_z, stride_y, stride_x, s0, s1, s2, p0, p1, p2, d0, d1, d2); } static void im2col_3d_cuda_f16(const float * src, half * dst, int64_t N, int64_t IC, int64_t ID, int64_t IH, int64_t IW, int64_t OC, int64_t KD, int64_t KH, int64_t KW, int64_t OD, int64_t OH, int64_t OW, + int64_t stride_q, int64_t stride_z, int64_t stride_y, int64_t stride_x, int s0, int s1, int s2, int p0, int p1, int p2, int d0, int d1, int d2, cudaStream_t stream) { - im2col_3d_cuda(src, dst, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW, s0, s1, s2, p0, p1, p2, d0, d1, d2, stream); + im2col_3d_cuda(src, dst, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW, + stride_q, stride_z, stride_y, stride_x, + s0, s1, s2, p0, p1, p2, d0, d1, d2, stream); } static void im2col_3d_cuda_f32(const float * src, float * dst, int64_t N, int64_t IC, int64_t ID, int64_t IH, int64_t IW, int64_t OC, int64_t KD, int64_t KH, int64_t KW, int64_t OD, int64_t OH, int64_t OW, + int64_t stride_q, int64_t stride_z, int64_t stride_y, int64_t stride_x, int s0, int s1, int s2, int p0, int p1, int p2, int d0, int d1, int d2, cudaStream_t stream) { - im2col_3d_cuda(src, dst, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW, s0, s1, s2, p0, p1, p2, d0, d1, d2, stream); + im2col_3d_cuda(src, dst, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW, + stride_q, stride_z, stride_y, stride_x, + s0, s1, s2, p0, p1, p2, d0, d1, d2, stream); } void ggml_cuda_op_im2col_3d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { @@ -235,9 +246,19 @@ void ggml_cuda_op_im2col_3d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) const int64_t OH = ne2; const int64_t OW = ne1; + const size_t es = ggml_element_size(src1); + const int64_t stride_x = src1->nb[0] / es; + const int64_t stride_y = src1->nb[1] / es; + const int64_t stride_z = src1->nb[2] / es; + const int64_t stride_q = src1->nb[3] / es; + if(dst->type == GGML_TYPE_F16) { - im2col_3d_cuda_f16(src1_d, (half *) dst_d, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW, s0, s1, s2, p0, p1, p2, d0, d1, d2, stream); + im2col_3d_cuda_f16(src1_d, (half *) dst_d, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW, + stride_q, stride_z, stride_y, stride_x, + s0, s1, s2, p0, p1, p2, d0, d1, d2, stream); } else { - im2col_3d_cuda_f32(src1_d, (float *) dst_d, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW, s0, s1, s2, p0, p1, p2, d0, d1, d2, stream); + im2col_3d_cuda_f32(src1_d, (float *) dst_d, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW, + stride_q, stride_z, stride_y, stride_x, + s0, s1, s2, p0, p1, p2, d0, d1, d2, stream); } } From 6d758839ff741d4966ca92b7f801b7a8b5b96364 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Tue, 16 Sep 2025 10:38:28 +0800 Subject: [PATCH 04/19] Add LLaDA-7b-MoE diffusion model (#16003) --- common/arg.cpp | 2 +- convert_hf_to_gguf.py | 73 +++++++++++ convert_hf_to_gguf_update.py | 1 + examples/diffusion/diffusion-cli.cpp | 24 ++-- gguf-py/gguf/constants.py | 19 +++ src/llama-arch.cpp | 22 ++++ src/llama-arch.h | 1 + src/llama-model.cpp | 179 +++++++++++++++++++++++++++ src/llama-vocab.cpp | 3 +- 9 files changed, 315 insertions(+), 9 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index c15008fe79..19189d8f9f 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1704,7 +1704,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.system_prompt = value; } - ).set_examples({LLAMA_EXAMPLE_MAIN})); + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION})); add_opt(common_arg( {"--no-perf"}, string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"), diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 855789f1ba..5a21ba2110 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -888,6 +888,9 @@ class TextModel(ModelBase): if chkhsh == "a1e163ecab2e718a4c829d1148b6e86824ec36163bb71941c3dca9cd5ac25756": # ref: https://huggingface.co/JetBrains/Mellum-4b-base res = "mellum" + if chkhsh == "9b1be57e70d20d9501b2b3186e792d81181ae36ada3903c26f9fea418cf87206": + # ref: https://huggingface.co/inclusionAI/LLaDA-MoE-7B-A1B-Base + res = "llada-moe" if res is None: logger.warning("\n") @@ -8239,6 +8242,76 @@ class HunYuanMoEModel(TextModel): raise ValueError(f"Unprocessed experts: {experts}") +@ModelBase.register("LLaDAMoEModel", "LLaDAMoEModelLM") +class LLaDAMoEModel(TextModel): + model_arch = gguf.MODEL_ARCH.LLADA_MOE + + def set_gguf_parameters(self): + super().set_gguf_parameters() + if (n_experts := self.hparams.get("num_experts")) is not None: + self.gguf_writer.add_expert_count(n_experts) + + if (expert_intermediate_size := self.hparams.get("expert_intermediate_size")) is not None: + self.gguf_writer.add_expert_feed_forward_length(expert_intermediate_size) + + # number of experts used per token (top-k) + if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None: + self.gguf_writer.add_expert_used_count(n_experts_used) + + self.gguf_writer.add_mask_token_id(156895) + self.gguf_writer.add_causal_attention(False) + self.gguf_writer.add_diffusion_shift_logits(False) + + _experts: list[dict[str, Tensor]] | None = None + + # Copied from: Qwen2MoeModel + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # process the experts separately + if name.find("experts") != -1: + n_experts = self.hparams["num_experts"] + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + tensors: list[tuple[str, Tensor]] = [] + + # merge the experts into a single 3d tensor + for w_name in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + new_name = self.map_tensor_name(merged_name) + + tensors.append((new_name, data_torch)) + return tensors + else: + return [] + + return [(self.map_tensor_name(name), data_torch)] + + # Copied from: Qwen2MoeModel + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + @ModelBase.register("HunYuanDenseV1ForCausalLM") class HunYuanModel(TextModel): model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index eb8fdfa7e1..21bb4a9f3e 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -139,6 +139,7 @@ models = [ {"name": "lfm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"}, {"name": "exaone4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", }, {"name": "mellum", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum-4b-base", }, + {"name": "llada-moe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/LLaDA-MoE-7B-A1B-Base", }, ] # some models are known to be broken upstream, so we will skip them as exceptions diff --git a/examples/diffusion/diffusion-cli.cpp b/examples/diffusion/diffusion-cli.cpp index abf7fb3573..273942a165 100644 --- a/examples/diffusion/diffusion-cli.cpp +++ b/examples/diffusion/diffusion-cli.cpp @@ -510,19 +510,27 @@ static void diffusion_generate(llama_context * ctx, n_generated = params.max_length; } -static std::string format_input_text(const std::string & prompt, bool use_chat_template, llama_model * model) { +static std::string format_input_text(const std::string & prompt, const std::string & system_prompt, bool use_chat_template, llama_model * model) { if (!use_chat_template) { return prompt; } auto chat_templates = common_chat_templates_init(model, ""); - common_chat_templates_inputs inputs; - common_chat_msg user_msg; - user_msg.role = "user"; - user_msg.content = prompt; - inputs.add_generation_prompt = true; + common_chat_msg system_msg; + + if (!system_prompt.empty()) { + system_msg.role = "system"; + system_msg.content = system_prompt; + inputs.messages.push_back(system_msg); + } + + common_chat_msg user_msg; + user_msg.role = "user"; + user_msg.content = prompt; + inputs.messages.push_back(user_msg); + inputs.add_generation_prompt = true; auto result = common_chat_templates_apply(chat_templates.get(), inputs); @@ -579,7 +587,8 @@ int main(int argc, char ** argv) { llama_set_n_threads(ctx, params.cpuparams.n_threads, params.cpuparams_batch.n_threads); const llama_vocab * vocab = llama_model_get_vocab(model); - std::string formatted_prompt = format_input_text(params.prompt, params.enable_chat_template, model); + + std::string formatted_prompt = format_input_text(params.prompt, params.system_prompt, params.enable_chat_template, model); std::vector input_tokens = common_tokenize(vocab, formatted_prompt, @@ -596,6 +605,7 @@ int main(int argc, char ** argv) { } llama_token mask_token_id = llama_vocab_mask(vocab); + GGML_ASSERT(mask_token_id != LLAMA_TOKEN_NULL); bool visual_mode = params.diffusion.visual_mode; diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index c7edef919b..7e16cbcbde 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -399,6 +399,7 @@ class MODEL_ARCH(IntEnum): DREAM = auto() SMALLTHINKER = auto() LLADA = auto() + LLADA_MOE = auto() SEED_OSS = auto() @@ -735,6 +736,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.DREAM: "dream", MODEL_ARCH.SMALLTHINKER: "smallthinker", MODEL_ARCH.LLADA: "llada", + MODEL_ARCH.LLADA_MOE: "llada-moe", MODEL_ARCH.SEED_OSS: "seed_oss", } @@ -2693,6 +2695,23 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, ], + MODEL_ARCH.LLADA_MOE: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_K_NORM, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_UP_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + ], # TODO } diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 3122331d8e..a4d2973ada 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -96,6 +96,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_DREAM, "dream" }, { LLM_ARCH_SMALLTHINKER, "smallthinker" }, { LLM_ARCH_LLADA, "llada" }, + { LLM_ARCH_LLADA_MOE, "llada-moe" }, { LLM_ARCH_SEED_OSS, "seed_oss" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -2147,6 +2148,26 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_LLADA_MOE, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, + { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, + { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, + { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, + }, + }, { LLM_ARCH_SEED_OSS, { @@ -2427,6 +2448,7 @@ bool llm_arch_is_diffusion(const llm_arch & arch) { switch (arch) { case LLM_ARCH_DREAM: case LLM_ARCH_LLADA: + case LLM_ARCH_LLADA_MOE: return true; default: return false; diff --git a/src/llama-arch.h b/src/llama-arch.h index a4ac28b525..d181ce6784 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -100,6 +100,7 @@ enum llm_arch { LLM_ARCH_DREAM, LLM_ARCH_SMALLTHINKER, LLM_ARCH_LLADA, + LLM_ARCH_LLADA_MOE, LLM_ARCH_SEED_OSS, LLM_ARCH_UNKNOWN, }; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 4864ed8e72..731e87383b 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -936,6 +936,18 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.causal_attn = false; } break; + case LLM_ARCH_LLADA_MOE: + { + ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false); + + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + // diffusion language model uses non-causal attention + hparams.causal_attn = false; + switch (hparams.n_layer) { + case 16: type = LLM_TYPE_A1_7B; break; + default: type = LLM_TYPE_UNKNOWN; + } + } break; case LLM_ARCH_QWEN2MOE: { ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false); @@ -2387,6 +2399,40 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } } break; + case LLM_ARCH_LLADA_MOE: + { + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + // output + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0); + + GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for llada-moe"); + GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for llada-moe"); + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); + layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0); + layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0); + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + + layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0); + + const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used; + + layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0); + layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0); + layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0); + } + } break; case LLM_ARCH_LLAMA4: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); @@ -12444,6 +12490,132 @@ struct llm_build_olmoe : public llm_graph_context { } }; +struct llm_build_llada_moe : public llm_graph_context { + llm_build_llada_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { + const int64_t n_embd_head = hparams.n_embd_head_v; + + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + ggml_tensor * cur; + ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + ggml_tensor * inp_pos = build_inp_pos(); + + auto * inp_attn = build_attn_inp_no_cache(); + + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + for (int il = 0; il < n_layer; ++il) { + ggml_tensor * inpSA = inpL; + + // norm + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + // self_attention + { + // compute Q and K and RoPE them + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il); + cb(Qcur, "Qcur_normed", il); + + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il); + cb(Kcur, "Kcur_normed", il); + + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + cur = build_attn(inp_attn, + model.layers[il].wo, NULL, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + } + + if (il == n_layer - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // MoE branch + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_moe_ffn(cur, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, + model.layers[il].ffn_down_exps, + nullptr, + n_expert, n_expert_used, + LLM_FFN_SILU, false, + false, 0.0, + LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, + il); + cb(cur, "ffn_moe_out", il); + + cur = ggml_add(ctx0, cur, ffn_inp); + + cur = build_cvec(cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + + cb(cur, "result_norm", -1); + res->t_embd = cur; + + // lm_head + cur = build_lora_mm(model.output, cur); + + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); + } +}; + struct llm_build_openelm : public llm_graph_context { llm_build_openelm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; @@ -18636,6 +18808,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, //case LLM_ARCH_GEMMA_EMBEDDING: // TODO: disabled until the cacheless SWA logic is fixed [TAG_NO_CACHE_ISWA] case LLM_ARCH_DREAM: case LLM_ARCH_LLADA: + case LLM_ARCH_LLADA_MOE: { res = nullptr; } break; @@ -18841,6 +19014,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { llm = std::make_unique(*this, params); } break; + case LLM_ARCH_LLADA_MOE: + { + llm = std::make_unique(*this, params); + } + break; case LLM_ARCH_QWEN2VL: { llm = std::make_unique(*this, params); @@ -19307,6 +19485,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_QWEN2MOE: case LLM_ARCH_QWEN3: case LLM_ARCH_QWEN3MOE: + case LLM_ARCH_LLADA_MOE: case LLM_ARCH_OLMO2: case LLM_ARCH_OLMOE: case LLM_ARCH_PHI2: diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index b551253afb..8cb36661a0 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1962,7 +1962,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { pre_type = LLAMA_VOCAB_PRE_TYPE_TRILLION; clean_spaces = false; } else if ( - tokenizer_pre == "bailingmoe") { + tokenizer_pre == "bailingmoe" || + tokenizer_pre == "llada-moe") { pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE; clean_spaces = false; } else if ( From 07808ebb07e2b1aa19032705e332679ddf967614 Mon Sep 17 00:00:00 2001 From: Yuri Khrustalev Date: Mon, 15 Sep 2025 22:54:44 -0400 Subject: [PATCH 05/19] cmake : Do not install tools on iOS targets (#15903) --- CMakeLists.txt | 7 +++++++ tools/batched-bench/CMakeLists.txt | 5 ++++- tools/cvector-generator/CMakeLists.txt | 5 ++++- tools/export-lora/CMakeLists.txt | 5 ++++- tools/gguf-split/CMakeLists.txt | 5 ++++- tools/imatrix/CMakeLists.txt | 5 ++++- tools/llama-bench/CMakeLists.txt | 5 ++++- tools/main/CMakeLists.txt | 5 ++++- tools/mtmd/CMakeLists.txt | 2 +- tools/perplexity/CMakeLists.txt | 5 ++++- tools/quantize/CMakeLists.txt | 5 ++++- tools/run/CMakeLists.txt | 4 +++- tools/tokenize/CMakeLists.txt | 4 +++- tools/tts/CMakeLists.txt | 5 ++++- 14 files changed, 54 insertions(+), 13 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 36a2078e4c..4720e1f1a2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -58,6 +58,12 @@ if (MSVC) add_compile_options("$<$:/bigobj>") endif() +if (CMAKE_SYSTEM_NAME STREQUAL "iOS") + set(LLAMA_TOOLS_INSTALL_DEFAULT OFF) +else() + set(LLAMA_TOOLS_INSTALL_DEFAULT ${LLAMA_STANDALONE}) +endif() + # # option list # @@ -82,6 +88,7 @@ option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_TOOLS "llama: build tools" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE}) +option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT}) # 3rd party libs option(LLAMA_CURL "llama: use libcurl to download model from an URL" ON) diff --git a/tools/batched-bench/CMakeLists.txt b/tools/batched-bench/CMakeLists.txt index 68ad707f32..4a46b57a52 100644 --- a/tools/batched-bench/CMakeLists.txt +++ b/tools/batched-bench/CMakeLists.txt @@ -1,5 +1,8 @@ set(TARGET llama-batched-bench) add_executable(${TARGET} batched-bench.cpp) -install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() diff --git a/tools/cvector-generator/CMakeLists.txt b/tools/cvector-generator/CMakeLists.txt index 49ad9561c8..baeb4d00c1 100644 --- a/tools/cvector-generator/CMakeLists.txt +++ b/tools/cvector-generator/CMakeLists.txt @@ -1,5 +1,8 @@ set(TARGET llama-cvector-generator) add_executable(${TARGET} cvector-generator.cpp pca.hpp) -install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() diff --git a/tools/export-lora/CMakeLists.txt b/tools/export-lora/CMakeLists.txt index 310455787a..cddfa77f02 100644 --- a/tools/export-lora/CMakeLists.txt +++ b/tools/export-lora/CMakeLists.txt @@ -1,5 +1,8 @@ set(TARGET llama-export-lora) add_executable(${TARGET} export-lora.cpp) -install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() diff --git a/tools/gguf-split/CMakeLists.txt b/tools/gguf-split/CMakeLists.txt index c407e2f0af..9b2125087c 100644 --- a/tools/gguf-split/CMakeLists.txt +++ b/tools/gguf-split/CMakeLists.txt @@ -1,5 +1,8 @@ set(TARGET llama-gguf-split) add_executable(${TARGET} gguf-split.cpp) -install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() diff --git a/tools/imatrix/CMakeLists.txt b/tools/imatrix/CMakeLists.txt index 412696c47c..22f2fe5fdb 100644 --- a/tools/imatrix/CMakeLists.txt +++ b/tools/imatrix/CMakeLists.txt @@ -1,5 +1,8 @@ set(TARGET llama-imatrix) add_executable(${TARGET} imatrix.cpp) -install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() diff --git a/tools/llama-bench/CMakeLists.txt b/tools/llama-bench/CMakeLists.txt index 17e3b9b87b..b8543a9692 100644 --- a/tools/llama-bench/CMakeLists.txt +++ b/tools/llama-bench/CMakeLists.txt @@ -1,5 +1,8 @@ set(TARGET llama-bench) add_executable(${TARGET} llama-bench.cpp) -install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() diff --git a/tools/main/CMakeLists.txt b/tools/main/CMakeLists.txt index af3d9150f8..8f8e9d444c 100644 --- a/tools/main/CMakeLists.txt +++ b/tools/main/CMakeLists.txt @@ -1,5 +1,8 @@ set(TARGET llama-cli) add_executable(${TARGET} main.cpp) -install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt index 0979488560..2381012a0d 100644 --- a/tools/mtmd/CMakeLists.txt +++ b/tools/mtmd/CMakeLists.txt @@ -55,7 +55,7 @@ add_executable(llama-qwen2vl-cli deprecation-warning.cpp) set(TARGET llama-mtmd-cli) add_executable (${TARGET} mtmd-cli.cpp) set_target_properties (${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli) -if(NOT CMAKE_SYSTEM_NAME STREQUAL "iOS") +if(LLAMA_TOOLS_INSTALL) install(TARGETS ${TARGET} RUNTIME) endif() target_link_libraries (${TARGET} PRIVATE common mtmd Threads::Threads) diff --git a/tools/perplexity/CMakeLists.txt b/tools/perplexity/CMakeLists.txt index 3e68640933..12b28b2be4 100644 --- a/tools/perplexity/CMakeLists.txt +++ b/tools/perplexity/CMakeLists.txt @@ -1,5 +1,8 @@ set(TARGET llama-perplexity) add_executable(${TARGET} perplexity.cpp) -install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() diff --git a/tools/quantize/CMakeLists.txt b/tools/quantize/CMakeLists.txt index 47e5cbe30c..bd9ddbd67d 100644 --- a/tools/quantize/CMakeLists.txt +++ b/tools/quantize/CMakeLists.txt @@ -1,6 +1,9 @@ set(TARGET llama-quantize) add_executable(${TARGET} quantize.cpp) -install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_include_directories(${TARGET} PRIVATE ../../common) target_compile_features(${TARGET} PRIVATE cxx_std_17) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() diff --git a/tools/run/CMakeLists.txt b/tools/run/CMakeLists.txt index d018959698..e52294ccc0 100644 --- a/tools/run/CMakeLists.txt +++ b/tools/run/CMakeLists.txt @@ -10,6 +10,8 @@ if (LLAMA_CURL) set(LLAMA_RUN_EXTRA_LIBS ${LLAMA_RUN_EXTRA_LIBS} ${CURL_LIBRARIES}) endif () -install(TARGETS ${TARGET} RUNTIME) +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT} ${LLAMA_RUN_EXTRA_LIBS}) target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/tools/tokenize/CMakeLists.txt b/tools/tokenize/CMakeLists.txt index 1690b53e5d..feed9a1062 100644 --- a/tools/tokenize/CMakeLists.txt +++ b/tools/tokenize/CMakeLists.txt @@ -1,5 +1,7 @@ set(TARGET llama-tokenize) add_executable(${TARGET} tokenize.cpp) -install(TARGETS ${TARGET} RUNTIME) +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/tools/tts/CMakeLists.txt b/tools/tts/CMakeLists.txt index c72bd814c3..76320d4c2d 100644 --- a/tools/tts/CMakeLists.txt +++ b/tools/tts/CMakeLists.txt @@ -1,5 +1,8 @@ set(TARGET llama-tts) add_executable(${TARGET} tts.cpp) -install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() From 51abc96bdc52ba8cd6ad78dcf12ed9a041d7b442 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Tue, 16 Sep 2025 05:57:16 +0200 Subject: [PATCH 06/19] ci : update macos-latest* jobs to use macos-latest (#15938) * ci : update macos-latest* jobs to use macos-latest This commit updates the jobs that are named macos-latest* to use the macos-latest label instead explicit versions. The motivation for this is that there is currently a mixuture of versions in this workflow and there are jobs that are failing because they require a newer version. Refs: https://github.com/ggml-org/llama.cpp/actions/runs/17644792595/job/50140010907#step:5:1759 * ci : add xcodebuild -downloadPlatform iOS command --- .github/workflows/build.yml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 1bd35627da..9a668bf8dd 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -56,7 +56,7 @@ env: jobs: macOS-latest-cmake-arm64: - runs-on: macos-14 + runs-on: macos-latest steps: - name: Clone @@ -97,7 +97,7 @@ jobs: ctest -L 'main|curl' --verbose --timeout 900 macOS-latest-cmake-x64: - runs-on: macos-13 + runs-on: macos-latest steps: - name: Clone @@ -138,7 +138,7 @@ jobs: ctest -L main --verbose --timeout 900 macOS-latest-cmake-arm64-webgpu: - runs-on: macos-14 + runs-on: latest steps: - name: Clone @@ -1171,7 +1171,9 @@ jobs: ./build-xcframework.sh - name: Build Xcode project - run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build + run: | + xcodebuild -downloadPlatform iOS + xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build android-build: runs-on: ubuntu-latest From f1fbffb5c0b34b2a68febb7da3fd0f8333f1ed4c Mon Sep 17 00:00:00 2001 From: Bowen Han Date: Mon, 15 Sep 2025 23:59:19 -0700 Subject: [PATCH 07/19] fix: apply clang-format to CUDA macros (#16017) clang-format previously broke long CUDA macros (e.g. __launch_bounds__) into unreadable line breaks inside template declarations, such as: template __launch_bounds__(nwarps*ggml_cuda_get_physical_warp_size(), 1) This change adjusts formatting rules so that CUDA macros remain consistent and aligned with the surrounding template syntax. --- .clang-format | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.clang-format b/.clang-format index 117e6986f6..742723fc8f 100644 --- a/.clang-format +++ b/.clang-format @@ -22,6 +22,13 @@ AllowShortIfStatementsOnASingleLine: Never AllowShortLambdasOnASingleLine: Inline AllowShortLoopsOnASingleLine: false AlwaysBreakBeforeMultilineStrings: true +# Treat CUDA keywords/attributes as "attribute macros" and avoid breaking lines inside them +AttributeMacros: + - __host__ + - __device__ + - __global__ + - __forceinline__ + - __launch_bounds__ BinPackArguments: true BinPackParameters: false # OnePerLine BitFieldColonSpacing: Both From 76888d202ed2b835ae19ea9f9db6baf39e419297 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Tue, 16 Sep 2025 13:41:38 +0200 Subject: [PATCH 08/19] ci : upload xcframework artifact from ios-xcode-build job (#16010) This commit updates the github workflows build.yml file to include steps for uploading and downloading the xcframework artifact. The macos-latest-swift job now depends on the ios-xcode-build job and downloads the xcframework artifact produced by it. The motivation for this changes is that it takes a long time to build the xcframework and we are currently doing this twice in the workflow. With this change, we only build it once and reuse the artifact. --- .github/workflows/build.yml | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 9a668bf8dd..7f2dccde0c 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -711,6 +711,7 @@ jobs: macOS-latest-swift: runs-on: macos-latest + needs: ios-xcode-build strategy: matrix: @@ -727,6 +728,12 @@ jobs: key: macOS-latest-swift evict-old-files: 1d + - name: Download xcframework artifact + uses: actions/download-artifact@v4 + with: + name: llama-xcframework + path: build-apple/llama.xcframework/ + - name: Dependencies id: depends continue-on-error: true @@ -748,11 +755,6 @@ jobs: -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) - - name: xcodebuild for swift package - id: xcodebuild - run: | - ./build-xcframework.sh - windows-msys2: runs-on: windows-2025 @@ -1170,6 +1172,13 @@ jobs: run: | ./build-xcframework.sh + - name: Upload xcframework artifact + uses: actions/upload-artifact@v4 + with: + name: llama-xcframework + path: build-apple/llama.xcframework/ + retention-days: 1 + - name: Build Xcode project run: | xcodebuild -downloadPlatform iOS From 3913f8730ec6d6245480affc30ae3049107956f4 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Tue, 16 Sep 2025 15:25:57 +0200 Subject: [PATCH 09/19] ggml : fix padding in timestep embedding kernels (#15932) * ggml : remove adding extra dim timestep embedding This commit updates the ggml_timestep_embedding function to no longer add an extra dimension when the specified dimension is odd. The motivation for this change is that this introduces an unnecessary dimension when the dimension is odd, which caused an issue in the kernels which were not expecting this extra dimension and it resulted in uninitialized memory for the second to last dimension. * ggml-cuda : fix padding in timestep embedding kernel This commit removes the zeroing out of the last dimension now that we are not adding the extra padding dimension. * ggml-metal : fix padding in timestep embedding kernel This commit fixes the zero padding for odd dimensions in the timestep embedding kernel * ggml-opencl : fix padding in timestep embedding kernel This commit fixes the zero padding for odd dimensions in the timestep embedding kernel. * ggml-sycl : fix padding in timestep embedding kernel This commit fixes the zero padding for odd dimensions in the timestep embedding kernel. * ggml-vulkan : fix padding in timestep embedding kernel This commit fixes the zero padding for odd dimensions in the timestep embedding kernel. * ggml-cpu : fix padding in timestep embedding function This commit removes the zeroing out of the last dimension now that we are not adding the extra padding dimension. --- ggml/src/ggml-cpu/ops.cpp | 1 - ggml/src/ggml-cuda/tsembd.cu | 6 +++--- ggml/src/ggml-metal/ggml-metal.metal | 2 +- ggml/src/ggml-opencl/kernels/tsembd.cl | 4 ++-- ggml/src/ggml-sycl/tsembd.cpp | 7 ++++--- .../src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp | 7 ++++--- ggml/src/ggml.c | 6 +----- 7 files changed, 15 insertions(+), 18 deletions(-) diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 212e52ef6a..c4824d145a 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -8599,7 +8599,6 @@ static void ggml_compute_forward_timestep_embedding_f32( } if (dim % 2 != 0 && ith == 0) { embed_data[2 * half] = 0.f; - embed_data[dim] = 0.f; } } } diff --git a/ggml/src/ggml-cuda/tsembd.cu b/ggml/src/ggml-cuda/tsembd.cu index 153ddbcda9..b91a26fc80 100644 --- a/ggml/src/ggml-cuda/tsembd.cu +++ b/ggml/src/ggml-cuda/tsembd.cu @@ -7,11 +7,11 @@ static __global__ void timestep_embedding_f32(const float * timesteps, float * d int j = threadIdx.x + blockIdx.x * blockDim.x; float * embed_data = (float *)((char *)dst + i*nb1); - if (dim % 2 != 0 && j == ((dim + 1) / 2)) { - embed_data[dim] = 0.f; + int half = dim / 2; + if (dim % 2 != 0 && j == half) { + embed_data[2 * half] = 0.f; } - int half = dim / 2; if (j >= half) { return; } diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 4314c9cc93..5057e264f6 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -4167,7 +4167,7 @@ kernel void kernel_timestep_embedding_f32( } if (args.dim % 2 != 0 && tpitg.x == 0) { - embed_data[args.dim] = 0.f; + embed_data[2 * half_] = 0.f; } } diff --git a/ggml/src/ggml-opencl/kernels/tsembd.cl b/ggml/src/ggml-opencl/kernels/tsembd.cl index 4b1107f70b..21444bd958 100644 --- a/ggml/src/ggml-opencl/kernels/tsembd.cl +++ b/ggml/src/ggml-opencl/kernels/tsembd.cl @@ -26,8 +26,8 @@ kernel void kernel_timestep_embedding( local_half_dim = logical_dim / 2; local_embed_data_ptr = (global float *)((global char *)local_dst_output_base_ptr + local_i * dst_nb1_bytes); - if (logical_dim % 2 != 0 && local_j == ((logical_dim + 1) / 2)) { - local_embed_data_ptr[logical_dim] = 0.0f; + if (logical_dim % 2 != 0 && local_j == local_half_dim) { + local_embed_data_ptr[2 * local_half_dim] = 0.0f; } if (local_j >= local_half_dim) { diff --git a/ggml/src/ggml-sycl/tsembd.cpp b/ggml/src/ggml-sycl/tsembd.cpp index f6ca626ea7..f2003794d3 100644 --- a/ggml/src/ggml-sycl/tsembd.cpp +++ b/ggml/src/ggml-sycl/tsembd.cpp @@ -21,11 +21,12 @@ static void timestep_embedding_f32( int j = item_ct1.get_local_id(2) + item_ct1.get_group(2) * item_ct1.get_local_range(2); float * embed_data = (float *)((char *)dst + i*nb1); - if (dim % 2 != 0 && j == ((dim + 1) / 2)) { - embed_data[dim] = 0.f; + int half = dim / 2; + + if (dim % 2 != 0 && j == half) { + embed_data[2 * half] = 0.f; } - int half = dim / 2; if (j >= half) { return; } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp b/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp index 79e065a931..ce8e09442d 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp @@ -24,11 +24,12 @@ void main() { const uint j = gl_GlobalInvocationID.x; const uint d_offset = i * p.nb1; - if (p.dim % 2 != 0 && j == ((p.dim + 1) / 2)) { - data_d[d_offset + p.dim] = 0.f; + const uint half_dim = p.dim / 2; + + if (p.dim % 2 != 0 && j == half_dim) { + data_d[d_offset + 2 * half_dim] = 0.f; } - const uint half_dim = p.dim / 2; if (j >= half_dim) { return; } diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 50dc1aa24f..3584827dca 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -4923,12 +4923,8 @@ struct ggml_tensor * ggml_timestep_embedding( struct ggml_tensor * timesteps, int dim, int max_period) { - int actual_dim = dim; - if (dim % 2 != 0) { - actual_dim = dim + 1; - } - struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, actual_dim, timesteps->ne[0]); + struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, dim, timesteps->ne[0]); ggml_set_op_params_i32(result, 0, dim); ggml_set_op_params_i32(result, 1, max_period); From 77475530b8bbea3bf578632507e1284cdfe2c8c0 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Tue, 16 Sep 2025 15:27:52 +0200 Subject: [PATCH 10/19] ci : use macos-latest for arm64 webgpu build (#16029) This commit updates the runs-on field for the macOS arm64 webgpu build job to use macos-latest instead of just latest. The motivation for this is that this job can wait for a runner to pick up the job for a very long time, sometimes over 7 hours. This is an attempt to see if this change can help reduce the wait time. Refs: https://github.com/ggml-org/llama.cpp/actions/runs/17754163447/job/50454257570?pr=16004 --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 7f2dccde0c..0a00c85b73 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -138,7 +138,7 @@ jobs: ctest -L main --verbose --timeout 900 macOS-latest-cmake-arm64-webgpu: - runs-on: latest + runs-on: macos-latest steps: - name: Clone From 8ff206097c2bf3ca1c7aa95f9d6db779fc7bdd68 Mon Sep 17 00:00:00 2001 From: jacekpoplawski <67507230+jacekpoplawski@users.noreply.github.com> Date: Tue, 16 Sep 2025 16:17:08 +0200 Subject: [PATCH 11/19] llama-bench: add --n-cpu-moe support (#15952) * llama-bench: add --n-cpu-moe support Support --n-cpu-moe in llama-bench the same way it is supported by llama-server. --- common/arg.cpp | 8 +-- common/common.h | 14 +++++ tools/llama-bench/llama-bench.cpp | 87 +++++++++++++++++++++++++------ 3 files changed, 90 insertions(+), 19 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 19189d8f9f..9fd8858e82 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2548,7 +2548,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"--cpu-moe", "-cmoe"}, "keep all Mixture of Experts (MoE) weights in the CPU", [](common_params & params) { - params.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()}); + params.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override()); } ).set_env("LLAMA_ARG_CPU_MOE")); add_opt(common_arg( @@ -2561,7 +2561,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex for (int i = 0; i < value; ++i) { // keep strings alive and avoid leaking memory by storing them in a static vector static std::list buft_overrides; - buft_overrides.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i)); + buft_overrides.push_back(llm_ffn_exps_block_regex(i)); params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()}); } } @@ -2570,7 +2570,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"--cpu-moe-draft", "-cmoed"}, "keep all Mixture of Experts (MoE) weights in the CPU for the draft model", [](common_params & params) { - params.speculative.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()}); + params.speculative.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override()); } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CPU_MOE_DRAFT")); add_opt(common_arg( @@ -2582,7 +2582,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } for (int i = 0; i < value; ++i) { static std::list buft_overrides_draft; - buft_overrides_draft.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i)); + buft_overrides_draft.push_back(llm_ffn_exps_block_regex(i)); params.speculative.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()}); } } diff --git a/common/common.h b/common/common.h index 5063d73f96..83d44dbaa7 100644 --- a/common/common.h +++ b/common/common.h @@ -734,6 +734,20 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count"; } +// +// MoE utils +// + +const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_exps"; + +static std::string llm_ffn_exps_block_regex(int idx) { + return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX); +} + +static llama_model_tensor_buft_override llm_ffn_exps_cpu_override() { + return { LLM_FFN_EXPS_REGEX, ggml_backend_cpu_buffer_type() }; +} + // // training utils // diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index 95f662a297..ad47bf144f 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -250,6 +250,7 @@ struct cmd_params { std::vector cpu_strict; std::vector poll; std::vector n_gpu_layers; + std::vector n_cpu_moe; std::vector rpc_servers; std::vector split_mode; std::vector main_gpu; @@ -286,6 +287,7 @@ static const cmd_params cmd_params_defaults = { /* cpu_strict */ { false }, /* poll */ { 50 }, /* n_gpu_layers */ { 99 }, + /* n_cpu_moe */ { 0 }, /* rpc_servers */ { "" }, /* split_mode */ { LLAMA_SPLIT_MODE_LAYER }, /* main_gpu */ { 0 }, @@ -353,6 +355,8 @@ static void print_usage(int /* argc */, char ** argv) { printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str()); printf(" -ngl, --n-gpu-layers (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); + printf(" -ncmoe, --n-cpu-moe (default: %s)\n", + join(cmd_params_defaults.n_cpu_moe, ",").c_str()); if (llama_supports_rpc()) { printf(" -rpc, --rpc (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str()); @@ -564,6 +568,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } auto p = parse_int_range(argv[i]); params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end()); + } else if (arg == "-ncmoe" || arg == "--n-cpu-moe") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = parse_int_range(argv[i]); + params.n_cpu_moe.insert(params.n_cpu_moe.end(), p.begin(), p.end()); } else if (llama_supports_rpc() && (arg == "-rpc" || arg == "--rpc")) { if (++i >= argc) { invalid_param = true; @@ -841,6 +852,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; } + if (params.n_cpu_moe.empty()) { + params.n_cpu_moe = cmd_params_defaults.n_cpu_moe; + } if (params.rpc_servers.empty()) { params.rpc_servers = cmd_params_defaults.rpc_servers; } @@ -901,6 +915,7 @@ struct cmd_params_instance { bool cpu_strict; int poll; int n_gpu_layers; + int n_cpu_moe; std::string rpc_servers_str; llama_split_mode split_mode; int main_gpu; @@ -973,20 +988,50 @@ struct cmd_params_instance { mparams.tensor_split = tensor_split.data(); mparams.use_mmap = use_mmap; - if (tensor_buft_overrides.empty()) { - mparams.tensor_buft_overrides = nullptr; + if (n_cpu_moe <= 0) { + if (tensor_buft_overrides.empty()) { + mparams.tensor_buft_overrides = nullptr; + } else { + GGML_ASSERT(tensor_buft_overrides.back().pattern == nullptr && + "Tensor buffer overrides not terminated with empty pattern"); + mparams.tensor_buft_overrides = tensor_buft_overrides.data(); + } } else { - GGML_ASSERT(tensor_buft_overrides.back().pattern == nullptr && "Tensor buffer overrides not terminated with empty pattern"); - mparams.tensor_buft_overrides = tensor_buft_overrides.data(); + static std::vector merged; + static std::vector patterns; + + merged.clear(); + patterns.clear(); + + auto first = tensor_buft_overrides.begin(); + auto last = tensor_buft_overrides.end(); + if (first != last && (last - 1)->pattern == nullptr) { + --last; + } + merged.insert(merged.end(), first, last); + + patterns.reserve((size_t) n_cpu_moe); + merged.reserve(merged.size() + (size_t) n_cpu_moe + 1); + + for (int i = 0; i < n_cpu_moe; ++i) { + patterns.push_back(llm_ffn_exps_block_regex(i)); + merged.push_back({ patterns.back().c_str(), + ggml_backend_cpu_buffer_type() }); + } + + merged.push_back({ nullptr, nullptr }); + + mparams.tensor_buft_overrides = merged.data(); } return mparams; } bool equal_mparams(const cmd_params_instance & other) const { - return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers_str == other.rpc_servers_str && - split_mode == other.split_mode && main_gpu == other.main_gpu && use_mmap == other.use_mmap && - tensor_split == other.tensor_split && vec_tensor_buft_override_equal(tensor_buft_overrides, other.tensor_buft_overrides); + return model == other.model && n_gpu_layers == other.n_gpu_layers && n_cpu_moe == other.n_cpu_moe && + rpc_servers_str == other.rpc_servers_str && split_mode == other.split_mode && + main_gpu == other.main_gpu && use_mmap == other.use_mmap && tensor_split == other.tensor_split && + vec_tensor_buft_override_equal(tensor_buft_overrides, other.tensor_buft_overrides); } llama_context_params to_llama_cparams() const { @@ -1014,6 +1059,7 @@ static std::vector get_cmd_params_instances(const cmd_param // clang-format off for (const auto & m : params.model) for (const auto & nl : params.n_gpu_layers) + for (const auto & ncmoe : params.n_cpu_moe) for (const auto & rpc : params.rpc_servers) for (const auto & sm : params.split_mode) for (const auto & mg : params.main_gpu) @@ -1051,6 +1097,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .cpu_strict = */ cs, /* .poll = */ pl, /* .n_gpu_layers = */ nl, + /* .n_cpu_moe = */ ncmoe, /* .rpc_servers = */ rpc, /* .split_mode = */ sm, /* .main_gpu = */ mg, @@ -1083,6 +1130,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .cpu_strict = */ cs, /* .poll = */ pl, /* .n_gpu_layers = */ nl, + /* .n_cpu_moe = */ ncmoe, /* .rpc_servers = */ rpc, /* .split_mode = */ sm, /* .main_gpu = */ mg, @@ -1115,6 +1163,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .cpu_strict = */ cs, /* .poll = */ pl, /* .n_gpu_layers = */ nl, + /* .n_cpu_moe = */ ncmoe, /* .rpc_servers = */ rpc, /* .split_mode = */ sm, /* .main_gpu = */ mg, @@ -1152,6 +1201,7 @@ struct test { ggml_type type_k; ggml_type type_v; int n_gpu_layers; + int n_cpu_moe; llama_split_mode split_mode; int main_gpu; bool no_kv_offload; @@ -1186,6 +1236,7 @@ struct test { type_k = inst.type_k; type_v = inst.type_v; n_gpu_layers = inst.n_gpu_layers; + n_cpu_moe = inst.n_cpu_moe; split_mode = inst.split_mode; main_gpu = inst.main_gpu; no_kv_offload = inst.no_kv_offload; @@ -1236,12 +1287,14 @@ struct test { static const std::vector & get_fields() { static const std::vector fields = { - "build_commit", "build_number", "cpu_info", "gpu_info", "backends", "model_filename", - "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads", - "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers", - "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides", - "use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen", "n_depth", "test_time", - "avg_ns", "stddev_ns", "avg_ts", "stddev_ts", + "build_commit", "build_number", "cpu_info", "gpu_info", "backends", + "model_filename", "model_type", "model_size", "model_n_params", "n_batch", + "n_ubatch", "n_threads", "cpu_mask", "cpu_strict", "poll", + "type_k", "type_v", "n_gpu_layers", "n_cpu_moe", "split_mode", + "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides", + "use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen", + "n_depth", "test_time", "avg_ns", "stddev_ns", "avg_ts", + "stddev_ts" }; return fields; } @@ -1251,8 +1304,8 @@ struct test { static field_type get_field_type(const std::string & field) { if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" || field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" || - field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" || - field == "avg_ns" || field == "stddev_ns" || field == "no_op_offload") { + field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" || field == "avg_ns" || + field == "stddev_ns" || field == "no_op_offload" || field == "n_cpu_moe") { return INT; } if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" || @@ -1320,6 +1373,7 @@ struct test { ggml_type_name(type_k), ggml_type_name(type_v), std::to_string(n_gpu_layers), + std::to_string(n_cpu_moe), split_mode_str(split_mode), std::to_string(main_gpu), std::to_string(no_kv_offload), @@ -1568,6 +1622,9 @@ struct markdown_printer : public printer { if (!is_cpu_backend) { fields.emplace_back("n_gpu_layers"); } + if (params.n_cpu_moe.size() > 1) { + fields.emplace_back("n_cpu_moe"); + } if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) { fields.emplace_back("n_threads"); } From d5fabe3682de515fd09d6c981f7a0d1b75614455 Mon Sep 17 00:00:00 2001 From: Chenguang Li <757486878@qq.com> Date: Wed, 17 Sep 2025 14:33:08 +0800 Subject: [PATCH 12/19] CANN: Optimize ggml_cann_set_device (#15935) * CANN: Fix ggml_cann_set_device to avoid redundant device switches - Added a check to skip aclrtSetDevice if the current device is already set. - Prevents unnecessary context switches while keeping thread/device consistency. * CANN: add device default id --- ggml/src/ggml-cann/common.h | 5 ++++- ggml/src/ggml-cann/ggml-cann.cpp | 12 ++++++------ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-cann/common.h b/ggml/src/ggml-cann/common.h index c5fce8dc91..b707b84359 100755 --- a/ggml/src/ggml-cann/common.h +++ b/ggml/src/ggml-cann/common.h @@ -526,7 +526,10 @@ struct ggml_backend_cann_context { */ aclrtStream stream(int stream) { if (streams[stream] == nullptr) { - ggml_cann_set_device(device); + // If the device is not set here, destroying the stream later may cause a mismatch + // between the thread contexts where the stream was created and destroyed. + // However, I printed the device_id, thread_id, and stream, and they are all consistent. + ACL_CHECK(aclrtSetDevice(device)); ACL_CHECK(aclrtCreateStream(&streams[stream])); } return streams[stream]; diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index 19a18a281d..56d82b4af3 100755 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -75,13 +75,12 @@ * @param device The device ID to set. */ void ggml_cann_set_device(const int32_t device) { - // TODO: uncomment these lines after empty context has fixed. - // int current_device; - // ACL_CHECK(aclrtGetDevice(¤t_device)); + int current_device = -1; + aclrtGetDevice(¤t_device); - // if (device == current_device) { - // return; - // } + if (device == current_device) { + return; + } ACL_CHECK(aclrtSetDevice(device)); } @@ -1729,6 +1728,7 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx, ggml_cann_get_rows(ctx, dst); break; case GGML_OP_SET_ROWS: + std::cout << "lcg GGML_OP_SET_ROWS"<< std::endl; ggml_cann_set_rows(ctx, dst); break; case GGML_OP_DUP: From 85286f354813056f6c835046c0acfa3bf6ba9432 Mon Sep 17 00:00:00 2001 From: Shane A Date: Wed, 17 Sep 2025 00:01:58 -0700 Subject: [PATCH 13/19] model : add OLMo3 support (#16015) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add HF to gguf conversion logic for Olmo3 * Add Olmo3 implementation * Update rope comment * Fix indentation Co-authored-by: Sigbjørn Skjæret * Apply suggestion from @CISC Co-authored-by: Sigbjørn Skjæret --------- Co-authored-by: Sigbjørn Skjæret --- convert_hf_to_gguf.py | 25 +++++++++++++++++++++++ src/llama-model.cpp | 47 +++++++++++++++++++++++++++++++++++++++---- 2 files changed, 68 insertions(+), 4 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 5a21ba2110..ce83f24695 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6009,9 +6009,34 @@ class SeedOssModel(TextModel): @ModelBase.register("Olmo2ForCausalLM") +@ModelBase.register("Olmo3ForCausalLM") class Olmo2Model(TextModel): model_arch = gguf.MODEL_ARCH.OLMO2 + def set_gguf_parameters(self): + super().set_gguf_parameters() + + rope_scaling = self.hparams.get("rope_scaling") or {} + if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling: + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) + self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) + self.gguf_writer.add_rope_scaling_attn_factors(rope_scaling["attention_factor"]) + self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"]) + + if "sliding_window" in self.hparams: + self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) + + sliding_window_pattern = [] + if "layer_types" in self.hparams: + sliding_window_pattern = [t == "sliding_attention" for t in self.hparams["layer_types"]] + else: + # Olmo2 does not use sliding window attention. + # Olmo3 defaults to using sliding window for all layers except every 4th. + for i in range(self.hparams["num_hidden_layers"]): + sliding_window_pattern.append((i + 1) % 4 != 0) + + self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern) + @ModelBase.register("OlmoeForCausalLM") class OlmoeModel(TextModel): diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 731e87383b..2be807a6a9 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1350,6 +1350,14 @@ void llama_model::load_hparams(llama_model_loader & ml) { { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); + if (found_swa && hparams.n_swa > 0) { + hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; + hparams.set_swa_pattern(4); + } else { + hparams.swa_type = LLAMA_SWA_TYPE_NONE; + } + switch (hparams.n_layer) { case 16: type = LLM_TYPE_1B; break; case 32: type = LLM_TYPE_7B; break; @@ -12233,6 +12241,7 @@ struct llm_build_olmo : public llm_graph_context { } }; +template struct llm_build_olmo2 : public llm_graph_context { llm_build_olmo2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; @@ -12248,7 +12257,14 @@ struct llm_build_olmo2 : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv(); + using inp_attn_type = std::conditional_t; + inp_attn_type * inp_attn = nullptr; + + if constexpr (iswa) { + inp_attn = build_attn_inp_kv_iswa(); + } else { + inp_attn = build_attn_inp_kv(); + } ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -12281,17 +12297,36 @@ struct llm_build_olmo2 : public llm_graph_context { Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); - Qcur = ggml_rope_ext( + const bool is_swa = hparams.is_swa(il); + + if (is_swa) { + // For sliding window layers, Olmo3 use regular rope with no yarn rope scaling. + // This is achieved here by setting freq_scale and attn_factor to 1. + // We also set ext_factor to 0 to avoid a few unnecessary computations. + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, 1.0, + 0.0, 1.0, beta_fast, beta_slow + ); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, 1.0, + 0.0, 1.0, beta_fast, beta_slow + ); + } else { + Qcur = ggml_rope_ext( ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); - Kcur = ggml_rope_ext( + Kcur = ggml_rope_ext( ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); + } cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); @@ -19131,7 +19166,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { } break; case LLM_ARCH_OLMO2: { - llm = std::make_unique(*this, params); + if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) { + llm = std::make_unique>(*this, params); + } else { + llm = std::make_unique>(*this, params); + } } break; case LLM_ARCH_OLMOE: { From 1cbd80f8cf80a817715b1ccc5680fe2a3c5172c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jie=20Fu=20=28=E5=82=85=E6=9D=B0=29?= Date: Wed, 17 Sep 2025 15:29:00 +0800 Subject: [PATCH 14/19] examples : support encoder-decoder models in the simple example (#16002) Signed-off-by: Jie Fu --- examples/simple/simple.cpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 633b87e584..d09771d104 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -145,6 +145,20 @@ int main(int argc, char ** argv) { llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size()); + if (llama_model_has_encoder(model)) { + if (llama_encode(ctx, batch)) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return 1; + } + + llama_token decoder_start_token_id = llama_model_decoder_start_token(model); + if (decoder_start_token_id == LLAMA_TOKEN_NULL) { + decoder_start_token_id = llama_vocab_bos(vocab); + } + + batch = llama_batch_get_one(&decoder_start_token_id, 1); + } + // main loop const auto t_main_start = ggml_time_us(); From 745cbcf2fe1eb88f8db615ac622f0b944d924ad6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jie=20Fu=20=28=E5=82=85=E6=9D=B0=29?= Date: Wed, 17 Sep 2025 15:30:55 +0800 Subject: [PATCH 15/19] llama-quant : fix the verification of attention layers for encoder-decoder models (#16023) Signed-off-by: Jie Fu --- src/llama-quant.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index c93e8065a8..97228b2a69 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -725,7 +725,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // attention layers have a non-zero number of kv heads int32_t n_attn_layer = model.hparams.n_layer - std::count(n_head_kv_iter, n_head_kv_iter + model.hparams.n_layer, 0); if (llama_model_has_encoder(&model)) { - n_attn_layer *= 3; + // now n_attn_layer is the number of attention layers in the encoder + // for each decoder block, there are 2 attention layers + n_attn_layer += 2 * model.hparams.dec_n_layer; } GGML_ASSERT((qs.n_attention_wv == n_attn_layer - pruned_attention_w) && "n_attention_wv is unexpected"); } From a91d035b901e8a9edf810f63d130ee49adf27be2 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Wed, 17 Sep 2025 09:34:09 +0200 Subject: [PATCH 16/19] ci : revert back to macos-13 for macOS-latest-cmake-x64 (#16040) This commit reverts the change of the runs-on parameter for the macOS-latest-cmake-x64 job back to macos-13 that was make in Commit 51abc96bdc52ba8cd6ad78dcf12ed9a041d7b442 ("ci : update macos-latest* jobs to use macos-latest (#15938)"). The motivation for this is that using macos-latest will cause an ARM based runner to be used, and not an x64 based runner. Refs: https://github.com/ggml-org/llama.cpp/pull/15938#issuecomment-3300805127 --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 0a00c85b73..ff42b19f1d 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -97,7 +97,7 @@ jobs: ctest -L 'main|curl' --verbose --timeout 900 macOS-latest-cmake-x64: - runs-on: macos-latest + runs-on: macos-13 steps: - name: Clone From cb5bb6cc05119c24e7711ca2956cd0e6d409d396 Mon Sep 17 00:00:00 2001 From: Eve <139727413+netrunnereve@users.noreply.github.com> Date: Wed, 17 Sep 2025 07:35:37 +0000 Subject: [PATCH 17/19] vulkan: automatically remove unsupported devices (#15976) * remove unsupported vulkan devices * make this happen during selection instead * pass by reference --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 60a99dc78b..1f1136382e 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -4423,8 +4423,8 @@ static void ggml_vk_print_gpu_info(size_t idx) { static bool ggml_vk_instance_validation_ext_available(); static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector& instance_extensions); - static bool ggml_vk_instance_debug_utils_ext_available(const std::vector & instance_extensions); +static bool ggml_vk_device_is_supported(const vk::PhysicalDevice & vkdev); static void ggml_vk_instance_init() { if (vk_instance_initialized) { @@ -4540,7 +4540,7 @@ static void ggml_vk_instance_init() { new_driver.pNext = &new_id; devices[i].getProperties2(&new_props); - if (new_props.properties.deviceType == vk::PhysicalDeviceType::eDiscreteGpu || new_props.properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu) { + if ((new_props.properties.deviceType == vk::PhysicalDeviceType::eDiscreteGpu || new_props.properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu) && ggml_vk_device_is_supported(devices[i])) { // Check if there are two physical devices corresponding to the same GPU auto old_device = std::find_if( vk_instance.device_indices.begin(), @@ -12738,6 +12738,20 @@ static bool ggml_vk_instance_debug_utils_ext_available( UNUSED(instance_extensions); } +static bool ggml_vk_device_is_supported(const vk::PhysicalDevice & vkdev) { + VkPhysicalDeviceFeatures2 device_features2; + device_features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2; + + VkPhysicalDeviceVulkan11Features vk11_features; + vk11_features.pNext = nullptr; + vk11_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES; + device_features2.pNext = &vk11_features; + + vkGetPhysicalDeviceFeatures2(vkdev, &device_features2); + + return vk11_features.storageBuffer16BitAccess; +} + static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch) { switch (props.vendorID) { case VK_VENDOR_ID_INTEL: From cd08fc3ecc0264b4414b68af3874a6c689ed60c1 Mon Sep 17 00:00:00 2001 From: David Ribeiro Alves Date: Wed, 17 Sep 2025 01:08:02 -0700 Subject: [PATCH 18/19] common : Fix corrupted memory error on json grammar initialization (#16038) Initalizing RESERVED_NAME in is_reserved_name() is not thread safe and leads to corrupted memory when used from multiple threads as can be seen in the asan trace below. This fixes the initialization to make it thread-safe. #0 0x000100abd018 in std::__1::pair, std::__1::allocator>, void*>*>, bool> std::__1::__hash_table, std::__1::allocator>, std::__1::hash, std::__1::allocator>>, std::__1::equal_to, std::__1::allocator>>, std::__1::allocator, std::__1::allocator>>>::__emplace_unique_key_args, std::__1::allocator>, std::__1::basic_string, std::__1::allocator> const&>(std::__1::basic_string, std::__1::allocator> const&, std::__1::basic_string, std::__1::allocator> const&) __hash_table:1565 #1 0x000100ab0320 in SchemaConverter::visit(nlohmann::json_abi_v3_12_0::basic_json, std::__1::allocator>, bool, long long, unsigned long long, double, std::__1::allocator, nlohmann::json_abi_v3_12_0::adl_serializer, std::__1::vector>, void> const&, std::__1::basic_string, std::__1::allocator> const&) json-schema-to-grammar.cpp:802 #2 0x000100aafc48 in std::__1::__function::__func const&, common_grammar_options const&)::$_2, std::__1::allocator const&, common_grammar_options const&)::$_2>, std::__1::basic_string, std::__1::allocator> (std::__1::basic_string, std::__1::allocator> const&, nlohmann::json_abi_v3_12_0::basic_json, std::__1::allocator>, bool, long long, unsigned long long, double, std::__1::allocator, nlohmann::json_abi_v3_12_0::adl_serializer, std::__1::vector>, void> const&)>::operator()(std::__1::basic_string, std::__1::allocator> const&, nlohmann::json_abi_v3_12_0::basic_json, std::__1::allocator>, bool, long long, unsigned long long, double, std::__1::allocator, nlohmann::json_abi_v3_12_0::adl_serializer, std::__1::vector>, void> const&) function.h:319 #3 0x000100a2c938 in std::__1::__function::__func, std::__1::allocator>, bool, long long, unsigned long long, double, std::__1::allocator, nlohmann::json_abi_v3_12_0::adl_serializer, std::__1::vector>, void> const&), std::__1::allocator, std::__1::allocator>, bool, long long, unsigned long long, double, std::__1::allocator, nlohmann::json_abi_v3_12_0::adl_serializer, std::__1::vector>, void> const&)>, void (nlohmann::json_abi_v3_12_0::basic_json, std::__1::allocator>, bool, long long, unsigned long long, double, std::__1::allocator, nlohmann::json_abi_v3_12_0::adl_serializer, std::__1::vector>, void> const&)>::operator()(nlohmann::json_abi_v3_12_0::basic_json, std::__1::allocator>, bool, long long, unsigned long long, double, std::__1::allocator, nlohmann::json_abi_v3_12_0::adl_serializer, std::__1::vector>, void> const&) function.h:319 #4 0x000100a139f8 in foreach_function(nlohmann::json_abi_v3_12_0::basic_json, std::__1::allocator>, bool, long long, unsigned long long, double, std::__1::allocator, nlohmann::json_abi_v3_12_0::adl_serializer, std::__1::vector>, void> const&, std::__1::function, std::__1::allocator>, bool, long long, unsigned long long, double, std::__1::allocator, nlohmann::json_abi_v3_12_0::adl_serializer, std::__1::vector>, void> const&)> const&) chat.cpp:762 #5 0x000100a2a7f4 in std::__1::__function::__func, void (common_grammar_builder const&)>::operator()(common_grammar_builder const&) function.h:319 #6 0x000100aa98f4 in build_grammar(std::__1::function const&, common_grammar_options const&) json-schema-to-grammar.cpp:982 #7 0x0001009c9314 in common_chat_params_init_llama_3_x(minja::chat_template const&, templates_params const&, bool) chat.cpp:1110 #8 0x0001009b8afc in common_chat_templates_apply_jinja(common_chat_templates const*, common_chat_templates_inputs const&) chat.cpp:1992 #9 0x0001009b533c in common_chat_templates_apply(common_chat_templates const*, common_chat_templates_inputs const&) chat.cpp:2074 #10 0x000100810120 in llamacpp_apply_chat_template+0x724 (predict_oai-98384e17fb94e863:arm64+0x100090120) ... ==45482==Register values: x[0] = 0x00006020004147f8 x[1] = 0x00006080000013c8 x[2] = 0x0000000000000000 x[3] = 0x0000604006289738 x[4] = 0x0000000000000002 x[5] = 0x0000000000000001 x[6] = 0x04034000004b4000 x[7] = 0x0000000000000001 x[8] = 0xbebebebebebebebe x[9] = 0x17d7d7d7d7d7d7d7 x[10] = 0x00000c04000828ff x[11] = 0x0000000000000001 x[12] = 0x000000002018d383 x[13] = 0x0000000000000000 x[14] = 0xfa0000000000fafa x[15] = 0x000010700001ffff x[16] = 0x000000019dc012c0 x[17] = 0x00000001021284f8 x[18] = 0x0000000000000000 x[19] = 0x00000001700acdc0 x[20] = 0x0000000000000002 x[21] = 0x000000002018d384 x[22] = 0x16dd16fd2e731151 x[23] = 0x0000007000020000 x[24] = 0x0000000100c69c08 x[25] = 0x0000000100c69c20 x[26] = 0x00006080000013c7 x[27] = 0x0000000100c69c00 x[28] = 0x00000001700acd60 fp = 0x00000001700aceb0 lr = 0x0000000100abce30 sp = 0x00000001700acd60 AddressSanitizer can not provide additional info. SUMMARY: AddressSanitizer: SEGV __hash_table:1565 in std::__1::pair, std::__1::allocator>, void*>*>, bool> std::__1::__hash_table, std::__1::allocator>, std::__1::hash, std::__1::allocator>>, std::__1::equal_to, std::__1::allocator>>, std::__1::allocator, std::__1::allocator>>>::__emplace_unique_key_args, std::__1::allocator>, std::__1::basic_string, std::__1::allocator> const&>(std::__1::basic_string, std::__1::allocator> const&, std::__1::basic_string, std::__1::allocator> const&) Thread T5 created by T0 here: #0 0x0001020b99d4 in pthread_create+0x5c (libclang_rt.asan_osx_dynamic.dylib:arm64e+0x359d4) #1 0x000100873910 in std::sys::pal::unix::thread::Thread::new::h77254fdd87a28e05+0x118 (predict_oai-98384e17fb94e863:arm64+0x1000f3910) #2 0x0001007c7a1c in test::run_test::haeb3c2bcd5ed6cf6+0x76c (predict_oai-98384e17fb94e863:arm64+0x100047a1c) #3 0x0001007aedb0 in test::console::run_tests_console::he9d142d704f3a986+0x149c (predict_oai-98384e17fb94e863:arm64+0x10002edb0) #4 0x0001007c5758 in test::test_main::hf86a5e20735245b9+0x118 (predict_oai-98384e17fb94e863:arm64+0x100045758) #5 0x0001007c5da0 in test::test_main_static::h61ee9c8fd30abca0+0x54 (predict_oai-98384e17fb94e863:arm64+0x100045da0) ... ==45482==ABORTING --- common/json-schema-to-grammar.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index 182c787544..db1f0b23dd 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -257,12 +257,13 @@ std::unordered_map STRING_FORMAT_RULES = { }; static bool is_reserved_name(const std::string & name) { - static std::unordered_set RESERVED_NAMES; - if (RESERVED_NAMES.empty()) { - RESERVED_NAMES.insert("root"); - for (const auto &p : PRIMITIVE_RULES) RESERVED_NAMES.insert(p.first); - for (const auto &p : STRING_FORMAT_RULES) RESERVED_NAMES.insert(p.first); - } + static const std::unordered_set RESERVED_NAMES = [] { + std::unordered_set s; + s.insert("root"); + for (const auto & p : PRIMITIVE_RULES) s.insert(p.first); + for (const auto & p : STRING_FORMAT_RULES) s.insert(p.first); + return s; + }(); return RESERVED_NAMES.find(name) != RESERVED_NAMES.end(); } From c959b676be29e93f8dbc3bd6056ceba812a9eb72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Wed, 17 Sep 2025 15:32:42 +0200 Subject: [PATCH 19/19] CUDA: fix FA occupancy, optimize tile kernel (#15982) --- ggml/src/ggml-cuda/common.cuh | 16 + ggml/src/ggml-cuda/fattn-common.cuh | 13 +- ggml/src/ggml-cuda/fattn-tile.cu | 551 ++++++++++++++++------------ ggml/src/ggml-cuda/vendors/hip.h | 34 +- 4 files changed, 361 insertions(+), 253 deletions(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index b0feea3623..045c6d3006 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -75,6 +75,8 @@ #define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4) #define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA1) #define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_RDNA1) +#define GGML_CUDA_CC_IS_CDNA1(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_CDNA2) +#define GGML_CUDA_CC_IS_CDNA2(cc) (cc >= GGML_CUDA_CC_CDNA2 && cc < GGML_CUDA_CC_CDNA3) #define GGML_CUDA_CC_IS_CDNA3(cc) (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_RDNA1) // Moore Threads @@ -325,6 +327,20 @@ static constexpr __device__ int ggml_cuda_get_physical_warp_size() { #endif // defined(GGML_USE_HIP) && (defined(__GFX9__) || defined(__GFX8__)) } +// Maximum number of bytes that can be copied in a single instruction. +static constexpr __device__ int ggml_cuda_get_max_cpy_bytes() { +#ifdef GGML_USE_HIP + return 16; +#else +#if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA + return 16; +#else + return 8; +#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA +#endif // GGML_USE_HIP +} + + [[noreturn]] static __device__ void no_device_code( const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) { diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh index b69f57d659..142a3a88d1 100644 --- a/ggml/src/ggml-cuda/fattn-common.cuh +++ b/ggml/src/ggml-cuda/fattn-common.cuh @@ -647,9 +647,7 @@ static __global__ void flash_attn_stream_k_fixup( } template // D == head size -#if !defined(GGML_USE_HIP) __launch_bounds__(D, 1) -#endif // !(defined(GGML_USE_HIP) static __global__ void flash_attn_combine_results( const float * __restrict__ VKQ_parts, const float2 * __restrict__ VKQ_meta, @@ -692,10 +690,7 @@ static __global__ void flash_attn_combine_results( float VKQ_numerator = 0.0f; float VKQ_denominator = 0.0f; for (int l = 0; l < parallel_blocks; ++l) { - const float diff = meta[l].x - kqmax; - float KQ_max_scale = expf(diff); - const uint32_t ftz_mask = 0xFFFFFFFF * (diff > SOFTMAX_FTZ_THRESHOLD); - *((uint32_t *) &KQ_max_scale) &= ftz_mask; + const float KQ_max_scale = expf(meta[l].x - kqmax); VKQ_numerator += KQ_max_scale * VKQ_parts[l*D + tid]; VKQ_denominator += KQ_max_scale * meta[l].y; @@ -836,11 +831,10 @@ void launch_fattn( CUDA_CHECK(cudaGetLastError()); } - int parallel_blocks = 1; - const dim3 block_dim(warp_size, nwarps, 1); int max_blocks_per_sm = 1; // Max. number of active blocks limited by occupancy. CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, fattn_kernel, block_dim.x * block_dim.y * block_dim.z, nbytes_shared)); + int parallel_blocks = max_blocks_per_sm; dim3 blocks_num; if (stream_k) { @@ -862,9 +856,6 @@ void launch_fattn( GGML_ASSERT(K->ne[1] % KQ_row_granularity == 0); const int ntiles_KQ = K->ne[1] / KQ_row_granularity; // Max. number of parallel blocks limited by tensor size. - // parallel_blocks should be at least large enough to achieve max. occupancy for a single wave: - parallel_blocks = std::max((nsm * max_blocks_per_sm) / ntiles_total, 1); - // parallel_blocks must not be larger than what the tensor size allows: parallel_blocks = std::min(parallel_blocks, ntiles_KQ); diff --git a/ggml/src/ggml-cuda/fattn-tile.cu b/ggml/src/ggml-cuda/fattn-tile.cu index c6a399ce5d..a2d9951ea5 100644 --- a/ggml/src/ggml-cuda/fattn-tile.cu +++ b/ggml/src/ggml-cuda/fattn-tile.cu @@ -2,20 +2,30 @@ #include "fattn-common.cuh" #include "fattn-tile.cuh" -#define FATTN_TILE_NTHREADS 256 +// kq_stride == number of KQ rows to process per iteration +// kq_nbatch == number of K columns to load in parallel for KQ calculation static int fattn_tile_get_kq_stride_host(const int D, const int ncols, const int cc, const int warp_size) { if (GGML_CUDA_CC_IS_AMD(cc)) { + if (GGML_CUDA_CC_IS_RDNA(cc)) { + switch (D) { + case 64: + return 128; + case 128: + case 256: + return ncols <= 16 ? 128 : 64; + default: + GGML_ABORT("fatal error"); + return -1; + } + } switch (D) { case 64: - return 64; + return ncols == 32 ? 128 : 64; case 128: + return ncols == 32 ? 64 : 32; case 256: - if (GGML_CUDA_CC_IS_GCN(cc) || GGML_CUDA_CC_IS_CDNA(cc)) { - return ncols <= 16 ? 64 : 32; - } else { - return 64; - } + return 32; default: GGML_ABORT("fatal error"); return -1; @@ -49,24 +59,28 @@ static int fattn_tile_get_kq_stride_host(const int D, const int ncols, const int static constexpr __device__ int fattn_tile_get_kq_stride_device(int D, int ncols, int warp_size) { #ifdef GGML_USE_HIP +#ifdef RDNA switch (D) { case 64: - return 64; + return 128; case 128: -#if defined(GCN) || defined(CDNA) - return ncols <= 16 ? 64 : 32; -#else - return 64; -#endif // defined(GCN) || defined(CDNA) case 256: -#if defined(GCN) || defined(CDNA) - return ncols <= 16 ? 64 : 32; -#else - return 64; -#endif // defined(GCN) || defined(CDNA) + return ncols <= 16 ? 128 : 64; default: return -1; } +#else + switch (D) { + case 64: + return ncols == 32 ? 128 : 64; + case 128: + return ncols == 32 ? 64 : 32; + case 256: + return 32; + default: + return -1; + } +#endif // RDNA #else #ifdef FAST_FP16_AVAILABLE switch (D) { @@ -100,17 +114,8 @@ static constexpr __device__ int fattn_tile_get_kq_nbatch_device(int D, int ncols case 64: return 64; case 128: -#if defined(GCN) || defined(CDNA) - return ncols <= 16 ? 64 : 128; -#else - return 64; -#endif // defined(GCN) || defined(CDNA) case 256: -#if defined(GCN) || defined(CDNA) - return ncols <= 16 ? 64 : 128; -#else - return ncols <= 16 ? 64 : 256; -#endif // defined(GCN) || defined(CDNA) + return 128; default: return -1; } @@ -120,9 +125,8 @@ static constexpr __device__ int fattn_tile_get_kq_nbatch_device(int D, int ncols case 64: return 64; case 128: - return ncols <= 16 ? 128 : 64; case 256: - return ncols <= 16 ? 64 : 128; + return 128; default: return -1; } @@ -142,12 +146,27 @@ static constexpr __device__ int fattn_tile_get_kq_nbatch_device(int D, int ncols GGML_UNUSED_VARS(ncols, warp_size); } -template // D == head size -#ifdef GGML_USE_HIP -__launch_bounds__(FATTN_TILE_NTHREADS, 1) +static int fattn_tile_get_nthreads_host(const int cc, const int ncols) { + return 256; + GGML_UNUSED_VARS(cc, ncols); +} + +static constexpr __device__ int fattn_tile_get_nthreads_device(int ncols) { + return 256; + GGML_UNUSED(ncols); +} + +static constexpr __device__ int fattn_tile_get_occupancy_device(int ncols) { +#ifdef RDNA + return 3; #else -__launch_bounds__(FATTN_TILE_NTHREADS, 2) -#endif // GGML_USE_HIP + return ncols <= 16 ? 3 : 2; +#endif // RDNA + GGML_UNUSED(ncols); +} + +template // D == head size +__launch_bounds__(fattn_tile_get_nthreads_device(ncols), fattn_tile_get_occupancy_device(ncols)) static __global__ void flash_attn_tile( const char * __restrict__ Q, const char * __restrict__ K, @@ -193,7 +212,7 @@ static __global__ void flash_attn_tile( } constexpr int warp_size = 32; - constexpr int nwarps = FATTN_TILE_NTHREADS / warp_size; + constexpr int nwarps = fattn_tile_get_nthreads_device(ncols) / warp_size; constexpr int kq_stride = fattn_tile_get_kq_stride_device(D, ncols, warp_size); static_assert(kq_stride % warp_size == 0, "kq_stride not divisable by warp_size."); constexpr int kq_nbatch = fattn_tile_get_kq_nbatch_device(D, ncols, warp_size); @@ -206,90 +225,126 @@ static __global__ void flash_attn_tile( const int sequence = blockIdx.z / ne02; const int head = blockIdx.z - sequence*ne02; const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix. - const float2 * Q_f2 = (const float2 *) (Q + nb03* sequence + nb02* head + nb01*ic0); - const half2 * K_h2 = (const half2 *) (K + nb13* sequence + nb12*(head / gqa_ratio)); - const half2 * V_h2 = (const half2 *) (V + nb13* sequence + nb12*(head / gqa_ratio)); // K and V have same shape - const half * maskh = (const half *) (mask + nb33*(sequence % ne33) + nb31*ic0); - const float * sinksf = (const float *) (sinks); + const float * Q_f = (const float *) (Q + nb03* sequence + nb02* head + nb01*ic0); + const half2 * K_h2 = (const half2 *) (K + nb13* sequence + nb12*(head / gqa_ratio)); + const half2 * V_h2 = (const half2 *) (V + nb13* sequence + nb12*(head / gqa_ratio)); // K and V have same shape + const half * maskh = (const half *) (mask + nb33*(sequence % ne33) + nb31*ic0); + const float * sinksf = (const float *) (sinks); const int stride_KV2 = nb11 / sizeof(half2); const float slope = get_alibi_slope(max_bias, head, n_head_log2, m0, m1); -#if defined(GGML_USE_HIP) - constexpr int cpy_nb = 16; -#else - constexpr int cpy_nb = 8; -#endif // defined(GGML_USE_HIP) && defined(GCN) + constexpr int cpy_nb = ggml_cuda_get_max_cpy_bytes(); constexpr int cpy_ne = cpy_nb / 4; - __shared__ float KQ[ncols][kq_stride]; + constexpr int cpw = ncols/nwarps; // cols per warp + + // softmax_iter_j == number of KQ columns for which to calculate softmax in parallel. + // KQ is originall 2D but uses a Z-shaped memory pattern for larger reads/writes. #ifdef FAST_FP16_AVAILABLE + constexpr int softmax_iter_j = cpw < 2*cpy_ne ? cpw : 2*cpy_ne; + + __shared__ half KQ[ncols/softmax_iter_j][kq_stride][softmax_iter_j]; __shared__ half2 Q_tmp[ncols][D/2]; - __shared__ half2 KV_tmp_h2[kq_stride * (kq_nbatch/2 + cpy_ne)]; // Padded to avoid memory bank conflicts. - half2 VKQ[ncols/nwarps][D/(2*warp_size)] = {{{0.0f, 0.0f}}}; + __shared__ half2 KV_tmp[kq_stride * (kq_nbatch/2 + cpy_ne)]; // Padded to avoid memory bank conflicts. + half2 VKQ[cpw][D/(2*warp_size)] = {{{0.0f, 0.0f}}}; #else + constexpr int softmax_iter_j = cpw < 1*cpy_ne ? cpw : 1*cpy_ne; + + __shared__ float KQ[ncols/softmax_iter_j][kq_stride][softmax_iter_j]; __shared__ float Q_tmp[ncols][D]; - __shared__ float KV_tmp_f[kq_stride * (kq_nbatch + cpy_ne)]; // Padded to avoid memory bank conflicts. - float2 * KV_tmp_f2 = (float2 *) KV_tmp_f; - float2 VKQ[ncols/nwarps][D/(2*warp_size)] = {{{0.0f, 0.0f}}}; + __shared__ float KV_tmp[kq_stride * (kq_nbatch + cpy_ne)]; // Padded to avoid memory bank conflicts. + float2 VKQ[cpw][D/(2*warp_size)] = {{{0.0f, 0.0f}}}; #endif // FAST_FP16_AVAILABLE + static_assert(cpw % softmax_iter_j == 0, "bad softmax_iter_j"); - - float kqmax[ncols/nwarps]; + float KQ_max[cpw]; #pragma unroll for (int j0 = 0; j0 < ncols; j0 += nwarps) { - kqmax[j0/nwarps] = -FLT_MAX/2.0f; + KQ_max[j0/nwarps] = -FLT_MAX/2.0f; } - float kqsum[ncols/nwarps] = {0.0f}; + float KQ_sum[cpw] = {0.0f}; + + // Load Q data, convert to FP16 if fast. +#pragma unroll + for (int j0 = 0; j0 < cpw; ++j0) { + const int j = j0 + threadIdx.y*cpw; + + constexpr int cpy_ne_D = cpy_ne < D/warp_size ? cpy_ne : D/warp_size; #pragma unroll - for (int j0 = 0; j0 < ncols; j0 += nwarps) { - const int j = j0 + threadIdx.y; + for (int i0 = 0; i0 < D; i0 += warp_size*cpy_ne_D) { + float tmp_f[cpy_ne_D] = {0.0f}; + if (ic0 + j < ne01) { + ggml_cuda_memcpy_1(tmp_f, &Q_f[j*(nb01/sizeof(float)) + i0 + threadIdx.x*cpy_ne_D]); + } #pragma unroll - for (int i0 = 0; i0 < D/2; i0 += warp_size) { - const float2 tmp = ic0 + j < ne01 ? Q_f2[j*(nb01/sizeof(float2)) + i0 + threadIdx.x] : make_float2(0.0f, 0.0f); + for (int i1 = 0; i1 < cpy_ne_D; ++i1) { + tmp_f[i1] *= scale; + } + #ifdef FAST_FP16_AVAILABLE - Q_tmp[j][i0 + threadIdx.x] = make_half2(tmp.x * scale, tmp.y * scale); + half2 tmp_h2[cpy_ne_D/2]; +#pragma unroll + for (int i1 = 0; i1 < cpy_ne_D; i1 += 2) { + tmp_h2[i1/2] = make_half2(tmp_f[i1 + 0], tmp_f[i1 + 1]); + } + ggml_cuda_memcpy_1(&Q_tmp[j][i0/2 + threadIdx.x*(cpy_ne_D/2)], tmp_h2); #else - Q_tmp[j][2*i0 + threadIdx.x] = tmp.x * scale; - Q_tmp[j][2*i0 + warp_size + threadIdx.x] = tmp.y * scale; + ggml_cuda_memcpy_1 (&Q_tmp[j][i0 + threadIdx.x* cpy_ne_D], tmp_f); #endif // FAST_FP16_AVAILABLE } } __syncthreads(); + // Main loop over KV cache: const int k_VKQ_max = KV_max ? KV_max[sequence*gridDim.x + blockIdx.x] : ne11; for (int k_VKQ_0 = blockIdx.y*kq_stride; k_VKQ_0 < k_VKQ_max; k_VKQ_0 += gridDim.y*kq_stride) { // Calculate KQ tile and keep track of new maximum KQ values: - float kqmax_new[ncols/nwarps]; + float KQ_max_new[cpw]; #pragma unroll - for (int j = 0; j < ncols/nwarps; ++j) { - kqmax_new[j] = kqmax[j]; + for (int j = 0; j < cpw; ++j) { + KQ_max_new[j] = KQ_max[j]; } - float sum[kq_stride/warp_size][ncols/nwarps] = {{0.0f}}; + float KQ_acc[kq_stride/warp_size][cpw] = {{0.0f}}; // Accumulators for KQ matrix multiplication. + // KQ = K @ Q matrix multiplication: #pragma unroll for (int k_KQ_0 = 0; k_KQ_0 < D; k_KQ_0 += kq_nbatch) { #pragma unroll for (int i_KQ_0 = 0; i_KQ_0 < kq_stride; i_KQ_0 += nwarps) { const int i_KQ = i_KQ_0 + threadIdx.y; -#pragma unroll - for (int k_KQ_1 = 0; k_KQ_1 < kq_nbatch/2; k_KQ_1 += warp_size) { - const half2 tmp_h2 = K_h2[int64_t(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ_0/2 + k_KQ_1 + threadIdx.x]; #ifdef FAST_FP16_AVAILABLE - KV_tmp_h2[i_KQ*(kq_nbatch/2 + cpy_ne) + k_KQ_1 + threadIdx.x] = tmp_h2; -#else - const float2 tmp_f2 = __half22float2(tmp_h2); - KV_tmp_f[i_KQ*(kq_nbatch + cpy_ne) + 2*k_KQ_1 + threadIdx.x] = tmp_f2.x; - KV_tmp_f[i_KQ*(kq_nbatch + cpy_ne) + 2*k_KQ_1 + warp_size + threadIdx.x] = tmp_f2.y; -#endif // FAST_FP16_AVAILABLE + constexpr int cpy_ne_kqnb = cpy_ne < kq_nbatch/(2*warp_size) ? cpy_ne : kq_nbatch/(2*warp_size); +#pragma unroll + for (int k_KQ_1 = 0; k_KQ_1 < kq_nbatch/2; k_KQ_1 += warp_size*cpy_ne_kqnb) { + ggml_cuda_memcpy_1( + &KV_tmp[i_KQ*(kq_nbatch/2 + cpy_ne) + k_KQ_1 + threadIdx.x*cpy_ne_kqnb], + &K_h2[int64_t(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ_0/2 + k_KQ_1 + threadIdx.x*cpy_ne_kqnb]); } +#else + constexpr int cpy_ne_kqnb = cpy_ne < kq_nbatch/warp_size ? cpy_ne : kq_nbatch/warp_size; +#pragma unroll + for (int k_KQ_1 = 0; k_KQ_1 < kq_nbatch; k_KQ_1 += warp_size*cpy_ne_kqnb) { + half2 tmp_h2[cpy_ne_kqnb/2]; + ggml_cuda_memcpy_1( + tmp_h2, &K_h2[int64_t(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ_0/2 + k_KQ_1/2 + threadIdx.x*(cpy_ne_kqnb/2)]); + + float2 tmp_f2[cpy_ne_kqnb/2]; +#pragma unroll + for (int k_KQ_2 = 0; k_KQ_2 < cpy_ne_kqnb/2; ++k_KQ_2) { + tmp_f2[k_KQ_2] = __half22float2(tmp_h2[k_KQ_2]); + } + ggml_cuda_memcpy_1( + &KV_tmp[i_KQ*(kq_nbatch + cpy_ne) + k_KQ_1 + threadIdx.x*cpy_ne_kqnb], tmp_f2); + } +#endif // FAST_FP16_AVAILABLE } __syncthreads(); @@ -298,12 +353,12 @@ static __global__ void flash_attn_tile( #pragma unroll for (int k_KQ_1 = 0; k_KQ_1 < kq_nbatch/2; k_KQ_1 += cpy_ne) { half2 K_k[kq_stride/warp_size][cpy_ne]; - half2 Q_k[ncols/nwarps][cpy_ne]; + half2 Q_k[cpw][cpy_ne]; #else #pragma unroll for (int k_KQ_1 = 0; k_KQ_1 < kq_nbatch; k_KQ_1 += cpy_ne) { float K_k[kq_stride/warp_size][cpy_ne]; - float Q_k[ncols/nwarps][cpy_ne]; + float Q_k[cpw][cpy_ne]; #endif // FAST_FP16_AVAILABLE #pragma unroll @@ -311,29 +366,29 @@ static __global__ void flash_attn_tile( const int i_KQ = i_KQ_0 + threadIdx.x; #ifdef FAST_FP16_AVAILABLE - ggml_cuda_memcpy_1(&K_k[i_KQ_0/warp_size], &KV_tmp_h2[i_KQ*(kq_nbatch/2 + cpy_ne) + k_KQ_1]); + ggml_cuda_memcpy_1(&K_k[i_KQ_0/warp_size], &KV_tmp[i_KQ*(kq_nbatch/2 + cpy_ne) + k_KQ_1]); #else - ggml_cuda_memcpy_1(&K_k[i_KQ_0/warp_size], &KV_tmp_f [i_KQ*(kq_nbatch + cpy_ne) + k_KQ_1]); + ggml_cuda_memcpy_1(&K_k[i_KQ_0/warp_size], &KV_tmp[i_KQ*(kq_nbatch + cpy_ne) + k_KQ_1]); #endif // FAST_FP16_AVAILABLE } #pragma unroll - for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) { - const int j_KQ = j_KQ_0 + threadIdx.y; + for (int j_KQ_0 = 0; j_KQ_0 < cpw; ++j_KQ_0) { + const int j_KQ = j_KQ_0 + threadIdx.y*cpw; #ifdef FAST_FP16_AVAILABLE - ggml_cuda_memcpy_1(&Q_k[j_KQ_0/nwarps], &Q_tmp[j_KQ][k_KQ_0/2 + k_KQ_1]); + ggml_cuda_memcpy_1(&Q_k[j_KQ_0], &Q_tmp[j_KQ][k_KQ_0/2 + k_KQ_1]); #else - ggml_cuda_memcpy_1(&Q_k[j_KQ_0/nwarps], &Q_tmp[j_KQ][k_KQ_0 + k_KQ_1]); + ggml_cuda_memcpy_1(&Q_k[j_KQ_0], &Q_tmp[j_KQ][k_KQ_0 + k_KQ_1]); #endif // FAST_FP16_AVAILABLE } #pragma unroll for (int i_KQ_0 = 0; i_KQ_0 < kq_stride; i_KQ_0 += warp_size) { #pragma unroll - for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) { + for (int j_KQ_0 = 0; j_KQ_0 < cpw; ++j_KQ_0) { #pragma unroll for (int k = 0; k < cpy_ne; ++k) { - ggml_cuda_mad(sum[i_KQ_0/warp_size][j_KQ_0/nwarps], K_k[i_KQ_0/warp_size][k], Q_k[j_KQ_0/nwarps][k]); + ggml_cuda_mad(KQ_acc[i_KQ_0/warp_size][j_KQ_0], K_k[i_KQ_0/warp_size][k], Q_k[j_KQ_0][k]); } } } @@ -344,104 +399,77 @@ static __global__ void flash_attn_tile( } } + // Apply logit softcap, mask, update KQ_max: #pragma unroll for (int i_KQ_0 = 0; i_KQ_0 < kq_stride; i_KQ_0 += warp_size) { const int i_KQ = i_KQ_0 + threadIdx.x; #pragma unroll - for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) { - const int j_KQ = j_KQ_0 + threadIdx.y; + for (int j_KQ_0 = 0; j_KQ_0 < cpw; ++j_KQ_0) { + const int j_KQ = j_KQ_0 + threadIdx.y*cpw; if (use_logit_softcap) { - sum[i_KQ_0/warp_size][j_KQ_0/nwarps] = logit_softcap * tanhf(sum[i_KQ_0/warp_size][j_KQ_0/nwarps]); + KQ_acc[i_KQ_0/warp_size][j_KQ_0] = logit_softcap * tanhf(KQ_acc[i_KQ_0/warp_size][j_KQ_0]); } - sum[i_KQ_0/warp_size][j_KQ_0/nwarps] += mask ? slope*__half2float(maskh[j_KQ*ne11 + k_VKQ_0 + i_KQ]) : 0.0f; + KQ_acc[i_KQ_0/warp_size][j_KQ_0] += mask ? slope*__half2float(maskh[j_KQ*ne11 + k_VKQ_0 + i_KQ]) : 0.0f; - kqmax_new[j_KQ_0/nwarps] = fmaxf(kqmax_new[j_KQ_0/nwarps], sum[i_KQ_0/warp_size][j_KQ_0/nwarps]); - - KQ[j_KQ][i_KQ] = sum[i_KQ_0/warp_size][j_KQ_0/nwarps]; + KQ_max_new[j_KQ_0] = fmaxf(KQ_max_new[j_KQ_0], KQ_acc[i_KQ_0/warp_size][j_KQ_0]); } } __syncthreads(); + // Calculate KQ softmax, write to shared KQ buffer, re-scale VKQ accumulators: #pragma unroll - for (int j0 = 0; j0 < ncols; j0 += nwarps) { - const int j = j0 + threadIdx.y; - - kqmax_new[j0/nwarps] = warp_reduce_max(kqmax_new[j0/nwarps]); - const float KQ_max_scale = expf(kqmax[j0/nwarps] - kqmax_new[j0/nwarps]); - kqmax[j0/nwarps] = kqmax_new[j0/nwarps]; - - float kqsum_add = 0.0f; - if (kq_stride % (4*warp_size) == 0 && cpy_ne % 4 == 0) { -#pragma unroll - for (int i0 = 0; i0 < kq_stride; i0 += 4*warp_size) { - const int i = i0 + 4*threadIdx.x; - - float4 val = *(const float4 *) &KQ[j][i]; - val.x = expf(val.x - kqmax[j0/nwarps]); - val.y = expf(val.y - kqmax[j0/nwarps]); - val.z = expf(val.z - kqmax[j0/nwarps]); - val.w = expf(val.w - kqmax[j0/nwarps]); - kqsum_add += val.x + val.y + val.z + val.w; - + for (int j0 = 0; j0 < cpw; j0 += softmax_iter_j) { #ifdef FAST_FP16_AVAILABLE - const half2 tmp[2] = {make_half2(val.x, val.y), make_half2(val.z, val.w)}; - ggml_cuda_memcpy_1(&KQ[j][i/2], &tmp); + half tmp[kq_stride/warp_size][softmax_iter_j]; #else - ggml_cuda_memcpy_1(&KQ[j][i], &val); + float tmp[kq_stride/warp_size][softmax_iter_j]; #endif // FAST_FP16_AVAILABLE - } - } else if (kq_stride % (2*warp_size) == 0 && cpy_ne % 2 == 0) { -#pragma unroll - for (int i0 = 0; i0 < kq_stride; i0 += 2*warp_size) { - const int i = i0 + 2*threadIdx.x; - float2 val = *(const float2 *) &KQ[j][i]; - val.x = expf(val.x - kqmax[j0/nwarps]); - val.y = expf(val.y - kqmax[j0/nwarps]); - kqsum_add += val.x + val.y; -#ifdef FAST_FP16_AVAILABLE - const half2 tmp = make_half2(val.x, val.y); - ggml_cuda_memcpy_1(&KQ[j][i/2], &tmp); -#else - ggml_cuda_memcpy_1(&KQ[j][i], &val); -#endif // FAST_FP16_AVAILABLE - } - } else { +#pragma unroll + for (int j1 = 0; j1 < softmax_iter_j; ++j1) { + KQ_max_new[j0+j1] = warp_reduce_max(KQ_max_new[j0+j1]); + const float KQ_max_scale = expf(KQ_max[j0+j1] - KQ_max_new[j0+j1]); + KQ_max[j0+j1] = KQ_max_new[j0+j1]; + + float KQ_sum_add = 0.0f; +#pragma unroll for (int i0 = 0; i0 < kq_stride; i0 += warp_size) { - const int i = i0 + threadIdx.x; - - const float diff = KQ[j][i] - kqmax[j0/nwarps]; - const float val = expf(diff); - kqsum_add += val; -#ifdef FAST_FP16_AVAILABLE - ((half *) KQ[j])[i] = val; -#else - KQ[j][i] = val; -#endif // FAST_FP16_AVAILABLE + const float val = expf(KQ_acc[i0/warp_size][j0+j1] - KQ_max[j0+j1]); + KQ_sum_add += val; + tmp[i0/warp_size][j1] = val; } - } - kqsum[j0/nwarps] = kqsum[j0/nwarps]*KQ_max_scale + kqsum_add; + KQ_sum[j0+j1] = KQ_sum[j0+j1]*KQ_max_scale + KQ_sum_add; #ifdef FAST_FP16_AVAILABLE - const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale, KQ_max_scale); + const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale, KQ_max_scale); #pragma unroll - for (int i0 = 0; i0 < D/2; i0 += warp_size) { - VKQ[j0/nwarps][i0/warp_size] *= KQ_max_scale_h2; - } + for (int i0 = 0; i0 < D/2; i0 += warp_size) { + VKQ[j0+j1][i0/warp_size] *= KQ_max_scale_h2; + } #else #pragma unroll - for (int i0 = 0; i0 < D/2; i0 += warp_size) { - VKQ[j0/nwarps][i0/warp_size].x *= KQ_max_scale; - VKQ[j0/nwarps][i0/warp_size].y *= KQ_max_scale; - } + for (int i0 = 0; i0 < D/2; i0 += warp_size) { + VKQ[j0+j1][i0/warp_size].x *= KQ_max_scale; + VKQ[j0+j1][i0/warp_size].y *= KQ_max_scale; + } #endif // FAST_FP16_AVAILABLE + } + +#pragma unroll + for (int i0 = 0; i0 < kq_stride; i0 += warp_size) { + const int i = i0 + threadIdx.x; + + ggml_cuda_memcpy_1( + KQ[j0/softmax_iter_j + threadIdx.y*(cpw/softmax_iter_j)][i], tmp[i0/warp_size]); + } } - constexpr int V_cols_per_iter = kq_stride*kq_nbatch / D; + // VKQ = V @ KQ matrix multiplication: + constexpr int V_cols_per_iter = kq_stride*kq_nbatch / D; // Number of V columns that fit in SRAM for K. static_assert(kq_stride % V_cols_per_iter == 0, "bad V_cols_per_iter"); #pragma unroll for (int k0 = 0; k0 < kq_stride; k0 += V_cols_per_iter) { @@ -449,65 +477,96 @@ static __global__ void flash_attn_tile( for (int k1 = 0; k1 < V_cols_per_iter; k1 += nwarps) { const int k_tile = k1 + threadIdx.y; -#pragma unroll - for (int i0 = 0; i0 < D/2; i0 += warp_size) { - const int i = i0 + threadIdx.x; - - const half2 tmp = V_h2[int64_t(k_VKQ_0 + k0 + k_tile)*stride_KV2 + i]; #ifdef FAST_FP16_AVAILABLE - KV_tmp_h2[k_tile*(D/2) + i] = tmp; -#else - KV_tmp_f2[k_tile*(D/2) + i] = __half22float2(tmp); -#endif // FAST_FP16_AVAILABLE + constexpr int cpy_ne_D = cpy_ne < D/(2*warp_size) ? cpy_ne : D/(2*warp_size); +#pragma unroll + for (int i0 = 0; i0 < D/2; i0 += warp_size*cpy_ne_D) { + ggml_cuda_memcpy_1( + &KV_tmp[k_tile*(D/2) + i0 + threadIdx.x*cpy_ne_D], + &V_h2[int64_t(k_VKQ_0 + k0 + k_tile)*stride_KV2 + i0 + threadIdx.x*cpy_ne_D]); } +#else + constexpr int cpy_ne_D = cpy_ne < D/warp_size ? cpy_ne : D/warp_size; +#pragma unroll + for (int i0 = 0; i0 < D; i0 += warp_size*cpy_ne_D) { + half2 tmp_h2[cpy_ne_D/2]; + ggml_cuda_memcpy_1( + tmp_h2, &V_h2[int64_t(k_VKQ_0 + k0 + k_tile)*stride_KV2 + i0/2 + threadIdx.x*(cpy_ne_D/2)]); + + float2 tmp_f2[cpy_ne_D/2]; +#pragma unroll + for (int i1 = 0; i1 < cpy_ne_D/2; ++i1) { + tmp_f2[i1] = __half22float2(tmp_h2[i1]); + } + ggml_cuda_memcpy_1( + &KV_tmp[k_tile*D + i0 + threadIdx.x*cpy_ne_D], tmp_f2); + } +#endif // FAST_FP16_AVAILABLE } __syncthreads(); +#ifdef FAST_FP16_AVAILABLE #pragma unroll for (int k1 = 0; k1 < V_cols_per_iter; ++k1) { -#ifdef FAST_FP16_AVAILABLE half2 V_k[(D/2)/warp_size]; - half2 KQ_k[ncols/nwarps]; -#else - float2 V_k[(D/2)/warp_size]; - float KQ_k[ncols/nwarps]; -#endif // FAST_FP16_AVAILABLE + half2 KQ_k[cpw]; + constexpr int cpy_ne_D = cpy_ne/2 < (D/2)/warp_size ? cpy_ne/2 : (D/2)/warp_size; #pragma unroll - for (int i0 = 0; i0 < D/2; i0 += warp_size) { - const int i = i0 + threadIdx.x; - -#ifdef FAST_FP16_AVAILABLE - V_k[i0/warp_size] = KV_tmp_h2[k1*(D/2) + i]; -#else - V_k[i0/warp_size] = KV_tmp_f2[k1*(D/2) + i]; -#endif // FAST_FP16_AVAILABLE + for (int i0 = 0; i0 < D/2; i0 += warp_size*cpy_ne_D) { + ggml_cuda_memcpy_1(&V_k[i0/warp_size], &KV_tmp[k1*(D/2) + i0 + threadIdx.x*cpy_ne_D]); } #pragma unroll - for (int j0 = 0; j0 < ncols; j0 += nwarps) { - const int j = j0 + threadIdx.y; + for (int j0 = 0; j0 < cpw; j0 += softmax_iter_j) { + const int j = j0/softmax_iter_j + threadIdx.y*(cpw/softmax_iter_j); -#ifdef FAST_FP16_AVAILABLE - KQ_k[j0/nwarps] = __half2half2(((const half *)KQ[j])[k0 + k1]); -#else - KQ_k[j0/nwarps] = KQ[j][k0 + k1]; -#endif // FAST_FP16_AVAILABLE + half tmp[softmax_iter_j]; + ggml_cuda_memcpy_1( + &tmp, KQ[j][k0 + k1]); +#pragma unroll + for (int j1 = 0; j1 < softmax_iter_j; ++j1) { + KQ_k[j0+j1] = __half2half2(tmp[j1]); + } } #pragma unroll for (int i0 = 0; i0 < D/2; i0 += warp_size) { #pragma unroll - for (int j0 = 0; j0 < ncols; j0 += nwarps) { -#ifdef FAST_FP16_AVAILABLE - VKQ[j0/nwarps][i0/warp_size] += V_k[i0/warp_size] *KQ_k[j0/nwarps]; -#else - VKQ[j0/nwarps][i0/warp_size].x += V_k[i0/warp_size].x*KQ_k[j0/nwarps]; - VKQ[j0/nwarps][i0/warp_size].y += V_k[i0/warp_size].y*KQ_k[j0/nwarps]; -#endif // FAST_FP16_AVAILABLE + for (int j0 = 0; j0 < cpw; ++j0) { + VKQ[j0][i0/warp_size] += V_k[i0/warp_size]*KQ_k[j0]; } } } +#else +#pragma unroll + for (int k1 = 0; k1 < V_cols_per_iter; ++k1) { + float2 V_k[(D/2)/warp_size]; + float KQ_k[cpw]; + + constexpr int cpy_ne_D = cpy_ne < D/warp_size ? cpy_ne : D/warp_size; +#pragma unroll + for (int i0 = 0; i0 < D; i0 += warp_size*cpy_ne_D) { + ggml_cuda_memcpy_1(&V_k[i0/(2*warp_size)], &KV_tmp[k1*D + i0 + threadIdx.x*cpy_ne_D]); + } +#pragma unroll + for (int j0 = 0; j0 < cpw; j0 += softmax_iter_j) { + const int j = j0/softmax_iter_j + threadIdx.y*(cpw/softmax_iter_j); + + ggml_cuda_memcpy_1( + &KQ_k[j0], KQ[j][k0 + k1]); + } + +#pragma unroll + for (int i0 = 0; i0 < D/2; i0 += warp_size) { +#pragma unroll + for (int j0 = 0; j0 < cpw; ++j0) { + VKQ[j0][i0/warp_size].x += V_k[i0/warp_size].x*KQ_k[j0]; + VKQ[j0][i0/warp_size].y += V_k[i0/warp_size].y*KQ_k[j0]; + } + } + } +#endif // FAST_FP16_AVAILABLE __syncthreads(); } @@ -519,69 +578,92 @@ static __global__ void flash_attn_tile( const float sink = sinksf[head]; #pragma unroll - for (int j0 = 0; j0 < ncols; j0 += nwarps) { - float kqmax_new_j = fmaxf(kqmax[j0/nwarps], sink); - kqmax_new_j = warp_reduce_max(kqmax_new_j); + for (int j0 = 0; j0 < cpw; ++j0) { + float KQ_max_new_j = fmaxf(KQ_max[j0], sink); + KQ_max_new_j = warp_reduce_max(KQ_max_new_j); - const float KQ_max_scale = expf(kqmax[j0/nwarps] - kqmax_new_j); - kqmax[j0/nwarps] = kqmax_new_j; + const float KQ_max_scale = expf(KQ_max[j0] - KQ_max_new_j); + KQ_max[j0] = KQ_max_new_j; - const float val = expf(sink - kqmax[j0/nwarps]); - kqsum[j0/nwarps] = kqsum[j0/nwarps] * KQ_max_scale; + const float val = expf(sink - KQ_max[j0]); + KQ_sum[j0] = KQ_sum[j0] * KQ_max_scale; if (threadIdx.x == 0) { - kqsum[j0/nwarps] += val; + KQ_sum[j0] += val; } #ifdef FAST_FP16_AVAILABLE const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale, KQ_max_scale); #pragma unroll for (int i0 = 0; i0 < D/2; i0 += warp_size) { - VKQ[j0/nwarps][i0/warp_size] *= KQ_max_scale_h2; + VKQ[j0][i0/warp_size] *= KQ_max_scale_h2; } #else #pragma unroll for (int i0 = 0; i0 < D/2; i0 += warp_size) { - VKQ[j0/nwarps][i0/warp_size].x *= KQ_max_scale; - VKQ[j0/nwarps][i0/warp_size].y *= KQ_max_scale; + VKQ[j0][i0/warp_size].x *= KQ_max_scale; + VKQ[j0][i0/warp_size].y *= KQ_max_scale; } #endif // FAST_FP16_AVAILABLE } } - float2 * dst2 = (float2 *) dst; - #pragma unroll - for (int j_VKQ_0 = 0; j_VKQ_0 < ncols; j_VKQ_0 += nwarps) { - const int j_VKQ = j_VKQ_0 + threadIdx.y; + for (int j_VKQ_0 = 0; j_VKQ_0 < cpw; ++j_VKQ_0) { + KQ_sum[j_VKQ_0] = warp_reduce_sum(KQ_sum[j_VKQ_0]); + } + if (gridDim.y == 1) { +#pragma unroll + for (int j_VKQ_0 = 0; j_VKQ_0 < cpw; ++j_VKQ_0) { +#ifdef FAST_FP16_AVAILABLE + const half2 KQ_sum_j_inv = make_half2(1.0f/KQ_sum[j_VKQ_0], 1.0f/KQ_sum[j_VKQ_0]); +#pragma unroll + for (int i = 0; i < (D/2)/warp_size; ++i) { + VKQ[j_VKQ_0][i] *= KQ_sum_j_inv; + } +#else + const float KQ_sum_j_inv = 1.0f/KQ_sum[j_VKQ_0]; +#pragma unroll + for (int i = 0; i < (D/2)/warp_size; ++i) { + VKQ[j_VKQ_0][i].x *= KQ_sum_j_inv; + VKQ[j_VKQ_0][i].y *= KQ_sum_j_inv; + } +#endif // FAST_FP16_AVAILABLE + } + } + + // Write back results: +#pragma unroll + for (int j_VKQ_0 = 0; j_VKQ_0 < cpw; ++j_VKQ_0) { + const int j_VKQ = j_VKQ_0 + threadIdx.y*cpw; if (ic0 + j_VKQ >= ne01) { return; } - float kqsum_j = kqsum[j_VKQ_0/nwarps]; - kqsum_j = warp_reduce_sum(kqsum_j); - const int j_dst_unrolled = ((sequence*ne01 + ic0 + j_VKQ)*ne02 + head)*gridDim.y + blockIdx.y; -#pragma unroll - for (int i00 = 0; i00 < D/2; i00 += warp_size) { - const int i0 = i00 + threadIdx.x; - #ifdef FAST_FP16_AVAILABLE - float2 dst_val = __half22float2(VKQ[j_VKQ_0/nwarps][i0/warp_size]); + constexpr int cpy_ne_D = cpy_ne/2 < (D/2)/warp_size ? cpy_ne/2 : (D/2)/warp_size; +#pragma unroll + for (int i0 = 0; i0 < D/2; i0 += warp_size*cpy_ne_D) { + float2 tmp[cpy_ne_D]; +#pragma unroll + for (int i1 = 0; i1 < cpy_ne_D; ++i1) { + tmp[i1] = __half22float2(VKQ[j_VKQ_0][i0/warp_size + i1]); + } + ggml_cuda_memcpy_1(&dst[j_dst_unrolled*D + 2*i0 + threadIdx.x*(2*cpy_ne_D)], tmp); + } #else - float2 dst_val = VKQ[j_VKQ_0/nwarps][i0/warp_size]; + constexpr int cpy_ne_D = cpy_ne < D/warp_size ? cpy_ne : D/warp_size; +#pragma unroll + for (int i0 = 0; i0 < D; i0 += warp_size*cpy_ne_D) { + ggml_cuda_memcpy_1( + &dst[j_dst_unrolled*D + i0 + threadIdx.x*cpy_ne_D], &VKQ[j_VKQ_0][i0/(2*warp_size)]); + } #endif // FAST_FP16_AVAILABLE - if (gridDim.y == 1) { - dst_val.x /= kqsum_j; - dst_val.y /= kqsum_j; - } - dst2[j_dst_unrolled*(D/2) + i0] = dst_val; - } - if (gridDim.y != 1 && threadIdx.x == 0) { - dst_meta[j_dst_unrolled] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j); + dst_meta[j_dst_unrolled] = make_float2(KQ_max[j_VKQ_0], KQ_sum[j_VKQ_0]); } } #else @@ -602,15 +684,29 @@ template static void launch_fattn_tile_switch_ncols(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * Q = dst->src[0]; - const int id = ggml_cuda_get_device(); - const int cc = ggml_cuda_info().devices[id].cc; - const int warp_size = 32; - const int nwarps = FATTN_TILE_NTHREADS / warp_size; + const int id = ggml_cuda_get_device(); + const int cc = ggml_cuda_info().devices[id].cc; + const int warp_size = 32; constexpr size_t nbytes_shared = 0; +#ifdef GGML_USE_HIP + if constexpr (D <= 128) { + if (Q->ne[1] > 32) { + constexpr int cols_per_block = 64; + const int nwarps = fattn_tile_get_nthreads_host(cc, cols_per_block) / warp_size; + fattn_kernel_t fattn_kernel = flash_attn_tile; + const int kq_stride = fattn_tile_get_kq_stride_host(D, cols_per_block, cc, warp_size); + launch_fattn + (ctx, dst, fattn_kernel, nwarps, nbytes_shared, kq_stride, true, true, false, warp_size); + return; + } + } +#endif // GGML_USE_HIP + if (Q->ne[1] > 16) { constexpr int cols_per_block = 32; + const int nwarps = fattn_tile_get_nthreads_host(cc, cols_per_block) / warp_size; fattn_kernel_t fattn_kernel = flash_attn_tile; const int kq_stride = fattn_tile_get_kq_stride_host(D, cols_per_block, cc, warp_size); launch_fattn @@ -619,6 +715,7 @@ static void launch_fattn_tile_switch_ncols(ggml_backend_cuda_context & ctx, ggml } constexpr int cols_per_block = 16; + const int nwarps = fattn_tile_get_nthreads_host(cc, cols_per_block) / warp_size; fattn_kernel_t fattn_kernel = flash_attn_tile; const int kq_stride = fattn_tile_get_kq_stride_host(D, cols_per_block, cc, warp_size); launch_fattn diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h index 12bbee4556..37386afcd4 100644 --- a/ggml/src/ggml-cuda/vendors/hip.h +++ b/ggml/src/ggml-cuda/vendors/hip.h @@ -158,41 +158,41 @@ #define __CUDA_ARCH__ 1300 -#if defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__) -#define GCN -#endif - #if defined(__gfx900__) || defined(__gfx906__) #define GCN5 -#endif +#endif // defined(__gfx900__) || defined(__gfx906__) #if defined(__gfx803__) #define GCN4 -#endif +#endif // defined(__gfx803__) -#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx942__) -#define CDNA // For the entire family -#endif +#if defined(GCN5) || defined(GCN4) +#define GCN +#endif // defined(GCN5) || defined(GCN4) #if defined(__gfx942__) #define CDNA3 -#endif +#endif // defined(__gfx942__) #if defined(__gfx90a__) #define CDNA2 -#endif +#endif // defined(__gfx90a__) #if defined(__gfx908__) #define CDNA1 -#endif +#endif // defined(__gfx908__) + +#if defined(CDNA3) || defined(CDNA2) || defined(CDNA1) +#define CDNA // For the entire family +#endif // defined(CDNA3) || defined(CDNA2) || defined(CDNA1) #if defined(__GFX12__) #define RDNA4 -#endif +#endif // defined(__GFX12__) #if defined(__GFX11__) #define RDNA3 -#endif +#endif // defined(__GFX11__) #if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \ defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__) @@ -201,7 +201,11 @@ #if defined(__gfx1010__) || defined(__gfx1012__) #define RDNA1 -#endif +#endif // defined(__gfx1010__) || defined(__gfx1012__) + +#if defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(RDNA1) +#define RDNA // For the entire family +#endif // defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(RDNA1) #ifndef __has_builtin #define __has_builtin(x) 0