diff --git a/.clang-format b/.clang-format index 117e6986f6..742723fc8f 100644 --- a/.clang-format +++ b/.clang-format @@ -22,6 +22,13 @@ AllowShortIfStatementsOnASingleLine: Never AllowShortLambdasOnASingleLine: Inline AllowShortLoopsOnASingleLine: false AlwaysBreakBeforeMultilineStrings: true +# Treat CUDA keywords/attributes as "attribute macros" and avoid breaking lines inside them +AttributeMacros: + - __host__ + - __device__ + - __global__ + - __forceinline__ + - __launch_bounds__ BinPackArguments: true BinPackParameters: false # OnePerLine BitFieldColonSpacing: Both diff --git a/.devops/rocm.Dockerfile b/.devops/rocm.Dockerfile index 221077197d..106c62b4dc 100644 --- a/.devops/rocm.Dockerfile +++ b/.devops/rocm.Dockerfile @@ -17,14 +17,11 @@ FROM ${BASE_ROCM_DEV_CONTAINER} AS build # gfx906 is deprecated #check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.1/reference/system-requirements.html -ARG ROCM_DOCKER_ARCH='gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102,gfx1200,gfx1201' -#ARG ROCM_DOCKER_ARCH=gfx1100 +ARG ROCM_DOCKER_ARCH='gfx803;gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1010;gfx1030;gfx1032;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx1151' +#ARG ROCM_DOCKER_ARCH='gfx1151' -# Set ROCm architectured +# Set ROCm architectures ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH} -# Enable ROCm -# ENV CC=/opt/rocm/llvm/bin/clang -# ENV CXX=/opt/rocm/llvm/bin/clang++ RUN apt-get update \ && apt-get install -y \ @@ -39,8 +36,16 @@ WORKDIR /app COPY . . +RUN git clone https://github.com/rocm/rocwmma --branch develop --depth 1 + RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \ - cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_BUILD_TESTS=OFF \ + cmake -S . -B build \ + -DGGML_HIP=ON \ + -DGGML_HIP_ROCWMMA_FATTN=ON \ + -DCMAKE_HIP_FLAGS="-I$(pwd)/rocwmma/library/include/" \ + -DAMDGPU_TARGETS="$ROCM_DOCKER_ARCH" \ + -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON \ + -DCMAKE_BUILD_TYPE=Release -DLLAMA_BUILD_TESTS=OFF \ && cmake --build build --config Release -j$(nproc) RUN mkdir -p /app/lib \ diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 1bd35627da..ff42b19f1d 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -56,7 +56,7 @@ env: jobs: macOS-latest-cmake-arm64: - runs-on: macos-14 + runs-on: macos-latest steps: - name: Clone @@ -138,7 +138,7 @@ jobs: ctest -L main --verbose --timeout 900 macOS-latest-cmake-arm64-webgpu: - runs-on: macos-14 + runs-on: macos-latest steps: - name: Clone @@ -711,6 +711,7 @@ jobs: macOS-latest-swift: runs-on: macos-latest + needs: ios-xcode-build strategy: matrix: @@ -727,6 +728,12 @@ jobs: key: macOS-latest-swift evict-old-files: 1d + - name: Download xcframework artifact + uses: actions/download-artifact@v4 + with: + name: llama-xcframework + path: build-apple/llama.xcframework/ + - name: Dependencies id: depends continue-on-error: true @@ -748,11 +755,6 @@ jobs: -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) - - name: xcodebuild for swift package - id: xcodebuild - run: | - ./build-xcframework.sh - windows-msys2: runs-on: windows-2025 @@ -1170,8 +1172,17 @@ jobs: run: | ./build-xcframework.sh + - name: Upload xcframework artifact + uses: actions/upload-artifact@v4 + with: + name: llama-xcframework + path: build-apple/llama.xcframework/ + retention-days: 1 + - name: Build Xcode project - run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build + run: | + xcodebuild -downloadPlatform iOS + xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build android-build: runs-on: ubuntu-latest diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index e959209ab2..f461456edf 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -530,15 +530,13 @@ jobs: runs-on: windows-2022 env: - # The ROCm version must correspond to the version used in the HIP SDK. - ROCM_VERSION: "6.4.2" HIPSDK_INSTALLER_VERSION: "25.Q3" strategy: matrix: include: - name: "radeon" - gpu_targets: "gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032" + gpu_targets: "gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032" steps: - name: Clone @@ -548,7 +546,7 @@ jobs: - name: Clone rocWMMA repository id: clone_rocwmma run: | - git clone https://github.com/rocm/rocwmma --branch rocm-${{ env.ROCM_VERSION }} --depth 1 + git clone https://github.com/rocm/rocwmma --branch develop --depth 1 - name: Cache ROCm Installation id: cache-rocm diff --git a/CMakeLists.txt b/CMakeLists.txt index 36a2078e4c..4720e1f1a2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -58,6 +58,12 @@ if (MSVC) add_compile_options("$<$:/bigobj>") endif() +if (CMAKE_SYSTEM_NAME STREQUAL "iOS") + set(LLAMA_TOOLS_INSTALL_DEFAULT OFF) +else() + set(LLAMA_TOOLS_INSTALL_DEFAULT ${LLAMA_STANDALONE}) +endif() + # # option list # @@ -82,6 +88,7 @@ option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_TOOLS "llama: build tools" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE}) +option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT}) # 3rd party libs option(LLAMA_CURL "llama: use libcurl to download model from an URL" ON) diff --git a/common/arg.cpp b/common/arg.cpp index c15008fe79..9fd8858e82 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1704,7 +1704,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.system_prompt = value; } - ).set_examples({LLAMA_EXAMPLE_MAIN})); + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION})); add_opt(common_arg( {"--no-perf"}, string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"), @@ -2548,7 +2548,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"--cpu-moe", "-cmoe"}, "keep all Mixture of Experts (MoE) weights in the CPU", [](common_params & params) { - params.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()}); + params.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override()); } ).set_env("LLAMA_ARG_CPU_MOE")); add_opt(common_arg( @@ -2561,7 +2561,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex for (int i = 0; i < value; ++i) { // keep strings alive and avoid leaking memory by storing them in a static vector static std::list buft_overrides; - buft_overrides.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i)); + buft_overrides.push_back(llm_ffn_exps_block_regex(i)); params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()}); } } @@ -2570,7 +2570,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"--cpu-moe-draft", "-cmoed"}, "keep all Mixture of Experts (MoE) weights in the CPU for the draft model", [](common_params & params) { - params.speculative.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()}); + params.speculative.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override()); } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CPU_MOE_DRAFT")); add_opt(common_arg( @@ -2582,7 +2582,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } for (int i = 0; i < value; ++i) { static std::list buft_overrides_draft; - buft_overrides_draft.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i)); + buft_overrides_draft.push_back(llm_ffn_exps_block_regex(i)); params.speculative.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()}); } } diff --git a/common/common.h b/common/common.h index 5063d73f96..83d44dbaa7 100644 --- a/common/common.h +++ b/common/common.h @@ -734,6 +734,20 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count"; } +// +// MoE utils +// + +const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_exps"; + +static std::string llm_ffn_exps_block_regex(int idx) { + return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX); +} + +static llama_model_tensor_buft_override llm_ffn_exps_cpu_override() { + return { LLM_FFN_EXPS_REGEX, ggml_backend_cpu_buffer_type() }; +} + // // training utils // diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index 182c787544..db1f0b23dd 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -257,12 +257,13 @@ std::unordered_map STRING_FORMAT_RULES = { }; static bool is_reserved_name(const std::string & name) { - static std::unordered_set RESERVED_NAMES; - if (RESERVED_NAMES.empty()) { - RESERVED_NAMES.insert("root"); - for (const auto &p : PRIMITIVE_RULES) RESERVED_NAMES.insert(p.first); - for (const auto &p : STRING_FORMAT_RULES) RESERVED_NAMES.insert(p.first); - } + static const std::unordered_set RESERVED_NAMES = [] { + std::unordered_set s; + s.insert("root"); + for (const auto & p : PRIMITIVE_RULES) s.insert(p.first); + for (const auto & p : STRING_FORMAT_RULES) s.insert(p.first); + return s; + }(); return RESERVED_NAMES.find(name) != RESERVED_NAMES.end(); } diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 3468fc7757..8142fa9328 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -891,6 +891,9 @@ class TextModel(ModelBase): if chkhsh == "a0b64b4385f123663873756336c085744376d015ff328bb1d901598f63c44152": # ref: https://huggingface.co/ibm-granite/granite-embedding-small-english-r2 res = "modern-bert" + if chkhsh == "9b1be57e70d20d9501b2b3186e792d81181ae36ada3903c26f9fea418cf87206": + # ref: https://huggingface.co/inclusionAI/LLaDA-MoE-7B-A1B-Base + res = "llada-moe" if res is None: logger.warning("\n") @@ -6009,9 +6012,34 @@ class SeedOssModel(TextModel): @ModelBase.register("Olmo2ForCausalLM") +@ModelBase.register("Olmo3ForCausalLM") class Olmo2Model(TextModel): model_arch = gguf.MODEL_ARCH.OLMO2 + def set_gguf_parameters(self): + super().set_gguf_parameters() + + rope_scaling = self.hparams.get("rope_scaling") or {} + if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling: + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) + self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) + self.gguf_writer.add_rope_scaling_attn_factors(rope_scaling["attention_factor"]) + self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"]) + + if "sliding_window" in self.hparams: + self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) + + sliding_window_pattern = [] + if "layer_types" in self.hparams: + sliding_window_pattern = [t == "sliding_attention" for t in self.hparams["layer_types"]] + else: + # Olmo2 does not use sliding window attention. + # Olmo3 defaults to using sliding window for all layers except every 4th. + for i in range(self.hparams["num_hidden_layers"]): + sliding_window_pattern.append((i + 1) % 4 != 0) + + self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern) + @ModelBase.register("OlmoeForCausalLM") class OlmoeModel(TextModel): @@ -8242,6 +8270,76 @@ class HunYuanMoEModel(TextModel): raise ValueError(f"Unprocessed experts: {experts}") +@ModelBase.register("LLaDAMoEModel", "LLaDAMoEModelLM") +class LLaDAMoEModel(TextModel): + model_arch = gguf.MODEL_ARCH.LLADA_MOE + + def set_gguf_parameters(self): + super().set_gguf_parameters() + if (n_experts := self.hparams.get("num_experts")) is not None: + self.gguf_writer.add_expert_count(n_experts) + + if (expert_intermediate_size := self.hparams.get("expert_intermediate_size")) is not None: + self.gguf_writer.add_expert_feed_forward_length(expert_intermediate_size) + + # number of experts used per token (top-k) + if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None: + self.gguf_writer.add_expert_used_count(n_experts_used) + + self.gguf_writer.add_mask_token_id(156895) + self.gguf_writer.add_causal_attention(False) + self.gguf_writer.add_diffusion_shift_logits(False) + + _experts: list[dict[str, Tensor]] | None = None + + # Copied from: Qwen2MoeModel + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # process the experts separately + if name.find("experts") != -1: + n_experts = self.hparams["num_experts"] + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + tensors: list[tuple[str, Tensor]] = [] + + # merge the experts into a single 3d tensor + for w_name in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + new_name = self.map_tensor_name(merged_name) + + tensors.append((new_name, data_torch)) + return tensors + else: + return [] + + return [(self.map_tensor_name(name), data_torch)] + + # Copied from: Qwen2MoeModel + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + @ModelBase.register("HunYuanDenseV1ForCausalLM") class HunYuanModel(TextModel): model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 00486efb7b..8f2467194d 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -140,6 +140,7 @@ models = [ {"name": "exaone4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", }, {"name": "mellum", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum-4b-base", }, {"name": "modern-bert", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-embedding-small-english-r2", }, + {"name": "llada-moe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/LLaDA-MoE-7B-A1B-Base", }, ] # some models are known to be broken upstream, so we will skip them as exceptions diff --git a/examples/diffusion/diffusion-cli.cpp b/examples/diffusion/diffusion-cli.cpp index abf7fb3573..273942a165 100644 --- a/examples/diffusion/diffusion-cli.cpp +++ b/examples/diffusion/diffusion-cli.cpp @@ -510,19 +510,27 @@ static void diffusion_generate(llama_context * ctx, n_generated = params.max_length; } -static std::string format_input_text(const std::string & prompt, bool use_chat_template, llama_model * model) { +static std::string format_input_text(const std::string & prompt, const std::string & system_prompt, bool use_chat_template, llama_model * model) { if (!use_chat_template) { return prompt; } auto chat_templates = common_chat_templates_init(model, ""); - common_chat_templates_inputs inputs; - common_chat_msg user_msg; - user_msg.role = "user"; - user_msg.content = prompt; - inputs.add_generation_prompt = true; + common_chat_msg system_msg; + + if (!system_prompt.empty()) { + system_msg.role = "system"; + system_msg.content = system_prompt; + inputs.messages.push_back(system_msg); + } + + common_chat_msg user_msg; + user_msg.role = "user"; + user_msg.content = prompt; + inputs.messages.push_back(user_msg); + inputs.add_generation_prompt = true; auto result = common_chat_templates_apply(chat_templates.get(), inputs); @@ -579,7 +587,8 @@ int main(int argc, char ** argv) { llama_set_n_threads(ctx, params.cpuparams.n_threads, params.cpuparams_batch.n_threads); const llama_vocab * vocab = llama_model_get_vocab(model); - std::string formatted_prompt = format_input_text(params.prompt, params.enable_chat_template, model); + + std::string formatted_prompt = format_input_text(params.prompt, params.system_prompt, params.enable_chat_template, model); std::vector input_tokens = common_tokenize(vocab, formatted_prompt, @@ -596,6 +605,7 @@ int main(int argc, char ** argv) { } llama_token mask_token_id = llama_vocab_mask(vocab); + GGML_ASSERT(mask_token_id != LLAMA_TOKEN_NULL); bool visual_mode = params.diffusion.visual_mode; diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 633b87e584..d09771d104 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -145,6 +145,20 @@ int main(int argc, char ** argv) { llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size()); + if (llama_model_has_encoder(model)) { + if (llama_encode(ctx, batch)) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return 1; + } + + llama_token decoder_start_token_id = llama_model_decoder_start_token(model); + if (decoder_start_token_id == LLAMA_TOKEN_NULL) { + decoder_start_token_id = llama_vocab_bos(vocab); + } + + batch = llama_batch_get_one(&decoder_start_token_id, 1); + } + // main loop const auto t_main_start = ggml_time_us(); diff --git a/ggml/src/ggml-cann/common.h b/ggml/src/ggml-cann/common.h index c5fce8dc91..b707b84359 100755 --- a/ggml/src/ggml-cann/common.h +++ b/ggml/src/ggml-cann/common.h @@ -526,7 +526,10 @@ struct ggml_backend_cann_context { */ aclrtStream stream(int stream) { if (streams[stream] == nullptr) { - ggml_cann_set_device(device); + // If the device is not set here, destroying the stream later may cause a mismatch + // between the thread contexts where the stream was created and destroyed. + // However, I printed the device_id, thread_id, and stream, and they are all consistent. + ACL_CHECK(aclrtSetDevice(device)); ACL_CHECK(aclrtCreateStream(&streams[stream])); } return streams[stream]; diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index 19a18a281d..56d82b4af3 100755 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -75,13 +75,12 @@ * @param device The device ID to set. */ void ggml_cann_set_device(const int32_t device) { - // TODO: uncomment these lines after empty context has fixed. - // int current_device; - // ACL_CHECK(aclrtGetDevice(¤t_device)); + int current_device = -1; + aclrtGetDevice(¤t_device); - // if (device == current_device) { - // return; - // } + if (device == current_device) { + return; + } ACL_CHECK(aclrtSetDevice(device)); } @@ -1729,6 +1728,7 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx, ggml_cann_get_rows(ctx, dst); break; case GGML_OP_SET_ROWS: + std::cout << "lcg GGML_OP_SET_ROWS"<< std::endl; ggml_cann_set_rows(ctx, dst); break; case GGML_OP_DUP: diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 212e52ef6a..c4824d145a 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -8599,7 +8599,6 @@ static void ggml_compute_forward_timestep_embedding_f32( } if (dim % 2 != 0 && ith == 0) { embed_data[2 * half] = 0.f; - embed_data[dim] = 0.f; } } } diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index b0feea3623..045c6d3006 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -75,6 +75,8 @@ #define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4) #define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA1) #define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_RDNA1) +#define GGML_CUDA_CC_IS_CDNA1(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_CDNA2) +#define GGML_CUDA_CC_IS_CDNA2(cc) (cc >= GGML_CUDA_CC_CDNA2 && cc < GGML_CUDA_CC_CDNA3) #define GGML_CUDA_CC_IS_CDNA3(cc) (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_RDNA1) // Moore Threads @@ -325,6 +327,20 @@ static constexpr __device__ int ggml_cuda_get_physical_warp_size() { #endif // defined(GGML_USE_HIP) && (defined(__GFX9__) || defined(__GFX8__)) } +// Maximum number of bytes that can be copied in a single instruction. +static constexpr __device__ int ggml_cuda_get_max_cpy_bytes() { +#ifdef GGML_USE_HIP + return 16; +#else +#if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA + return 16; +#else + return 8; +#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA +#endif // GGML_USE_HIP +} + + [[noreturn]] static __device__ void no_device_code( const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) { diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh index b69f57d659..142a3a88d1 100644 --- a/ggml/src/ggml-cuda/fattn-common.cuh +++ b/ggml/src/ggml-cuda/fattn-common.cuh @@ -647,9 +647,7 @@ static __global__ void flash_attn_stream_k_fixup( } template // D == head size -#if !defined(GGML_USE_HIP) __launch_bounds__(D, 1) -#endif // !(defined(GGML_USE_HIP) static __global__ void flash_attn_combine_results( const float * __restrict__ VKQ_parts, const float2 * __restrict__ VKQ_meta, @@ -692,10 +690,7 @@ static __global__ void flash_attn_combine_results( float VKQ_numerator = 0.0f; float VKQ_denominator = 0.0f; for (int l = 0; l < parallel_blocks; ++l) { - const float diff = meta[l].x - kqmax; - float KQ_max_scale = expf(diff); - const uint32_t ftz_mask = 0xFFFFFFFF * (diff > SOFTMAX_FTZ_THRESHOLD); - *((uint32_t *) &KQ_max_scale) &= ftz_mask; + const float KQ_max_scale = expf(meta[l].x - kqmax); VKQ_numerator += KQ_max_scale * VKQ_parts[l*D + tid]; VKQ_denominator += KQ_max_scale * meta[l].y; @@ -836,11 +831,10 @@ void launch_fattn( CUDA_CHECK(cudaGetLastError()); } - int parallel_blocks = 1; - const dim3 block_dim(warp_size, nwarps, 1); int max_blocks_per_sm = 1; // Max. number of active blocks limited by occupancy. CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, fattn_kernel, block_dim.x * block_dim.y * block_dim.z, nbytes_shared)); + int parallel_blocks = max_blocks_per_sm; dim3 blocks_num; if (stream_k) { @@ -862,9 +856,6 @@ void launch_fattn( GGML_ASSERT(K->ne[1] % KQ_row_granularity == 0); const int ntiles_KQ = K->ne[1] / KQ_row_granularity; // Max. number of parallel blocks limited by tensor size. - // parallel_blocks should be at least large enough to achieve max. occupancy for a single wave: - parallel_blocks = std::max((nsm * max_blocks_per_sm) / ntiles_total, 1); - // parallel_blocks must not be larger than what the tensor size allows: parallel_blocks = std::min(parallel_blocks, ntiles_KQ); diff --git a/ggml/src/ggml-cuda/fattn-tile.cu b/ggml/src/ggml-cuda/fattn-tile.cu index c6a399ce5d..a2d9951ea5 100644 --- a/ggml/src/ggml-cuda/fattn-tile.cu +++ b/ggml/src/ggml-cuda/fattn-tile.cu @@ -2,20 +2,30 @@ #include "fattn-common.cuh" #include "fattn-tile.cuh" -#define FATTN_TILE_NTHREADS 256 +// kq_stride == number of KQ rows to process per iteration +// kq_nbatch == number of K columns to load in parallel for KQ calculation static int fattn_tile_get_kq_stride_host(const int D, const int ncols, const int cc, const int warp_size) { if (GGML_CUDA_CC_IS_AMD(cc)) { + if (GGML_CUDA_CC_IS_RDNA(cc)) { + switch (D) { + case 64: + return 128; + case 128: + case 256: + return ncols <= 16 ? 128 : 64; + default: + GGML_ABORT("fatal error"); + return -1; + } + } switch (D) { case 64: - return 64; + return ncols == 32 ? 128 : 64; case 128: + return ncols == 32 ? 64 : 32; case 256: - if (GGML_CUDA_CC_IS_GCN(cc) || GGML_CUDA_CC_IS_CDNA(cc)) { - return ncols <= 16 ? 64 : 32; - } else { - return 64; - } + return 32; default: GGML_ABORT("fatal error"); return -1; @@ -49,24 +59,28 @@ static int fattn_tile_get_kq_stride_host(const int D, const int ncols, const int static constexpr __device__ int fattn_tile_get_kq_stride_device(int D, int ncols, int warp_size) { #ifdef GGML_USE_HIP +#ifdef RDNA switch (D) { case 64: - return 64; + return 128; case 128: -#if defined(GCN) || defined(CDNA) - return ncols <= 16 ? 64 : 32; -#else - return 64; -#endif // defined(GCN) || defined(CDNA) case 256: -#if defined(GCN) || defined(CDNA) - return ncols <= 16 ? 64 : 32; -#else - return 64; -#endif // defined(GCN) || defined(CDNA) + return ncols <= 16 ? 128 : 64; default: return -1; } +#else + switch (D) { + case 64: + return ncols == 32 ? 128 : 64; + case 128: + return ncols == 32 ? 64 : 32; + case 256: + return 32; + default: + return -1; + } +#endif // RDNA #else #ifdef FAST_FP16_AVAILABLE switch (D) { @@ -100,17 +114,8 @@ static constexpr __device__ int fattn_tile_get_kq_nbatch_device(int D, int ncols case 64: return 64; case 128: -#if defined(GCN) || defined(CDNA) - return ncols <= 16 ? 64 : 128; -#else - return 64; -#endif // defined(GCN) || defined(CDNA) case 256: -#if defined(GCN) || defined(CDNA) - return ncols <= 16 ? 64 : 128; -#else - return ncols <= 16 ? 64 : 256; -#endif // defined(GCN) || defined(CDNA) + return 128; default: return -1; } @@ -120,9 +125,8 @@ static constexpr __device__ int fattn_tile_get_kq_nbatch_device(int D, int ncols case 64: return 64; case 128: - return ncols <= 16 ? 128 : 64; case 256: - return ncols <= 16 ? 64 : 128; + return 128; default: return -1; } @@ -142,12 +146,27 @@ static constexpr __device__ int fattn_tile_get_kq_nbatch_device(int D, int ncols GGML_UNUSED_VARS(ncols, warp_size); } -template // D == head size -#ifdef GGML_USE_HIP -__launch_bounds__(FATTN_TILE_NTHREADS, 1) +static int fattn_tile_get_nthreads_host(const int cc, const int ncols) { + return 256; + GGML_UNUSED_VARS(cc, ncols); +} + +static constexpr __device__ int fattn_tile_get_nthreads_device(int ncols) { + return 256; + GGML_UNUSED(ncols); +} + +static constexpr __device__ int fattn_tile_get_occupancy_device(int ncols) { +#ifdef RDNA + return 3; #else -__launch_bounds__(FATTN_TILE_NTHREADS, 2) -#endif // GGML_USE_HIP + return ncols <= 16 ? 3 : 2; +#endif // RDNA + GGML_UNUSED(ncols); +} + +template // D == head size +__launch_bounds__(fattn_tile_get_nthreads_device(ncols), fattn_tile_get_occupancy_device(ncols)) static __global__ void flash_attn_tile( const char * __restrict__ Q, const char * __restrict__ K, @@ -193,7 +212,7 @@ static __global__ void flash_attn_tile( } constexpr int warp_size = 32; - constexpr int nwarps = FATTN_TILE_NTHREADS / warp_size; + constexpr int nwarps = fattn_tile_get_nthreads_device(ncols) / warp_size; constexpr int kq_stride = fattn_tile_get_kq_stride_device(D, ncols, warp_size); static_assert(kq_stride % warp_size == 0, "kq_stride not divisable by warp_size."); constexpr int kq_nbatch = fattn_tile_get_kq_nbatch_device(D, ncols, warp_size); @@ -206,90 +225,126 @@ static __global__ void flash_attn_tile( const int sequence = blockIdx.z / ne02; const int head = blockIdx.z - sequence*ne02; const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix. - const float2 * Q_f2 = (const float2 *) (Q + nb03* sequence + nb02* head + nb01*ic0); - const half2 * K_h2 = (const half2 *) (K + nb13* sequence + nb12*(head / gqa_ratio)); - const half2 * V_h2 = (const half2 *) (V + nb13* sequence + nb12*(head / gqa_ratio)); // K and V have same shape - const half * maskh = (const half *) (mask + nb33*(sequence % ne33) + nb31*ic0); - const float * sinksf = (const float *) (sinks); + const float * Q_f = (const float *) (Q + nb03* sequence + nb02* head + nb01*ic0); + const half2 * K_h2 = (const half2 *) (K + nb13* sequence + nb12*(head / gqa_ratio)); + const half2 * V_h2 = (const half2 *) (V + nb13* sequence + nb12*(head / gqa_ratio)); // K and V have same shape + const half * maskh = (const half *) (mask + nb33*(sequence % ne33) + nb31*ic0); + const float * sinksf = (const float *) (sinks); const int stride_KV2 = nb11 / sizeof(half2); const float slope = get_alibi_slope(max_bias, head, n_head_log2, m0, m1); -#if defined(GGML_USE_HIP) - constexpr int cpy_nb = 16; -#else - constexpr int cpy_nb = 8; -#endif // defined(GGML_USE_HIP) && defined(GCN) + constexpr int cpy_nb = ggml_cuda_get_max_cpy_bytes(); constexpr int cpy_ne = cpy_nb / 4; - __shared__ float KQ[ncols][kq_stride]; + constexpr int cpw = ncols/nwarps; // cols per warp + + // softmax_iter_j == number of KQ columns for which to calculate softmax in parallel. + // KQ is originall 2D but uses a Z-shaped memory pattern for larger reads/writes. #ifdef FAST_FP16_AVAILABLE + constexpr int softmax_iter_j = cpw < 2*cpy_ne ? cpw : 2*cpy_ne; + + __shared__ half KQ[ncols/softmax_iter_j][kq_stride][softmax_iter_j]; __shared__ half2 Q_tmp[ncols][D/2]; - __shared__ half2 KV_tmp_h2[kq_stride * (kq_nbatch/2 + cpy_ne)]; // Padded to avoid memory bank conflicts. - half2 VKQ[ncols/nwarps][D/(2*warp_size)] = {{{0.0f, 0.0f}}}; + __shared__ half2 KV_tmp[kq_stride * (kq_nbatch/2 + cpy_ne)]; // Padded to avoid memory bank conflicts. + half2 VKQ[cpw][D/(2*warp_size)] = {{{0.0f, 0.0f}}}; #else + constexpr int softmax_iter_j = cpw < 1*cpy_ne ? cpw : 1*cpy_ne; + + __shared__ float KQ[ncols/softmax_iter_j][kq_stride][softmax_iter_j]; __shared__ float Q_tmp[ncols][D]; - __shared__ float KV_tmp_f[kq_stride * (kq_nbatch + cpy_ne)]; // Padded to avoid memory bank conflicts. - float2 * KV_tmp_f2 = (float2 *) KV_tmp_f; - float2 VKQ[ncols/nwarps][D/(2*warp_size)] = {{{0.0f, 0.0f}}}; + __shared__ float KV_tmp[kq_stride * (kq_nbatch + cpy_ne)]; // Padded to avoid memory bank conflicts. + float2 VKQ[cpw][D/(2*warp_size)] = {{{0.0f, 0.0f}}}; #endif // FAST_FP16_AVAILABLE + static_assert(cpw % softmax_iter_j == 0, "bad softmax_iter_j"); - - float kqmax[ncols/nwarps]; + float KQ_max[cpw]; #pragma unroll for (int j0 = 0; j0 < ncols; j0 += nwarps) { - kqmax[j0/nwarps] = -FLT_MAX/2.0f; + KQ_max[j0/nwarps] = -FLT_MAX/2.0f; } - float kqsum[ncols/nwarps] = {0.0f}; + float KQ_sum[cpw] = {0.0f}; + + // Load Q data, convert to FP16 if fast. +#pragma unroll + for (int j0 = 0; j0 < cpw; ++j0) { + const int j = j0 + threadIdx.y*cpw; + + constexpr int cpy_ne_D = cpy_ne < D/warp_size ? cpy_ne : D/warp_size; #pragma unroll - for (int j0 = 0; j0 < ncols; j0 += nwarps) { - const int j = j0 + threadIdx.y; + for (int i0 = 0; i0 < D; i0 += warp_size*cpy_ne_D) { + float tmp_f[cpy_ne_D] = {0.0f}; + if (ic0 + j < ne01) { + ggml_cuda_memcpy_1(tmp_f, &Q_f[j*(nb01/sizeof(float)) + i0 + threadIdx.x*cpy_ne_D]); + } #pragma unroll - for (int i0 = 0; i0 < D/2; i0 += warp_size) { - const float2 tmp = ic0 + j < ne01 ? Q_f2[j*(nb01/sizeof(float2)) + i0 + threadIdx.x] : make_float2(0.0f, 0.0f); + for (int i1 = 0; i1 < cpy_ne_D; ++i1) { + tmp_f[i1] *= scale; + } + #ifdef FAST_FP16_AVAILABLE - Q_tmp[j][i0 + threadIdx.x] = make_half2(tmp.x * scale, tmp.y * scale); + half2 tmp_h2[cpy_ne_D/2]; +#pragma unroll + for (int i1 = 0; i1 < cpy_ne_D; i1 += 2) { + tmp_h2[i1/2] = make_half2(tmp_f[i1 + 0], tmp_f[i1 + 1]); + } + ggml_cuda_memcpy_1(&Q_tmp[j][i0/2 + threadIdx.x*(cpy_ne_D/2)], tmp_h2); #else - Q_tmp[j][2*i0 + threadIdx.x] = tmp.x * scale; - Q_tmp[j][2*i0 + warp_size + threadIdx.x] = tmp.y * scale; + ggml_cuda_memcpy_1 (&Q_tmp[j][i0 + threadIdx.x* cpy_ne_D], tmp_f); #endif // FAST_FP16_AVAILABLE } } __syncthreads(); + // Main loop over KV cache: const int k_VKQ_max = KV_max ? KV_max[sequence*gridDim.x + blockIdx.x] : ne11; for (int k_VKQ_0 = blockIdx.y*kq_stride; k_VKQ_0 < k_VKQ_max; k_VKQ_0 += gridDim.y*kq_stride) { // Calculate KQ tile and keep track of new maximum KQ values: - float kqmax_new[ncols/nwarps]; + float KQ_max_new[cpw]; #pragma unroll - for (int j = 0; j < ncols/nwarps; ++j) { - kqmax_new[j] = kqmax[j]; + for (int j = 0; j < cpw; ++j) { + KQ_max_new[j] = KQ_max[j]; } - float sum[kq_stride/warp_size][ncols/nwarps] = {{0.0f}}; + float KQ_acc[kq_stride/warp_size][cpw] = {{0.0f}}; // Accumulators for KQ matrix multiplication. + // KQ = K @ Q matrix multiplication: #pragma unroll for (int k_KQ_0 = 0; k_KQ_0 < D; k_KQ_0 += kq_nbatch) { #pragma unroll for (int i_KQ_0 = 0; i_KQ_0 < kq_stride; i_KQ_0 += nwarps) { const int i_KQ = i_KQ_0 + threadIdx.y; -#pragma unroll - for (int k_KQ_1 = 0; k_KQ_1 < kq_nbatch/2; k_KQ_1 += warp_size) { - const half2 tmp_h2 = K_h2[int64_t(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ_0/2 + k_KQ_1 + threadIdx.x]; #ifdef FAST_FP16_AVAILABLE - KV_tmp_h2[i_KQ*(kq_nbatch/2 + cpy_ne) + k_KQ_1 + threadIdx.x] = tmp_h2; -#else - const float2 tmp_f2 = __half22float2(tmp_h2); - KV_tmp_f[i_KQ*(kq_nbatch + cpy_ne) + 2*k_KQ_1 + threadIdx.x] = tmp_f2.x; - KV_tmp_f[i_KQ*(kq_nbatch + cpy_ne) + 2*k_KQ_1 + warp_size + threadIdx.x] = tmp_f2.y; -#endif // FAST_FP16_AVAILABLE + constexpr int cpy_ne_kqnb = cpy_ne < kq_nbatch/(2*warp_size) ? cpy_ne : kq_nbatch/(2*warp_size); +#pragma unroll + for (int k_KQ_1 = 0; k_KQ_1 < kq_nbatch/2; k_KQ_1 += warp_size*cpy_ne_kqnb) { + ggml_cuda_memcpy_1( + &KV_tmp[i_KQ*(kq_nbatch/2 + cpy_ne) + k_KQ_1 + threadIdx.x*cpy_ne_kqnb], + &K_h2[int64_t(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ_0/2 + k_KQ_1 + threadIdx.x*cpy_ne_kqnb]); } +#else + constexpr int cpy_ne_kqnb = cpy_ne < kq_nbatch/warp_size ? cpy_ne : kq_nbatch/warp_size; +#pragma unroll + for (int k_KQ_1 = 0; k_KQ_1 < kq_nbatch; k_KQ_1 += warp_size*cpy_ne_kqnb) { + half2 tmp_h2[cpy_ne_kqnb/2]; + ggml_cuda_memcpy_1( + tmp_h2, &K_h2[int64_t(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ_0/2 + k_KQ_1/2 + threadIdx.x*(cpy_ne_kqnb/2)]); + + float2 tmp_f2[cpy_ne_kqnb/2]; +#pragma unroll + for (int k_KQ_2 = 0; k_KQ_2 < cpy_ne_kqnb/2; ++k_KQ_2) { + tmp_f2[k_KQ_2] = __half22float2(tmp_h2[k_KQ_2]); + } + ggml_cuda_memcpy_1( + &KV_tmp[i_KQ*(kq_nbatch + cpy_ne) + k_KQ_1 + threadIdx.x*cpy_ne_kqnb], tmp_f2); + } +#endif // FAST_FP16_AVAILABLE } __syncthreads(); @@ -298,12 +353,12 @@ static __global__ void flash_attn_tile( #pragma unroll for (int k_KQ_1 = 0; k_KQ_1 < kq_nbatch/2; k_KQ_1 += cpy_ne) { half2 K_k[kq_stride/warp_size][cpy_ne]; - half2 Q_k[ncols/nwarps][cpy_ne]; + half2 Q_k[cpw][cpy_ne]; #else #pragma unroll for (int k_KQ_1 = 0; k_KQ_1 < kq_nbatch; k_KQ_1 += cpy_ne) { float K_k[kq_stride/warp_size][cpy_ne]; - float Q_k[ncols/nwarps][cpy_ne]; + float Q_k[cpw][cpy_ne]; #endif // FAST_FP16_AVAILABLE #pragma unroll @@ -311,29 +366,29 @@ static __global__ void flash_attn_tile( const int i_KQ = i_KQ_0 + threadIdx.x; #ifdef FAST_FP16_AVAILABLE - ggml_cuda_memcpy_1(&K_k[i_KQ_0/warp_size], &KV_tmp_h2[i_KQ*(kq_nbatch/2 + cpy_ne) + k_KQ_1]); + ggml_cuda_memcpy_1(&K_k[i_KQ_0/warp_size], &KV_tmp[i_KQ*(kq_nbatch/2 + cpy_ne) + k_KQ_1]); #else - ggml_cuda_memcpy_1(&K_k[i_KQ_0/warp_size], &KV_tmp_f [i_KQ*(kq_nbatch + cpy_ne) + k_KQ_1]); + ggml_cuda_memcpy_1(&K_k[i_KQ_0/warp_size], &KV_tmp[i_KQ*(kq_nbatch + cpy_ne) + k_KQ_1]); #endif // FAST_FP16_AVAILABLE } #pragma unroll - for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) { - const int j_KQ = j_KQ_0 + threadIdx.y; + for (int j_KQ_0 = 0; j_KQ_0 < cpw; ++j_KQ_0) { + const int j_KQ = j_KQ_0 + threadIdx.y*cpw; #ifdef FAST_FP16_AVAILABLE - ggml_cuda_memcpy_1(&Q_k[j_KQ_0/nwarps], &Q_tmp[j_KQ][k_KQ_0/2 + k_KQ_1]); + ggml_cuda_memcpy_1(&Q_k[j_KQ_0], &Q_tmp[j_KQ][k_KQ_0/2 + k_KQ_1]); #else - ggml_cuda_memcpy_1(&Q_k[j_KQ_0/nwarps], &Q_tmp[j_KQ][k_KQ_0 + k_KQ_1]); + ggml_cuda_memcpy_1(&Q_k[j_KQ_0], &Q_tmp[j_KQ][k_KQ_0 + k_KQ_1]); #endif // FAST_FP16_AVAILABLE } #pragma unroll for (int i_KQ_0 = 0; i_KQ_0 < kq_stride; i_KQ_0 += warp_size) { #pragma unroll - for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) { + for (int j_KQ_0 = 0; j_KQ_0 < cpw; ++j_KQ_0) { #pragma unroll for (int k = 0; k < cpy_ne; ++k) { - ggml_cuda_mad(sum[i_KQ_0/warp_size][j_KQ_0/nwarps], K_k[i_KQ_0/warp_size][k], Q_k[j_KQ_0/nwarps][k]); + ggml_cuda_mad(KQ_acc[i_KQ_0/warp_size][j_KQ_0], K_k[i_KQ_0/warp_size][k], Q_k[j_KQ_0][k]); } } } @@ -344,104 +399,77 @@ static __global__ void flash_attn_tile( } } + // Apply logit softcap, mask, update KQ_max: #pragma unroll for (int i_KQ_0 = 0; i_KQ_0 < kq_stride; i_KQ_0 += warp_size) { const int i_KQ = i_KQ_0 + threadIdx.x; #pragma unroll - for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) { - const int j_KQ = j_KQ_0 + threadIdx.y; + for (int j_KQ_0 = 0; j_KQ_0 < cpw; ++j_KQ_0) { + const int j_KQ = j_KQ_0 + threadIdx.y*cpw; if (use_logit_softcap) { - sum[i_KQ_0/warp_size][j_KQ_0/nwarps] = logit_softcap * tanhf(sum[i_KQ_0/warp_size][j_KQ_0/nwarps]); + KQ_acc[i_KQ_0/warp_size][j_KQ_0] = logit_softcap * tanhf(KQ_acc[i_KQ_0/warp_size][j_KQ_0]); } - sum[i_KQ_0/warp_size][j_KQ_0/nwarps] += mask ? slope*__half2float(maskh[j_KQ*ne11 + k_VKQ_0 + i_KQ]) : 0.0f; + KQ_acc[i_KQ_0/warp_size][j_KQ_0] += mask ? slope*__half2float(maskh[j_KQ*ne11 + k_VKQ_0 + i_KQ]) : 0.0f; - kqmax_new[j_KQ_0/nwarps] = fmaxf(kqmax_new[j_KQ_0/nwarps], sum[i_KQ_0/warp_size][j_KQ_0/nwarps]); - - KQ[j_KQ][i_KQ] = sum[i_KQ_0/warp_size][j_KQ_0/nwarps]; + KQ_max_new[j_KQ_0] = fmaxf(KQ_max_new[j_KQ_0], KQ_acc[i_KQ_0/warp_size][j_KQ_0]); } } __syncthreads(); + // Calculate KQ softmax, write to shared KQ buffer, re-scale VKQ accumulators: #pragma unroll - for (int j0 = 0; j0 < ncols; j0 += nwarps) { - const int j = j0 + threadIdx.y; - - kqmax_new[j0/nwarps] = warp_reduce_max(kqmax_new[j0/nwarps]); - const float KQ_max_scale = expf(kqmax[j0/nwarps] - kqmax_new[j0/nwarps]); - kqmax[j0/nwarps] = kqmax_new[j0/nwarps]; - - float kqsum_add = 0.0f; - if (kq_stride % (4*warp_size) == 0 && cpy_ne % 4 == 0) { -#pragma unroll - for (int i0 = 0; i0 < kq_stride; i0 += 4*warp_size) { - const int i = i0 + 4*threadIdx.x; - - float4 val = *(const float4 *) &KQ[j][i]; - val.x = expf(val.x - kqmax[j0/nwarps]); - val.y = expf(val.y - kqmax[j0/nwarps]); - val.z = expf(val.z - kqmax[j0/nwarps]); - val.w = expf(val.w - kqmax[j0/nwarps]); - kqsum_add += val.x + val.y + val.z + val.w; - + for (int j0 = 0; j0 < cpw; j0 += softmax_iter_j) { #ifdef FAST_FP16_AVAILABLE - const half2 tmp[2] = {make_half2(val.x, val.y), make_half2(val.z, val.w)}; - ggml_cuda_memcpy_1(&KQ[j][i/2], &tmp); + half tmp[kq_stride/warp_size][softmax_iter_j]; #else - ggml_cuda_memcpy_1(&KQ[j][i], &val); + float tmp[kq_stride/warp_size][softmax_iter_j]; #endif // FAST_FP16_AVAILABLE - } - } else if (kq_stride % (2*warp_size) == 0 && cpy_ne % 2 == 0) { -#pragma unroll - for (int i0 = 0; i0 < kq_stride; i0 += 2*warp_size) { - const int i = i0 + 2*threadIdx.x; - float2 val = *(const float2 *) &KQ[j][i]; - val.x = expf(val.x - kqmax[j0/nwarps]); - val.y = expf(val.y - kqmax[j0/nwarps]); - kqsum_add += val.x + val.y; -#ifdef FAST_FP16_AVAILABLE - const half2 tmp = make_half2(val.x, val.y); - ggml_cuda_memcpy_1(&KQ[j][i/2], &tmp); -#else - ggml_cuda_memcpy_1(&KQ[j][i], &val); -#endif // FAST_FP16_AVAILABLE - } - } else { +#pragma unroll + for (int j1 = 0; j1 < softmax_iter_j; ++j1) { + KQ_max_new[j0+j1] = warp_reduce_max(KQ_max_new[j0+j1]); + const float KQ_max_scale = expf(KQ_max[j0+j1] - KQ_max_new[j0+j1]); + KQ_max[j0+j1] = KQ_max_new[j0+j1]; + + float KQ_sum_add = 0.0f; +#pragma unroll for (int i0 = 0; i0 < kq_stride; i0 += warp_size) { - const int i = i0 + threadIdx.x; - - const float diff = KQ[j][i] - kqmax[j0/nwarps]; - const float val = expf(diff); - kqsum_add += val; -#ifdef FAST_FP16_AVAILABLE - ((half *) KQ[j])[i] = val; -#else - KQ[j][i] = val; -#endif // FAST_FP16_AVAILABLE + const float val = expf(KQ_acc[i0/warp_size][j0+j1] - KQ_max[j0+j1]); + KQ_sum_add += val; + tmp[i0/warp_size][j1] = val; } - } - kqsum[j0/nwarps] = kqsum[j0/nwarps]*KQ_max_scale + kqsum_add; + KQ_sum[j0+j1] = KQ_sum[j0+j1]*KQ_max_scale + KQ_sum_add; #ifdef FAST_FP16_AVAILABLE - const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale, KQ_max_scale); + const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale, KQ_max_scale); #pragma unroll - for (int i0 = 0; i0 < D/2; i0 += warp_size) { - VKQ[j0/nwarps][i0/warp_size] *= KQ_max_scale_h2; - } + for (int i0 = 0; i0 < D/2; i0 += warp_size) { + VKQ[j0+j1][i0/warp_size] *= KQ_max_scale_h2; + } #else #pragma unroll - for (int i0 = 0; i0 < D/2; i0 += warp_size) { - VKQ[j0/nwarps][i0/warp_size].x *= KQ_max_scale; - VKQ[j0/nwarps][i0/warp_size].y *= KQ_max_scale; - } + for (int i0 = 0; i0 < D/2; i0 += warp_size) { + VKQ[j0+j1][i0/warp_size].x *= KQ_max_scale; + VKQ[j0+j1][i0/warp_size].y *= KQ_max_scale; + } #endif // FAST_FP16_AVAILABLE + } + +#pragma unroll + for (int i0 = 0; i0 < kq_stride; i0 += warp_size) { + const int i = i0 + threadIdx.x; + + ggml_cuda_memcpy_1( + KQ[j0/softmax_iter_j + threadIdx.y*(cpw/softmax_iter_j)][i], tmp[i0/warp_size]); + } } - constexpr int V_cols_per_iter = kq_stride*kq_nbatch / D; + // VKQ = V @ KQ matrix multiplication: + constexpr int V_cols_per_iter = kq_stride*kq_nbatch / D; // Number of V columns that fit in SRAM for K. static_assert(kq_stride % V_cols_per_iter == 0, "bad V_cols_per_iter"); #pragma unroll for (int k0 = 0; k0 < kq_stride; k0 += V_cols_per_iter) { @@ -449,65 +477,96 @@ static __global__ void flash_attn_tile( for (int k1 = 0; k1 < V_cols_per_iter; k1 += nwarps) { const int k_tile = k1 + threadIdx.y; -#pragma unroll - for (int i0 = 0; i0 < D/2; i0 += warp_size) { - const int i = i0 + threadIdx.x; - - const half2 tmp = V_h2[int64_t(k_VKQ_0 + k0 + k_tile)*stride_KV2 + i]; #ifdef FAST_FP16_AVAILABLE - KV_tmp_h2[k_tile*(D/2) + i] = tmp; -#else - KV_tmp_f2[k_tile*(D/2) + i] = __half22float2(tmp); -#endif // FAST_FP16_AVAILABLE + constexpr int cpy_ne_D = cpy_ne < D/(2*warp_size) ? cpy_ne : D/(2*warp_size); +#pragma unroll + for (int i0 = 0; i0 < D/2; i0 += warp_size*cpy_ne_D) { + ggml_cuda_memcpy_1( + &KV_tmp[k_tile*(D/2) + i0 + threadIdx.x*cpy_ne_D], + &V_h2[int64_t(k_VKQ_0 + k0 + k_tile)*stride_KV2 + i0 + threadIdx.x*cpy_ne_D]); } +#else + constexpr int cpy_ne_D = cpy_ne < D/warp_size ? cpy_ne : D/warp_size; +#pragma unroll + for (int i0 = 0; i0 < D; i0 += warp_size*cpy_ne_D) { + half2 tmp_h2[cpy_ne_D/2]; + ggml_cuda_memcpy_1( + tmp_h2, &V_h2[int64_t(k_VKQ_0 + k0 + k_tile)*stride_KV2 + i0/2 + threadIdx.x*(cpy_ne_D/2)]); + + float2 tmp_f2[cpy_ne_D/2]; +#pragma unroll + for (int i1 = 0; i1 < cpy_ne_D/2; ++i1) { + tmp_f2[i1] = __half22float2(tmp_h2[i1]); + } + ggml_cuda_memcpy_1( + &KV_tmp[k_tile*D + i0 + threadIdx.x*cpy_ne_D], tmp_f2); + } +#endif // FAST_FP16_AVAILABLE } __syncthreads(); +#ifdef FAST_FP16_AVAILABLE #pragma unroll for (int k1 = 0; k1 < V_cols_per_iter; ++k1) { -#ifdef FAST_FP16_AVAILABLE half2 V_k[(D/2)/warp_size]; - half2 KQ_k[ncols/nwarps]; -#else - float2 V_k[(D/2)/warp_size]; - float KQ_k[ncols/nwarps]; -#endif // FAST_FP16_AVAILABLE + half2 KQ_k[cpw]; + constexpr int cpy_ne_D = cpy_ne/2 < (D/2)/warp_size ? cpy_ne/2 : (D/2)/warp_size; #pragma unroll - for (int i0 = 0; i0 < D/2; i0 += warp_size) { - const int i = i0 + threadIdx.x; - -#ifdef FAST_FP16_AVAILABLE - V_k[i0/warp_size] = KV_tmp_h2[k1*(D/2) + i]; -#else - V_k[i0/warp_size] = KV_tmp_f2[k1*(D/2) + i]; -#endif // FAST_FP16_AVAILABLE + for (int i0 = 0; i0 < D/2; i0 += warp_size*cpy_ne_D) { + ggml_cuda_memcpy_1(&V_k[i0/warp_size], &KV_tmp[k1*(D/2) + i0 + threadIdx.x*cpy_ne_D]); } #pragma unroll - for (int j0 = 0; j0 < ncols; j0 += nwarps) { - const int j = j0 + threadIdx.y; + for (int j0 = 0; j0 < cpw; j0 += softmax_iter_j) { + const int j = j0/softmax_iter_j + threadIdx.y*(cpw/softmax_iter_j); -#ifdef FAST_FP16_AVAILABLE - KQ_k[j0/nwarps] = __half2half2(((const half *)KQ[j])[k0 + k1]); -#else - KQ_k[j0/nwarps] = KQ[j][k0 + k1]; -#endif // FAST_FP16_AVAILABLE + half tmp[softmax_iter_j]; + ggml_cuda_memcpy_1( + &tmp, KQ[j][k0 + k1]); +#pragma unroll + for (int j1 = 0; j1 < softmax_iter_j; ++j1) { + KQ_k[j0+j1] = __half2half2(tmp[j1]); + } } #pragma unroll for (int i0 = 0; i0 < D/2; i0 += warp_size) { #pragma unroll - for (int j0 = 0; j0 < ncols; j0 += nwarps) { -#ifdef FAST_FP16_AVAILABLE - VKQ[j0/nwarps][i0/warp_size] += V_k[i0/warp_size] *KQ_k[j0/nwarps]; -#else - VKQ[j0/nwarps][i0/warp_size].x += V_k[i0/warp_size].x*KQ_k[j0/nwarps]; - VKQ[j0/nwarps][i0/warp_size].y += V_k[i0/warp_size].y*KQ_k[j0/nwarps]; -#endif // FAST_FP16_AVAILABLE + for (int j0 = 0; j0 < cpw; ++j0) { + VKQ[j0][i0/warp_size] += V_k[i0/warp_size]*KQ_k[j0]; } } } +#else +#pragma unroll + for (int k1 = 0; k1 < V_cols_per_iter; ++k1) { + float2 V_k[(D/2)/warp_size]; + float KQ_k[cpw]; + + constexpr int cpy_ne_D = cpy_ne < D/warp_size ? cpy_ne : D/warp_size; +#pragma unroll + for (int i0 = 0; i0 < D; i0 += warp_size*cpy_ne_D) { + ggml_cuda_memcpy_1(&V_k[i0/(2*warp_size)], &KV_tmp[k1*D + i0 + threadIdx.x*cpy_ne_D]); + } +#pragma unroll + for (int j0 = 0; j0 < cpw; j0 += softmax_iter_j) { + const int j = j0/softmax_iter_j + threadIdx.y*(cpw/softmax_iter_j); + + ggml_cuda_memcpy_1( + &KQ_k[j0], KQ[j][k0 + k1]); + } + +#pragma unroll + for (int i0 = 0; i0 < D/2; i0 += warp_size) { +#pragma unroll + for (int j0 = 0; j0 < cpw; ++j0) { + VKQ[j0][i0/warp_size].x += V_k[i0/warp_size].x*KQ_k[j0]; + VKQ[j0][i0/warp_size].y += V_k[i0/warp_size].y*KQ_k[j0]; + } + } + } +#endif // FAST_FP16_AVAILABLE __syncthreads(); } @@ -519,69 +578,92 @@ static __global__ void flash_attn_tile( const float sink = sinksf[head]; #pragma unroll - for (int j0 = 0; j0 < ncols; j0 += nwarps) { - float kqmax_new_j = fmaxf(kqmax[j0/nwarps], sink); - kqmax_new_j = warp_reduce_max(kqmax_new_j); + for (int j0 = 0; j0 < cpw; ++j0) { + float KQ_max_new_j = fmaxf(KQ_max[j0], sink); + KQ_max_new_j = warp_reduce_max(KQ_max_new_j); - const float KQ_max_scale = expf(kqmax[j0/nwarps] - kqmax_new_j); - kqmax[j0/nwarps] = kqmax_new_j; + const float KQ_max_scale = expf(KQ_max[j0] - KQ_max_new_j); + KQ_max[j0] = KQ_max_new_j; - const float val = expf(sink - kqmax[j0/nwarps]); - kqsum[j0/nwarps] = kqsum[j0/nwarps] * KQ_max_scale; + const float val = expf(sink - KQ_max[j0]); + KQ_sum[j0] = KQ_sum[j0] * KQ_max_scale; if (threadIdx.x == 0) { - kqsum[j0/nwarps] += val; + KQ_sum[j0] += val; } #ifdef FAST_FP16_AVAILABLE const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale, KQ_max_scale); #pragma unroll for (int i0 = 0; i0 < D/2; i0 += warp_size) { - VKQ[j0/nwarps][i0/warp_size] *= KQ_max_scale_h2; + VKQ[j0][i0/warp_size] *= KQ_max_scale_h2; } #else #pragma unroll for (int i0 = 0; i0 < D/2; i0 += warp_size) { - VKQ[j0/nwarps][i0/warp_size].x *= KQ_max_scale; - VKQ[j0/nwarps][i0/warp_size].y *= KQ_max_scale; + VKQ[j0][i0/warp_size].x *= KQ_max_scale; + VKQ[j0][i0/warp_size].y *= KQ_max_scale; } #endif // FAST_FP16_AVAILABLE } } - float2 * dst2 = (float2 *) dst; - #pragma unroll - for (int j_VKQ_0 = 0; j_VKQ_0 < ncols; j_VKQ_0 += nwarps) { - const int j_VKQ = j_VKQ_0 + threadIdx.y; + for (int j_VKQ_0 = 0; j_VKQ_0 < cpw; ++j_VKQ_0) { + KQ_sum[j_VKQ_0] = warp_reduce_sum(KQ_sum[j_VKQ_0]); + } + if (gridDim.y == 1) { +#pragma unroll + for (int j_VKQ_0 = 0; j_VKQ_0 < cpw; ++j_VKQ_0) { +#ifdef FAST_FP16_AVAILABLE + const half2 KQ_sum_j_inv = make_half2(1.0f/KQ_sum[j_VKQ_0], 1.0f/KQ_sum[j_VKQ_0]); +#pragma unroll + for (int i = 0; i < (D/2)/warp_size; ++i) { + VKQ[j_VKQ_0][i] *= KQ_sum_j_inv; + } +#else + const float KQ_sum_j_inv = 1.0f/KQ_sum[j_VKQ_0]; +#pragma unroll + for (int i = 0; i < (D/2)/warp_size; ++i) { + VKQ[j_VKQ_0][i].x *= KQ_sum_j_inv; + VKQ[j_VKQ_0][i].y *= KQ_sum_j_inv; + } +#endif // FAST_FP16_AVAILABLE + } + } + + // Write back results: +#pragma unroll + for (int j_VKQ_0 = 0; j_VKQ_0 < cpw; ++j_VKQ_0) { + const int j_VKQ = j_VKQ_0 + threadIdx.y*cpw; if (ic0 + j_VKQ >= ne01) { return; } - float kqsum_j = kqsum[j_VKQ_0/nwarps]; - kqsum_j = warp_reduce_sum(kqsum_j); - const int j_dst_unrolled = ((sequence*ne01 + ic0 + j_VKQ)*ne02 + head)*gridDim.y + blockIdx.y; -#pragma unroll - for (int i00 = 0; i00 < D/2; i00 += warp_size) { - const int i0 = i00 + threadIdx.x; - #ifdef FAST_FP16_AVAILABLE - float2 dst_val = __half22float2(VKQ[j_VKQ_0/nwarps][i0/warp_size]); + constexpr int cpy_ne_D = cpy_ne/2 < (D/2)/warp_size ? cpy_ne/2 : (D/2)/warp_size; +#pragma unroll + for (int i0 = 0; i0 < D/2; i0 += warp_size*cpy_ne_D) { + float2 tmp[cpy_ne_D]; +#pragma unroll + for (int i1 = 0; i1 < cpy_ne_D; ++i1) { + tmp[i1] = __half22float2(VKQ[j_VKQ_0][i0/warp_size + i1]); + } + ggml_cuda_memcpy_1(&dst[j_dst_unrolled*D + 2*i0 + threadIdx.x*(2*cpy_ne_D)], tmp); + } #else - float2 dst_val = VKQ[j_VKQ_0/nwarps][i0/warp_size]; + constexpr int cpy_ne_D = cpy_ne < D/warp_size ? cpy_ne : D/warp_size; +#pragma unroll + for (int i0 = 0; i0 < D; i0 += warp_size*cpy_ne_D) { + ggml_cuda_memcpy_1( + &dst[j_dst_unrolled*D + i0 + threadIdx.x*cpy_ne_D], &VKQ[j_VKQ_0][i0/(2*warp_size)]); + } #endif // FAST_FP16_AVAILABLE - if (gridDim.y == 1) { - dst_val.x /= kqsum_j; - dst_val.y /= kqsum_j; - } - dst2[j_dst_unrolled*(D/2) + i0] = dst_val; - } - if (gridDim.y != 1 && threadIdx.x == 0) { - dst_meta[j_dst_unrolled] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j); + dst_meta[j_dst_unrolled] = make_float2(KQ_max[j_VKQ_0], KQ_sum[j_VKQ_0]); } } #else @@ -602,15 +684,29 @@ template static void launch_fattn_tile_switch_ncols(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * Q = dst->src[0]; - const int id = ggml_cuda_get_device(); - const int cc = ggml_cuda_info().devices[id].cc; - const int warp_size = 32; - const int nwarps = FATTN_TILE_NTHREADS / warp_size; + const int id = ggml_cuda_get_device(); + const int cc = ggml_cuda_info().devices[id].cc; + const int warp_size = 32; constexpr size_t nbytes_shared = 0; +#ifdef GGML_USE_HIP + if constexpr (D <= 128) { + if (Q->ne[1] > 32) { + constexpr int cols_per_block = 64; + const int nwarps = fattn_tile_get_nthreads_host(cc, cols_per_block) / warp_size; + fattn_kernel_t fattn_kernel = flash_attn_tile; + const int kq_stride = fattn_tile_get_kq_stride_host(D, cols_per_block, cc, warp_size); + launch_fattn + (ctx, dst, fattn_kernel, nwarps, nbytes_shared, kq_stride, true, true, false, warp_size); + return; + } + } +#endif // GGML_USE_HIP + if (Q->ne[1] > 16) { constexpr int cols_per_block = 32; + const int nwarps = fattn_tile_get_nthreads_host(cc, cols_per_block) / warp_size; fattn_kernel_t fattn_kernel = flash_attn_tile; const int kq_stride = fattn_tile_get_kq_stride_host(D, cols_per_block, cc, warp_size); launch_fattn @@ -619,6 +715,7 @@ static void launch_fattn_tile_switch_ncols(ggml_backend_cuda_context & ctx, ggml } constexpr int cols_per_block = 16; + const int nwarps = fattn_tile_get_nthreads_host(cc, cols_per_block) / warp_size; fattn_kernel_t fattn_kernel = flash_attn_tile; const int kq_stride = fattn_tile_get_kq_stride_host(D, cols_per_block, cc, warp_size); launch_fattn diff --git a/ggml/src/ggml-cuda/im2col.cu b/ggml/src/ggml-cuda/im2col.cu index 7737d6a5d5..56dc054574 100644 --- a/ggml/src/ggml-cuda/im2col.cu +++ b/ggml/src/ggml-cuda/im2col.cu @@ -122,11 +122,14 @@ static __global__ void im2col_3d_kernel( int64_t OH_OW, int64_t KD_KH_KW, int64_t ID_IH_IW, int64_t KH_KW, int64_t IH_IW, int64_t IC_ID_IH_IW, int64_t IC_KD_KH_KW, int64_t OW_KD_KH_KW, int64_t OD_OH_OW_IC_KD_KH_KW, int64_t OH_OW_IC_KD_KH_KW, int64_t OW_IC_KD_KH_KW, int64_t N_OD_OH, int64_t OD_OH, + int64_t stride_q, int64_t stride_z, int64_t stride_y, int64_t stride_x, int s0, int s1, int s2, int p0, int p1, int p2, int d0, int d1, int d2) { const int64_t i = threadIdx.x + blockIdx.x * blockDim.x; if (i >= IC_KD_KH_KW) { return; } + GGML_UNUSED(N); GGML_UNUSED(OC); GGML_UNUSED(OH_OW); GGML_UNUSED(OD); GGML_UNUSED(OW); GGML_UNUSED(KD); GGML_UNUSED(KH); + GGML_UNUSED(ID_IH_IW); GGML_UNUSED(IH_IW); GGML_UNUSED(IC_ID_IH_IW); GGML_UNUSED(OW_KD_KH_KW); const int64_t iic = i / KD_KH_KW; const int64_t ikd = (i - iic * KD_KH_KW) / KH_KW; @@ -148,7 +151,7 @@ static __global__ void im2col_3d_kernel( if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW || iid < 0 || iid >= ID) { dst[offset_dst] = 0.0f; } else { - const int64_t offset_src = in*IC_ID_IH_IW + iic*ID_IH_IW + iid*IH_IW + iih*IW + iiw; + const int64_t offset_src = ((in * IC + iic) * stride_q) + (iid * stride_z) + (iih * stride_y) + (iiw * stride_x); dst[offset_dst] = src[offset_src]; } } @@ -159,6 +162,7 @@ template static void im2col_3d_cuda(const float * src, T* dst, int64_t N, int64_t IC, int64_t ID, int64_t IH, int64_t IW, int64_t OC, int64_t KD, int64_t KH, int64_t KW, int64_t OD, int64_t OH, int64_t OW, + int64_t stride_q, int64_t stride_z, int64_t stride_y, int64_t stride_x, int s0, int s1, int s2, int p0, int p1, int p2, int d0, int d1, int d2, cudaStream_t stream) { const int64_t OH_OW = OH*OW; const int64_t KD_KH_KW = KD*KH*KW; @@ -179,23 +183,30 @@ static void im2col_3d_cuda(const float * src, T* dst, OH_OW, KD_KH_KW, ID_IH_IW, KH_KW, IH_IW, IC_ID_IH_IW, IC_KD_KH_KW, OW_KD_KH_KW, OD_OH_OW_IC_KD_KH_KW, OH_OW_IC_KD_KH_KW, OW_IC_KD_KH_KW, N_OD_OH, OD_OH, + stride_q, stride_z, stride_y, stride_x, s0, s1, s2, p0, p1, p2, d0, d1, d2); } static void im2col_3d_cuda_f16(const float * src, half * dst, int64_t N, int64_t IC, int64_t ID, int64_t IH, int64_t IW, int64_t OC, int64_t KD, int64_t KH, int64_t KW, int64_t OD, int64_t OH, int64_t OW, + int64_t stride_q, int64_t stride_z, int64_t stride_y, int64_t stride_x, int s0, int s1, int s2, int p0, int p1, int p2, int d0, int d1, int d2, cudaStream_t stream) { - im2col_3d_cuda(src, dst, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW, s0, s1, s2, p0, p1, p2, d0, d1, d2, stream); + im2col_3d_cuda(src, dst, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW, + stride_q, stride_z, stride_y, stride_x, + s0, s1, s2, p0, p1, p2, d0, d1, d2, stream); } static void im2col_3d_cuda_f32(const float * src, float * dst, int64_t N, int64_t IC, int64_t ID, int64_t IH, int64_t IW, int64_t OC, int64_t KD, int64_t KH, int64_t KW, int64_t OD, int64_t OH, int64_t OW, + int64_t stride_q, int64_t stride_z, int64_t stride_y, int64_t stride_x, int s0, int s1, int s2, int p0, int p1, int p2, int d0, int d1, int d2, cudaStream_t stream) { - im2col_3d_cuda(src, dst, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW, s0, s1, s2, p0, p1, p2, d0, d1, d2, stream); + im2col_3d_cuda(src, dst, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW, + stride_q, stride_z, stride_y, stride_x, + s0, s1, s2, p0, p1, p2, d0, d1, d2, stream); } void ggml_cuda_op_im2col_3d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { @@ -235,9 +246,19 @@ void ggml_cuda_op_im2col_3d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) const int64_t OH = ne2; const int64_t OW = ne1; + const size_t es = ggml_element_size(src1); + const int64_t stride_x = src1->nb[0] / es; + const int64_t stride_y = src1->nb[1] / es; + const int64_t stride_z = src1->nb[2] / es; + const int64_t stride_q = src1->nb[3] / es; + if(dst->type == GGML_TYPE_F16) { - im2col_3d_cuda_f16(src1_d, (half *) dst_d, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW, s0, s1, s2, p0, p1, p2, d0, d1, d2, stream); + im2col_3d_cuda_f16(src1_d, (half *) dst_d, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW, + stride_q, stride_z, stride_y, stride_x, + s0, s1, s2, p0, p1, p2, d0, d1, d2, stream); } else { - im2col_3d_cuda_f32(src1_d, (float *) dst_d, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW, s0, s1, s2, p0, p1, p2, d0, d1, d2, stream); + im2col_3d_cuda_f32(src1_d, (float *) dst_d, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW, + stride_q, stride_z, stride_y, stride_x, + s0, s1, s2, p0, p1, p2, d0, d1, d2, stream); } } diff --git a/ggml/src/ggml-cuda/tsembd.cu b/ggml/src/ggml-cuda/tsembd.cu index 153ddbcda9..b91a26fc80 100644 --- a/ggml/src/ggml-cuda/tsembd.cu +++ b/ggml/src/ggml-cuda/tsembd.cu @@ -7,11 +7,11 @@ static __global__ void timestep_embedding_f32(const float * timesteps, float * d int j = threadIdx.x + blockIdx.x * blockDim.x; float * embed_data = (float *)((char *)dst + i*nb1); - if (dim % 2 != 0 && j == ((dim + 1) / 2)) { - embed_data[dim] = 0.f; + int half = dim / 2; + if (dim % 2 != 0 && j == half) { + embed_data[2 * half] = 0.f; } - int half = dim / 2; if (j >= half) { return; } diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h index 12bbee4556..37386afcd4 100644 --- a/ggml/src/ggml-cuda/vendors/hip.h +++ b/ggml/src/ggml-cuda/vendors/hip.h @@ -158,41 +158,41 @@ #define __CUDA_ARCH__ 1300 -#if defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__) -#define GCN -#endif - #if defined(__gfx900__) || defined(__gfx906__) #define GCN5 -#endif +#endif // defined(__gfx900__) || defined(__gfx906__) #if defined(__gfx803__) #define GCN4 -#endif +#endif // defined(__gfx803__) -#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx942__) -#define CDNA // For the entire family -#endif +#if defined(GCN5) || defined(GCN4) +#define GCN +#endif // defined(GCN5) || defined(GCN4) #if defined(__gfx942__) #define CDNA3 -#endif +#endif // defined(__gfx942__) #if defined(__gfx90a__) #define CDNA2 -#endif +#endif // defined(__gfx90a__) #if defined(__gfx908__) #define CDNA1 -#endif +#endif // defined(__gfx908__) + +#if defined(CDNA3) || defined(CDNA2) || defined(CDNA1) +#define CDNA // For the entire family +#endif // defined(CDNA3) || defined(CDNA2) || defined(CDNA1) #if defined(__GFX12__) #define RDNA4 -#endif +#endif // defined(__GFX12__) #if defined(__GFX11__) #define RDNA3 -#endif +#endif // defined(__GFX11__) #if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \ defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__) @@ -201,7 +201,11 @@ #if defined(__gfx1010__) || defined(__gfx1012__) #define RDNA1 -#endif +#endif // defined(__gfx1010__) || defined(__gfx1012__) + +#if defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(RDNA1) +#define RDNA // For the entire family +#endif // defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(RDNA1) #ifndef __has_builtin #define __has_builtin(x) 0 diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 4314c9cc93..5057e264f6 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -4167,7 +4167,7 @@ kernel void kernel_timestep_embedding_f32( } if (args.dim % 2 != 0 && tpitg.x == 0) { - embed_data[args.dim] = 0.f; + embed_data[2 * half_] = 0.f; } } diff --git a/ggml/src/ggml-opencl/kernels/tsembd.cl b/ggml/src/ggml-opencl/kernels/tsembd.cl index 4b1107f70b..21444bd958 100644 --- a/ggml/src/ggml-opencl/kernels/tsembd.cl +++ b/ggml/src/ggml-opencl/kernels/tsembd.cl @@ -26,8 +26,8 @@ kernel void kernel_timestep_embedding( local_half_dim = logical_dim / 2; local_embed_data_ptr = (global float *)((global char *)local_dst_output_base_ptr + local_i * dst_nb1_bytes); - if (logical_dim % 2 != 0 && local_j == ((logical_dim + 1) / 2)) { - local_embed_data_ptr[logical_dim] = 0.0f; + if (logical_dim % 2 != 0 && local_j == local_half_dim) { + local_embed_data_ptr[2 * local_half_dim] = 0.0f; } if (local_j >= local_half_dim) { diff --git a/ggml/src/ggml-sycl/tsembd.cpp b/ggml/src/ggml-sycl/tsembd.cpp index f6ca626ea7..f2003794d3 100644 --- a/ggml/src/ggml-sycl/tsembd.cpp +++ b/ggml/src/ggml-sycl/tsembd.cpp @@ -21,11 +21,12 @@ static void timestep_embedding_f32( int j = item_ct1.get_local_id(2) + item_ct1.get_group(2) * item_ct1.get_local_range(2); float * embed_data = (float *)((char *)dst + i*nb1); - if (dim % 2 != 0 && j == ((dim + 1) / 2)) { - embed_data[dim] = 0.f; + int half = dim / 2; + + if (dim % 2 != 0 && j == half) { + embed_data[2 * half] = 0.f; } - int half = dim / 2; if (j >= half) { return; } diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 60a99dc78b..1f1136382e 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -4423,8 +4423,8 @@ static void ggml_vk_print_gpu_info(size_t idx) { static bool ggml_vk_instance_validation_ext_available(); static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector& instance_extensions); - static bool ggml_vk_instance_debug_utils_ext_available(const std::vector & instance_extensions); +static bool ggml_vk_device_is_supported(const vk::PhysicalDevice & vkdev); static void ggml_vk_instance_init() { if (vk_instance_initialized) { @@ -4540,7 +4540,7 @@ static void ggml_vk_instance_init() { new_driver.pNext = &new_id; devices[i].getProperties2(&new_props); - if (new_props.properties.deviceType == vk::PhysicalDeviceType::eDiscreteGpu || new_props.properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu) { + if ((new_props.properties.deviceType == vk::PhysicalDeviceType::eDiscreteGpu || new_props.properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu) && ggml_vk_device_is_supported(devices[i])) { // Check if there are two physical devices corresponding to the same GPU auto old_device = std::find_if( vk_instance.device_indices.begin(), @@ -12738,6 +12738,20 @@ static bool ggml_vk_instance_debug_utils_ext_available( UNUSED(instance_extensions); } +static bool ggml_vk_device_is_supported(const vk::PhysicalDevice & vkdev) { + VkPhysicalDeviceFeatures2 device_features2; + device_features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2; + + VkPhysicalDeviceVulkan11Features vk11_features; + vk11_features.pNext = nullptr; + vk11_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES; + device_features2.pNext = &vk11_features; + + vkGetPhysicalDeviceFeatures2(vkdev, &device_features2); + + return vk11_features.storageBuffer16BitAccess; +} + static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch) { switch (props.vendorID) { case VK_VENDOR_ID_INTEL: diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp b/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp index 79e065a931..ce8e09442d 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp @@ -24,11 +24,12 @@ void main() { const uint j = gl_GlobalInvocationID.x; const uint d_offset = i * p.nb1; - if (p.dim % 2 != 0 && j == ((p.dim + 1) / 2)) { - data_d[d_offset + p.dim] = 0.f; + const uint half_dim = p.dim / 2; + + if (p.dim % 2 != 0 && j == half_dim) { + data_d[d_offset + 2 * half_dim] = 0.f; } - const uint half_dim = p.dim / 2; if (j >= half_dim) { return; } diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 50dc1aa24f..3584827dca 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -4923,12 +4923,8 @@ struct ggml_tensor * ggml_timestep_embedding( struct ggml_tensor * timesteps, int dim, int max_period) { - int actual_dim = dim; - if (dim % 2 != 0) { - actual_dim = dim + 1; - } - struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, actual_dim, timesteps->ne[0]); + struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, dim, timesteps->ne[0]); ggml_set_op_params_i32(result, 0, dim); ggml_set_op_params_i32(result, 1, max_period); diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 7c02f611ed..0c520233d2 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -401,6 +401,7 @@ class MODEL_ARCH(IntEnum): DREAM = auto() SMALLTHINKER = auto() LLADA = auto() + LLADA_MOE = auto() SEED_OSS = auto() @@ -738,6 +739,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.DREAM: "dream", MODEL_ARCH.SMALLTHINKER: "smallthinker", MODEL_ARCH.LLADA: "llada", + MODEL_ARCH.LLADA_MOE: "llada-moe", MODEL_ARCH.SEED_OSS: "seed_oss", } @@ -2710,6 +2712,23 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, ], + MODEL_ARCH.LLADA_MOE: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_K_NORM, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_UP_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + ], # TODO } diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 3e02a1be62..6c4b689f02 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -97,6 +97,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_DREAM, "dream" }, { LLM_ARCH_SMALLTHINKER, "smallthinker" }, { LLM_ARCH_LLADA, "llada" }, + { LLM_ARCH_LLADA_MOE, "llada-moe" }, { LLM_ARCH_SEED_OSS, "seed_oss" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -2167,6 +2168,26 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_LLADA_MOE, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, + { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, + { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, + { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, + }, + }, { LLM_ARCH_SEED_OSS, { @@ -2447,6 +2468,7 @@ bool llm_arch_is_diffusion(const llm_arch & arch) { switch (arch) { case LLM_ARCH_DREAM: case LLM_ARCH_LLADA: + case LLM_ARCH_LLADA_MOE: return true; default: return false; diff --git a/src/llama-arch.h b/src/llama-arch.h index fe80ff5517..c3421181f7 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -101,6 +101,7 @@ enum llm_arch { LLM_ARCH_DREAM, LLM_ARCH_SMALLTHINKER, LLM_ARCH_LLADA, + LLM_ARCH_LLADA_MOE, LLM_ARCH_SEED_OSS, LLM_ARCH_UNKNOWN, }; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 85458d3136..c6ad8d2ac5 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -957,6 +957,18 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.causal_attn = false; } break; + case LLM_ARCH_LLADA_MOE: + { + ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false); + + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + // diffusion language model uses non-causal attention + hparams.causal_attn = false; + switch (hparams.n_layer) { + case 16: type = LLM_TYPE_A1_7B; break; + default: type = LLM_TYPE_UNKNOWN; + } + } break; case LLM_ARCH_QWEN2MOE: { ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false); @@ -1359,6 +1371,14 @@ void llama_model::load_hparams(llama_model_loader & ml) { { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); + if (found_swa && hparams.n_swa > 0) { + hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; + hparams.set_swa_pattern(4); + } else { + hparams.swa_type = LLAMA_SWA_TYPE_NONE; + } + switch (hparams.n_layer) { case 16: type = LLM_TYPE_1B; break; case 32: type = LLM_TYPE_7B; break; @@ -2408,6 +2428,40 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } } break; + case LLM_ARCH_LLADA_MOE: + { + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + // output + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0); + + GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for llada-moe"); + GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for llada-moe"); + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); + layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0); + layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0); + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + + layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0); + + const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used; + + layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0); + layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0); + layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0); + } + } break; case LLM_ARCH_LLAMA4: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); @@ -12361,6 +12415,7 @@ struct llm_build_olmo : public llm_graph_context { } }; +template struct llm_build_olmo2 : public llm_graph_context { llm_build_olmo2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; @@ -12376,7 +12431,14 @@ struct llm_build_olmo2 : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv(); + using inp_attn_type = std::conditional_t; + inp_attn_type * inp_attn = nullptr; + + if constexpr (iswa) { + inp_attn = build_attn_inp_kv_iswa(); + } else { + inp_attn = build_attn_inp_kv(); + } ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -12409,17 +12471,36 @@ struct llm_build_olmo2 : public llm_graph_context { Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); - Qcur = ggml_rope_ext( + const bool is_swa = hparams.is_swa(il); + + if (is_swa) { + // For sliding window layers, Olmo3 use regular rope with no yarn rope scaling. + // This is achieved here by setting freq_scale and attn_factor to 1. + // We also set ext_factor to 0 to avoid a few unnecessary computations. + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, 1.0, + 0.0, 1.0, beta_fast, beta_slow + ); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, 1.0, + 0.0, 1.0, beta_fast, beta_slow + ); + } else { + Qcur = ggml_rope_ext( ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); - Kcur = ggml_rope_ext( + Kcur = ggml_rope_ext( ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); + } cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); @@ -12618,6 +12699,132 @@ struct llm_build_olmoe : public llm_graph_context { } }; +struct llm_build_llada_moe : public llm_graph_context { + llm_build_llada_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { + const int64_t n_embd_head = hparams.n_embd_head_v; + + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + ggml_tensor * cur; + ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + ggml_tensor * inp_pos = build_inp_pos(); + + auto * inp_attn = build_attn_inp_no_cache(); + + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + for (int il = 0; il < n_layer; ++il) { + ggml_tensor * inpSA = inpL; + + // norm + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + // self_attention + { + // compute Q and K and RoPE them + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il); + cb(Qcur, "Qcur_normed", il); + + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il); + cb(Kcur, "Kcur_normed", il); + + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + cur = build_attn(inp_attn, + model.layers[il].wo, NULL, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + } + + if (il == n_layer - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // MoE branch + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_moe_ffn(cur, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, + model.layers[il].ffn_down_exps, + nullptr, + n_expert, n_expert_used, + LLM_FFN_SILU, false, + false, 0.0, + LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, + il); + cb(cur, "ffn_moe_out", il); + + cur = ggml_add(ctx0, cur, ffn_inp); + + cur = build_cvec(cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + + cb(cur, "result_norm", -1); + res->t_embd = cur; + + // lm_head + cur = build_lora_mm(model.output, cur); + + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); + } +}; + struct llm_build_openelm : public llm_graph_context { llm_build_openelm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; @@ -18810,6 +19017,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, //case LLM_ARCH_GEMMA_EMBEDDING: // TODO: disabled until the cacheless SWA logic is fixed [TAG_NO_CACHE_ISWA] case LLM_ARCH_DREAM: case LLM_ARCH_LLADA: + case LLM_ARCH_LLADA_MOE: { res = nullptr; } break; @@ -19019,6 +19227,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { llm = std::make_unique(*this, params); } break; + case LLM_ARCH_LLADA_MOE: + { + llm = std::make_unique(*this, params); + } + break; case LLM_ARCH_QWEN2VL: { llm = std::make_unique(*this, params); @@ -19131,7 +19344,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { } break; case LLM_ARCH_OLMO2: { - llm = std::make_unique(*this, params); + if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) { + llm = std::make_unique>(*this, params); + } else { + llm = std::make_unique>(*this, params); + } } break; case LLM_ARCH_OLMOE: { @@ -19486,6 +19703,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_QWEN2MOE: case LLM_ARCH_QWEN3: case LLM_ARCH_QWEN3MOE: + case LLM_ARCH_LLADA_MOE: case LLM_ARCH_OLMO2: case LLM_ARCH_OLMOE: case LLM_ARCH_PHI2: diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index c93e8065a8..97228b2a69 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -725,7 +725,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // attention layers have a non-zero number of kv heads int32_t n_attn_layer = model.hparams.n_layer - std::count(n_head_kv_iter, n_head_kv_iter + model.hparams.n_layer, 0); if (llama_model_has_encoder(&model)) { - n_attn_layer *= 3; + // now n_attn_layer is the number of attention layers in the encoder + // for each decoder block, there are 2 attention layers + n_attn_layer += 2 * model.hparams.dec_n_layer; } GGML_ASSERT((qs.n_attention_wv == n_attn_layer - pruned_attention_w) && "n_attention_wv is unexpected"); } diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index d7d89d99de..e1e0d66fbb 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1963,7 +1963,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { pre_type = LLAMA_VOCAB_PRE_TYPE_TRILLION; clean_spaces = false; } else if ( - tokenizer_pre == "bailingmoe") { + tokenizer_pre == "bailingmoe" || + tokenizer_pre == "llada-moe") { pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE; clean_spaces = false; } else if ( diff --git a/tools/batched-bench/CMakeLists.txt b/tools/batched-bench/CMakeLists.txt index 68ad707f32..4a46b57a52 100644 --- a/tools/batched-bench/CMakeLists.txt +++ b/tools/batched-bench/CMakeLists.txt @@ -1,5 +1,8 @@ set(TARGET llama-batched-bench) add_executable(${TARGET} batched-bench.cpp) -install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() diff --git a/tools/cvector-generator/CMakeLists.txt b/tools/cvector-generator/CMakeLists.txt index 49ad9561c8..baeb4d00c1 100644 --- a/tools/cvector-generator/CMakeLists.txt +++ b/tools/cvector-generator/CMakeLists.txt @@ -1,5 +1,8 @@ set(TARGET llama-cvector-generator) add_executable(${TARGET} cvector-generator.cpp pca.hpp) -install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() diff --git a/tools/export-lora/CMakeLists.txt b/tools/export-lora/CMakeLists.txt index 310455787a..cddfa77f02 100644 --- a/tools/export-lora/CMakeLists.txt +++ b/tools/export-lora/CMakeLists.txt @@ -1,5 +1,8 @@ set(TARGET llama-export-lora) add_executable(${TARGET} export-lora.cpp) -install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() diff --git a/tools/gguf-split/CMakeLists.txt b/tools/gguf-split/CMakeLists.txt index c407e2f0af..9b2125087c 100644 --- a/tools/gguf-split/CMakeLists.txt +++ b/tools/gguf-split/CMakeLists.txt @@ -1,5 +1,8 @@ set(TARGET llama-gguf-split) add_executable(${TARGET} gguf-split.cpp) -install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() diff --git a/tools/imatrix/CMakeLists.txt b/tools/imatrix/CMakeLists.txt index 412696c47c..22f2fe5fdb 100644 --- a/tools/imatrix/CMakeLists.txt +++ b/tools/imatrix/CMakeLists.txt @@ -1,5 +1,8 @@ set(TARGET llama-imatrix) add_executable(${TARGET} imatrix.cpp) -install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() diff --git a/tools/llama-bench/CMakeLists.txt b/tools/llama-bench/CMakeLists.txt index 17e3b9b87b..b8543a9692 100644 --- a/tools/llama-bench/CMakeLists.txt +++ b/tools/llama-bench/CMakeLists.txt @@ -1,5 +1,8 @@ set(TARGET llama-bench) add_executable(${TARGET} llama-bench.cpp) -install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index 95f662a297..ad47bf144f 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -250,6 +250,7 @@ struct cmd_params { std::vector cpu_strict; std::vector poll; std::vector n_gpu_layers; + std::vector n_cpu_moe; std::vector rpc_servers; std::vector split_mode; std::vector main_gpu; @@ -286,6 +287,7 @@ static const cmd_params cmd_params_defaults = { /* cpu_strict */ { false }, /* poll */ { 50 }, /* n_gpu_layers */ { 99 }, + /* n_cpu_moe */ { 0 }, /* rpc_servers */ { "" }, /* split_mode */ { LLAMA_SPLIT_MODE_LAYER }, /* main_gpu */ { 0 }, @@ -353,6 +355,8 @@ static void print_usage(int /* argc */, char ** argv) { printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str()); printf(" -ngl, --n-gpu-layers (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); + printf(" -ncmoe, --n-cpu-moe (default: %s)\n", + join(cmd_params_defaults.n_cpu_moe, ",").c_str()); if (llama_supports_rpc()) { printf(" -rpc, --rpc (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str()); @@ -564,6 +568,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } auto p = parse_int_range(argv[i]); params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end()); + } else if (arg == "-ncmoe" || arg == "--n-cpu-moe") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = parse_int_range(argv[i]); + params.n_cpu_moe.insert(params.n_cpu_moe.end(), p.begin(), p.end()); } else if (llama_supports_rpc() && (arg == "-rpc" || arg == "--rpc")) { if (++i >= argc) { invalid_param = true; @@ -841,6 +852,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; } + if (params.n_cpu_moe.empty()) { + params.n_cpu_moe = cmd_params_defaults.n_cpu_moe; + } if (params.rpc_servers.empty()) { params.rpc_servers = cmd_params_defaults.rpc_servers; } @@ -901,6 +915,7 @@ struct cmd_params_instance { bool cpu_strict; int poll; int n_gpu_layers; + int n_cpu_moe; std::string rpc_servers_str; llama_split_mode split_mode; int main_gpu; @@ -973,20 +988,50 @@ struct cmd_params_instance { mparams.tensor_split = tensor_split.data(); mparams.use_mmap = use_mmap; - if (tensor_buft_overrides.empty()) { - mparams.tensor_buft_overrides = nullptr; + if (n_cpu_moe <= 0) { + if (tensor_buft_overrides.empty()) { + mparams.tensor_buft_overrides = nullptr; + } else { + GGML_ASSERT(tensor_buft_overrides.back().pattern == nullptr && + "Tensor buffer overrides not terminated with empty pattern"); + mparams.tensor_buft_overrides = tensor_buft_overrides.data(); + } } else { - GGML_ASSERT(tensor_buft_overrides.back().pattern == nullptr && "Tensor buffer overrides not terminated with empty pattern"); - mparams.tensor_buft_overrides = tensor_buft_overrides.data(); + static std::vector merged; + static std::vector patterns; + + merged.clear(); + patterns.clear(); + + auto first = tensor_buft_overrides.begin(); + auto last = tensor_buft_overrides.end(); + if (first != last && (last - 1)->pattern == nullptr) { + --last; + } + merged.insert(merged.end(), first, last); + + patterns.reserve((size_t) n_cpu_moe); + merged.reserve(merged.size() + (size_t) n_cpu_moe + 1); + + for (int i = 0; i < n_cpu_moe; ++i) { + patterns.push_back(llm_ffn_exps_block_regex(i)); + merged.push_back({ patterns.back().c_str(), + ggml_backend_cpu_buffer_type() }); + } + + merged.push_back({ nullptr, nullptr }); + + mparams.tensor_buft_overrides = merged.data(); } return mparams; } bool equal_mparams(const cmd_params_instance & other) const { - return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers_str == other.rpc_servers_str && - split_mode == other.split_mode && main_gpu == other.main_gpu && use_mmap == other.use_mmap && - tensor_split == other.tensor_split && vec_tensor_buft_override_equal(tensor_buft_overrides, other.tensor_buft_overrides); + return model == other.model && n_gpu_layers == other.n_gpu_layers && n_cpu_moe == other.n_cpu_moe && + rpc_servers_str == other.rpc_servers_str && split_mode == other.split_mode && + main_gpu == other.main_gpu && use_mmap == other.use_mmap && tensor_split == other.tensor_split && + vec_tensor_buft_override_equal(tensor_buft_overrides, other.tensor_buft_overrides); } llama_context_params to_llama_cparams() const { @@ -1014,6 +1059,7 @@ static std::vector get_cmd_params_instances(const cmd_param // clang-format off for (const auto & m : params.model) for (const auto & nl : params.n_gpu_layers) + for (const auto & ncmoe : params.n_cpu_moe) for (const auto & rpc : params.rpc_servers) for (const auto & sm : params.split_mode) for (const auto & mg : params.main_gpu) @@ -1051,6 +1097,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .cpu_strict = */ cs, /* .poll = */ pl, /* .n_gpu_layers = */ nl, + /* .n_cpu_moe = */ ncmoe, /* .rpc_servers = */ rpc, /* .split_mode = */ sm, /* .main_gpu = */ mg, @@ -1083,6 +1130,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .cpu_strict = */ cs, /* .poll = */ pl, /* .n_gpu_layers = */ nl, + /* .n_cpu_moe = */ ncmoe, /* .rpc_servers = */ rpc, /* .split_mode = */ sm, /* .main_gpu = */ mg, @@ -1115,6 +1163,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .cpu_strict = */ cs, /* .poll = */ pl, /* .n_gpu_layers = */ nl, + /* .n_cpu_moe = */ ncmoe, /* .rpc_servers = */ rpc, /* .split_mode = */ sm, /* .main_gpu = */ mg, @@ -1152,6 +1201,7 @@ struct test { ggml_type type_k; ggml_type type_v; int n_gpu_layers; + int n_cpu_moe; llama_split_mode split_mode; int main_gpu; bool no_kv_offload; @@ -1186,6 +1236,7 @@ struct test { type_k = inst.type_k; type_v = inst.type_v; n_gpu_layers = inst.n_gpu_layers; + n_cpu_moe = inst.n_cpu_moe; split_mode = inst.split_mode; main_gpu = inst.main_gpu; no_kv_offload = inst.no_kv_offload; @@ -1236,12 +1287,14 @@ struct test { static const std::vector & get_fields() { static const std::vector fields = { - "build_commit", "build_number", "cpu_info", "gpu_info", "backends", "model_filename", - "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads", - "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers", - "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides", - "use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen", "n_depth", "test_time", - "avg_ns", "stddev_ns", "avg_ts", "stddev_ts", + "build_commit", "build_number", "cpu_info", "gpu_info", "backends", + "model_filename", "model_type", "model_size", "model_n_params", "n_batch", + "n_ubatch", "n_threads", "cpu_mask", "cpu_strict", "poll", + "type_k", "type_v", "n_gpu_layers", "n_cpu_moe", "split_mode", + "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides", + "use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen", + "n_depth", "test_time", "avg_ns", "stddev_ns", "avg_ts", + "stddev_ts" }; return fields; } @@ -1251,8 +1304,8 @@ struct test { static field_type get_field_type(const std::string & field) { if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" || field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" || - field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" || - field == "avg_ns" || field == "stddev_ns" || field == "no_op_offload") { + field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" || field == "avg_ns" || + field == "stddev_ns" || field == "no_op_offload" || field == "n_cpu_moe") { return INT; } if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" || @@ -1320,6 +1373,7 @@ struct test { ggml_type_name(type_k), ggml_type_name(type_v), std::to_string(n_gpu_layers), + std::to_string(n_cpu_moe), split_mode_str(split_mode), std::to_string(main_gpu), std::to_string(no_kv_offload), @@ -1568,6 +1622,9 @@ struct markdown_printer : public printer { if (!is_cpu_backend) { fields.emplace_back("n_gpu_layers"); } + if (params.n_cpu_moe.size() > 1) { + fields.emplace_back("n_cpu_moe"); + } if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) { fields.emplace_back("n_threads"); } diff --git a/tools/main/CMakeLists.txt b/tools/main/CMakeLists.txt index af3d9150f8..8f8e9d444c 100644 --- a/tools/main/CMakeLists.txt +++ b/tools/main/CMakeLists.txt @@ -1,5 +1,8 @@ set(TARGET llama-cli) add_executable(${TARGET} main.cpp) -install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt index 0979488560..2381012a0d 100644 --- a/tools/mtmd/CMakeLists.txt +++ b/tools/mtmd/CMakeLists.txt @@ -55,7 +55,7 @@ add_executable(llama-qwen2vl-cli deprecation-warning.cpp) set(TARGET llama-mtmd-cli) add_executable (${TARGET} mtmd-cli.cpp) set_target_properties (${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli) -if(NOT CMAKE_SYSTEM_NAME STREQUAL "iOS") +if(LLAMA_TOOLS_INSTALL) install(TARGETS ${TARGET} RUNTIME) endif() target_link_libraries (${TARGET} PRIVATE common mtmd Threads::Threads) diff --git a/tools/perplexity/CMakeLists.txt b/tools/perplexity/CMakeLists.txt index 3e68640933..12b28b2be4 100644 --- a/tools/perplexity/CMakeLists.txt +++ b/tools/perplexity/CMakeLists.txt @@ -1,5 +1,8 @@ set(TARGET llama-perplexity) add_executable(${TARGET} perplexity.cpp) -install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() diff --git a/tools/quantize/CMakeLists.txt b/tools/quantize/CMakeLists.txt index 47e5cbe30c..bd9ddbd67d 100644 --- a/tools/quantize/CMakeLists.txt +++ b/tools/quantize/CMakeLists.txt @@ -1,6 +1,9 @@ set(TARGET llama-quantize) add_executable(${TARGET} quantize.cpp) -install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_include_directories(${TARGET} PRIVATE ../../common) target_compile_features(${TARGET} PRIVATE cxx_std_17) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() diff --git a/tools/run/CMakeLists.txt b/tools/run/CMakeLists.txt index d018959698..e52294ccc0 100644 --- a/tools/run/CMakeLists.txt +++ b/tools/run/CMakeLists.txt @@ -10,6 +10,8 @@ if (LLAMA_CURL) set(LLAMA_RUN_EXTRA_LIBS ${LLAMA_RUN_EXTRA_LIBS} ${CURL_LIBRARIES}) endif () -install(TARGETS ${TARGET} RUNTIME) +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT} ${LLAMA_RUN_EXTRA_LIBS}) target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/tools/tokenize/CMakeLists.txt b/tools/tokenize/CMakeLists.txt index 1690b53e5d..feed9a1062 100644 --- a/tools/tokenize/CMakeLists.txt +++ b/tools/tokenize/CMakeLists.txt @@ -1,5 +1,7 @@ set(TARGET llama-tokenize) add_executable(${TARGET} tokenize.cpp) -install(TARGETS ${TARGET} RUNTIME) +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/tools/tts/CMakeLists.txt b/tools/tts/CMakeLists.txt index c72bd814c3..76320d4c2d 100644 --- a/tools/tts/CMakeLists.txt +++ b/tools/tts/CMakeLists.txt @@ -1,5 +1,8 @@ set(TARGET llama-tts) add_executable(${TARGET} tts.cpp) -install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif()