diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 43553ac13b..1f226d6ea8 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1063,7 +1063,17 @@ jobs: run: | git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1 - - name: Install + - name: Cache ROCm Installation + id: cache-rocm + uses: actions/cache@v4 + with: + path: C:\Program Files\AMD\ROCm + key: rocm-6.1-${{ runner.os }}-v1 + restore-keys: | + rocm-6.1-${{ runner.os }}- + + - name: Install ROCm + if: steps.cache-rocm.outputs.cache-hit != 'true' id: depends run: | $ErrorActionPreference = "Stop" @@ -1071,13 +1081,28 @@ jobs: Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe" write-host "Installing AMD HIP SDK" $proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru - $proc.WaitForExit(600000) + $completed = $proc.WaitForExit(600000) + if (-not $completed) { + Write-Error "ROCm installation timed out after 10 minutes. Killing the process" + $proc.Kill() + exit 1 + } + if ($proc.ExitCode -ne 0) { + Write-Error "ROCm installation failed with exit code $($proc.ExitCode)" + exit 1 + } write-host "Completed AMD HIP SDK installation" - name: Verify ROCm id: verify run: | - & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version + # Find and test ROCm installation + $clangPath = Get-ChildItem 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | Select-Object -First 1 + if (-not $clangPath) { + Write-Error "ROCm installation not found" + exit 1 + } + & $clangPath.FullName --version - name: Install ccache uses: ggml-org/ccache-action@v1.2.16 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 5367637e42..701811eeb2 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -544,13 +544,23 @@ jobs: run: | git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1 + - name: Cache ROCm Installation + id: cache-rocm + uses: actions/cache@v4 + with: + path: C:\Program Files\AMD\ROCm + key: rocm-6.1-${{ runner.os }}-v1 + restore-keys: | + rocm-6.1-${{ runner.os }}- + - name: ccache uses: ggml-org/ccache-action@v1.2.16 with: key: windows-latest-cmake-hip-${{ matrix.name }}-x64 evict-old-files: 1d - - name: Install + - name: Install ROCm + if: steps.cache-rocm.outputs.cache-hit != 'true' id: depends run: | $ErrorActionPreference = "Stop" @@ -558,13 +568,28 @@ jobs: Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe" write-host "Installing AMD HIP SDK" $proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru - $proc.WaitForExit(600000) + $completed = $proc.WaitForExit(600000) + if (-not $completed) { + Write-Error "ROCm installation timed out after 10 minutes. Killing the process" + $proc.Kill() + exit 1 + } + if ($proc.ExitCode -ne 0) { + Write-Error "ROCm installation failed with exit code $($proc.ExitCode)" + exit 1 + } write-host "Completed AMD HIP SDK installation" - name: Verify ROCm id: verify run: | - & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version + # Find and test ROCm installation + $clangPath = Get-ChildItem 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | Select-Object -First 1 + if (-not $clangPath) { + Write-Error "ROCm installation not found" + exit 1 + } + & $clangPath.FullName --version - name: Build id: cmake_build diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index e68ff92445..6db74a0cf2 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -16,6 +16,9 @@ - Use the following format for the squashed commit title: ` : (#)`. For example: `utils : fix typo in utils.py (#1234)` - Optionally pick a `` from here: https://github.com/ggml-org/llama.cpp/wiki/Modules - Consider adding yourself to [CODEOWNERS](CODEOWNERS) +- Let authors, who are also collaborators, merge their own PRs +- When merging a PR by a contributor, make sure you have a good understanding of the changes +- Be mindful of maintenance: most of the work going into a feature happens after the PR is merged. If the PR author is not committed to contribute long-term, someone else needs to take responsibility (you) # Coding guidelines diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index 637891f506..182c787544 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -843,9 +843,10 @@ public: _build_object_rule( properties, required, name, schema.contains("additionalProperties") ? schema["additionalProperties"] : json())); - } else if ((schema_type.is_null() || schema_type == "object") && schema.contains("allOf")) { + } else if ((schema_type.is_null() || schema_type == "object" || schema_type == "string") && schema.contains("allOf")) { std::unordered_set required; std::vector> properties; + std::map enum_values; std::string hybrid_name = name; std::function add_component = [&](const json & comp_schema, bool is_required) { if (comp_schema.contains("$ref")) { @@ -857,6 +858,14 @@ public: required.insert(prop.key()); } } + } else if (comp_schema.contains("enum")) { + for (const auto & v : comp_schema["enum"]) { + const auto rule = _generate_constant_rule(v); + if (enum_values.find(rule) == enum_values.end()) { + enum_values[rule] = 0; + } + enum_values[rule] += 1; + } } else { // todo warning } @@ -870,6 +879,17 @@ public: add_component(t, true); } } + if (!enum_values.empty()) { + std::vector enum_intersection; + for (const auto & p : enum_values) { + if (p.second == schema["allOf"].size()) { + enum_intersection.push_back(p.first); + } + } + if (!enum_intersection.empty()) { + return _add_rule(rule_name, "(" + string_join(enum_intersection, " | ") + ") space"); + } + } return _add_rule(rule_name, _build_object_rule(properties, required, hybrid_name, json())); } else if ((schema_type.is_null() || schema_type == "array") && (schema.contains("items") || schema.contains("prefixItems"))) { json items = schema.contains("items") ? schema["items"] : schema["prefixItems"]; diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 62a546ee22..bbc21813f8 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6701,6 +6701,8 @@ class T5Model(TextModel): self.gguf_writer.add_embedding_length(self.hparams["d_model"]) self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"]) self.gguf_writer.add_block_count(self.hparams["num_layers"]) + if (dec_n_layer := self.hparams.get("num_decoder_layers")) is not None: + self.gguf_writer.add_decoder_block_count(dec_n_layer) self.gguf_writer.add_head_count(self.hparams["num_heads"]) self.gguf_writer.add_key_length(self.hparams["d_kv"]) self.gguf_writer.add_value_length(self.hparams["d_kv"]) diff --git a/docs/backend/CANN.md b/docs/backend/CANN.md index 357253f43a..35b189bb95 100755 --- a/docs/backend/CANN.md +++ b/docs/backend/CANN.md @@ -314,3 +314,7 @@ Converting the matmul weight format from ND to NZ to improve performance. Enable ### GGML_CANN_ACL_GRAPH Operators are executed using ACL graph execution, rather than in op-by-op (eager) mode. Enabled by default. + +### GGML_CANN_GRAPH_CACHE_CAPACITY + +Maximum number of compiled CANN graphs kept in the LRU cache, default is 12. When the number of cached graphs exceeds this capacity, the least recently used graph will be evicted. diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index d4ef751fbb..cefa39a57c 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -28,6 +28,15 @@ static std::string ggml_ne_string(const ggml_tensor * t) { return str; } +static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) { + union { + float f; + uint32_t i; + } u; + u.i = (uint32_t)h.bits << 16; + return u.f; +} + static float ggml_get_float_value(uint8_t * data, ggml_type type, const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) { size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0]; float v; @@ -43,6 +52,8 @@ static float ggml_get_float_value(uint8_t * data, ggml_type type, const size_t * v = (float) *(int16_t *) &data[i]; } else if (type == GGML_TYPE_I8) { v = (float) *(int8_t *) &data[i]; + } else if (type == GGML_TYPE_BF16) { + v = ggml_compute_bf16_to_fp32(*(ggml_bf16_t *) &data[i]); } else { GGML_ABORT("fatal error"); } diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py index ed37958554..2d57549046 100755 --- a/examples/json_schema_to_grammar.py +++ b/examples/json_schema_to_grammar.py @@ -586,9 +586,10 @@ class SchemaConverter: properties = list(schema.get('properties', {}).items()) return self._add_rule(rule_name, self._build_object_rule(properties, required, name, schema.get('additionalProperties'))) - elif schema_type in (None, 'object') and 'allOf' in schema: + elif schema_type in (None, 'object', 'string') and 'allOf' in schema: required = set() properties = [] + enum_sets = [] hybrid_name = name def add_component(comp_schema, is_required): if (ref := comp_schema.get('$ref')) is not None: @@ -600,6 +601,9 @@ class SchemaConverter: if is_required: required.add(prop_name) + if 'enum' in comp_schema: + enum_sets.append(set(comp_schema['enum'])) + for t in schema['allOf']: if 'anyOf' in t: for tt in t['anyOf']: @@ -607,6 +611,15 @@ class SchemaConverter: else: add_component(t, is_required=True) + if enum_sets: + enum_intersection = enum_sets[0] + for s in enum_sets[1:]: + enum_intersection &= s + + if enum_intersection: + rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in sorted(enum_intersection))) + ') space' + return self._add_rule(rule_name, rule) + return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=None)) elif schema_type in (None, 'array') and ('items' in schema or 'prefixItems' in schema): diff --git a/examples/model-conversion/requirements.txt b/examples/model-conversion/requirements.txt index b8148b269a..ac9f69e10b 100644 --- a/examples/model-conversion/requirements.txt +++ b/examples/model-conversion/requirements.txt @@ -1,5 +1,6 @@ --extra-index-url https://download.pytorch.org/whl/cpu -torch~=2.6.0 -torchvision~=0.21.0 -transformers~=4.55.0 -huggingface-hub~=0.34.0 +torch +torchvision +transformers +huggingface-hub +accelerate diff --git a/examples/model-conversion/scripts/causal/run-org-model.py b/examples/model-conversion/scripts/causal/run-org-model.py index f6188ea6f3..78a54abf13 100755 --- a/examples/model-conversion/scripts/causal/run-org-model.py +++ b/examples/model-conversion/scripts/causal/run-org-model.py @@ -9,15 +9,134 @@ from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig import torch import numpy as np -unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME') +### If you want to dump RoPE activations, apply this monkey patch to the model +### class from Transformers that you are running (replace apertus.modeling_apertus +### with the proper package and class for your model +### === START ROPE DEBUG === +# from transformers.models.apertus.modeling_apertus import apply_rotary_pos_emb -parser = argparse.ArgumentParser(description='Process model with specified path') -parser.add_argument('--model-path', '-m', help='Path to the model') +# orig_rope = apply_rotary_pos_emb +# torch.set_printoptions(threshold=float('inf')) +# torch.set_printoptions(precision=6, sci_mode=False) + +# def debug_rope(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): +# # log inputs +# summarize(q, "RoPE.q_in") +# summarize(k, "RoPE.k_in") + +# # call original +# q_out, k_out = orig_rope(q, k, cos, sin, position_ids, unsqueeze_dim) + +# # log outputs +# summarize(q_out, "RoPE.q_out") +# summarize(k_out, "RoPE.k_out") + +# return q_out, k_out + +# # Patch it +# import transformers.models.apertus.modeling_apertus as apertus_mod # noqa: E402 +# apertus_mod.apply_rotary_pos_emb = debug_rope +### == END ROPE DEBUG === + + +def summarize(tensor: torch.Tensor, name: str, max_seq: int = 3, max_vals: int = 3): + """ + Print a tensor in llama.cpp debug style. + + Supports: + - 2D tensors (seq, hidden) + - 3D tensors (batch, seq, hidden) + - 4D tensors (batch, seq, heads, dim_per_head) via flattening heads × dim_per_head + + Shows first and last max_vals of each vector per sequence position. + """ + t = tensor.detach().to(torch.float32).cpu() + + # Determine dimensions + if t.ndim == 3: + _, s, _ = t.shape + elif t.ndim == 2: + _, s = 1, t.shape[0] + t = t.unsqueeze(0) + elif t.ndim == 4: + _, s, _, _ = t.shape + else: + print(f"Skipping tensor due to unsupported dimensions: {t.ndim}") + return + + ten_shape = t.shape + + print(f"ggml_debug: {name} = (f32) ... = {{{ten_shape}}}") + print(" [") + print(" [") + + # Determine indices for first and last sequences + first_indices = list(range(min(s, max_seq))) + last_indices = list(range(max(0, s - max_seq), s)) + + # Check if there's an overlap between first and last indices or if we're at the edge case of s = 2 * max_seq + has_overlap = bool(set(first_indices) & set(last_indices)) or (max_seq * 2 == s) + + # Combine indices + if has_overlap: + # If there's overlap, just use the combined unique indices + indices = sorted(list(set(first_indices + last_indices))) + separator_index = None + else: + # If no overlap, we'll add a separator between first and last sequences + indices = first_indices + last_indices + separator_index = len(first_indices) + + for i, si in enumerate(indices): + # Add separator if needed + if separator_index is not None and i == separator_index: + print(" ...") + + # Extract appropriate slice + vec = t[0, si] + if vec.ndim == 2: # 4D case: flatten heads × dim_per_head + flat = vec.flatten().tolist() + else: # 2D or 3D case + flat = vec.tolist() + + # First and last slices + first = flat[:max_vals] + last = flat[-max_vals:] if len(flat) >= max_vals else flat + first_str = ", ".join(f"{v:12.4f}" for v in first) + last_str = ", ".join(f"{v:12.4f}" for v in last) + + print(f" [{first_str}, ..., {last_str}]") + + print(" ],") + print(" ]") + print(f" sum = {t.sum().item():.6f}\n") + + +def debug_hook(name): + def fn(_m, input, output): + if isinstance(input, torch.Tensor): + summarize(input, name + "_in") + elif isinstance(input, (tuple, list)) and isinstance(input[0], torch.Tensor): + summarize(input[0], name + "_in") + if isinstance(output, torch.Tensor): + summarize(output, name + "_out") + elif isinstance(output, (tuple, list)) and isinstance(output[0], torch.Tensor): + summarize(output[0], name + "_out") + + return fn + + +unreleased_model_name = os.getenv("UNRELEASED_MODEL_NAME") + +parser = argparse.ArgumentParser(description="Process model with specified path") +parser.add_argument("--model-path", "-m", help="Path to the model") args = parser.parse_args() -model_path = os.environ.get('MODEL_PATH', args.model_path) +model_path = os.environ.get("MODEL_PATH", args.model_path) if model_path is None: - parser.error("Model path must be specified either via --model-path argument or MODEL_PATH environment variable") + parser.error( + "Model path must be specified either via --model-path argument or MODEL_PATH environment variable" + ) config = AutoConfig.from_pretrained(model_path) @@ -34,18 +153,30 @@ config = AutoConfig.from_pretrained(model_path) if unreleased_model_name: model_name_lower = unreleased_model_name.lower() - unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}" + unreleased_module_path = ( + f"transformers.models.{model_name_lower}.modular_{model_name_lower}" + ) class_name = f"{unreleased_model_name}ForCausalLM" print(f"Importing unreleased model module: {unreleased_module_path}") try: - model_class = getattr(importlib.import_module(unreleased_module_path), class_name) - model = model_class.from_pretrained(model_path) # Note: from_pretrained, not fromPretrained + model_class = getattr( + importlib.import_module(unreleased_module_path), class_name + ) + model = model_class.from_pretrained( + model_path + ) # Note: from_pretrained, not fromPretrained except (ImportError, AttributeError) as e: print(f"Failed to import or load model: {e}") exit(1) else: - model = AutoModelForCausalLM.from_pretrained(model_path) + model = AutoModelForCausalLM.from_pretrained( + model_path, device_map="auto", offload_folder="offload" + ) + +for name, module in model.named_modules(): + if len(list(module.children())) == 0: # only leaf modules + module.register_forward_hook(debug_hook(name)) model_name = os.path.basename(model_path) # Printing the Model class to allow for easier debugging. This can be useful diff --git a/ggml/include/ggml-metal.h b/ggml/include/ggml-metal.h index a610694423..1163438bc2 100644 --- a/ggml/include/ggml-metal.h +++ b/ggml/include/ggml-metal.h @@ -43,14 +43,8 @@ GGML_BACKEND_API ggml_backend_t ggml_backend_metal_init(void); GGML_BACKEND_API bool ggml_backend_is_metal(ggml_backend_t backend); -GGML_DEPRECATED( - GGML_BACKEND_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size), - "obsoleted by the new device interface - https://github.com/ggml-org/llama.cpp/pull/9713"); - GGML_BACKEND_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data); -GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void); - // helper to check if the device supports a specific family // ideally, the user code should be doing these checks // ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h index c36c12d657..2db5c4e0f8 100644 --- a/ggml/src/ggml-backend-impl.h +++ b/ggml/src/ggml-backend-impl.h @@ -114,6 +114,9 @@ extern "C" { void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event); // wait for an event on on a different stream void (*event_wait) (ggml_backend_t backend, ggml_backend_event_t event); + + // (optional) sort/optimize the nodes in the graph + void (*optimize_graph) (ggml_backend_t backend, struct ggml_cgraph * cgraph); }; struct ggml_backend { diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index f615ab4bee..7646f3f134 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -463,6 +463,13 @@ void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) backend->iface.event_wait(backend, event); } +static void ggml_backend_optimize_graph(ggml_backend_t backend, struct ggml_cgraph * cgraph) { + GGML_ASSERT(backend); + if (backend->iface.optimize_graph != NULL) { + backend->iface.optimize_graph(backend, cgraph); + } +} + // Backend device const char * ggml_backend_dev_name(ggml_backend_dev_t device) { @@ -1298,6 +1305,10 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra struct ggml_backend_sched_split * split = &sched->splits[i]; split->graph = ggml_graph_view(graph, split->i_start, split->i_end); + // Optimize this split of the graph. This needs to happen before we make graph_copy, + // so they are in sync. + ggml_backend_optimize_graph(sched->backends[split->backend_id], &split->graph); + // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split for (int j = 0; j < split->n_inputs; j++) { assert(graph_copy->size > (graph_copy->n_nodes + 1)); diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp index aeac2e5744..cdfc5a9bc2 100644 --- a/ggml/src/ggml-blas/ggml-blas.cpp +++ b/ggml/src/ggml-blas/ggml-blas.cpp @@ -270,6 +270,7 @@ static struct ggml_backend_i blas_backend_i = { /* .graph_compute = */ ggml_backend_blas_graph_compute, /* .event_record = */ NULL, /* .event_wait = */ NULL, + /* .optimize_graph = */ NULL, }; static ggml_guid_t ggml_backend_blas_guid(void) { diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index ac2e2e1adf..434023dd22 100755 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -2268,8 +2268,6 @@ static void aclnn_index_fill_tensor(ggml_backend_cann_context& ctx, * stream, and persistent buffers for rope init/cache. * @param dst The destination ggml_tensor whose computation * depends on the RoPE values (usually Qcur/Kcur). - * @param sin_tensor_buffer Pre-allocated buffer for storing repeated sin values. - * @param cos_tensor_buffer Pre-allocated buffer for storing repeated cos values. * @param theta_scale Scalar exponent base for computing theta scale values. * @param freq_scale Frequency scaling factor, applied to theta scale. * @param attn_factor Attention scaling factor, applied to sin/cos. @@ -2277,17 +2275,23 @@ static void aclnn_index_fill_tensor(ggml_backend_cann_context& ctx, * (dim expansion vs repeat_interleave). */ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, - void* sin_tensor_buffer, void* cos_tensor_buffer, float* corr_dims, float ext_factor, float theta_scale, float freq_scale, float attn_factor, bool is_neox) { - // int sin/cos cache, cache has different repeat method depond on - // @param.is_neox - ggml_tensor* src0 = dst->src[0]; // input ggml_tensor* src1 = dst->src[1]; // position ggml_tensor* src2 = dst->src[2]; // freq_factors + if(src2 == nullptr && ctx.rope_cache.cached + && ctx.rope_cache.ext_factor == ext_factor + && ctx.rope_cache.theta_scale == theta_scale + && ctx.rope_cache.freq_scale == freq_scale + && ctx.rope_cache.attn_factor == attn_factor + && ctx.rope_cache.is_neox == is_neox) { + // use cache. + return; + } + int64_t theta_scale_length = src0->ne[0] / 2; int64_t theta_scale_ne[] = {theta_scale_length, 1, 1, 1}; size_t theta_scale_nb[] = {sizeof(float), sizeof(float), sizeof(float), @@ -2316,8 +2320,6 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, ctx.rope_cache.freq_scale != freq_scale) { ctx.rope_cache.theta_scale_length = theta_scale_length; - ctx.rope_cache.theta_scale = theta_scale; - ctx.rope_cache.freq_scale = freq_scale; if (ctx.rope_cache.theta_scale_cache != nullptr) { ACL_CHECK(aclrtFree(ctx.rope_cache.theta_scale_cache)); @@ -2342,7 +2344,7 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, // return MIN(1, MAX(0, y)) - 1; yarn_ramp_allocator.alloc(theta_scale_length * sizeof(float)); void* yarn_ramp_buffer = yarn_ramp_allocator.get(); - acl_yarn_ramp_tensor = ggml_cann_create_tensor(yarn_ramp_buffer, ACL_FLOAT, sizeof(float_t), + acl_yarn_ramp_tensor = ggml_cann_create_tensor(yarn_ramp_buffer, ACL_FLOAT, sizeof(float), theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS); float zero_value = 0, one_value = 1; float denom_safe_value = MAX(0.001f, corr_dims[1] - corr_dims[0]); @@ -2411,6 +2413,20 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, ggml_cann_release_resources(ctx, acl_freq_factors_tensor, acl_freq_fac_res_tensor); } + // init sin_repeat && cos_repeat, only to accelerate first layer on each device + if (position_length > ctx.rope_cache.position_length) { + ctx.rope_cache.position_length = position_length; + if (ctx.rope_cache.sin_cache != nullptr) { + ACL_CHECK(aclrtFree(ctx.rope_cache.sin_cache)); + } + if (ctx.rope_cache.cos_cache != nullptr) { + ACL_CHECK(aclrtFree(ctx.rope_cache.cos_cache)); + } + int64_t repeat_theta_length = theta_scale_length * position_length * 2; + ACL_CHECK(aclrtMalloc(&ctx.rope_cache.sin_cache, repeat_theta_length * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc(&ctx.rope_cache.cos_cache, repeat_theta_length * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST)); + } + // position aclTensor* acl_position_tensor = ggml_cann_create_tensor( src1->data, ggml_cann_type_mapping(src1->type), @@ -2462,10 +2478,10 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, sin_reshape_nb[i] = sin_reshape_nb[i - 1] * sin_reshape_ne[i - 1]; } aclTensor* acl_sin_repeat_tensor = - ggml_cann_create_tensor(sin_tensor_buffer, ACL_FLOAT, sizeof(float), + ggml_cann_create_tensor(ctx.rope_cache.sin_cache, ACL_FLOAT, sizeof(float), sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS); aclTensor* acl_cos_repeat_tensor = - ggml_cann_create_tensor(cos_tensor_buffer, ACL_FLOAT, sizeof(float), + ggml_cann_create_tensor(ctx.rope_cache.cos_cache, ACL_FLOAT, sizeof(float), sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS); // repeat @@ -2483,6 +2499,14 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, num_repeats, output_size); } + // Other layers use cache except first layer. + ctx.rope_cache.cached = true; + ctx.rope_cache.ext_factor = ext_factor; + ctx.rope_cache.theta_scale = theta_scale; + ctx.rope_cache.freq_scale = freq_scale; + ctx.rope_cache.attn_factor = attn_factor; + ctx.rope_cache.is_neox = is_neox; + ggml_cann_release_resources(ctx, acl_theta_scale_tensor, acl_position_tensor, acl_theta_tensor, acl_sin_tensor, acl_sin_repeat_tensor, acl_cos_tensor, acl_cos_repeat_tensor); @@ -2504,10 +2528,7 @@ aclnnStatus aclnnRotaryPositionEmbedding(void* workspace, #endif void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { - // TODO: use ascendc - // Only test with LLAMA model. ggml_tensor* src0 = dst->src[0]; // input - ggml_tensor* src1 = dst->src[1]; // param float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; @@ -2538,15 +2559,8 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; - // sin/cos tensor length. - int64_t repeat_theta_length = src0->ne[0] * src1->ne[0]; - ggml_cann_pool_alloc sin_tensor_allocator(ctx.pool(), repeat_theta_length * sizeof(float)); - ggml_cann_pool_alloc cos_tensor_allocator(ctx.pool(), repeat_theta_length * sizeof(float)); - void *sin_tensor_buffer = sin_tensor_allocator.get(); - void *cos_tensor_buffer = cos_tensor_allocator.get(); - // init ctx.rope_cos/rope_sin cache - aclnn_cache_init(ctx, dst, sin_tensor_buffer, cos_tensor_buffer, corr_dims, ext_factor, + aclnn_cache_init(ctx, dst, corr_dims, ext_factor, theta_scale, freq_scale, attn_factor, is_neox); int64_t sin_reshape_ne[4] = {ne00, 1, ne02, 1}; @@ -2556,10 +2570,10 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { sin_reshape_nb[i] = sin_reshape_nb[i - 1] * sin_reshape_ne[i - 1]; } aclTensor* acl_sin_reshape_tensor = - ggml_cann_create_tensor(sin_tensor_buffer, ACL_FLOAT, sizeof(float), + ggml_cann_create_tensor(ctx.rope_cache.sin_cache, ACL_FLOAT, sizeof(float), sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS); aclTensor* acl_cos_reshape_tensor = - ggml_cann_create_tensor(cos_tensor_buffer, ACL_FLOAT, sizeof(float), + ggml_cann_create_tensor(ctx.rope_cache.cos_cache, ACL_FLOAT, sizeof(float), sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS); aclTensor* acl_src = ggml_cann_create_tensor(src0); diff --git a/ggml/src/ggml-cann/common.h b/ggml/src/ggml-cann/common.h index e295f4ab47..c5fce8dc91 100755 --- a/ggml/src/ggml-cann/common.h +++ b/ggml/src/ggml-cann/common.h @@ -38,6 +38,7 @@ #include #include #include +#include #include "../include/ggml-cann.h" #include "../include/ggml.h" @@ -106,6 +107,7 @@ int32_t ggml_cann_get_device(); std::optional get_env(const std::string& name); bool parse_bool(const std::string& value); +int parse_integer(const std::string& value); /** * @brief Abstract base class for memory pools used by CANN. @@ -350,7 +352,7 @@ struct ggml_graph_node_properties { struct ggml_cann_graph { ~ggml_cann_graph() { if (graph != nullptr) { - aclmdlRIDestroy(graph); + ACL_CHECK(aclmdlRIDestroy(graph)); } } @@ -358,6 +360,64 @@ struct ggml_cann_graph { std::vector ggml_graph_properties; }; + +/** + * @brief LRU cache for managing ggml_cann_graph objects. + * + * This class maintains a list of shared_ptr to ggml_cann_graph objects + * and enforces a maximum capacity. It provides methods to push new graphs, + * move existing graphs to the front (most recently used), and clear the cache. + */ +struct ggml_cann_graph_lru_cache { + size_t capacity; /**< Maximum number of graphs in the cache. */ + + std::list cache_list; /**< List storing cached graphs as raw pointers. */ + + ggml_cann_graph_lru_cache() { + capacity = parse_integer(get_env("GGML_CANN_GRAPH_CACHE_CAPACITY").value_or("12")); + } + + /** + * @brief Push a new graph to the front of the cache. + * If the cache exceeds capacity, the least recently used graph is deleted. + * @param new_node Pointer to the new ggml_cann_graph to cache. + * Ownership is transferred to the cache (cache will delete it). + */ + void push(ggml_cann_graph* new_node) { + if (cache_list.size() >= capacity) { + ggml_cann_graph* old = cache_list.back(); + cache_list.pop_back(); + delete old; // free the old graph + } + cache_list.push_front(new_node); + } + + /** + * @brief Move an existing graph to the front of the cache. + * @param node Pointer to the ggml_cann_graph to move. + */ + void move_to_front(ggml_cann_graph* node) { + cache_list.remove(node); + cache_list.push_front(node); + } + + /** + * @brief Clear all graphs from the cache (also frees memory). + */ + void clear() { + for (auto ptr : cache_list) { + delete ptr; + } + cache_list.clear(); + } + + /** + * @brief Destructor that clears the cache and frees all cached graphs. + */ + ~ggml_cann_graph_lru_cache() { + clear(); + } +}; #endif // USE_ACL_GRAPH struct ggml_cann_rope_cache { @@ -365,12 +425,27 @@ struct ggml_cann_rope_cache { if(theta_scale_cache != nullptr) { ACL_CHECK(aclrtFree(theta_scale_cache)); } + if(sin_cache != nullptr) { + ACL_CHECK(aclrtFree(sin_cache)); + } + if(cos_cache != nullptr) { + ACL_CHECK(aclrtFree(cos_cache)); + } } void* theta_scale_cache = nullptr; int64_t theta_scale_length = 0; + // sin/cos cache, used only to accelerate first layer on each device + void* sin_cache = nullptr; + void* cos_cache = nullptr; + int64_t position_length = 0; + // Properties to check before reusing the sincos cache + bool cached = false; + float ext_factor = 0.0f; float theta_scale = 0.0f; float freq_scale = 0.0f; + float attn_factor = 0.0f; + bool is_neox = false; }; struct ggml_cann_tensor_cache { @@ -394,7 +469,7 @@ struct ggml_backend_cann_context { aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */ #ifdef USE_ACL_GRAPH /// Cached CANN ACL graph used for executing the current ggml computation graph. - std::unique_ptr cann_graph; + ggml_cann_graph_lru_cache graph_lru_cache; bool acl_graph_mode = true; #endif cann_task_queue task_queue; diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index 2f9f373f54..d148174f1e 100755 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -116,6 +116,24 @@ bool parse_bool(const std::string& value) { return valid_values.find(value) != valid_values.end(); } +/** + * @brief Parse a string as an integer, returning 0 if invalid. + * + * This function attempts to convert the input string `value` to an `int`. + * If the string is not a valid integer or is out of the `int` range, + * it returns 0. + * + * @param value The string to parse. + * @return The parsed integer, or 0 if conversion fails. + */ +int parse_integer(const std::string& value) { + try { + return std::stoi(value); + } catch (...) { + return 0; + } +} + /** * @brief Initialize the CANN device information. * @@ -2131,30 +2149,52 @@ static void ggml_backend_cann_synchronize(ggml_backend_t backend) { #ifdef USE_ACL_GRAPH /** - * @brief Populate the internal CANN graph node properties from the ggml computation graph. + * @brief Add a new CANN graph to the LRU cache by populating node properties from the ggml graph. * - * This function copies all node attributes (operation type, dimensions, strides, input sources, - * and operation parameters) into the cached CANN graph structure for later reuse or comparison. + * This function creates a new ggml_cann_graph object and fills its node properties + * (operation type, dimensions, strides, input sources, and operation parameters) + * based on the current ggml computation graph. * - * @param cann_ctx The CANN backend context. - * @param cgraph The ggml computational graph. + * Each node in the ggml graph is mapped to a property entry in the new CANN graph: + * - node address + * - operation type + * - shape (ne) and strides (nb) + * - source tensor addresses + * - operation parameters + * + * After initialization, the new graph is pushed into the LRU cache owned by the + * CANN backend context. The cache takes ownership of the graph and manages its + * lifetime (including deletion upon eviction). + * + * @param cann_ctx The CANN backend context containing the graph cache. + * @param cgraph The current ggml computation graph. */ -static void set_ggml_graph_node_properties(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph) { - for (int node_idx = 0; node_idx < cgraph->n_nodes; node_idx++) { - ggml_tensor * node = cgraph->nodes[node_idx]; - cann_ctx->cann_graph->ggml_graph_properties[node_idx].node_address = node->data; - cann_ctx->cann_graph->ggml_graph_properties[node_idx].node_op = node->op; +static void add_lru_matched_graph_node_properties( + ggml_backend_cann_context * cann_ctx, + ggml_cgraph * cgraph) { + // Create a new ggml_cann_graph object on the heap (its lifetime is managed by the cache). + ggml_cann_graph * new_graph = new ggml_cann_graph(); + new_graph->ggml_graph_properties.resize(cgraph->n_nodes); - for (int dim = 0; dim < GGML_MAX_DIMS; dim++) { - cann_ctx->cann_graph->ggml_graph_properties[node_idx].ne[dim] = node->ne[dim]; - cann_ctx->cann_graph->ggml_graph_properties[node_idx].nb[dim] = node->nb[dim]; + for (int node_idx = 0; node_idx < cgraph->n_nodes; ++node_idx) { + ggml_tensor * node = cgraph->nodes[node_idx]; + auto & prop = new_graph->ggml_graph_properties[node_idx]; + + prop.node_address = node->data; + prop.node_op = node->op; + + std::copy_n(node->ne, GGML_MAX_DIMS, prop.ne); + std::copy_n(node->nb, GGML_MAX_DIMS, prop.nb); + + for (int src = 0; src < GGML_MAX_SRC; ++src) { + prop.src_address[src] = node->src[src] ? node->src[src]->data : nullptr; } - for (int src = 0; src < GGML_MAX_SRC; src++) { - cann_ctx->cann_graph->ggml_graph_properties[node_idx].src_address[src] = - node->src[src] ? node->src[src]->data : nullptr; - } - memcpy(cann_ctx->cann_graph->ggml_graph_properties[node_idx].op_params, node->op_params, GGML_MAX_OP_PARAMS); + + memcpy(prop.op_params, node->op_params, GGML_MAX_OP_PARAMS); } + + // Insert into the LRU cache (cache takes ownership and will delete it when evicted). + cann_ctx->graph_lru_cache.push(new_graph); } /** @@ -2199,30 +2239,45 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra } /** - * @brief Determine if the CANN graph needs to be rebuilt due to graph changes. + * @brief Check whether there is a cached CANN graph that matches the current ggml graph. * - * This checks whether the number or properties of ggml graph nodes have changed - * compared to the last captured CANN graph. If so, the CANN graph must be re-captured. + * This function iterates through the cached CANN graphs stored in the LRU cache and + * compares them against the given ggml computation graph. A match requires that the + * number of nodes is the same and that each node’s properties (operation type, + * dimensions, strides, inputs, and operation parameters) are identical. * - * @param cann_ctx The CANN backend context. + * If a matching graph is found, it is promoted to the front of the LRU cache and the + * function returns true. Otherwise, the function returns false, indicating that a new + * CANN graph needs to be captured. + * + * @param cann_ctx The CANN backend context containing the graph cache. * @param cgraph The current ggml computation graph. - * @return true if an update is required; false otherwise. + * @return true if a matching cached graph exists; false otherwise. */ -static bool is_cann_graph_update_required(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph) { - // The number of nodes is different, so the graph needs to be reconstructed. - if (cann_ctx->cann_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) { - cann_ctx->cann_graph->ggml_graph_properties.resize(cgraph->n_nodes); - return true; - } +static bool is_matched_graph(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph) { + ggml_cann_graph_lru_cache &lru_cache = cann_ctx->graph_lru_cache; + for (auto &graph_ptr : lru_cache.cache_list) { + // Skip graphs with a different number of nodes. + if (graph_ptr->ggml_graph_properties.size() != static_cast(cgraph->n_nodes)) { + continue; + } - // The number of nodes is the same; iterate over each node to check whether they match. - for (int i = 0; i < cgraph->n_nodes; i++) { - bool has_matching_properties = ggml_graph_node_has_matching_properties( - cgraph->nodes[i], &cann_ctx->cann_graph->ggml_graph_properties[i]); - if(!has_matching_properties) { + // Check if all nodes match. + bool all_match = true; + for (int i = 0; i < cgraph->n_nodes; ++i) { + if (!ggml_graph_node_has_matching_properties(cgraph->nodes[i], &graph_ptr->ggml_graph_properties[i])) { + all_match = false; + break; + } + } + + if (all_match) { + // update cache_list && renturn graph_ptr + lru_cache.move_to_front(graph_ptr); return true; } } + return false; } #endif // USE_ACL_GRAPH @@ -2241,17 +2296,13 @@ static bool is_cann_graph_update_required(ggml_backend_cann_context * cann_ctx, * @param cann_graph_update_required Whether graph capture is needed due to graph changes. */ static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph, - bool & use_cann_graph, bool & cann_graph_update_required) { + bool & use_cann_graph, bool & cann_graph_update_required) { #ifdef USE_ACL_GRAPH + ggml_cann_graph* matched_graph = cann_ctx->graph_lru_cache.cache_list.front(); if (use_cann_graph && cann_graph_update_required) { - if (cann_ctx->cann_graph->graph != nullptr) { - ACL_CHECK(aclmdlRIDestroy(cann_ctx->cann_graph->graph)); - cann_ctx->cann_graph->graph = nullptr; - } ACL_CHECK(aclmdlRICaptureBegin(cann_ctx->stream(), ACL_MODEL_RI_CAPTURE_MODE_GLOBAL)); } #endif // USE_ACL_GRAPH - // Only perform the graph execution if CANN graphs are not enabled, or we are capturing the graph. // With the use of CANN graphs, the execution will be performed by the graph launch. if (!use_cann_graph || cann_graph_update_required) { @@ -2272,12 +2323,12 @@ static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx #ifdef USE_ACL_GRAPH if (use_cann_graph && cann_graph_update_required) { // End CANN graph capture - ACL_CHECK(aclmdlRICaptureEnd(cann_ctx->stream(), &cann_ctx->cann_graph->graph)); + ACL_CHECK(aclmdlRICaptureEnd(cann_ctx->stream(), &matched_graph->graph)); } if (use_cann_graph) { // Execute graph - ACL_CHECK(aclmdlRIExecuteAsync(cann_ctx->cann_graph->graph, cann_ctx->stream())); + ACL_CHECK(aclmdlRIExecuteAsync(matched_graph->graph, cann_ctx->stream())); } #endif // USE_ACL_GRAPH } @@ -2302,6 +2353,9 @@ static enum ggml_status ggml_backend_cann_graph_compute( ggml_cann_set_device(cann_ctx->device); g_nz_workspaces[cann_ctx->device].clear(); + // calculate rope cache for fist layer in current device. + cann_ctx->rope_cache.cached = false; + #ifdef USE_ACL_GRAPH bool use_cann_graph = true; bool cann_graph_update_required = false; @@ -2311,19 +2365,17 @@ static enum ggml_status ggml_backend_cann_graph_compute( } if (use_cann_graph) { - if (cann_ctx->cann_graph == nullptr) { - cann_ctx->cann_graph.reset(new ggml_cann_graph()); - cann_graph_update_required = true; + // If no matching graph is found, the graph needs to be recaptured. + cann_graph_update_required = !is_matched_graph(cann_ctx, cgraph); + if (cann_graph_update_required) { + // If no matching graph is found, add a new ACL graph. + add_lru_matched_graph_node_properties(cann_ctx, cgraph); } - - cann_graph_update_required = is_cann_graph_update_required(cann_ctx, cgraph); - set_ggml_graph_node_properties(cann_ctx, cgraph); } #else bool use_cann_graph = false; bool cann_graph_update_required = false; #endif // USE_ACL_GRAPH - evaluate_and_capture_cann_graph( cann_ctx, cgraph, @@ -2690,6 +2742,7 @@ static const ggml_backend_i ggml_backend_cann_interface = { /* .graph_compute = */ ggml_backend_cann_graph_compute, /* .event_record = */ ggml_backend_cann_event_record, /* .event_wait = */ ggml_backend_cann_event_wait, + /* .optimize_graph = */ NULL, }; /** diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp index 0c15165f1b..2b81f8b9af 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -190,6 +190,7 @@ static const struct ggml_backend_i ggml_backend_cpu_i = { /* .graph_compute = */ ggml_backend_cpu_graph_compute, /* .event_record = */ NULL, /* .event_wait = */ NULL, + /* .optimize_graph = */ NULL, }; static ggml_guid_t ggml_backend_cpu_guid(void) { diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 9adf910769..212e52ef6a 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -8598,6 +8598,7 @@ static void ggml_compute_forward_timestep_embedding_f32( embed_data[j + half] = sinf(arg); } if (dim % 2 != 0 && ith == 0) { + embed_data[2 * half] = 0.f; embed_data[dim] = 0.f; } } diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt index d3dfc7807d..0d8c5af473 100644 --- a/ggml/src/ggml-cuda/CMakeLists.txt +++ b/ggml/src/ggml-cuda/CMakeLists.txt @@ -44,6 +44,8 @@ if (CUDAToolkit_FOUND) list(APPEND GGML_SOURCES_CUDA ${SRCS}) file(GLOB SRCS "template-instances/mmq*.cu") list(APPEND GGML_SOURCES_CUDA ${SRCS}) + file(GLOB SRCS "template-instances/mmf*.cu") + list(APPEND GGML_SOURCES_CUDA ${SRCS}) if (GGML_CUDA_FA_ALL_QUANTS) file(GLOB SRCS "template-instances/fattn-vec*.cu") diff --git a/ggml/src/ggml-cuda/binbcast.cu b/ggml/src/ggml-cuda/binbcast.cu index 1c76566344..725e1a81a1 100644 --- a/ggml/src/ggml-cuda/binbcast.cu +++ b/ggml/src/ggml-cuda/binbcast.cu @@ -23,28 +23,44 @@ static __device__ __forceinline__ float op_div(const float a, const float b) { return a / b; } +template +static __global__ void k_bin_bcast(const src0_t * src0, + const src1_t * src1, + dst_t * dst, + const int ne0, + const int ne1, + const int ne2, + const uint3 ne3, + const uint3 ne10, + const uint3 ne11, + const uint3 ne12, + const uint3 ne13, + /*int s0, */ const int s1, + const int s2, + const int s3, + /*int s00,*/ const int s01, + const int s02, + const int s03, + /*int s10,*/ const int s11, + const int s12, + const int s13, + src1_ptrs... src1s) { + const uint32_t i0s = blockDim.x * blockIdx.x + threadIdx.x; + const uint32_t i1 = (blockDim.y * blockIdx.y + threadIdx.y); + const uint32_t i2 = fastdiv((blockDim.z * blockIdx.z + threadIdx.z), ne3); + const uint32_t i3 = (blockDim.z * blockIdx.z + threadIdx.z) - (i2 * ne3.z); - -template -static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst, - const int ne0, const int ne1, const int ne2, const int ne3, - const int ne10, const int ne11, const int ne12, const int ne13, - /*int s0, */ const int s1, const int s2, const int s3, - /*int s00,*/ const int s01, const int s02, const int s03, - /*int s10,*/ const int s11, const int s12, const int s13, - src1_ptrs... src1s) { - const int i0s = blockDim.x*blockIdx.x + threadIdx.x; - const int i1 = (blockDim.y*blockIdx.y + threadIdx.y); - const int i2 = (blockDim.z*blockIdx.z + threadIdx.z) / ne3; - const int i3 = (blockDim.z*blockIdx.z + threadIdx.z) % ne3; - - if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) { + if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3.z) { return; } - const int i11 = i1 % ne11; - const int i12 = i2 % ne12; - const int i13 = i3 % ne13; + const uint32_t i11 = fastmodulo(i1, ne11); + const uint32_t i12 = fastmodulo(i2, ne12); + const uint32_t i13 = fastmodulo(i3, ne13); const size_t i_src0 = i3*s03 + i2*s02 + i1*s01; const size_t i_src1 = i13*s13 + i12*s12 + i11*s11; @@ -53,8 +69,8 @@ static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst const src0_t * src0_row = src0 ? (src0 + i_src0) : nullptr; dst_t * dst_row = dst + i_dst; - for (int i0 = i0s; i0 < ne0; i0 += blockDim.x*gridDim.x) { - const int i10 = i0 % ne10; + for (int i0 = i0s; i0 < ne0; i0 += blockDim.x * gridDim.x) { + const uint32_t i10 = fastmodulo(i0, ne10); float result = src0_row ? (float) src0_row[i0] : 0.0f; if constexpr (sizeof...(src1_ptrs) > 0) { @@ -67,28 +83,48 @@ static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst } } -template -static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst, - const int ne0, const int ne1, const int ne2,const int ne3, - const int ne10, const int ne11, const int ne12, const int ne13, - /*int s0, */ const int s1, const int s2, const int s3, - /*int s00,*/ const int s01, const int s02, const int s03, - /*int s10,*/ const int s11, const int s12, const int s13, - src1_ptrs ... src1s) { +template +static __global__ void k_bin_bcast_unravel(const src0_t * src0, + const src1_t * src1, + dst_t * dst, + const uint3 ne0, + const uint3 ne1, + const uint3 ne2, + const uint32_t ne3, + const uint3 prod_012, + const uint3 prod_01, + const uint3 ne10, + const uint3 ne11, + const uint3 ne12, + const uint3 ne13, + /*int s0, */ const int s1, + const int s2, + const int s3, + /*int s00,*/ const int s01, + const int s02, + const int s03, + /*int s10,*/ const int s11, + const int s12, + const int s13, + src1_ptrs... src1s) { const int i = blockDim.x*blockIdx.x + threadIdx.x; - const int i3 = i/(ne2*ne1*ne0); - const int i2 = (i/(ne1*ne0)) % ne2; - const int i1 = (i/ne0) % ne1; - const int i0 = i % ne0; + const uint32_t i3 = fastdiv(i, prod_012); + const uint32_t i2 = fastdiv(i - i3 * prod_012.z, prod_01); + const uint32_t i1 = fastdiv(i - i3 * prod_012.z - i2 * prod_01.z, ne0); + const uint32_t i0 = i - i3 * prod_012.z - i2 * prod_01.z - i1 * ne0.z; - if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) { + if (i0 >= ne0.z || i1 >= ne1.z || i2 >= ne2.z || i3 >= ne3) { return; } - const int i11 = i1 % ne11; - const int i12 = i2 % ne12; - const int i13 = i3 % ne13; + const int i11 = fastmodulo(i1, ne11); + const int i12 = fastmodulo(i2, ne12); + const int i13 = fastmodulo(i3, ne13); const size_t i_src0 = i3*s03 + i2*s02 + i1*s01; const size_t i_src1 = i13*s13 + i12*s12 + i11*s11; @@ -97,7 +133,7 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * const src0_t * src0_row = src0 ? (src0 + i_src0) : nullptr; dst_t * dst_row = dst + i_dst; - const int i10 = i0 % ne10; + const int i10 = fastmodulo(i0, ne10); float result = src0_row ? (float) src0_row[i0] : 0.0f; if constexpr (sizeof...(src1_ptrs) > 0) { @@ -170,11 +206,6 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor * //int64_t ne02 = cne0[2]; GGML_UNUSED(ne02); //int64_t ne03 = cne0[3]; GGML_UNUSED(ne03); - int64_t ne10 = cne1[0]; - int64_t ne11 = cne1[1]; - int64_t ne12 = cne1[2]; - int64_t ne13 = cne1[3]; - size_t nb0 = cnb[0]; size_t nb1 = cnb[1]; size_t nb2 = cnb[2]; @@ -233,48 +264,51 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor * block_dims.y = std::min(ne1, block_size / block_dims.x); block_dims.z = std::min(std::min(ne2 * ne3, block_size / block_dims.x / block_dims.y), 64U); - dim3 block_nums((hne0 + block_dims.x - 1) / block_dims.x, - (ne1 + block_dims.y - 1) / block_dims.y, + dim3 block_nums((hne0 + block_dims.x - 1) / block_dims.x, (ne1 + block_dims.y - 1) / block_dims.y, (ne2 * ne3 + block_dims.z - 1) / block_dims.z); + const uint3 ne10 = init_fastdiv_values((uint32_t) cne1[0]); + const uint3 ne11 = init_fastdiv_values((uint32_t) cne1[1]); + const uint3 ne12 = init_fastdiv_values((uint32_t) cne1[2]); + const uint3 ne13 = init_fastdiv_values((uint32_t) cne1[3]); + if (block_nums.z > 65535) { - int block_num = (ne0 * ne1 * ne2 * ne3 + block_size - 1) / block_size; + int block_num = (ne0 * ne1 * ne2 * ne3 + block_size - 1) / block_size; + const uint3 prod_012 = init_fastdiv_values((uint32_t) (ne0 * ne1 * ne2)); + const uint3 prod_01 = init_fastdiv_values((uint32_t) (ne0 * ne1)); + const uint3 ne0_fastdiv = init_fastdiv_values((uint32_t) ne0); + const uint3 ne1_fastdiv = init_fastdiv_values((uint32_t) ne1); + const uint3 ne2_fastdiv = init_fastdiv_values((uint32_t) ne2); + if constexpr (sizeof...(I) > 0) { - k_bin_bcast_unravel - <<>>(src0_dd, src1_dd, dst_dd, - ne0, ne1, ne2, ne3, - ne10, ne11, ne12, ne13, - /* s0, */ s1, s2, s3, - /* s00,*/ s01, s02, s03, - /* s10,*/ s11, s12,s13, - (const src1_t *) dst->src[I + 1]->data...); + k_bin_bcast_unravel<<>>( + src0_dd, src1_dd, dst_dd, ne0_fastdiv, ne1_fastdiv, ne2_fastdiv, ne3, prod_012, prod_01, ne10, ne11, + ne12, ne13, + /* s0, */ s1, s2, s3, + /* s00,*/ s01, s02, s03, + /* s10,*/ s11, s12, s13, (const src1_t *) dst->src[I + 1]->data...); } else { k_bin_bcast_unravel - <<>>(src0_dd, src1_dd, dst_dd, - ne0, ne1, ne2, ne3, - ne10, ne11, ne12, ne13, - /* s0, */ s1, s2, s3, - /* s00,*/ s01, s02, s03, - /* s10,*/ s11, s12,s13); + <<>>(src0_dd, src1_dd, dst_dd, ne0_fastdiv, ne1_fastdiv, + ne2_fastdiv, ne3, prod_012, prod_01, ne10, ne11, ne12, ne13, + /* s0, */ s1, s2, s3, + /* s00,*/ s01, s02, s03, + /* s10,*/ s11, s12, s13); } } else { + const uint3 ne3_fastdiv = init_fastdiv_values((uint32_t) ne3); if constexpr (sizeof...(I) > 0) { - k_bin_bcast - <<>>(src0_dd, src1_dd, dst_dd, - ne0, ne1, ne2, ne3, - ne10, ne11, ne12, ne13, - /* s0, */ s1, s2, s3, - /* s00,*/ s01, s02, s03, - /* s10,*/ s11, s12,s13, - (const src1_t *) dst->src[I + 1]->data...); + k_bin_bcast<<>>( + src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3_fastdiv, ne10, ne11, ne12, ne13, + /* s0, */ s1, s2, s3, + /* s00,*/ s01, s02, s03, + /* s10,*/ s11, s12, s13, (const src1_t *) dst->src[I + 1]->data...); } else { - k_bin_bcast - <<>>(src0_dd, src1_dd, dst_dd, - ne0, ne1, ne2, ne3, - ne10, ne11, ne12, ne13, - /* s0, */ s1, s2, s3, - /* s00,*/ s01, s02, s03, - /* s10,*/ s11, s12,s13); + k_bin_bcast<<>>( + src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3_fastdiv, ne10, ne11, ne12, ne13, + /* s0, */ s1, s2, s3, + /* s00,*/ s01, s02, s03, + /* s10,*/ s11, s12, s13); } } } diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 931524a200..394595be0e 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -545,6 +545,31 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i #endif // defined(GGML_USE_HIP) } +static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const float v, const float u) { + acc += v*u; +} + +static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const float2 v, const float2 u) { + acc += v.x*u.x; + acc += v.y*u.y; +} + +static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const half2 v, const half2 u) { +#if defined(GGML_USE_HIP) && defined(GCN) + asm volatile("v_dot2_f32_f16 %0, %1, %2, %0" : "+v"(acc) : "v"(v), "v"(u)); +#else +#ifdef FAST_FP16_AVAILABLE + const float2 tmp = __half22float2(v*u); + acc += tmp.x + tmp.y; +#else + const float2 tmpv = __half22float2(v); + const float2 tmpu = __half22float2(u); + acc += tmpv.x * tmpu.x; + acc += tmpv.y * tmpu.y; +#endif // FAST_FP16_AVAILABLE +#endif // defined(GGML_USE_HIP) && defined(GCN) +} + static __device__ __forceinline__ float ggml_cuda_e8m0_to_fp32(uint8_t x) { #if CUDART_VERSION >= 12080 const nv_bfloat16 e = __nv_cvt_e8m0_to_bf16raw(x); diff --git a/ggml/src/ggml-cuda/fattn-tile.cu b/ggml/src/ggml-cuda/fattn-tile.cu index fb2163acd0..64f7d4a1a1 100644 --- a/ggml/src/ggml-cuda/fattn-tile.cu +++ b/ggml/src/ggml-cuda/fattn-tile.cu @@ -304,12 +304,7 @@ static __global__ void flash_attn_tile( for (int i_KQ_0 = 0; i_KQ_0 < kq_stride; i_KQ_0 += warp_size) { #pragma unroll for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) { -#ifdef FAST_FP16_AVAILABLE - const float2 tmp = __half22float2(K_k[i_KQ_0/warp_size] * Q_k[j_KQ_0/nwarps]); - sum[i_KQ_0/warp_size][j_KQ_0/nwarps] += tmp.x + tmp.y; -#else - sum[i_KQ_0/warp_size][j_KQ_0/nwarps] += K_k[i_KQ_0/warp_size] * Q_k[j_KQ_0/nwarps]; -#endif // FAST_FP16_AVAILABLE + ggml_cuda_mad(sum[i_KQ_0/warp_size][j_KQ_0/nwarps], K_k[i_KQ_0/warp_size], Q_k[j_KQ_0/nwarps]); } } } diff --git a/ggml/src/ggml-cuda/getrows.cu b/ggml/src/ggml-cuda/getrows.cu index 83d02474f5..2fab33243d 100644 --- a/ggml/src/ggml-cuda/getrows.cu +++ b/ggml/src/ggml-cuda/getrows.cu @@ -2,39 +2,39 @@ #include "dequantize.cuh" #include "convert.cuh" -#define MAX_GRIDDIM_Y 65535 - template static __global__ void k_get_rows( const void * __restrict__ src0, const int32_t * __restrict__ src1, dst_t * __restrict__ dst, const int64_t ne00, /*const int64_t ne01, const int64_t ne02, const int64_t ne03,*/ - /*const int64_t ne10, const int64_t ne11,*/ const int64_t ne12, /*const int64_t ne13,*/ + /*const int64_t ne10,*/ const int64_t ne11, const int64_t ne12, /*const int64_t ne13,*/ /*const size_t s0,*/ const size_t s1, const size_t s2, const size_t s3, /*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03, const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) { - for (int64_t i00 = 2*(blockIdx.y*blockDim.x + threadIdx.x); i00 < ne00; i00 += gridDim.y*blockDim.x) { - // The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher. - const int i10 = blockIdx.x; - const int i11 = blockIdx.z / ne12; - const int i12 = blockIdx.z % ne12; + for (int64_t z = blockIdx.z; z < ne11*ne12; z += gridDim.z) { + for (int64_t i00 = 2*(blockIdx.y*blockDim.x + threadIdx.x); i00 < ne00; i00 += gridDim.y*blockDim.x) { + // The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher. + const int i10 = blockIdx.x; + const int i11 = z / ne12; // TODO fastdiv + const int i12 = z % ne12; - const int i01 = src1[i10*s10 + i11*s11 + i12*s12]; + const int i01 = src1[i10*s10 + i11*s11 + i12*s12]; - dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3; - const void * src0_row = (const char *) src0 + i01*nb01 + i11*nb02 + i12*nb03; + dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3; + const void * src0_row = (const char *) src0 + i01*nb01 + i11*nb02 + i12*nb03; - const int ib = i00/qk; // block index - const int iqs = (i00%qk)/qr; // quant index - const int iybs = i00 - i00%qk; // dst block start index - const int y_offset = qr == 1 ? 1 : qk/2; + const int ib = i00/qk; // block index + const int iqs = (i00%qk)/qr; // quant index + const int iybs = i00 - i00%qk; // dst block start index + const int y_offset = qr == 1 ? 1 : qk/2; - // dequantize - float2 v; - dequantize_kernel(src0_row, ib, iqs, v); + // dequantize + float2 v; + dequantize_kernel(src0_row, ib, iqs, v); - dst_row[iybs + iqs + 0] = ggml_cuda_cast(v.x); - dst_row[iybs + iqs + y_offset] = ggml_cuda_cast(v.y); + dst_row[iybs + iqs + 0] = ggml_cuda_cast(v.x); + dst_row[iybs + iqs + y_offset] = ggml_cuda_cast(v.y); + } } } @@ -42,27 +42,29 @@ template static __global__ void k_get_rows_float( const src0_t * __restrict__ src0, const int32_t * __restrict__ src1, dst_t * __restrict__ dst, const int64_t ne00, /*const int64_t ne01, const int64_t ne02, const int64_t ne03,*/ - /*const int64_t ne10, const int64_t ne11,*/ const int64_t ne12, /*const int64_t ne13,*/ + /*const int64_t ne10,*/ const int64_t ne11, const int64_t ne12, /*const int64_t ne13,*/ /*const size_t s0,*/ const size_t s1, const size_t s2, const size_t s3, /*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03, const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) { - for (int64_t i00 = blockIdx.y*blockDim.x + threadIdx.x; i00 < ne00; i00 += gridDim.y*blockDim.x) { - // The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher. - const int i10 = blockIdx.x; - const int i11 = blockIdx.z / ne12; - const int i12 = blockIdx.z % ne12; + for (int64_t z = blockIdx.z; z < ne11*ne12; z += gridDim.z) { + for (int64_t i00 = blockIdx.y*blockDim.x + threadIdx.x; i00 < ne00; i00 += gridDim.y*blockDim.x) { + // The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher. + const int i10 = blockIdx.x; + const int i11 = z / ne12; // TODO fastdiv + const int i12 = z % ne12; - if (i00 >= ne00) { - return; + if (i00 >= ne00) { + return; + } + + const int i01 = src1[i10*s10 + i11*s11 + i12*s12]; + + dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3; + const src0_t * src0_row = (const src0_t *)((const char *) src0 + i01*nb01 + i11*nb02 + i12*nb03); + + dst_row[i00] = ggml_cuda_cast(src0_row[i00]); } - - const int i01 = src1[i10*s10 + i11*s11 + i12*s12]; - - dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3; - const src0_t * src0_row = (const src0_t *)((const char *) src0 + i01*nb01 + i11*nb02 + i12*nb03); - - dst_row[i00] = ggml_cuda_cast(src0_row[i00]); } } @@ -98,7 +100,7 @@ static void get_rows_cuda_q( cudaStream_t stream) { const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1); const int block_num_y = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE); - const dim3 block_nums(ne10, MIN(block_num_y, MAX_GRIDDIM_Y), ne11*ne12); + const dim3 block_nums(ne10, MIN(block_num_y, UINT16_MAX), MIN(ne11*ne12, UINT16_MAX)); // strides in elements // const size_t s0 = nb0 / sizeof(dst_t); @@ -116,7 +118,7 @@ static void get_rows_cuda_q( k_get_rows<<>>( src0_d, src1_d, dst_d, ne00, /*ne01, ne02, ne03,*/ - /*ne10, ne11,*/ ne12, /*ne13,*/ + /*ne10,*/ ne11, ne12, /*ne13,*/ /* s0,*/ s1, s2, s3, /* nb00,*/ nb01, nb02, nb03, s10, s11, s12/*, s13*/); @@ -131,7 +133,7 @@ static void get_rows_cuda_float( cudaStream_t stream) { const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1); const int block_num_y = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE; - const dim3 block_nums(ne10, MIN(block_num_y, MAX_GRIDDIM_Y), ne11*ne12); + const dim3 block_nums(ne10, MIN(block_num_y, UINT16_MAX), MIN(ne11*ne12, UINT16_MAX)); // strides in elements // const size_t s0 = nb0 / sizeof(dst_t); @@ -147,7 +149,7 @@ static void get_rows_cuda_float( k_get_rows_float<<>>( src0_d, src1_d, dst_d, ne00, /*ne01, ne02, ne03,*/ - /*ne10, ne11,*/ ne12, /*ne13,*/ + /*ne10,*/ ne11, ne12, /*ne13,*/ /* s0,*/ s1, s2, s3, /* nb00,*/ nb01, nb02, nb03, s10, s11, s12/*, s13*/); diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 2051a2f50e..154076f38d 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -2110,6 +2110,11 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * ggml_cuda_mul_mat_q(ctx, src0, src1, ids, dst); return; } + + if (ggml_cuda_should_use_mmf(src0->type, cc, WARP_SIZE, src0->ne, src1->ne[2])) { + ggml_cuda_mul_mat_f(ctx, src0, src1, ids, dst); + return; + } } cudaStream_t stream = ctx.stream(); @@ -3139,6 +3144,7 @@ static const ggml_backend_i ggml_backend_cuda_interface = { /* .graph_compute = */ ggml_backend_cuda_graph_compute, /* .event_record = */ ggml_backend_cuda_event_record, /* .event_wait = */ ggml_backend_cuda_event_wait, + /* .optimize_graph = */ NULL, }; static ggml_guid_t ggml_backend_cuda_guid() { @@ -3396,10 +3402,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32; case GGML_OP_GET_ROWS: { - // FIXME: https://github.com/ggml-org/llama.cpp/pull/15868 - if (op->src[1]->ne[1]*op->src[1]->ne[2] > 65535) { - return false; - } switch (op->src[0]->type) { case GGML_TYPE_F16: case GGML_TYPE_F32: diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh index 667deb9c65..c1f24243fe 100644 --- a/ggml/src/ggml-cuda/mma.cuh +++ b/ggml/src/ggml-cuda/mma.cuh @@ -1,3 +1,4 @@ +#pragma once // This file contains primitives that expose the tensor core PTX instructions for CUDA code. // The primitives can be used in a similar way as the nvcuda::wmma interface but with a well-defined memory layout. // The documentation for the PTX instructions can be found under: diff --git a/ggml/src/ggml-cuda/mmf.cu b/ggml/src/ggml-cuda/mmf.cu index cfa5c5cce2..16331e9ecf 100644 --- a/ggml/src/ggml-cuda/mmf.cu +++ b/ggml/src/ggml-cuda/mmf.cu @@ -1,343 +1,12 @@ #include "ggml.h" -#include "common.cuh" -#include "mma.cuh" #include "mmf.cuh" -using namespace ggml_cuda_mma; - -#define MMF_ROWS_PER_BLOCK 32 - -template -__launch_bounds__(ggml_cuda_get_physical_warp_size()*nwarps, 1) -static __global__ void mul_mat_f( - const T * __restrict__ x, const float * __restrict__ y, const int32_t * __restrict__ ids, float * __restrict__ dst, - const int ncols, const int nchannels_y, const int stride_row, const int stride_col_y, const int stride_col_dst, - const int channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst, - const int sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) { -#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) - typedef tile<16, 8, T> tile_A; - typedef tile< 8, 8, T> tile_B; - typedef tile<16, 8, float> tile_C; - - constexpr int warp_size = ggml_cuda_get_physical_warp_size(); - constexpr int tile_k_padded = warp_size + 4; - constexpr int ntA = rows_per_block / tile_A::I; - constexpr int ntB = (cols_per_block + tile_B::I - 1) / tile_B::I; - - const int row0 = blockIdx.x * rows_per_block; - const int channel_dst = blockIdx.y; - const int channel_x = channel_dst / channel_ratio; - const int channel_y = channel_dst; - const int sample_dst = blockIdx.z; - const int sample_x = sample_dst / sample_ratio; - const int sample_y = sample_dst; - - x += int64_t(sample_x) *stride_sample_x + channel_x *stride_channel_x + row0*stride_row ; - y += int64_t(sample_y) *stride_sample_y + channel_y *stride_channel_y; - dst += int64_t(sample_dst)*stride_sample_dst + channel_dst*stride_channel_dst; - - const float2 * y2 = (const float2 *) y; - - extern __shared__ char data_mmv[]; - - tile_C C[ntA][ntB]; - - T * tile_xy = (T *) data_mmv + threadIdx.y*(tile_A::I * tile_k_padded); - - for (int col = threadIdx.y*warp_size + threadIdx.x; col < ncols; col += nwarps*warp_size) { - tile_A A[ntA][warp_size / tile_A::J]; -#pragma unroll - for (int itA = 0; itA < ntA; ++itA) { -#pragma unroll - for (int i = 0; i < tile_A::I; ++i) { - tile_xy[i*tile_k_padded + threadIdx.x] = x[(itA*tile_A::I + i)*stride_row + col]; - } -#pragma unroll - for (int k0 = 0; k0 < warp_size; k0 += tile_A::J) { - load_ldmatrix(A[itA][k0/tile_A::J], tile_xy + k0, tile_k_padded); - } - } - -#pragma unroll - for (int itB = 0; itB < ntB; ++itB) { - if constexpr (std::is_same_v) { -#pragma unroll - for (int j0 = 0; j0 < tile_B::I; ++j0) { - const int j = j0 + itB*tile_B::I; - - tile_xy[j0*tile_k_padded + threadIdx.x] = j < cols_per_block ? y[j*stride_col_y + col] : 0.0f; - } - } else if constexpr (std::is_same_v || std::is_same_v) { -#pragma unroll - for (int j0 = 0; j0 < tile_B::I; ++j0) { - const int j = j0 + itB*tile_B::I; - - const float2 tmp = j < cols_per_block ? y2[j*stride_col_y + col] : make_float2(0.0f, 0.0f); - tile_xy[j0*tile_k_padded + threadIdx.x] = {tmp.x, tmp.y}; - } - } else { - static_assert(std::is_same_v, "unsupported type"); - } -#pragma unroll - for (int k0 = 0; k0 < warp_size; k0 += tile_B::J) { - tile_B B; - load_ldmatrix(B, tile_xy + k0, tile_k_padded); -#pragma unroll - for (int itA = 0; itA < ntA; ++itA) { - mma(C[itA][itB], A[itA][k0/tile_B::J], B); - } - } - } - } - - float * buf_iw = (float *) data_mmv; - constexpr int kiw = nwarps*rows_per_block + 4; - - if (nwarps > 1) { - __syncthreads(); - } -#pragma unroll - for (int itB = 0; itB < ntB; ++itB) { -#pragma unroll - for (int itA = 0; itA < ntA; ++itA) { -#pragma unroll - for (int l = 0; l < tile_C::ne; ++l) { - const int i = threadIdx.y*rows_per_block + itA*tile_C::I + tile_C::get_i(l); - const int j = itB*tile_C::J + tile_C::get_j(l); - buf_iw[j*kiw + i] = C[itA][itB].x[l]; - } - } - } - - if (nwarps > 1) { - __syncthreads(); - } - -#pragma unroll - for (int j0 = 0; j0 < cols_per_block; j0 += nwarps) { - const int j = j0 + threadIdx.y; - - if (j0 + nwarps > cols_per_block && j >= cols_per_block) { - return; - } - - float sum = 0.0f; - static_assert(rows_per_block == warp_size, "need loop/check"); -#pragma unroll - for (int i0 = 0; i0 < nwarps*rows_per_block; i0 += rows_per_block) { - const int i = i0 + threadIdx.x; - - sum += buf_iw[j*kiw + i]; - } - dst[j*stride_col_dst + row0 + threadIdx.x] = sum; - } -#else - GGML_UNUSED_VARS(x, y, ids, dst, - ncols, nchannels_y, stride_row, stride_col_y, stride_col_dst, - channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); - NO_DEVICE_CODE; -#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) -} - -template -static void mul_mat_f_cuda( - const T * x, const float * y, const int32_t * ids, float * dst, - const int64_t ncols_x, const int64_t nrows_x, - const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst, - const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst, - const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x, - const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst, - cudaStream_t stream) { - typedef tile<16, 8, T> tile_A; - typedef tile< 8, 8, T> tile_B; - - GGML_ASSERT(!ids && "mul_mat_id not implemented"); - - GGML_ASSERT(ncols_x % 2 == 0); - GGML_ASSERT(stride_row % 2 == 0); - GGML_ASSERT(stride_col_y % 2 == 0); - GGML_ASSERT(ids || nchannels_dst % nchannels_x == 0); - GGML_ASSERT( nsamples_dst % nsamples_x == 0); - const int64_t channel_ratio = nchannels_dst / nchannels_x; - const int64_t sample_ratio = nsamples_dst / nsamples_x; - - const int device = ggml_cuda_get_device(); - const int warp_size = ggml_cuda_info().devices[device].warp_size; - - int64_t nwarps_best = 1; - int64_t niter_best = (ncols_x + warp_size*2 - 1) / (warp_size*2); - int64_t max_block_size = 256; - for (int64_t nwarps = 2; nwarps <= max_block_size/warp_size; nwarps++) { - const int64_t niter = (ncols_x + nwarps*warp_size*2 - 1) / (nwarps*warp_size*2); - if (niter < niter_best) { - niter_best = niter; - nwarps_best = nwarps; - } - } - - constexpr int rows_per_block = MMF_ROWS_PER_BLOCK; - const int nbytes_shared_iter = nwarps_best * tile_A::I * (warp_size + 4) * 4; - const int nbytes_shared_combine = GGML_PAD(cols_per_block, tile_B::I) * (nwarps_best*rows_per_block + 4) * 4; - const int nbytes_shared = std::max(nbytes_shared_iter, nbytes_shared_combine); - const dim3 block_nums(nrows_x/rows_per_block, nchannels_dst, nsamples_dst); - const dim3 block_dims(warp_size, nwarps_best, 1); - switch (nwarps_best) { - case 1: { - mul_mat_f<<>> - (x, y, ids, dst, ncols_x, nchannels_y, stride_row, stride_col_y, stride_col_dst, - channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); - } break; - case 2: { - mul_mat_f<<>> - (x, y, ids, dst, ncols_x, nchannels_y, stride_row, stride_col_y, stride_col_dst, - channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); - } break; - case 3: { - mul_mat_f<<>> - (x, y, ids, dst, ncols_x, nchannels_y, stride_row, stride_col_y, stride_col_dst, - channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); - } break; - case 4: { - mul_mat_f<<>> - (x, y, ids, dst, ncols_x, nchannels_y, stride_row, stride_col_y, stride_col_dst, - channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); - } break; - case 5: { - mul_mat_f<<>> - (x, y, ids, dst, ncols_x, nchannels_y, stride_row, stride_col_y, stride_col_dst, - channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); - } break; - case 6: { - mul_mat_f<<>> - (x, y, ids, dst, ncols_x, nchannels_y, stride_row, stride_col_y, stride_col_dst, - channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); - } break; - case 7: { - mul_mat_f<<>> - (x, y, ids, dst, ncols_x, nchannels_y, stride_row, stride_col_y, stride_col_dst, - channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); - } break; - case 8: { - mul_mat_f<<>> - (x, y, ids, dst, ncols_x, nchannels_y, stride_row, stride_col_y, stride_col_dst, - channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); - } break; - default: { - GGML_ABORT("fatal error"); - } break; - } -} - -template -static void mul_mat_f_switch_cols_per_block( - const T * x, const float * y, const int32_t * ids, float * dst, - const int64_t ncols_x, const int64_t nrows_x, const int64_t ncols_dst, - const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst, - const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst, - const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x, - const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst, - cudaStream_t stream) { - switch (ncols_dst) { - case 1: { - mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, - nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); - } break; - case 2: { - mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, - nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); - } break; - case 3: { - mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, - nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); - } break; - case 4: { - mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, - nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); - } break; - case 5: { - mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, - nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); - } break; - case 6: { - mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, - nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); - } break; - case 7: { - mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, - nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); - } break; - case 8: { - mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, - nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); - } break; - case 9: { - mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, - nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); - } break; - case 10: { - mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, - nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); - } break; - case 11: { - mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, - nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); - } break; - case 12: { - mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, - nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); - } break; - case 13: { - mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, - nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); - } break; - case 14: { - mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, - nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); - } break; - case 15: { - mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, - nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); - } break; - case 16: { - mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, - nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); - } break; - default: { - GGML_ABORT("fatal error"); - } break; - } -} - void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) { GGML_ASSERT( src1->type == GGML_TYPE_F32); GGML_ASSERT(!ids || ids->type == GGML_TYPE_I32); GGML_ASSERT( dst->type == GGML_TYPE_F32); + GGML_TENSOR_BINARY_OP_LOCALS; const size_t ts_src0 = ggml_type_size(src0->type); @@ -365,55 +34,72 @@ void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * sr const int64_t s13 = src1->nb[3] / ts_src1; const int64_t s3 = dst->nb[3] / ts_dst; + const int64_t ids_s0 = ids ? ids->nb[0] / ggml_type_size(ids->type) : 0; + const int64_t ids_s1 = ids ? ids->nb[1] / ggml_type_size(ids->type) : 0; + // For MUL_MAT_ID the memory layout is different than for MUL_MAT: const int64_t ncols_dst = ids ? ne2 : ne1; - const int64_t nchannels_y = ids ? ne11 : ne12; - const int64_t nchannels_dst = ids ? ne1 : ne2; - const int64_t stride_channel_dst = ids ? s1 : s2; - const int64_t stride_channel_y = ids ? s11 : s12; + const int64_t nchannels_dst = ids ? ne1 : ne2; - GGML_ASSERT(!ids || ncols_dst == 1); + const int64_t stride_col_dst = ids ? s2 : s1; + const int64_t stride_col_y = ids ? s12 : s11; + const int64_t stride_channel_dst = ids ? s1 : s2; + + int64_t stride_channel_y = ids ? s11 : s12; + int64_t nchannels_y = ids ? ne11 : ne12; + + //mul_mat_id: handle broadcast + if (ids && nchannels_y == 1) { + stride_channel_y = 0; + nchannels_y = ids->ne[0]; + } switch (src0->type) { case GGML_TYPE_F32: { const float * src0_d = (const float *) src0->data; constexpr int vals_per_T = 1; mul_mat_f_switch_cols_per_block( - src0_d, src1_d, ids_d, dst_d, ne00/vals_per_T, ne01, ncols_dst, s01/vals_per_T, s11/vals_per_T, s1, - ne02, nchannels_y, nchannels_dst, s02/vals_per_T, stride_channel_y, stride_channel_dst, - ne03, ne3, s03/vals_per_T, s13, s3, ctx.stream()); + src0_d, src1_d, ids_d, dst_d, ne00/vals_per_T, ne01, ncols_dst, s01/vals_per_T, stride_col_y/vals_per_T, stride_col_dst, + ids_s0, ids_s1, ne02, nchannels_y, nchannels_dst, s02/vals_per_T, stride_channel_y, stride_channel_dst, + ne03, ne3, s03/vals_per_T, s13, s3, ctx.stream()); } break; case GGML_TYPE_F16: { const half2 * src0_d = (const half2 *) src0->data; constexpr int vals_per_T = 2; mul_mat_f_switch_cols_per_block( - src0_d, src1_d, ids_d, dst_d, ne00/vals_per_T, ne01, ncols_dst, s01/vals_per_T, s11/vals_per_T, s1, - ne02, nchannels_y, nchannels_dst, s02/vals_per_T, stride_channel_y, stride_channel_dst, - ne03, ne3, s03/vals_per_T, s13, s3, ctx.stream()); + src0_d, src1_d, ids_d, dst_d, ne00/vals_per_T, ne01, ncols_dst, s01/vals_per_T, stride_col_y/vals_per_T, stride_col_dst, + ids_s0, ids_s1, ne02, nchannels_y, nchannels_dst, s02/vals_per_T, stride_channel_y, stride_channel_dst, + ne03, ne3, s03/vals_per_T, s13, s3, ctx.stream()); } break; case GGML_TYPE_BF16: { const nv_bfloat162 * src0_d = (const nv_bfloat162 *) src0->data; constexpr int vals_per_T = 2; mul_mat_f_switch_cols_per_block( - src0_d, src1_d, ids_d, dst_d, ne00/vals_per_T, ne01, ncols_dst, s01/vals_per_T, s11/vals_per_T, s1, - ne02, nchannels_y, nchannels_dst, s02/vals_per_T, stride_channel_y, stride_channel_dst, - ne03, ne3, s03/vals_per_T, s13, s3, ctx.stream()); + src0_d, src1_d, ids_d, dst_d, ne00/vals_per_T, ne01, ncols_dst, s01/vals_per_T, stride_col_y/vals_per_T, stride_col_dst, + ids_s0, ids_s1, ne02, nchannels_y, nchannels_dst, s02/vals_per_T, stride_channel_y, stride_channel_dst, + ne03, ne3, s03/vals_per_T, s13, s3, ctx.stream()); } break; default: GGML_ABORT("unsupported type: %s", ggml_type_name(src0->type)); } } -bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * src0_ne, int64_t ne11) { +bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * src0_ne, const int src1_ncols) { + + if (ggml_is_quantized(type)) { + return false; + } + if (src0_ne[0] % (warp_size * (4/ggml_type_size(type))) != 0) { return false; } if (src0_ne[1] % MMF_ROWS_PER_BLOCK != 0) { return false; } - if (ne11 > 16) { + if (src1_ncols > 16) { return false; } + switch (type) { case GGML_TYPE_F32: return ampere_mma_available(cc); diff --git a/ggml/src/ggml-cuda/mmf.cuh b/ggml/src/ggml-cuda/mmf.cuh index 785f9f211c..bf724bc57b 100644 --- a/ggml/src/ggml-cuda/mmf.cuh +++ b/ggml/src/ggml-cuda/mmf.cuh @@ -1,5 +1,473 @@ +#pragma once + +#include "mma.cuh" #include "common.cuh" +using namespace ggml_cuda_mma; + +#define MMF_ROWS_PER_BLOCK 32 + void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst); -bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * scr0_ne, int64_t ne11); +bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * scr0_ne, const int src1_ncols); + +template +__launch_bounds__(ggml_cuda_get_physical_warp_size()*nwarps, 1) +static __global__ void mul_mat_f( + const T * __restrict__ x, const float * __restrict__ y, const int32_t * __restrict__ ids, float * __restrict__ dst, + const int ncols, const int nchannels_dst, const int stride_row, const int stride_col_y, const int stride_col_dst, + const int stride_col_id, const int stride_row_id, + const int channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst, + const int sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) { +#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) + typedef tile<16, 8, T> tile_A; + typedef tile< 8, 8, T> tile_B; + typedef tile<16, 8, float> tile_C; + + constexpr int warp_size = ggml_cuda_get_physical_warp_size(); + constexpr int tile_k_padded = warp_size + 4; + constexpr int ntA = rows_per_block / tile_A::I; + constexpr int ntB = (cols_per_block + tile_B::I - 1) / tile_B::I; + + const int row0 = blockIdx.x * rows_per_block; + + const int expert_idx = has_ids ? blockIdx.y : 0; + const int channel_dst = has_ids ? 0 : blockIdx.y; + + const int channel_x = has_ids ? expert_idx : (channel_dst / channel_ratio); + const int channel_y = channel_dst; + const int sample_dst = blockIdx.z; + const int sample_x = sample_dst / sample_ratio; + const int sample_y = sample_dst; + + x += int64_t(sample_x) *stride_sample_x + channel_x *stride_channel_x + row0*stride_row ; + y += int64_t(sample_y) *stride_sample_y + (has_ids ? 0 : channel_y *stride_channel_y); + dst += int64_t(sample_dst)*stride_sample_dst + (has_ids ? 0 : channel_dst*stride_channel_dst); + + const float2 * y2 = (const float2 *) y; + + extern __shared__ char data_mmv[]; + + char * shmem_base = data_mmv; + int * slot_map = (int *) shmem_base; + char * compute_base = has_ids ? (shmem_base + GGML_PAD(cols_per_block, 16) * sizeof(int)) : shmem_base; + + tile_C C[ntA][ntB]; + + T * tile_xy = (T *) compute_base + threadIdx.y*(tile_A::I * tile_k_padded); + + if constexpr (has_ids) { + __shared__ int has_any; + if (threadIdx.y == 0) { + int local_has_any = 0; + for (int j = threadIdx.x; j < cols_per_block; j += warp_size) { + int slot = -1; + for (int k = 0; k < nchannels_dst; ++k) { + const int idv = ids[j*stride_row_id + k*stride_col_id]; + if (idv == expert_idx) { + slot = k; + break; + } + } + if (j < cols_per_block) { + local_has_any |= (slot >= 0); + slot_map[j] = slot; + } + } + has_any = warp_reduce_any(local_has_any); + } + __syncthreads(); + if (has_any == 0) { + return; + } + } + + for (int col = threadIdx.y*warp_size + threadIdx.x; col < ncols; col += nwarps*warp_size) { + tile_A A[ntA][warp_size / tile_A::J]; +#pragma unroll + for (int itA = 0; itA < ntA; ++itA) { +#pragma unroll + for (int i = 0; i < tile_A::I; ++i) { + tile_xy[i*tile_k_padded + threadIdx.x] = x[(itA*tile_A::I + i)*stride_row + col]; + } +#pragma unroll + for (int k0 = 0; k0 < warp_size; k0 += tile_A::J) { + load_ldmatrix(A[itA][k0/tile_A::J], tile_xy + k0, tile_k_padded); + } + } + +#pragma unroll + for (int itB = 0; itB < ntB; ++itB) { + if constexpr (std::is_same_v) { +#pragma unroll + for (int j0 = 0; j0 < tile_B::I; ++j0) { + const int j = j0 + itB*tile_B::I; + + if constexpr (!has_ids) { + tile_xy[j0*tile_k_padded + threadIdx.x] = j < cols_per_block ? y[j*stride_col_y + col] : 0.0f; + } else { + float val = 0.0f; + if (j < cols_per_block) { + const int slot = slot_map[j]; + if (slot >= 0) { + val = y[slot*stride_channel_y + j*stride_col_y + col]; + } + } + tile_xy[j0*tile_k_padded + threadIdx.x] = val; + } + } + } else if constexpr (std::is_same_v || std::is_same_v) { +#pragma unroll + for (int j0 = 0; j0 < tile_B::I; ++j0) { + const int j = j0 + itB*tile_B::I; + + if constexpr (!has_ids) { + const float2 tmp = j < cols_per_block ? y2[j*stride_col_y + col] : make_float2(0.0f, 0.0f); + tile_xy[j0*tile_k_padded + threadIdx.x] = {tmp.x, tmp.y}; + } else { + float2 tmp = make_float2(0.0f, 0.0f); + if (j < cols_per_block) { + const int slot = slot_map[j]; + if (slot >= 0) { + const float2 * y2_slot = (const float2 *)(y + slot*stride_channel_y); + tmp = y2_slot[j*stride_col_y + col]; + } + } + tile_xy[j0*tile_k_padded + threadIdx.x] = {tmp.x, tmp.y}; + } + } + } else { + static_assert(std::is_same_v, "unsupported type"); + } +#pragma unroll + for (int k0 = 0; k0 < warp_size; k0 += tile_B::J) { + tile_B B; + load_ldmatrix(B, tile_xy + k0, tile_k_padded); +#pragma unroll + for (int itA = 0; itA < ntA; ++itA) { + mma(C[itA][itB], A[itA][k0/tile_B::J], B); + } + } + } + } + + float * buf_iw = (float *) compute_base; + constexpr int kiw = nwarps*rows_per_block + 4; + + if (nwarps > 1) { + __syncthreads(); + } +#pragma unroll + for (int itB = 0; itB < ntB; ++itB) { +#pragma unroll + for (int itA = 0; itA < ntA; ++itA) { +#pragma unroll + for (int l = 0; l < tile_C::ne; ++l) { + const int i = threadIdx.y*rows_per_block + itA*tile_C::I + tile_C::get_i(l); + const int j = itB*tile_C::J + tile_C::get_j(l); + buf_iw[j*kiw + i] = C[itA][itB].x[l]; + } + } + } + + if (nwarps > 1) { + __syncthreads(); + } + +#pragma unroll + for (int j0 = 0; j0 < cols_per_block; j0 += nwarps) { + const int j = j0 + threadIdx.y; + + if (j0 + nwarps > cols_per_block && j >= cols_per_block) { + return; + } + + float sum = 0.0f; + static_assert(rows_per_block == warp_size, "need loop/check"); +#pragma unroll + for (int i0 = 0; i0 < nwarps*rows_per_block; i0 += rows_per_block) { + const int i = i0 + threadIdx.x; + + sum += buf_iw[j*kiw + i]; + } + + if constexpr (!has_ids) { + dst[j*stride_col_dst + row0 + threadIdx.x] = sum; + } else { + const int slot = (j < cols_per_block) ? slot_map[j] : -1; + if (slot >= 0) { + dst[slot*stride_channel_dst + j*stride_col_dst + row0 + threadIdx.x] = sum; + } + } + } +#else + GGML_UNUSED_VARS(x, y, ids, dst, + ncols, nchannels_dst, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, + channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + NO_DEVICE_CODE; +#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) +} + +template +static inline void mul_mat_f_switch_ids( + const T * x, const float * y, const int32_t * ids, float * dst, + const int64_t ncols_x, const int64_t nchannels_dst, + const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst, + const int64_t stride_col_id, const int64_t stride_row_id, + const int64_t channel_ratio, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, + const int64_t sample_ratio, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst, + const dim3 & block_nums, const dim3 & block_dims, const int nbytes_shared_total, cudaStream_t stream) { + if (ids) { + mul_mat_f<<>> + (x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + } else { + mul_mat_f<<>> + (x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + } +} + +template +void mul_mat_f_cuda( + const T * x, const float * y, const int32_t * ids, float * dst, + const int64_t ncols_x, const int64_t nrows_x, const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst, + const int64_t stride_col_id, const int64_t stride_row_id, + const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst, + const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x, + const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst, + cudaStream_t stream) { + typedef tile<16, 8, T> tile_A; + typedef tile< 8, 8, T> tile_B; + + GGML_ASSERT(ncols_x % 2 == 0); + GGML_ASSERT(stride_row % 2 == 0); + GGML_ASSERT(stride_col_y % 2 == 0); + GGML_ASSERT(ids || nchannels_dst % nchannels_x == 0); + GGML_ASSERT( nsamples_dst % nsamples_x == 0); + const int64_t channel_ratio = nchannels_dst / nchannels_x; + const int64_t sample_ratio = nsamples_dst / nsamples_x; + + const int device = ggml_cuda_get_device(); + const int warp_size = ggml_cuda_info().devices[device].warp_size; + + int64_t nwarps_best = 1; + int64_t niter_best = (ncols_x + warp_size*2 - 1) / (warp_size*2); + int64_t max_block_size = 256; + for (int64_t nwarps = 2; nwarps <= max_block_size/warp_size; nwarps++) { + const int64_t niter = (ncols_x + nwarps*warp_size*2 - 1) / (nwarps*warp_size*2); + if (niter < niter_best) { + niter_best = niter; + nwarps_best = nwarps; + } + } + + constexpr int rows_per_block = MMF_ROWS_PER_BLOCK; + const int nbytes_shared_iter = nwarps_best * tile_A::I * (warp_size + 4) * 4; + const int nbytes_shared_combine = GGML_PAD(cols_per_block, tile_B::I) * (nwarps_best*rows_per_block + 4) * 4; + const int nbytes_shared = std::max(nbytes_shared_iter, nbytes_shared_combine); + const int nbytes_slotmap = ids ? GGML_PAD(cols_per_block, 16) * sizeof(int) : 0; + const int nbytes_shared_total = nbytes_shared + nbytes_slotmap; + const int64_t grid_y = ids ? nchannels_x : nchannels_dst; // per expert when ids present + + const dim3 block_nums(nrows_x/rows_per_block, grid_y, nsamples_dst); + const dim3 block_dims(warp_size, nwarps_best, 1); + + switch (nwarps_best) { + case 1: { + mul_mat_f_switch_ids( + x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream); + } break; + case 2: { + mul_mat_f_switch_ids( + x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream); + } break; + case 3: { + mul_mat_f_switch_ids( + x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream); + } break; + case 4: { + mul_mat_f_switch_ids( + x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream); + } break; + case 5: { + mul_mat_f_switch_ids( + x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream); + } break; + case 6: { + mul_mat_f_switch_ids( + x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream); + } break; + case 7: { + mul_mat_f_switch_ids( + x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream); + } break; + case 8: { + mul_mat_f_switch_ids( + x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream); + } break; + default: { + GGML_ABORT("fatal error"); + } break; + } + + GGML_UNUSED_VARS(nchannels_y); +} + +template +static void mul_mat_f_switch_cols_per_block( + const T * x, const float * y, const int32_t * ids, float * dst, + const int64_t ncols_x, const int64_t nrows_x, const int64_t ncols_dst, + const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst, + const int64_t stride_col_id, const int stride_row_id, + const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst, + const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x, + const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst, + cudaStream_t stream) { + switch (ncols_dst) { + case 1: { + mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + } break; + case 2: { + mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + } break; + case 3: { + mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + } break; + case 4: { + mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + } break; + case 5: { + mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + } break; + case 6: { + mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + } break; + case 7: { + mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + } break; + case 8: { + mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + } break; + case 9: { + mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + } break; + case 10: { + mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + } break; + case 11: { + mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + } break; + case 12: { + mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + } break; + case 13: { + mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + } break; + case 14: { + mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + } break; + case 15: { + mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + } break; + case 16: { + mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + } break; + default: { + GGML_ABORT("fatal error"); + } break; + } +} + +#define DECL_MMF_CASE_HELPER(T, ncols_dst) \ + template void mul_mat_f_cuda( \ + const T * x, const float * y, const int32_t * ids, float * dst, \ + const int64_t ncols_x, const int64_t nrows_x, const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst, \ + const int64_t stride_col_id, const int64_t stride_row_id, \ + const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst, \ + const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,\ + const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst, \ + cudaStream_t stream); + +#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) +#define DECL_MMF_CASE_EXTERN(ncols_dst) \ + extern DECL_MMF_CASE_HELPER(float, ncols_dst) \ + extern DECL_MMF_CASE_HELPER(half2, ncols_dst) \ + extern DECL_MMF_CASE_HELPER(nv_bfloat162, ncols_dst) + +#define DECL_MMF_CASE(ncols_dst) \ + DECL_MMF_CASE_HELPER(float, ncols_dst) \ + DECL_MMF_CASE_HELPER(half2, ncols_dst) \ + DECL_MMF_CASE_HELPER(nv_bfloat162, ncols_dst) + +DECL_MMF_CASE_EXTERN(1); +DECL_MMF_CASE_EXTERN(2); +DECL_MMF_CASE_EXTERN(3); +DECL_MMF_CASE_EXTERN(4); +DECL_MMF_CASE_EXTERN(5); +DECL_MMF_CASE_EXTERN(6); +DECL_MMF_CASE_EXTERN(7); +DECL_MMF_CASE_EXTERN(8); +DECL_MMF_CASE_EXTERN(9); +DECL_MMF_CASE_EXTERN(10); +DECL_MMF_CASE_EXTERN(11); +DECL_MMF_CASE_EXTERN(12); +DECL_MMF_CASE_EXTERN(13); +DECL_MMF_CASE_EXTERN(14); +DECL_MMF_CASE_EXTERN(15); +DECL_MMF_CASE_EXTERN(16); +#else +#define DECL_MMF_CASE(ncols_dst) +#endif diff --git a/ggml/src/ggml-cuda/template-instances/generate_cu_files.py b/ggml/src/ggml-cuda/template-instances/generate_cu_files.py index 3428113dc8..da2d7b7c3b 100755 --- a/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +++ b/ggml/src/ggml-cuda/template-instances/generate_cu_files.py @@ -24,7 +24,7 @@ TYPES_MMQ = [ "GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0", "GGML_TYPE_Q2_K", "GGML_TYPE_Q3_K", "GGML_TYPE_Q4_K", "GGML_TYPE_Q5_K", "GGML_TYPE_Q6_K", "GGML_TYPE_IQ2_XXS", "GGML_TYPE_IQ2_XS", "GGML_TYPE_IQ2_S", "GGML_TYPE_IQ3_XXS", "GGML_TYPE_IQ3_S", - "GGML_TYPE_IQ1_S", "GGML_TYPE_IQ4_NL", "GGML_TYPE_IQ4_XS" + "GGML_TYPE_IQ1_S", "GGML_TYPE_IQ4_NL", "GGML_TYPE_IQ4_XS", "GGML_TYPE_MXFP4" ] SOURCE_MMQ = """// This file has been autogenerated by generate_cu_files.py, do not edit manually. @@ -34,6 +34,13 @@ SOURCE_MMQ = """// This file has been autogenerated by generate_cu_files.py, do DECL_MMQ_CASE({type}); """ +SOURCE_MMF = """// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../mmf.cuh" + +DECL_MMF_CASE({type}); +""" + def get_short_name(long_quant_name): return long_quant_name.replace("GGML_TYPE_", "").lower() @@ -76,3 +83,7 @@ for ncols in [8, 16, 32, 64]: for type in TYPES_MMQ: with open(f"mmq-instance-{get_short_name(type)}.cu", "w") as f: f.write(SOURCE_MMQ.format(type=type)) + +for type in range(1, 17): + with open(f"mmf-instance-ncols_{type}.cu", "w") as f: + f.write(SOURCE_MMF.format(type=type)) diff --git a/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu new file mode 100644 index 0000000000..f594d5d51d --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../mmf.cuh" + +DECL_MMF_CASE(1); diff --git a/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu new file mode 100644 index 0000000000..9cc6772542 --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../mmf.cuh" + +DECL_MMF_CASE(10); diff --git a/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu new file mode 100644 index 0000000000..317f487d7a --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../mmf.cuh" + +DECL_MMF_CASE(11); diff --git a/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu new file mode 100644 index 0000000000..dc0033227c --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../mmf.cuh" + +DECL_MMF_CASE(12); diff --git a/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu new file mode 100644 index 0000000000..0782101753 --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../mmf.cuh" + +DECL_MMF_CASE(13); diff --git a/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu new file mode 100644 index 0000000000..a23ad6ae26 --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../mmf.cuh" + +DECL_MMF_CASE(14); diff --git a/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu new file mode 100644 index 0000000000..0fe3f7821e --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../mmf.cuh" + +DECL_MMF_CASE(15); diff --git a/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu new file mode 100644 index 0000000000..544086375e --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../mmf.cuh" + +DECL_MMF_CASE(16); diff --git a/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu new file mode 100644 index 0000000000..3b901797cf --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../mmf.cuh" + +DECL_MMF_CASE(2); diff --git a/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu new file mode 100644 index 0000000000..56e940bba0 --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../mmf.cuh" + +DECL_MMF_CASE(3); diff --git a/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu new file mode 100644 index 0000000000..a7665d49d0 --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../mmf.cuh" + +DECL_MMF_CASE(4); diff --git a/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu new file mode 100644 index 0000000000..3a1dff2587 --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../mmf.cuh" + +DECL_MMF_CASE(5); diff --git a/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu new file mode 100644 index 0000000000..400fb7c663 --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../mmf.cuh" + +DECL_MMF_CASE(6); diff --git a/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu new file mode 100644 index 0000000000..954a1c7e03 --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../mmf.cuh" + +DECL_MMF_CASE(7); diff --git a/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu new file mode 100644 index 0000000000..f1bd09c945 --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../mmf.cuh" + +DECL_MMF_CASE(8); diff --git a/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu new file mode 100644 index 0000000000..1255ac2af6 --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../mmf.cuh" + +DECL_MMF_CASE(9); diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index eeb6c9d4b7..07b96dbdd5 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -48,6 +48,11 @@ static struct ggml_backend_metal_device_context { int mtl_device_ref_count; id mtl_library; + // a single global queue shared by all Metal backends + // technically not needed for devices with unified memory, but enables discrete GPUs support + // ref: https://github.com/ggml-org/llama.cpp/pull/15906 + id mtl_queue; + NSLock * mtl_lock; bool has_simdgroup_reduction; @@ -56,6 +61,7 @@ static struct ggml_backend_metal_device_context { bool has_bfloat; bool use_bfloat; bool use_fusion; + bool use_shared_buffers; int debug_fusion; @@ -69,6 +75,7 @@ static struct ggml_backend_metal_device_context { /*.mtl_device =*/ nil, /*.mtl_device_ref_count =*/ 0, /*.mtl_library =*/ nil, + /*.mtl_queue =*/ nil, /*.mtl_lock =*/ nil, /*.has_simdgroup_reduction =*/ false, /*.has_simdgroup_mm =*/ false, @@ -76,6 +83,7 @@ static struct ggml_backend_metal_device_context { /*.has_bfloat =*/ false, /*.use_bfloat =*/ false, /*.use_fusion =*/ true, + /*.use_shared_buffers =*/ true, /*.debug_fusion =*/ 0, /*.fuse_cnt =*/ { 0 }, /*.max_size =*/ 0, @@ -94,6 +102,11 @@ static id ggml_backend_metal_device_acq(struct ggml_backend_metal_dev ctx->mtl_device = MTLCreateSystemDefaultDevice(); if (ctx->mtl_device) { + ctx->mtl_queue = [ctx->mtl_device newCommandQueue]; + if (ctx->mtl_queue == nil) { + GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__); + } + ctx->has_simdgroup_reduction = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7]; ctx->has_simdgroup_reduction |= [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML]; @@ -118,6 +131,12 @@ static id ggml_backend_metal_device_acq(struct ggml_backend_metal_dev ctx->debug_fusion = val ? atoi(val) : 0; } + ctx->use_shared_buffers = ctx->mtl_device.hasUnifiedMemory; + + if (getenv("GGML_METAL_SHARED_BUFFERS_DISABLE") != NULL) { + ctx->use_shared_buffers = false; + } + memset(ctx->fuse_cnt, 0, sizeof(ctx->fuse_cnt)); ctx->max_size = ctx->mtl_device.maxBufferLength; @@ -161,6 +180,11 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte ctx->mtl_library = nil; } + if (ctx->mtl_queue) { + [ctx->mtl_queue release]; + ctx->mtl_queue = nil; + } + if (ctx->mtl_device) { [ctx->mtl_device release]; ctx->mtl_device = nil; @@ -467,8 +491,6 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC, GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32, - GGML_METAL_KERNEL_TYPE_SET_I32, - GGML_METAL_KERNEL_TYPE_SET_F32, GGML_METAL_KERNEL_TYPE_CPY_F32_F32, GGML_METAL_KERNEL_TYPE_CPY_F32_F16, GGML_METAL_KERNEL_TYPE_CPY_F32_BF16, @@ -773,7 +795,7 @@ struct ggml_metal_command_buffer { struct ggml_backend_metal_context { id device; - id queue; + id queue; // currently a pointer to the device queue, but might become separate queue [TAG_QUEUE_PER_BACKEND] dispatch_queue_t d_queue; @@ -803,6 +825,12 @@ struct ggml_backend_metal_context { // n_cb command buffers + 1 used by the main thread struct ggml_metal_command_buffer cmd_bufs[GGML_METAL_MAX_COMMAND_BUFFERS + 1]; + // extra command buffers for things like getting, setting and copying tensors + NSMutableArray * cmd_bufs_ext; + + // the last command buffer queued into the Metal queue with operations relevant to the current Metal backend + id cmd_buf_last; + // abort ggml_metal_graph_compute if callback returns true ggml_abort_callback abort_callback; void * abort_callback_data; @@ -999,7 +1027,11 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]); ctx->device = device; - ctx->queue = [device newCommandQueue]; + + // TODO: question - would it be better to have one queue for the backend and one queue for the device? + // the graph encoders and async ops would use the backend queue while the sync ops would use the device queue? + //ctx->queue = [device newCommandQueue]; [TAG_QUEUE_PER_BACKEND] + ctx->queue = ctx_dev->mtl_queue; if (ctx->queue == nil) { GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__); return NULL; @@ -1058,6 +1090,8 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de GGML_LOG_INFO("%s: has residency sets = %s\n", __func__, ctx_dev->has_residency_sets ? "true" : "false"); GGML_LOG_INFO("%s: has bfloat = %s\n", __func__, ctx_dev->has_bfloat ? "true" : "false"); GGML_LOG_INFO("%s: use bfloat = %s\n", __func__, ctx_dev->use_bfloat ? "true" : "false"); + GGML_LOG_INFO("%s: use fusion = %s\n", __func__, ctx_dev->use_fusion ? "true" : "false"); + GGML_LOG_INFO("%s: use shared buffers = %s\n", __func__, ctx_dev->use_shared_buffers ? "true" : "false"); GGML_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx_dev->mtl_device.hasUnifiedMemory ? "true" : "false"); ctx->capture_next_compute = false; @@ -1073,6 +1107,10 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de ctx->cmd_bufs[i].mem_pool->device = device; } + ctx->cmd_bufs_ext = [[NSMutableArray alloc] init]; + + ctx->cmd_buf_last = nil; + #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15) if (@available(macOS 10.12, iOS 16.0, *)) { GGML_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, device.recommendedMaxWorkingSetSize / 1e6); @@ -1390,8 +1428,6 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC, argsort_f32_i32_desc, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32, leaky_relu_f32, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_F32, set_f32, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_I32, set_i32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32, cpy_f32_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16, cpy_f32_f16, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_BF16, cpy_f32_bf16, use_bfloat); @@ -1663,14 +1699,19 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) { Block_release(ctx->encode_async); - [ctx->queue release]; + //[ctx->queue release]; // [TAG_QUEUE_PER_BACKEND] for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) { - // ctx->cmd_bufs[i].obj is auto released + if (ctx->cmd_bufs[i].obj) { + [ctx->cmd_bufs[i].obj release]; + } ggml_metal_mem_pool_free(ctx->cmd_bufs[i].mem_pool); } + [ctx->cmd_bufs_ext removeAllObjects]; + [ctx->cmd_bufs_ext release]; + dispatch_release(ctx->d_queue); free(ctx); @@ -1688,14 +1729,21 @@ struct ggml_backend_metal_buffer { struct ggml_backend_metal_buffer_context { void * all_data; size_t all_size; - bool owned; + + // if false, the Metal buffer data is allocated in private GPU memory and is not shared with the host + bool is_shared; // multiple buffers are used only to avoid the maximum buffer size limitation when using mmap int n_buffers; struct ggml_backend_metal_buffer buffers[GGML_METAL_MAX_BUFFERS]; // optional MTLResidencySet + // note: cannot use explicity "id" here because it is not available on certain OSes id rset; + + // pointers to global device objects + id device; + id queue; }; // rset init @@ -1761,7 +1809,7 @@ static void ggml_backend_metal_buffer_rset_free(struct ggml_backend_metal_buffer // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the // Metal buffer based on the host memory pointer // -static id ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs) { +static id ggml_metal_get_buffer(const struct ggml_tensor * t, size_t * offs) { //GGML_LOG_INFO("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach); const int64_t tsize = ggml_nbytes(t); @@ -1984,16 +2032,6 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex return false; }; } - case GGML_OP_SET: - { - switch (op->src[0]->type) { - case GGML_TYPE_F32: - case GGML_TYPE_I32: - return true; - default: - return false; - }; - } case GGML_OP_DIAG_MASK_INF: case GGML_OP_GET_ROWS: { @@ -5569,68 +5607,6 @@ static int ggml_metal_encode_node( [encoder dispatchThreadgroups:MTLSizeMake((ne01 + nrptg - 1)/nrptg, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, nrptg, 1)]; } break; - case GGML_OP_SET: - { - GGML_ASSERT(ggml_are_same_shape(src0, dst)); - GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); - - // src0 and dst as viewed during set - const size_t dst_nb0 = ggml_element_size(src0); - - const size_t dst_nb1 = ((int32_t *) dst->op_params)[0]; - const size_t dst_nb2 = ((int32_t *) dst->op_params)[1]; - const size_t dst_nb3 = ((int32_t *) dst->op_params)[2]; - const size_t offset = ((int32_t *) dst->op_params)[3]; - const bool inplace = (bool) ((int32_t *) dst->op_params)[4]; - - if (!inplace) { - memcpy(((char *) dst->data), ((char *) src0->data), ggml_nbytes(dst)); - } - - const int im0 = (ne10 == 0 ? 0 : ne10-1); - const int im1 = (ne11 == 0 ? 0 : ne11-1); - const int im2 = (ne12 == 0 ? 0 : ne12-1); - const int im3 = (ne13 == 0 ? 0 : ne13-1); - - GGML_ASSERT(offset + im0*dst_nb0 + im1*dst_nb1 + im2*dst_nb2 + im3*dst_nb3 <= ggml_nbytes(dst)); - - id pipeline = nil; - - switch (src0t) { - case GGML_TYPE_F32: - GGML_ASSERT(nb10 == sizeof(float)); - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SET_F32].pipeline; break; - case GGML_TYPE_I32: - GGML_ASSERT(nb10 == sizeof(int32_t)); - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SET_I32].pipeline; break; - default: GGML_ABORT("fatal error"); - } - - ggml_metal_kargs_set args = { - /*.ne10 =*/ ne10, - /*.ne11 =*/ ne11, - /*.ne12 =*/ ne12, - /*.nb10 =*/ nb10, - /*.nb11 =*/ nb11, - /*.nb12 =*/ nb12, - /*.nb13 =*/ nb13, - /*.nb1 =*/ dst_nb1, - /*.nb2 =*/ dst_nb2, - /*.nb3 =*/ dst_nb3, - /*.offs =*/ offset, - /*.inplace =*/ inplace, - }; - - const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne10); - - [encoder setComputePipelineState:pipeline]; - [encoder setBytes:&args length:sizeof(args) atIndex:0]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:3]; - - [encoder dispatchThreadgroups:MTLSizeMake(ne11, ne12, ne13) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } break; case GGML_OP_POOL_2D: { GGML_ASSERT(ggml_is_contiguous(src0)); @@ -5759,6 +5735,12 @@ static enum ggml_status ggml_metal_graph_compute( if (should_capture) { ctx->capture_next_compute = false; + // make sure all previous computations have finished before starting the capture + if (ctx->cmd_buf_last) { + [ctx->cmd_buf_last waitUntilCompleted]; + ctx->cmd_buf_last = nil; + } + if (!ctx->capture_started) { // create capture scope ctx->capture_scope = [[MTLCaptureManager sharedCaptureManager] newCaptureScopeWithDevice:ctx_dev->mtl_device]; @@ -5781,78 +5763,103 @@ static enum ggml_status ggml_metal_graph_compute( // the main thread commits the first few commands immediately // cmd_buf[n_cb] { - id cmd_buf = [ctx->queue commandBufferWithUnretainedReferences]; + // cannot use commandBufferWithUnretainedReferences because the buffers from the memory pool can get destroyed + // TODO: when the memory pools are removed, we can again use commandBufferWithUnretainedReferences + // https://github.com/ggml-org/llama.cpp/pull/15832#discussion_r2334215009 + //id cmd_buf = [ctx->queue commandBufferWithUnretainedReferences]; + id cmd_buf = [ctx->queue commandBuffer]; + [cmd_buf retain]; + ctx->cmd_bufs[n_cb].obj = cmd_buf; [cmd_buf enqueue]; + ctx->encode_async(n_cb); } - // prepare the rest of the command buffers asynchronously + // remember the command buffer for the next iteration + ctx->cmd_buf_last = ctx->cmd_bufs[n_cb].obj; + + // prepare the rest of the command buffers asynchronously (optional) // cmd_buf[0.. n_cb) for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) { - id cmd_buf = [ctx->queue commandBufferWithUnretainedReferences]; + //id cmd_buf = [ctx->queue commandBufferWithUnretainedReferences]; + id cmd_buf = [ctx->queue commandBuffer]; + [cmd_buf retain]; + + if (ctx->cmd_bufs[cb_idx].obj) { + [ctx->cmd_bufs[cb_idx].obj release]; + } ctx->cmd_bufs[cb_idx].obj = cmd_buf; // always enqueue the first two command buffers // enqueue all of the command buffers if we don't need to abort if (cb_idx < 2 || ctx->abort_callback == NULL) { [cmd_buf enqueue]; + + // update the pointer to the last queued command buffer + // this is needed to implement synchronize() + ctx->cmd_buf_last = cmd_buf; } } dispatch_apply(n_cb, ctx->d_queue, ctx->encode_async); - // wait for completion and check status of each command buffer - // needed to detect if the device ran out-of-memory for example (#1881) - { - id cmd_buf = ctx->cmd_bufs[n_cb].obj; - [cmd_buf waitUntilCompleted]; - - MTLCommandBufferStatus status = [cmd_buf status]; - if (status != MTLCommandBufferStatusCompleted) { - GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, n_cb, status); - if (status == MTLCommandBufferStatusError) { - GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]); - } - - return GGML_STATUS_FAILED; - } - } - - for (int i = 0; i < n_cb; ++i) { - id cmd_buf = ctx->cmd_bufs[i].obj; - [cmd_buf waitUntilCompleted]; - - MTLCommandBufferStatus status = [cmd_buf status]; - if (status != MTLCommandBufferStatusCompleted) { - GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status); - if (status == MTLCommandBufferStatusError) { - GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]); - } - - return GGML_STATUS_FAILED; - } - - id next_buffer = (i + 1 < n_cb ? ctx->cmd_bufs[i + 1].obj : nil); - if (!next_buffer) { - continue; - } - - const bool next_queued = ([next_buffer status] != MTLCommandBufferStatusNotEnqueued); - if (next_queued) { - continue; - } - - if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) { - GGML_LOG_INFO("%s: command buffer %d aborted", __func__, i); - return GGML_STATUS_ABORTED; - } - - [next_buffer commit]; - } + // for debugging: block until graph is computed + //[ctx->cmd_buf_last waitUntilCompleted]; + // enter here only when capturing in order to wait for all computation to finish + // otherwise, we leave the graph to compute asynchronously if (!should_capture && ctx->capture_started) { + // wait for completion and check status of each command buffer + // needed to detect if the device ran out-of-memory for example (#1881) + { + id cmd_buf = ctx->cmd_bufs[n_cb].obj; + [cmd_buf waitUntilCompleted]; + + MTLCommandBufferStatus status = [cmd_buf status]; + if (status != MTLCommandBufferStatusCompleted) { + GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, n_cb, status); + if (status == MTLCommandBufferStatusError) { + GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]); + } + + return GGML_STATUS_FAILED; + } + } + + for (int i = 0; i < n_cb; ++i) { + id cmd_buf = ctx->cmd_bufs[i].obj; + [cmd_buf waitUntilCompleted]; + + MTLCommandBufferStatus status = [cmd_buf status]; + if (status != MTLCommandBufferStatusCompleted) { + GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status); + if (status == MTLCommandBufferStatusError) { + GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]); + } + + return GGML_STATUS_FAILED; + } + + id next_buffer = (i + 1 < n_cb ? ctx->cmd_bufs[i + 1].obj : nil); + if (!next_buffer) { + continue; + } + + const bool next_queued = ([next_buffer status] != MTLCommandBufferStatusNotEnqueued); + if (next_queued) { + continue; + } + + if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) { + GGML_LOG_INFO("%s: command buffer %d aborted", __func__, i); + return GGML_STATUS_ABORTED; + } + + [next_buffer commit]; + } + [ctx->capture_scope endScope]; [[MTLCaptureManager sharedCaptureManager] stopCapture]; } @@ -5862,10 +5869,12 @@ static enum ggml_status ggml_metal_graph_compute( } //////////////////////////////////////////////////////////////////////////////// - // backend interface +//////////////////////////////////////////////////////////////////////////////// -static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) { +// shared buffer + +static void ggml_backend_metal_buffer_shared_free_buffer(ggml_backend_buffer_t buffer) { struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; for (int i = 0; i < ctx->n_buffers; i++) { @@ -5874,7 +5883,9 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) ggml_backend_metal_buffer_rset_free(ctx); - if (ctx->owned) { + GGML_ASSERT(ctx->is_shared); + + { #if TARGET_OS_OSX vm_deallocate((vm_map_t)mach_task_self(), (vm_address_t)ctx->all_data, ctx->all_size); #else @@ -5885,66 +5896,254 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) free(ctx); } -static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) { +static void * ggml_backend_metal_buffer_shared_get_base(ggml_backend_buffer_t buffer) { struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; return ctx->all_data; } -static void ggml_backend_metal_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { - memset((char *)tensor->data + offset, value, size); - - GGML_UNUSED(buffer); -} - -static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - memcpy((char *)tensor->data + offset, data, size); - - GGML_UNUSED(buffer); -} - -static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { - memcpy(data, (const char *)tensor->data + offset, size); - - GGML_UNUSED(buffer); -} - -static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { - if (ggml_backend_buffer_is_host(src->buffer)) { - memcpy(dst->data, src->data, ggml_nbytes(src)); - return true; - } - return false; - - GGML_UNUSED(buffer); -} - -static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { +static void ggml_backend_metal_buffer_shared_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; + GGML_ASSERT(ctx->is_shared); + + memset((char *)tensor->data + offset, value, size); +} + +static void ggml_backend_metal_buffer_shared_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; + + GGML_ASSERT(ctx->is_shared); + + memcpy((char *)tensor->data + offset, data, size); +} + +static void ggml_backend_metal_buffer_shared_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { + struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; + + GGML_ASSERT(ctx->is_shared); + + memcpy(data, (const char *)tensor->data + offset, size); +} + +static bool ggml_backend_metal_buffer_shared_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { + GGML_UNUSED(buffer); + GGML_UNUSED(src); + GGML_UNUSED(dst); + + return false; +} + +static void ggml_backend_metal_buffer_shared_clear(ggml_backend_buffer_t buffer, uint8_t value) { + struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; + + GGML_ASSERT(ctx->is_shared); + memset(ctx->all_data, value, ctx->all_size); } -static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = { - /* .free_buffer = */ ggml_backend_metal_buffer_free_buffer, - /* .get_base = */ ggml_backend_metal_buffer_get_base, +static struct ggml_backend_buffer_i ggml_backend_metal_buffer_shared_i = { + /* .free_buffer = */ ggml_backend_metal_buffer_shared_free_buffer, + /* .get_base = */ ggml_backend_metal_buffer_shared_get_base, /* .init_tensor = */ NULL, - /* .memset_tensor = */ ggml_backend_metal_buffer_memset_tensor, - /* .set_tensor = */ ggml_backend_metal_buffer_set_tensor, - /* .get_tensor = */ ggml_backend_metal_buffer_get_tensor, - /* .cpy_tensor = */ ggml_backend_metal_buffer_cpy_tensor, - /* .clear = */ ggml_backend_metal_buffer_clear, + /* .memset_tensor = */ ggml_backend_metal_buffer_shared_memset_tensor, + /* .set_tensor = */ ggml_backend_metal_buffer_shared_set_tensor, + /* .get_tensor = */ ggml_backend_metal_buffer_shared_get_tensor, + /* .cpy_tensor = */ ggml_backend_metal_buffer_shared_cpy_tensor, + /* .clear = */ ggml_backend_metal_buffer_shared_clear, /* .reset = */ NULL, }; -// default buffer type +// private buffer -static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) { - return "Metal"; +static void ggml_backend_metal_buffer_private_free_buffer(ggml_backend_buffer_t buffer) { + struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; - GGML_UNUSED(buft); + for (int i = 0; i < ctx->n_buffers; i++) { + [ctx->buffers[i].metal release]; + } + + ggml_backend_metal_buffer_rset_free(ctx); + + GGML_ASSERT(!ctx->is_shared); + + free(ctx); } +static void * ggml_backend_metal_buffer_private_get_base(ggml_backend_buffer_t buffer) { + struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; + + return ctx->all_data; +} + +static void ggml_backend_metal_buffer_private_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { + struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; + + GGML_ASSERT(!ctx->is_shared); + + @autoreleasepool { + // dst + size_t buf_dst_offset = 0; + id buf_dst = ggml_metal_get_buffer(tensor, &buf_dst_offset); + + buf_dst_offset += offset; + + id queue = ctx->queue; + id cmd_buf = [queue commandBufferWithUnretainedReferences]; + + { + id encoder = [cmd_buf blitCommandEncoder]; + + [encoder fillBuffer:buf_dst + range:NSMakeRange(buf_dst_offset, buf_dst_offset + size) + value:value]; + + [encoder endEncoding]; + } + + [cmd_buf commit]; + [cmd_buf waitUntilCompleted]; + } +} + +static void ggml_backend_metal_buffer_private_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; + + GGML_ASSERT(!ctx->is_shared); + + @autoreleasepool { + // src + void * data_ptr = (void *)(uintptr_t) data; // "const cast" the src data + id buf_src = [ctx->device newBufferWithBytesNoCopy:data_ptr + length:size + options:MTLResourceStorageModeShared + deallocator:nil]; + + // dst + size_t buf_dst_offset = 0; + id buf_dst = ggml_metal_get_buffer(tensor, &buf_dst_offset); + + buf_dst_offset += offset; + + // note: for experimentation purposes, here we use a semaphore to wait for the copy to complete + // this is alternative to waitUntilCompleted, which should be faster, but don't seem to make much difference + dispatch_semaphore_t completion_semaphore = dispatch_semaphore_create(0); + + id queue = ctx->queue; + id cmd_buf = [queue commandBufferWithUnretainedReferences]; + + { + id encoder = [cmd_buf blitCommandEncoder]; + + [encoder copyFromBuffer:buf_src + sourceOffset:0 + toBuffer:buf_dst + destinationOffset:buf_dst_offset + size:size]; + + [encoder endEncoding]; + } + + [cmd_buf addCompletedHandler:^(id cb) { + // TODO: can check for errors here + GGML_UNUSED(cb); + + dispatch_semaphore_signal(completion_semaphore); + }]; + + [cmd_buf commit]; + + dispatch_semaphore_wait(completion_semaphore, DISPATCH_TIME_FOREVER); + //[cmd_buf waitUntilCompleted]; + } +} + +static void ggml_backend_metal_buffer_private_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { + struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; + + GGML_ASSERT(!ctx->is_shared); + + @autoreleasepool { + // src + size_t buf_src_offset = 0; + id buf_src = ggml_metal_get_buffer(tensor, &buf_src_offset); + + buf_src_offset += offset; + + // dst + id buf_dst = [ctx->device newBufferWithBytesNoCopy:data + length:size + options:MTLResourceStorageModeShared + deallocator:nil]; + + id queue = ctx->queue; + id cmd_buf = [queue commandBufferWithUnretainedReferences]; + + { + id encoder = [cmd_buf blitCommandEncoder]; + + [encoder copyFromBuffer:buf_src + sourceOffset:buf_src_offset + toBuffer:buf_dst + destinationOffset:0 + size:size]; + + [encoder endEncoding]; + } + + [cmd_buf commit]; + [cmd_buf waitUntilCompleted]; + } +} + +static bool ggml_backend_metal_buffer_private_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { + GGML_UNUSED(buffer); + GGML_UNUSED(src); + GGML_UNUSED(dst); + + return false; +} + +static void ggml_backend_metal_buffer_private_clear(ggml_backend_buffer_t buffer, uint8_t value) { + struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; + + GGML_ASSERT(!ctx->is_shared); + + @autoreleasepool { + id queue = ctx->queue; + id cmd_buf = [queue commandBufferWithUnretainedReferences]; + + { + id encoder = [cmd_buf blitCommandEncoder]; + + [encoder fillBuffer:ctx->buffers[0].metal + range:NSMakeRange(0, ctx->buffers[0].size) + value:value]; + + [encoder endEncoding]; + } + + [cmd_buf commit]; + [cmd_buf waitUntilCompleted]; + } +} + +static struct ggml_backend_buffer_i ggml_backend_metal_buffer_private_i = { + /* .free_buffer = */ ggml_backend_metal_buffer_private_free_buffer, + /* .get_base = */ ggml_backend_metal_buffer_private_get_base, + /* .init_tensor = */ NULL, + /* .memset_tensor = */ ggml_backend_metal_buffer_private_memset_tensor, + /* .set_tensor = */ ggml_backend_metal_buffer_private_set_tensor, + /* .get_tensor = */ ggml_backend_metal_buffer_private_get_tensor, + /* .cpy_tensor = */ ggml_backend_metal_buffer_private_cpy_tensor, + /* .clear = */ ggml_backend_metal_buffer_private_clear, + /* .reset = */ NULL, +}; + +// +// buffer types +// + static void ggml_backend_metal_log_allocated_size(id device, size_t size_aligned) { #ifndef GGML_METAL_NDEBUG #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15) @@ -5970,7 +6169,8 @@ static void ggml_backend_metal_log_allocated_size(id device, size_t s GGML_UNUSED(size_aligned); } -static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +// common method for allocating shread or private Metal buffers +static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size, bool shared) { struct ggml_backend_metal_buffer_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_buffer_context)); const size_t size_page = sysconf(_SC_PAGESIZE); @@ -5986,22 +6186,40 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba id device = ctx_dev->mtl_device; - ctx->all_data = ggml_metal_host_malloc(size_aligned); + // allocate shared buffer if the device supports it and it is required by the buffer type + if (ctx_dev->use_shared_buffers && shared) { + ctx->all_data = ggml_metal_host_malloc(size_aligned); + ctx->is_shared = true; + } else { + // dummy, non-NULL value - we'll populate this after creating the Metal buffer below + ctx->all_data = (void *) 0x000000400ULL; + ctx->is_shared = false; + } ctx->all_size = size_aligned; - ctx->owned = true; + + ctx->device = device; + ctx->queue = ctx_dev->mtl_queue; + ctx->n_buffers = 1; if (ctx->all_data != NULL) { - ctx->buffers[0].data = ctx->all_data; ctx->buffers[0].size = size; ctx->buffers[0].metal = nil; if (size_aligned > 0) { - ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data - length:size_aligned - options:MTLResourceStorageModeShared - deallocator:nil]; + if (ctx_dev->use_shared_buffers) { + ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data + length:size_aligned + options:MTLResourceStorageModeShared + deallocator:nil]; + } else { + ctx->buffers[0].metal = [device newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate]; + + ctx->all_data = (void *) (ctx->buffers[0].metal.gpuAddress); + } } + + ctx->buffers[0].data = ctx->all_data; } if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) { @@ -6018,36 +6236,50 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba //ggml_backend_metal_log_allocated_size(device, size_aligned); - return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size); + struct ggml_backend_buffer_i buf_i = ctx->is_shared ? ggml_backend_metal_buffer_shared_i : ggml_backend_metal_buffer_private_i; + + return ggml_backend_buffer_init(buft, buf_i, ctx, size); } -static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { +// default (shared) buffer type + +static const char * ggml_backend_metal_buffer_type_shared_get_name(ggml_backend_buffer_type_t buft) { + return "Metal"; + + GGML_UNUSED(buft); +} + +static ggml_backend_buffer_t ggml_backend_metal_buffer_type_shared_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + return ggml_backend_metal_buffer_type_alloc_buffer(buft, size, true); +} + +static size_t ggml_backend_metal_buffer_type_shared_get_alignment(ggml_backend_buffer_type_t buft) { return 32; GGML_UNUSED(buft); } -static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { +static size_t ggml_backend_metal_buffer_type_shared_get_max_size(ggml_backend_buffer_type_t buft) { const size_t max_size = ((struct ggml_backend_metal_device_context *)buft->device->context)->max_size; return max_size; } -static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) { - return true; +static bool ggml_backend_metal_buffer_type_shared_is_host(ggml_backend_buffer_type_t buft) { + return false; GGML_UNUSED(buft); } -ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { +static ggml_backend_buffer_type_t ggml_backend_metal_buffer_type_shared(void) { static struct ggml_backend_buffer_type ggml_backend_buffer_type_metal = { /* .iface = */ { - /* .get_name = */ ggml_backend_metal_buffer_type_get_name, - /* .alloc_buffer = */ ggml_backend_metal_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment, - /* .get_max_size = */ ggml_backend_metal_buffer_type_get_max_size, + /* .get_name = */ ggml_backend_metal_buffer_type_shared_get_name, + /* .alloc_buffer = */ ggml_backend_metal_buffer_type_shared_alloc_buffer, + /* .get_alignment = */ ggml_backend_metal_buffer_type_shared_get_alignment, + /* .get_max_size = */ ggml_backend_metal_buffer_type_shared_get_max_size, /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes - /* .is_host = */ ggml_backend_metal_buffer_type_is_host, + /* .is_host = */ ggml_backend_metal_buffer_type_shared_is_host, }, /* .device = */ &g_ggml_backend_metal_device, /* .context = */ NULL, @@ -6056,116 +6288,101 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { return &ggml_backend_buffer_type_metal; } -static const char * ggml_backend_metal_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) { - return "Metal_Mapped"; +// default (private) buffer type + +static const char * ggml_backend_metal_buffer_type_private_get_name(ggml_backend_buffer_type_t buft) { + return "Metal_Private"; GGML_UNUSED(buft); } -static ggml_backend_buffer_type_t ggml_backend_metal_buffer_from_ptr_type(void) { - static struct ggml_backend_buffer_type ggml_backend_buffer_from_ptr_type_metal = { +static ggml_backend_buffer_t ggml_backend_metal_buffer_type_private_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + return ggml_backend_metal_buffer_type_alloc_buffer(buft, size, false); +} + +static size_t ggml_backend_metal_buffer_type_private_get_alignment(ggml_backend_buffer_type_t buft) { + return 32; + + GGML_UNUSED(buft); +} + +static size_t ggml_backend_metal_buffer_type_private_get_max_size(ggml_backend_buffer_type_t buft) { + const size_t max_size = ((struct ggml_backend_metal_device_context *)buft->device->context)->max_size; + + return max_size; +} + +static bool ggml_backend_metal_buffer_type_private_is_host(ggml_backend_buffer_type_t buft) { + return false; + + GGML_UNUSED(buft); +} + +static ggml_backend_buffer_type_t ggml_backend_metal_buffer_type_private(void) { + static struct ggml_backend_buffer_type ggml_backend_buffer_type_metal = { /* .iface = */ { - /* .get_name = */ ggml_backend_metal_buffer_from_ptr_type_get_name, - /* .alloc_buffer = */ ggml_backend_metal_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment, - /* .get_max_size = */ ggml_backend_metal_buffer_type_get_max_size, + /* .get_name = */ ggml_backend_metal_buffer_type_private_get_name, + /* .alloc_buffer = */ ggml_backend_metal_buffer_type_private_alloc_buffer, + /* .get_alignment = */ ggml_backend_metal_buffer_type_private_get_alignment, + /* .get_max_size = */ ggml_backend_metal_buffer_type_private_get_max_size, /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes - /* .is_host = */ ggml_backend_metal_buffer_type_is_host, + /* .is_host = */ ggml_backend_metal_buffer_type_private_is_host, }, /* .device = */ &g_ggml_backend_metal_device, /* .context = */ NULL, }; - return &ggml_backend_buffer_from_ptr_type_metal; + return &ggml_backend_buffer_type_metal; } -// TODO: obsoleted by ggml_backend_metal_device_buffer_from_ptr -ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) { - struct ggml_backend_metal_buffer_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_buffer_context)); +// mapped buffer type - ctx->all_data = data; - ctx->all_size = size; - ctx->owned = false; - ctx->n_buffers = 0; +static const char * ggml_backend_metal_buffer_type_mapped_get_name(ggml_backend_buffer_type_t buft) { + return "Metal_Mapped"; - const size_t size_page = sysconf(_SC_PAGESIZE); + GGML_UNUSED(buft); +} - // page-align the data ptr - { - const uintptr_t offs = (uintptr_t) data % size_page; - data = (void *) ((char *) data - offs); - size += offs; - } +static ggml_backend_buffer_t ggml_backend_metal_buffer_type_mapped_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + // for mapped buffers, prefer shared memory + return ggml_backend_metal_buffer_type_alloc_buffer(buft, size, true); +} - size_t size_aligned = size; - if ((size_aligned % size_page) != 0) { - size_aligned += (size_page - (size_aligned % size_page)); - } +static size_t ggml_backend_metal_buffer_type_mapped_get_alignment(ggml_backend_buffer_type_t buft) { + return 32; - struct ggml_backend_metal_device_context * ctx_dev = &g_ggml_ctx_dev_main; + GGML_UNUSED(buft); +} - GGML_ASSERT(ctx_dev->mtl_device != nil); +static size_t ggml_backend_metal_buffer_type_mapped_get_max_size(ggml_backend_buffer_type_t buft) { + const size_t max_size = ((struct ggml_backend_metal_device_context *)buft->device->context)->max_size; - id device = ctx_dev->mtl_device; + return max_size; +} - // the buffer fits into the max buffer size allowed by the device - if (size_aligned <= device.maxBufferLength) { - ctx->buffers[ctx->n_buffers].data = data; - ctx->buffers[ctx->n_buffers].size = size; - ctx->buffers[ctx->n_buffers].metal = nil; +static bool ggml_backend_metal_buffer_type_mapped_is_host(ggml_backend_buffer_type_t buft) { + return false; - if (size_aligned > 0) { - ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; + GGML_UNUSED(buft); +} - if (ctx->buffers[ctx->n_buffers].metal == nil) { - GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0); - return false; - } - } +static ggml_backend_buffer_type_t ggml_backend_metal_buffer_type_mapped(void) { + // note: not obvious, but this buffer type still needs to implement .alloc_buffer: + // https://github.com/ggml-org/llama.cpp/pull/15832#discussion_r2333177099 + static struct ggml_backend_buffer_type ggml_backend_buffer_type_mapped_metal = { + /* .iface = */ { + /* .get_name = */ ggml_backend_metal_buffer_type_mapped_get_name, + /* .alloc_buffer = */ ggml_backend_metal_buffer_type_mapped_alloc_buffer, + /* .get_alignment = */ ggml_backend_metal_buffer_type_mapped_get_alignment, + /* .get_max_size = */ ggml_backend_metal_buffer_type_mapped_get_max_size, + /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes + /* .is_host = */ ggml_backend_metal_buffer_type_mapped_is_host, + }, + /* .device = */ &g_ggml_backend_metal_device, + /* .context = */ NULL, + }; - ggml_backend_metal_log_allocated_size(device, size_aligned); - - ++ctx->n_buffers; - } else { - // this overlap between the views will guarantee that the tensor with the maximum size will fully fit into - // one of the views - const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case - const size_t size_step = device.maxBufferLength - size_ovlp; - const size_t size_view = device.maxBufferLength; - - for (size_t i = 0; i < size; i += size_step) { - const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i); - - ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i); - ctx->buffers[ctx->n_buffers].size = size_step_aligned; - ctx->buffers[ctx->n_buffers].metal = nil; - - if (size_step_aligned > 0) { - ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil]; - - if (ctx->buffers[ctx->n_buffers].metal == nil) { - GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0); - return false; - } - } - - ggml_backend_metal_log_allocated_size(device, size_step_aligned); - - if (i + size_step < size) { - GGML_LOG_INFO("\n"); - } - - ++ctx->n_buffers; - } - } - - if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) { - GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__); - free(ctx); - return NULL; - } - - return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size); + return &ggml_backend_buffer_type_mapped_metal; } // backend @@ -6184,6 +6401,137 @@ static void ggml_backend_metal_free(ggml_backend_t backend) { free(backend); } +static void ggml_backend_metal_synchronize(ggml_backend_t backend) { + struct ggml_backend_metal_context * ctx = backend->context; + + // wait for any backend operations to finish + if (ctx->cmd_buf_last) { + [ctx->cmd_buf_last waitUntilCompleted]; + ctx->cmd_buf_last = nil; + } + + // release any completed command buffers + if (ctx->cmd_bufs_ext.count > 0) { + for (size_t i = 0; i < ctx->cmd_bufs_ext.count; ++i) { + id cmd_buf = ctx->cmd_bufs_ext[i]; + + MTLCommandBufferStatus status = [cmd_buf status]; + if (status != MTLCommandBufferStatusCompleted) { + GGML_LOG_ERROR("%s: error: command buffer %d failed with status %d\n", __func__, (int) i, (int) status); + if (status == MTLCommandBufferStatusError) { + GGML_LOG_ERROR("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]); + } + GGML_ABORT("fatal error"); + } + + [cmd_buf release]; + } + + [ctx->cmd_bufs_ext removeAllObjects]; + } +} + +static void ggml_backend_metal_set_tensor_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + struct ggml_backend_metal_context * ctx = backend->context; + struct ggml_backend_metal_device_context * ctx_dev = backend->device->context; + + @autoreleasepool { + id device = ctx_dev->mtl_device; + + // wrap the source data into a Metal buffer + id buf_src = [device newBufferWithBytes:data + length:size + options:MTLResourceStorageModeShared]; + + size_t buf_dst_offset = 0; + id buf_dst = ggml_metal_get_buffer(tensor, &buf_dst_offset); + + if (buf_dst == nil) { + GGML_ABORT("%s: failed to find buffer for tensor '%s'\n", __func__, tensor->name); + } + + buf_dst_offset += offset; + + // queue the copy operation into the queue of the Metal context + // this will be queued at the end, after any currently ongoing GPU operations + id cmd_buf = [ctx->queue commandBufferWithUnretainedReferences]; + id encoder = [cmd_buf blitCommandEncoder]; + + [encoder copyFromBuffer:buf_src + sourceOffset:0 + toBuffer:buf_dst + destinationOffset:buf_dst_offset + size:size]; + + [encoder endEncoding]; + [cmd_buf commit]; + + // do not wait here for completion + //[cmd_buf waitUntilCompleted]; + + // instead, remember a reference to the command buffer and wait for it later if needed + [ctx->cmd_bufs_ext addObject:cmd_buf]; + ctx->cmd_buf_last = cmd_buf; + + [cmd_buf retain]; + } +} + +static void ggml_backend_metal_get_tensor_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { + struct ggml_backend_metal_context * ctx = backend->context; + struct ggml_backend_metal_device_context * ctx_dev = backend->device->context; + + @autoreleasepool { + id device = ctx_dev->mtl_device; + + id buf_dst = [device newBufferWithBytesNoCopy:data + length:size + options:MTLResourceStorageModeShared + deallocator:nil]; + + size_t buf_src_offset = 0; + id buf_src = ggml_metal_get_buffer(tensor, &buf_src_offset); + + if (buf_src == nil) { + GGML_ABORT("%s: failed to find buffer for tensor '%s'\n", __func__, tensor->name); + } + + buf_src_offset += offset; + + // queue the copy operation into the queue of the Metal context + // this will be queued at the end, after any currently ongoing GPU operations + id cmd_buf = [ctx->queue commandBufferWithUnretainedReferences]; + id encoder = [cmd_buf blitCommandEncoder]; + + [encoder copyFromBuffer:buf_src + sourceOffset:buf_src_offset + toBuffer:buf_dst + destinationOffset:0 + size:size]; + + [encoder endEncoding]; + [cmd_buf commit]; + + // do not wait here for completion + //[cmd_buf waitUntilCompleted]; + + // instead, remember a reference to the command buffer and wait for it later if needed + [ctx->cmd_bufs_ext addObject:cmd_buf]; + ctx->cmd_buf_last = cmd_buf; + + [cmd_buf retain]; + } +} + +static bool ggml_backend_metal_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst) { + return false; + + GGML_UNUSED(backend_src); + GGML_UNUSED(backend_dst); + GGML_UNUSED(src); + GGML_UNUSED(dst); +} + static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { return ggml_metal_graph_compute(backend, cgraph); } @@ -6214,7 +6562,10 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { const int n_nodes_per_cb = ctx->n_nodes_per_cb; - id cmd_buf = ctx->cmd_bufs[cb_idx].obj; + id cmd_buf = ctx->cmd_bufs[cb_idx].obj; + struct ggml_metal_mem_pool * mem_pool = ctx->cmd_bufs[cb_idx].mem_pool; + + ggml_metal_mem_pool_reset(mem_pool); id encoder = [cmd_buf computeCommandEncoder]; @@ -6228,9 +6579,6 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { const bool should_capture = ctx->capture_next_compute; - struct ggml_metal_mem_pool * mem_pool = ctx->cmd_bufs[cb_idx].mem_pool; - ggml_metal_mem_pool_reset(mem_pool); - for (int idx = node_start; idx < node_end;) { if (should_capture) { [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]]; @@ -6264,17 +6612,22 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { static struct ggml_backend_i ggml_backend_metal_i = { /* .get_name = */ ggml_backend_metal_name, /* .free = */ ggml_backend_metal_free, - /* .set_tensor_async = */ NULL, - /* .get_tensor_async = */ NULL, - /* .cpy_tensor_async = */ NULL, - /* .synchronize = */ NULL, + /* .set_tensor_async = */ ggml_backend_metal_set_tensor_async, + /* .get_tensor_async = */ ggml_backend_metal_get_tensor_async, + /* .cpy_tensor_async = */ ggml_backend_metal_cpy_tensor_async, // only needed for multi-GPU setups + /* .synchronize = */ ggml_backend_metal_synchronize, /* .graph_plan_create = */ NULL, /* .graph_plan_free = */ NULL, /* .graph_plan_update = */ NULL, /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_metal_graph_compute, + + // the events API is needed only for multi-GPU setups, so likely no need to implement it for Metal + // in any case, these docs seem relevant if we ever decide to implement it: + // https://developer.apple.com/documentation/metal/mtlcommandbuffer#Synchronizing-Passes-with-Events /* .event_record = */ NULL, /* .event_wait = */ NULL, + /* .optimize_graph = */ NULL, }; static ggml_guid_t ggml_backend_metal_guid(void) { @@ -6375,7 +6728,7 @@ static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct g props->type = ggml_backend_metal_device_get_type(dev); ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total); props->caps = (struct ggml_backend_dev_caps) { - /* .async = */ false, + /* .async = */ true, /* .host_buffer = */ false, /* .buffer_from_host_ptr = */ true, /* .events = */ false, @@ -6406,17 +6759,19 @@ static ggml_backend_t ggml_backend_metal_device_init(ggml_backend_dev_t dev, con } static ggml_backend_buffer_type_t ggml_backend_metal_device_get_buffer_type(ggml_backend_dev_t dev) { - return ggml_backend_metal_buffer_type(); + struct ggml_backend_metal_device_context * ctx_dev = dev->context; - GGML_UNUSED(dev); + return ctx_dev->use_shared_buffers ? ggml_backend_metal_buffer_type_shared() : ggml_backend_metal_buffer_type_private(); } -static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { +static ggml_backend_buffer_t ggml_backend_metal_device_buffer_mapped(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { struct ggml_backend_metal_buffer_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_buffer_context)); ctx->all_data = ptr; ctx->all_size = size; - ctx->owned = false; + + ctx->is_shared = true; + ctx->n_buffers = 0; const size_t size_page = sysconf(_SC_PAGESIZE); @@ -6439,6 +6794,9 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back id device = ctx_dev->mtl_device; + ctx->device = device; + ctx->queue = ctx_dev->mtl_queue; + // the buffer fits into the max buffer size allowed by the device if (size_aligned <= device.maxBufferLength) { ctx->buffers[ctx->n_buffers].data = ptr; @@ -6496,7 +6854,7 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back return NULL; } - return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size); + return ggml_backend_buffer_init(ggml_backend_metal_buffer_type_mapped(), ggml_backend_metal_buffer_shared_i, ctx, size); } static bool ggml_backend_metal_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { @@ -6507,14 +6865,30 @@ static bool ggml_backend_metal_device_supports_op(ggml_backend_dev_t dev, const static bool ggml_backend_metal_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { return - buft->iface.get_name == ggml_backend_metal_buffer_type_get_name || - buft->iface.get_name == ggml_backend_metal_buffer_from_ptr_type_get_name; + buft->iface.get_name == ggml_backend_metal_buffer_type_shared_get_name || + buft->iface.get_name == ggml_backend_metal_buffer_type_private_get_name || + buft->iface.get_name == ggml_backend_metal_buffer_type_mapped_get_name; GGML_UNUSED(dev); } +static int64_t get_op_batch_size(const struct ggml_tensor * op) { + switch (op->op) { + case GGML_OP_MUL_MAT: + return op->ne[1]; + case GGML_OP_MUL_MAT_ID: + return op->ne[2]; + default: + return ggml_nrows(op); + } +} + static bool ggml_backend_metal_device_offload_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { - return false; + const int min_batch_size = 32; + + return (op->op == GGML_OP_MUL_MAT || + op->op == GGML_OP_MUL_MAT_ID) && + get_op_batch_size(op) >= min_batch_size; GGML_UNUSED(dev); GGML_UNUSED(op); @@ -6529,7 +6903,7 @@ static struct ggml_backend_device_i ggml_backend_metal_device_i = { /* .init_backend = */ ggml_backend_metal_device_init, /* .get_buffer_type = */ ggml_backend_metal_device_get_buffer_type, /* .get_host_buffer_type = */ NULL, - /* .buffer_from_host_ptr = */ ggml_backend_metal_device_buffer_from_ptr, + /* .buffer_from_host_ptr = */ ggml_backend_metal_device_buffer_mapped, /* .supports_op = */ ggml_backend_metal_device_supports_op, /* .supports_buft = */ ggml_backend_metal_device_supports_buft, /* .offload_op = */ ggml_backend_metal_device_offload_op, diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 77be3c5c9d..157d0cc6d0 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -5571,38 +5571,6 @@ kernel void kernel_flash_attn_ext_vec_reduce( #undef DV } -template -kernel void kernel_set( - constant ggml_metal_kargs_set & args, - device const char * src0, - device const char * src1, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - ushort3 tpitg[[thread_position_in_threadgroup]], - ushort3 ntg[[threads_per_threadgroup]]) { - const int i13 = tgpig[2]; - const int i12 = tgpig[1]; - const int i11 = tgpig[0]; - - const int64_t n = i13*args.ne12*args.ne11*args.ne10 + i12*args.ne11*args.ne10 + i11*args.ne10; - - const int64_t i3 = n / (args.ne12*args.ne11*args.ne10); - const int64_t i2 = (n - i3*args.ne12*args.ne11*args.ne10) / (args.ne11*args.ne10); - const int64_t i1 = (n - i3*args.ne12*args.ne11*args.ne10 - i2*args.ne11*args.ne10) / args.ne10; - - device T * dst_data = (device T *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + args.offs); - - for (int64_t i10 = tpitg.x; i10 < args.ne10; i10 += ntg.x) { - device const T * src = (device T *) (src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11 + i10*args.nb10); - dst_data[i10] = (T) src[0]; - } -} - -typedef decltype(kernel_set) kernel_set_t; - -template [[host_name("kernel_set_f32")]] kernel kernel_set_t kernel_set; -template [[host_name("kernel_set_i32")]] kernel kernel_set_t kernel_set; - template kernel void kernel_cpy( constant ggml_metal_kargs_cpy & args, diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 727163b7fd..b188c5af34 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -2838,6 +2838,7 @@ static ggml_backend_i ggml_backend_opencl_i = { /* .graph_compute = */ ggml_backend_opencl_graph_compute, /* .event_record = */ NULL, /* .event_wait = */ NULL, + /* .optimize_graph = */ NULL, }; ggml_backend_t ggml_backend_opencl_init(void) { diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp index e84ff93efc..d4833068d0 100644 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp @@ -795,6 +795,7 @@ static ggml_backend_i ggml_backend_rpc_interface = { /* .graph_compute = */ ggml_backend_rpc_graph_compute, /* .event_record = */ NULL, /* .event_wait = */ NULL, + /* .optimize_graph = */ NULL, }; ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint) { diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 877fbf7e86..619ccaefc0 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -4063,6 +4063,7 @@ static ggml_backend_i ggml_backend_sycl_interface = { /* .graph_compute = */ ggml_backend_sycl_graph_compute, /* .event_record = */ ggml_backend_sycl_event_record, /* .event_wait = */ ggml_backend_sycl_event_wait, + /* .optimize_graph = */ NULL, }; static ggml_guid_t ggml_backend_sycl_guid() { diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index e6245d0cfc..cb379fe94e 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -583,6 +583,7 @@ struct vk_device_struct { bool disable_fusion; bool disable_host_visible_vidmem; bool allow_sysmem_fallback; + bool disable_optimize_graph; #ifdef GGML_VULKAN_MEMORY_DEBUG std::unique_ptr memory_logger; @@ -1936,7 +1937,9 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std vk::PhysicalDeviceMemoryProperties mem_props = device->physical_device.getMemoryProperties(); - for (auto &req_flags : req_flags_list) { + for (auto it = req_flags_list.begin(); it != req_flags_list.end(); it++) { + const auto & req_flags = *it; + uint32_t memory_type_index = find_properties(&mem_props, &mem_req, req_flags); if (memory_type_index == UINT32_MAX) { @@ -1949,6 +1952,11 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std break; } catch (const vk::SystemError& e) { // loop and retry + // during last attempt throw the exception + if (it + 1 == req_flags_list.end()) { + device->device.destroyBuffer(buf->buffer); + throw e; + } } } @@ -3386,7 +3394,7 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_wg512, "soft_max_f32_wg512", soft_max_f32_len, soft_max_f32_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 512 }, 1); ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_f16, "soft_max_f32_f16", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_f16_wg512, "soft_max_f32_f16_wg512", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 512 }, 1); - ggml_vk_create_pipeline(device, device->pipeline_soft_max_back_f32, "soft_max_back_f32", soft_max_back_f32_len, soft_max_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(device, device->pipeline_soft_max_back_f32, "soft_max_back_f32", soft_max_back_f32_len, soft_max_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1, true); ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f32, "rope_norm_f32", rope_norm_f32_len, rope_norm_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); @@ -3592,6 +3600,9 @@ static vk_device ggml_vk_get_device(size_t idx) { const char* GGML_VK_ALLOW_SYSMEM_FALLBACK = getenv("GGML_VK_ALLOW_SYSMEM_FALLBACK"); device->allow_sysmem_fallback = GGML_VK_ALLOW_SYSMEM_FALLBACK != nullptr; + const char* GGML_VK_DISABLE_OPTIMIZE_GRAPH = getenv("GGML_VK_DISABLE_OPTIMIZE_GRAPH"); + device->disable_optimize_graph = GGML_VK_DISABLE_OPTIMIZE_GRAPH != nullptr; + bool fp16_storage = false; bool fp16_compute = false; bool maintenance4_support = false; @@ -3732,6 +3743,12 @@ static vk_device ggml_vk_get_device(size_t idx) { device->subgroup_arithmetic = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) && (vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eArithmetic); +#ifdef __APPLE__ + // Workaround for subgroup arithmetic failing on MoltenVK with AMD GPUs (issue 15846) + if (device->vendor_id == VK_VENDOR_ID_AMD) { + device->subgroup_arithmetic = false; + } +#endif device->subgroup_shuffle = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) && (vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eShuffle); device->subgroup_clustered = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) && @@ -9135,7 +9152,7 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx, static void ggml_vk_soft_max_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { float * op_params = (float *)dst->op_params; - ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_SOFT_MAX_BACK, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], op_params[1] }, dryrun); + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_SOFT_MAX_BACK, { (uint32_t)src0->ne[0], (uint32_t)ggml_nrows(src0), op_params[0], op_params[1] }, dryrun); } static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, bool backprop, bool dryrun = false) { @@ -11853,6 +11870,131 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg UNUSED(backend); } +// Sort the graph for improved parallelism. +static void ggml_vk_optimize_graph(ggml_backend_t backend, struct ggml_cgraph * graph) +{ + VK_LOG_DEBUG("ggml_vk_optimize_graph(" << graph->n_nodes << " nodes)"); + ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; + + if (ctx->device->disable_optimize_graph) { + return; + } + + auto const &is_empty = [](ggml_tensor * node) -> bool { + return node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE; + }; + + auto const &is_src_of = [](const ggml_tensor *dst, const ggml_tensor *src) -> bool { + for (uint32_t s = 0; s < GGML_MAX_SRC; ++s) { + if (dst->src[s] == src) { + return true; + } + } + // implicit dependency if they view the same tensor + const ggml_tensor *dst2 = dst->view_src ? dst->view_src : dst; + const ggml_tensor *src2 = src->view_src ? src->view_src : src; + if (dst2 == src2) { + return true; + } + return false; + }; + + // This function tries to reorder the graph to allow nodes to run in parallel. + // This helps with small batches, but for large batches its a slowdown, probably + // due to cache contention. So only reorder if the majority of nodes have few rows. + int num_small_nodes = 0; + int num_counted_nodes = 0; + for (int i = 0; i < graph->n_nodes; ++i) { + if (!is_empty(graph->nodes[i]) && + graph->nodes[i]->op != GGML_OP_SET_ROWS) { + if (ggml_nrows(graph->nodes[i]) <= 8) { + num_small_nodes++; + } + num_counted_nodes++; + } + } + if (num_small_nodes < num_counted_nodes / 2) { + return; + } + + std::vector new_order; + std::vector used(graph->n_nodes, false); + int first_unused = 0; + while (first_unused < graph->n_nodes) { + std::vector current_set; + + // First, grab the next unused node. + current_set.push_back(first_unused); + + // Loop through the next N nodes. Grab any that don't depend on other nodes that + // haven't already been run. Nodes that have already been run have used[i] set + // to true. Allow nodes that depend on the previous node if it's a fusion pattern + // that we support (e.g. RMS_NORM + MUL). + // This first pass only grabs "real" (non-view nodes). Second pass grabs view nodes. + // The goal is to not interleave real and view nodes in a way that breaks fusion. + const int NUM_TO_CHECK = 20; + for (int j = first_unused+1; j < std::min(first_unused + NUM_TO_CHECK, graph->n_nodes); ++j) { + if (used[j]) { + continue; + } + if (is_empty(graph->nodes[j])) { + continue; + } + bool ok = true; + for (int c = first_unused; c < j; ++c) { + if (!used[c] && + is_src_of(graph->nodes[j], graph->nodes[c]) && + !(j == c+1 && c == current_set.back() && graph->nodes[c]->op == GGML_OP_RMS_NORM && graph->nodes[j]->op == GGML_OP_MUL)) { + ok = false; + break; + } + } + if (ok) { + current_set.push_back(j); + } + } + // Second pass grabs view nodes. + // Skip this if it would break a fusion optimization (don't split up add->rms_norm or add->add). + if (graph->nodes[current_set.back()]->op != GGML_OP_ADD) { + for (int j = first_unused+1; j < std::min(first_unused + NUM_TO_CHECK, graph->n_nodes); ++j) { + if (used[j]) { + continue; + } + if (!is_empty(graph->nodes[j])) { + continue; + } + bool ok = true; + for (int c = first_unused; c < j; ++c) { + bool c_in_current_set = std::find(current_set.begin(), current_set.end(), c) != current_set.end(); + // skip views whose srcs haven't been processed. + if (!used[c] && + is_src_of(graph->nodes[j], graph->nodes[c]) && + !c_in_current_set) { + ok = false; + break; + } + } + if (ok) { + current_set.push_back(j); + } + } + } + + // Push the current set into new_order + for (auto c : current_set) { + new_order.push_back(graph->nodes[c]); + used[c] = true; + } + while (first_unused < graph->n_nodes && used[first_unused]) { + first_unused++; + } + } + // Replace the graph with the new order. + for (int i = 0; i < graph->n_nodes; ++i) { + graph->nodes[i] = new_order[i]; + } +} + // TODO: enable async and synchronize static ggml_backend_i ggml_backend_vk_interface = { /* .get_name = */ ggml_backend_vk_name, @@ -11868,6 +12010,7 @@ static ggml_backend_i ggml_backend_vk_interface = { /* .graph_compute = */ ggml_backend_vk_graph_compute, /* .event_record = */ NULL, /* .event_wait = */ NULL, + /* .optimize_graph = */ ggml_vk_optimize_graph, }; static ggml_guid_t ggml_backend_vk_guid() { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp index 29bd77d7e1..144ea58e6f 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp @@ -20,6 +20,10 @@ void main() { const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x; const uint tid = gl_LocalInvocationID.x; + if (row >= p.KY) { + return; + } + FLOAT_TYPE scale = p.param1; // partial sums for thread in warp diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index aad27bf605..df6a3ed95a 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -665,6 +665,7 @@ static ggml_backend_i ggml_backend_webgpu_i = { /* .graph_compute = */ ggml_backend_webgpu_graph_compute, /* .event_record = */ NULL, /* .event_wait = */ NULL, + /* .optimize_graph = */ NULL, }; /* End GGML Backend Interface */ diff --git a/ggml/src/ggml-zdnn/ggml-zdnn.cpp b/ggml/src/ggml-zdnn/ggml-zdnn.cpp index 7507a52aea..a4c51ab4e7 100644 --- a/ggml/src/ggml-zdnn/ggml-zdnn.cpp +++ b/ggml/src/ggml-zdnn/ggml-zdnn.cpp @@ -586,6 +586,7 @@ static ggml_backend_i ggml_backend_zdnn_i = { /* .graph_compute = */ ggml_backend_zdnn_graph_compute, /* .event_record = */ NULL, /* .event_wait = */ NULL, + /* .optimize_graph = */ NULL, }; static ggml_guid_t ggml_backend_zdnn_guid(void) { diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index b8ac394580..1e88b6505b 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -109,6 +109,7 @@ class Keys: POOLING_TYPE = "{arch}.pooling_type" LOGIT_SCALE = "{arch}.logit_scale" DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id" + DECODER_BLOCK_COUNT = "{arch}.decoder_block_count" ATTN_LOGIT_SOFTCAPPING = "{arch}.attn_logit_softcapping" FINAL_LOGIT_SOFTCAPPING = "{arch}.final_logit_softcapping" SWIN_NORM = "{arch}.swin_norm" diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index a6cc8a931e..7ff12f7f57 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -676,6 +676,9 @@ class GGUFWriter: def add_decoder_start_token_id(self, id: int) -> None: self.add_uint32(Keys.LLM.DECODER_START_TOKEN_ID.format(arch=self.arch), id) + def add_decoder_block_count(self, value: int) -> None: + self.add_uint32(Keys.LLM.DECODER_BLOCK_COUNT.format(arch=self.arch), value) + def add_embedding_length_per_layer_input(self, value: int) -> None: self.add_uint32(Keys.LLM.EMBD_LENGTH_PER_LAYER_INP.format(arch=self.arch), value) diff --git a/media/llama1-icon-transparent.png b/media/llama1-icon-transparent.png new file mode 100644 index 0000000000..432d6c2223 Binary files /dev/null and b/media/llama1-icon-transparent.png differ diff --git a/media/llama1-icon-transparent.svg b/media/llama1-icon-transparent.svg new file mode 100644 index 0000000000..e28203f4e8 --- /dev/null +++ b/media/llama1-icon-transparent.svg @@ -0,0 +1,77 @@ + + + + + + + + + + + + + + + + + diff --git a/media/llama1-icon.png b/media/llama1-icon.png new file mode 100644 index 0000000000..0e44672e54 Binary files /dev/null and b/media/llama1-icon.png differ diff --git a/media/llama1-icon.svg b/media/llama1-icon.svg new file mode 100644 index 0000000000..dcbe9cce9b --- /dev/null +++ b/media/llama1-icon.svg @@ -0,0 +1,87 @@ + + + + + + + + + + + + + + + + + + diff --git a/requirements/requirements-convert_hf_to_gguf.txt b/requirements/requirements-convert_hf_to_gguf.txt index 766745f42f..90c98c3ffe 100644 --- a/requirements/requirements-convert_hf_to_gguf.txt +++ b/requirements/requirements-convert_hf_to_gguf.txt @@ -2,7 +2,9 @@ mistral-common>=1.8.3 -r ./requirements-convert_legacy_llama.txt --extra-index-url https://download.pytorch.org/whl/cpu -torch~=2.4.0; platform_machine != "s390x" + +## Embedding Gemma requires PyTorch 2.6.0 or later +torch~=2.6.0; platform_machine != "s390x" # torch s390x packages can only be found from nightly builds --extra-index-url https://download.pytorch.org/whl/nightly diff --git a/requirements/requirements-convert_legacy_llama.txt b/requirements/requirements-convert_legacy_llama.txt index 859204b27e..f6076142ce 100644 --- a/requirements/requirements-convert_legacy_llama.txt +++ b/requirements/requirements-convert_legacy_llama.txt @@ -1,5 +1,14 @@ numpy~=1.26.4 sentencepiece~=0.2.0 -transformers>=4.45.1,<5.0.0 + +# Embedding Gemma is currently a preview release: +# https://github.com/huggingface/transformers/releases/tag/v4.56.0-Embedding-Gemma-preview + +# The version is needed to be able to convert Embedding Gemma models to GGUF format: +git+https://github.com/huggingface/transformers@v4.56.0-Embedding-Gemma-preview + +# Once Embedding Gemma is officially released, we can switch to: +#transformers>=4.57.1,<5.0.0 + gguf>=0.1.0 protobuf>=4.21.0,<5.0.0 diff --git a/requirements/requirements-tool_bench.txt b/requirements/requirements-tool_bench.txt index b94521fc7f..f7912aff72 100644 --- a/requirements/requirements-tool_bench.txt +++ b/requirements/requirements-tool_bench.txt @@ -1,6 +1,6 @@ aiohttp~=3.9.3 pytest~=8.3.3 -huggingface_hub~=0.23.2 +huggingface_hub>=0.34.0,<1.0 matplotlib~=3.10.0 numpy~=1.26.4 openai~=1.55.3 diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 77b2fecf18..81f9746818 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -137,6 +137,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_POOLING_TYPE, "%s.pooling_type" }, { LLM_KV_LOGIT_SCALE, "%s.logit_scale" }, { LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" }, + { LLM_KV_DECODER_BLOCK_COUNT, "%s.decoder_block_count" }, { LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" }, { LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" }, { LLM_KV_SWIN_NORM, "%s.swin_norm" }, diff --git a/src/llama-arch.h b/src/llama-arch.h index 21ab47bd7a..6ee3707dcf 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -141,6 +141,7 @@ enum llm_kv { LLM_KV_POOLING_TYPE, LLM_KV_LOGIT_SCALE, LLM_KV_DECODER_START_TOKEN_ID, + LLM_KV_DECODER_BLOCK_COUNT, LLM_KV_ATTN_LOGIT_SOFTCAPPING, LLM_KV_FINAL_LOGIT_SOFTCAPPING, LLM_KV_SWIN_NORM, diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 874c6f82cb..3e163001c1 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1447,7 +1447,9 @@ ggml_status llama_context::graph_compute( if (backend_cpu != nullptr) { auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_cpu)); auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool"); - set_threadpool_fn(backend_cpu, tp); + if (set_threadpool_fn) { + set_threadpool_fn(backend_cpu, tp); + } } // set the number of threads for all the backends diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 7f254b25cd..ddc772b179 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1273,7 +1273,7 @@ ggml_tensor * llm_graph_context::build_attn_mha( // split the batch into streams if needed const auto n_stream = k->ne[3]; - q = ggml_reshape_4d(ctx0, q, q->ne[0], q->ne[1], q->ne[2]/n_stream, n_stream); + q = ggml_view_4d(ctx0, q, q->ne[0], q->ne[1], q->ne[2]/n_stream, n_stream, q->nb[1], q->nb[2], q->nb[3]/n_stream, 0); q = ggml_permute(ctx0, q, 0, 2, 1, 3); k = ggml_permute(ctx0, k, 0, 2, 1, 3); diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 89f5c7ab65..4dca2ca41d 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -159,6 +159,7 @@ struct llama_hparams { // needed by encoder-decoder models (e.g. T5, FLAN-T5) // ref: https://github.com/ggerganov/llama.cpp/pull/8141 llama_token dec_start_token_id = LLAMA_TOKEN_NULL; + uint32_t dec_n_layer = 0; enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE; enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index b9e4634a70..818b209641 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1542,6 +1542,9 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.dec_start_token_id = dec_start_token_id; } + hparams.dec_n_layer = hparams.n_layer; + ml.get_key(LLM_KV_DECODER_BLOCK_COUNT, hparams.dec_n_layer, false); + switch (hparams.n_layer) { case 6: type = LLM_TYPE_60M; break; // t5-small case 8: type = LLM_TYPE_80M; break; // flan-t5-small @@ -4414,6 +4417,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } + // n_layer: number of encoder_layers + // dec_n_layer: number of decoder_layers + const int dec_n_layer = hparams.dec_n_layer; + if (dec_n_layer > n_layer) { + layers.resize(dec_n_layer); + } + + // load encoder layers for (int i = 0; i < n_layer; ++i) { auto & layer = layers[i]; @@ -4429,6 +4440,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED); layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + } + + // load decoder layers + for (int i = 0; i < dec_n_layer; ++i) { + auto & layer = layers[i]; layer.attn_norm = create_tensor(tn(LLM_TENSOR_DEC_ATTN_NORM, "weight", i), {n_embd}, 0); layer.attn_rel_b = create_tensor(tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED); @@ -13509,7 +13525,9 @@ struct llm_build_t5_dec : public llm_graph_context { ggml_tensor * inp_out_ids = build_inp_out_ids(); - for (int il = 0; il < n_layer; ++il) { + const int64_t dec_n_layer = hparams.dec_n_layer; + + for (int il = 0; il < dec_n_layer; ++il) { ggml_tensor * inpSA = inpL; // norm @@ -13600,7 +13618,7 @@ struct llm_build_t5_dec : public llm_graph_context { //cb(cur, "kqv_out", il); } - if (il == n_layer - 1 && inp_out_ids) { + if (il == dec_n_layer - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids); } @@ -13621,8 +13639,8 @@ struct llm_build_t5_dec : public llm_graph_context { model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, NULL, - model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU, - model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ, + model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_RELU, + model.layers[il].ffn_gate ? LLM_FFN_PAR : LLM_FFN_SEQ, il); cb(cur, "ffn_out", il); } diff --git a/src/llama.cpp b/src/llama.cpp index f0d4f5f891..92cddccc99 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -83,7 +83,9 @@ void llama_numa_init(enum ggml_numa_strategy numa) { GGML_ASSERT(dev && "CPU backend is not loaded"); auto * reg = ggml_backend_dev_backend_reg(dev); auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init"); - numa_init_fn(numa); + if (numa_init_fn) { + numa_init_fn(numa); + } } } diff --git a/tests/.gitignore b/tests/.gitignore index 620a48ee44..cbc381606c 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -2,3 +2,4 @@ !*.* *.o ggml-common.h +**/*.swp diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 6055b9d864..1b9e8a2464 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -6162,6 +6162,9 @@ static std::vector> make_test_cases_eval() { add_test_bin_bcast(type, {10, 5, 4, 3}, {1, 2, 2, 2}); add_test_bin_bcast(type, {10, 5, 4, 3}, {2, 2, 2, 2}); + // test case for k_bin_bcast_unravel in CUDA backend + add_test_bin_bcast(type, {1, 1, 65536, 1}, {256, 1, 1, 1}); + // stable diffusion add_test_bin_bcast(type, {1280, 1, 1, 1}, {1, 1, 1, 1}); add_test_bin_bcast(type, {1280, 1, 1, 1}, {1, 16, 16, 1}); @@ -6373,7 +6376,7 @@ static std::vector> make_test_cases_eval() { for (int n_mats : {4, 8}) { for (int n_used : {1, 2, 4}) { for (bool b : {false, true}) { - for (int n : {1, 32, 129}) { + for (int n : {1, 4, 5, 32, 129}) { int m = 512; int k = 256; test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, n_used, b, m, n, k)); @@ -6503,6 +6506,7 @@ static std::vector> make_test_cases_eval() { for (int64_t ne1 : {16, 1024}) { test_cases.emplace_back(new test_soft_max_back(GGML_TYPE_F32, {ne0, ne1, 1, 1}, scale, max_bias)); test_cases.emplace_back(new test_soft_max_back(GGML_TYPE_F32, {ne0-1, ne1-1, 1, 1}, scale, max_bias)); + test_cases.emplace_back(new test_soft_max_back(GGML_TYPE_F32, {ne0, ne1, 2, 3}, scale, max_bias)); } } } @@ -6929,7 +6933,17 @@ static void list_all_ops() { static void show_test_coverage() { std::set all_ops; for (int i = 1; i < GGML_OP_COUNT; i++) { - all_ops.insert(ggml_op_name((enum ggml_op)i)); + auto op = (enum ggml_op)i; + if (op == GGML_OP_VIEW || + op == GGML_OP_RESHAPE || + op == GGML_OP_PERMUTE || + op == GGML_OP_TRANSPOSE || + op == GGML_OP_CONT || + op == GGML_OP_GLU || + op == GGML_OP_UNARY) { + continue; + } + all_ops.insert(ggml_op_name(op)); } for (int i = 0; i < GGML_UNARY_OP_COUNT; i++) { all_ops.insert(ggml_unary_op_name((enum ggml_unary_op)i)); diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp index 78ee55e246..67df240c6f 100755 --- a/tests/test-json-schema-to-grammar.cpp +++ b/tests/test-json-schema-to-grammar.cpp @@ -1209,6 +1209,51 @@ static void test_all(const std::string & lang, std::function { const ref = compSchema.$ref; if (ref !== undefined) { @@ -648,6 +649,10 @@ export class SchemaConverter { } } } + + if ('enum' in compSchema) { + enumSets.push(new Set(compSchema.enum || [])); + } }; for (const t of schema.allOf) { @@ -660,6 +665,14 @@ export class SchemaConverter { } } + if (enumSets.length > 0) { + const enumIntersection = new Set([...enumSets[0]].filter(v => enumSets.every(s => s.has(v)))); + if (enumIntersection.size > 0) { + const sortedEnums = [...enumIntersection].sort((a, b) => a.localeCompare(b)); + const rule = '(' + sortedEnums.map(v => this._generateConstantRule(v)).join(' | ') + ') space'; + return this._addRule(ruleName, rule); + } + } return this._addRule(ruleName, this._buildObjectRule(properties, required, name, null)); } else if ((schemaType === undefined || schemaType === 'array') && ('items' in schema || 'prefixItems' in schema)) { const items = schema.items ?? schema.prefixItems; diff --git a/tools/server/tests/requirements.txt b/tools/server/tests/requirements.txt index 15d024914e..4ea7f19f77 100644 --- a/tools/server/tests/requirements.txt +++ b/tools/server/tests/requirements.txt @@ -1,6 +1,6 @@ aiohttp~=3.9.3 pytest~=8.3.3 -huggingface_hub~=0.23.2 +huggingface_hub>=0.34.0,<1.0 numpy~=1.26.4 openai~=1.55.3 prometheus-client~=0.20.0