Merge branch 'master' into HEAD

This commit is contained in:
Georgi Gerganov 2025-12-06 16:55:33 +02:00
commit fdac9686f7
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735
54 changed files with 21534 additions and 1158 deletions

View File

@ -16,7 +16,7 @@ The project differentiates between 3 levels of contributors:
- If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops`
- Create separate PRs for each feature or fix. Avoid combining unrelated changes in a single PR
- Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
- If your PR becomes stale, don't hesitate to ping the maintainers in the comments
- If your PR becomes stale, rebase it on top of latest `master` to get maintainers attention
- Maintainers will rely on your insights and approval when making a final decision to approve and merge a PR
- Consider adding yourself to [CODEOWNERS](CODEOWNERS) to indicate your availability for reviewing related PRs
- Using AI to generate PRs is permitted. However, you must (1) explicitly disclose how AI was used and (2) conduct a thorough manual review before publishing the PR. Note that trivial tab autocompletions do not require disclosure.

View File

@ -1524,6 +1524,79 @@ class TextModel(ModelBase):
special_vocab._set_special_token("bos", 151643)
special_vocab.add_to_gguf(self.gguf_writer)
def _set_vocab_mistral(self):
if not _mistral_common_installed:
raise ImportError(_mistral_import_error_msg)
vocab = MistralVocab(self.dir_model)
logger.info(
f"Converting tokenizer {vocab.tokenizer_type} of size {vocab.vocab_size}."
)
self.gguf_writer.add_tokenizer_model(vocab.gguf_tokenizer_model)
tokens = []
scores = []
toktypes = []
for text, score, toktype in vocab.all_tokens():
tokens.append(text)
scores.append(score)
toktypes.append(toktype)
assert len(tokens) == vocab.vocab_size, (
f"token count ({len(tokens)}) != vocab size ({vocab.vocab_size})"
)
if vocab.tokenizer_type == MistralTokenizerType.tekken:
self.gguf_writer.add_tokenizer_pre("tekken")
self.gguf_writer.add_token_merges(
vocab.extract_vocab_merges_from_model()
)
logger.info(
f"Setting bos, eos, unk and pad token IDs to {vocab.bos_id}, {vocab.eos_id}, {vocab.unk_id}, {vocab.pad_id}."
)
self.gguf_writer.add_bos_token_id(vocab.bos_id)
self.gguf_writer.add_eos_token_id(vocab.eos_id)
self.gguf_writer.add_unk_token_id(vocab.unk_id)
self.gguf_writer.add_pad_token_id(vocab.pad_id)
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)
self.gguf_writer.add_vocab_size(vocab.vocab_size)
self.gguf_writer.add_add_bos_token(True)
self.gguf_writer.add_add_eos_token(False)
local_template_file_path = self.dir_model / "chat_template.jinja"
if self.is_mistral_format and local_template_file_path.is_file():
# Ministral-3 and other new Mistral models come with chat templates.
# ref: https://huggingface.co/mistralai/Ministral-3-14B-Instruct-2512/tree/main
logger.info("Using an existing Mistral local chat template.")
with open(local_template_file_path, "r", encoding="utf-8") as f:
template = f.read()
elif not self.is_mistral_format or not self.disable_mistral_community_chat_template:
template_dir = Path(__file__).parent / "models/templates/"
# Log only for Mistral format that the official tokenization and detokenization is via `mistral-common`.
if self.is_mistral_format:
logger.info(
"Using a Mistral community chat template. These templates can be subject to errors in early days or weeks after a release. "
"Mistral recommends to use `mistral-common` to perform tokenization and detokenization."
)
template = MistralModel.get_community_chat_template(vocab, template_dir, self.is_mistral_format)
else:
logger.info("Not using a Mistral local or community chat template. Ensure to perform the tokenization and detokenization via `mistral-common`.")
template = None
if template is not None:
self.gguf_writer.add_chat_template(template)
class MmprojModel(ModelBase):
model_type = ModelType.MMPROJ
@ -2294,79 +2367,6 @@ class LlamaModel(TextModel):
if self.hf_arch == "VLlama3ForCausalLM":
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
def _set_vocab_mistral(self):
if not _mistral_common_installed:
raise ImportError(_mistral_import_error_msg)
vocab = MistralVocab(self.dir_model)
logger.info(
f"Converting tokenizer {vocab.tokenizer_type} of size {vocab.vocab_size}."
)
self.gguf_writer.add_tokenizer_model(vocab.gguf_tokenizer_model)
tokens = []
scores = []
toktypes = []
for text, score, toktype in vocab.all_tokens():
tokens.append(text)
scores.append(score)
toktypes.append(toktype)
assert len(tokens) == vocab.vocab_size, (
f"token count ({len(tokens)}) != vocab size ({vocab.vocab_size})"
)
if vocab.tokenizer_type == MistralTokenizerType.tekken:
self.gguf_writer.add_tokenizer_pre("tekken")
self.gguf_writer.add_token_merges(
vocab.extract_vocab_merges_from_model()
)
logger.info(
f"Setting bos, eos, unk and pad token IDs to {vocab.bos_id}, {vocab.eos_id}, {vocab.unk_id}, {vocab.pad_id}."
)
self.gguf_writer.add_bos_token_id(vocab.bos_id)
self.gguf_writer.add_eos_token_id(vocab.eos_id)
self.gguf_writer.add_unk_token_id(vocab.unk_id)
self.gguf_writer.add_pad_token_id(vocab.pad_id)
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)
self.gguf_writer.add_vocab_size(vocab.vocab_size)
self.gguf_writer.add_add_bos_token(True)
self.gguf_writer.add_add_eos_token(False)
local_template_file_path = self.dir_model / "chat_template.jinja"
if self.is_mistral_format and local_template_file_path.is_file():
# Ministral-3 and other new Mistral models come with chat templates.
# ref: https://huggingface.co/mistralai/Ministral-3-14B-Instruct-2512/tree/main
logger.info("Using an existing Mistral local chat template.")
with open(local_template_file_path, "r", encoding="utf-8") as f:
template = f.read()
elif not self.is_mistral_format or not self.disable_mistral_community_chat_template:
template_dir = Path(__file__).parent / "models/templates/"
# Log only for Mistral format that the official tokenization and detokenization is via `mistral-common`.
if self.is_mistral_format:
logger.info(
"Using a Mistral community chat template. These templates can be subject to errors in early days or weeks after a release. "
"Mistral recommends to use `mistral-common` to perform tokenization and detokenization."
)
template = MistralModel.get_community_chat_template(vocab, template_dir, self.is_mistral_format)
else:
logger.info("Not using a Mistral local or community chat template. Ensure to perform the tokenization and detokenization via `mistral-common`.")
template = None
if template is not None:
self.gguf_writer.add_chat_template(template)
def set_vocab(self):
if self.is_mistral_format:
return self._set_vocab_mistral()
@ -9924,17 +9924,109 @@ class MistralModel(LlamaModel):
def set_gguf_parameters(self):
super().set_gguf_parameters()
if "yarn" in self.hparams:
yarn_params = self.hparams["yarn"]
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
self.gguf_writer.add_rope_scaling_factor(yarn_params["factor"])
self.gguf_writer.add_rope_scaling_yarn_beta_fast(yarn_params["beta"])
self.gguf_writer.add_rope_scaling_yarn_beta_slow(yarn_params["alpha"])
self.gguf_writer.add_rope_scaling_yarn_log_mul(1.0) # mscale_all_dim
self.gguf_writer.add_rope_scaling_orig_ctx_len(yarn_params["original_max_position_embeddings"])
MistralModel.set_mistral_config(self.gguf_writer, self.hparams)
if "llama_4_scaling" in self.hparams:
self.gguf_writer.add_attn_temperature_scale(self.hparams["llama_4_scaling"]["beta"])
@staticmethod
def set_mistral_config(gguf_writer: gguf.GGUFWriter, hparams: dict):
if "yarn" in hparams:
yarn_params = hparams["yarn"]
gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
gguf_writer.add_rope_scaling_factor(yarn_params["factor"])
gguf_writer.add_rope_scaling_yarn_beta_fast(yarn_params["beta"])
gguf_writer.add_rope_scaling_yarn_beta_slow(yarn_params["alpha"])
gguf_writer.add_rope_scaling_yarn_log_mul(1.0) # mscale_all_dim
gguf_writer.add_rope_scaling_orig_ctx_len(yarn_params["original_max_position_embeddings"])
if "llama_4_scaling" in hparams:
gguf_writer.add_attn_temperature_scale(hparams["llama_4_scaling"]["beta"])
class MistralMoeModel(DeepseekV2Model):
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
model_name = "Mistral"
hf_arch = ""
is_mistral_format = True
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
logger.info("Using MistralMoeModel")
# remap hparams from Mistral MoE format to DeepseekV2 format
# we do this way to be able to reuse DeepseekV2Model set_gguf_parameters logic
# ref: https://github.com/vllm-project/vllm/blob/b294e28db2c5dee61bc25157664edcada8b90b31/vllm/transformers_utils/configs/mistral.py
config = self.hparams
# Mistral key -> HF key
config_mapping = {
"dim": "hidden_size",
"norm_eps": "rms_norm_eps",
"n_kv_heads": "num_key_value_heads",
"n_layers": "num_hidden_layers",
"n_heads": "num_attention_heads",
"hidden_dim": "intermediate_size",
}
# HF key -> (Mistral key, default value)
top_level_mapping_with_default = {
"model_type": ("model_type", "transformer"),
"hidden_act": ("activation", "silu"),
"tie_word_embeddings": ("tied_embeddings", False),
"max_seq_len": ("max_seq_len", config.get("max_position_embeddings", 128_000)),
"max_position_embeddings": ("max_position_embeddings", 128_000),
}
# mapping top-level keys
for key, new_key in config_mapping.items():
if key in config:
config[new_key] = config[key]
for new_key, (key, default_value) in top_level_mapping_with_default.items():
config[new_key] = config.get(key, default_value)
# mapping MoE-specific keys
moe_config_map = {
"route_every_n": "moe_layer_freq",
"first_k_dense_replace": "first_k_dense_replace",
"num_experts_per_tok": "num_experts_per_tok",
"num_experts": "n_routed_experts",
"expert_hidden_dim": "moe_intermediate_size",
"routed_scale": "routed_scaling_factor",
"num_shared_experts": "n_shared_experts",
"num_expert_groups": "n_group",
"num_expert_groups_per_tok": "topk_group",
}
moe = config["moe"]
for key, new_key in moe_config_map.items():
if key in moe:
config[new_key] = moe[key]
# provide missing values
config["topk_method"] = None
config["norm_topk_prob"] = True
config["scoring_func"] = "softmax"
def set_vocab(self):
self._set_vocab_mistral()
def set_gguf_parameters(self):
super().set_gguf_parameters()
MistralModel.set_mistral_config(self.gguf_writer, self.hparams)
yarn_params = self.hparams["yarn"]
self.gguf_writer.add_attn_temperature_length(yarn_params["original_max_position_embeddings"])
self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1) # mscale_all_dim * 0.1
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
if name.startswith("vision_") or name.startswith("patch_merger.") or "mm_projector" in name:
return []
# rename certain tensors so that we can reuse DeepseekV2Model modify_tensors logic
if name.endswith(".qscale_act"):
name = name.replace(".qscale_act", ".input_scale")
if name.endswith(".qscale_weight"):
name = name.replace(".qscale_weight", ".weight_scale")
if ".wkv_b." in name:
name = name.replace(".wkv_b.", ".kv_b_proj.")
if ".experts." in name:
name = name.replace(".experts.", ".mlp.experts.")
name = name.replace(".w1.", ".gate_proj.")
name = name.replace(".w2.", ".down_proj.")
name = name.replace(".w3.", ".up_proj.")
name = "model." + name
return super().modify_tensors(data_torch, name, bid)
class PixtralModel(LlavaVisionModel):
@ -10490,6 +10582,8 @@ def main() -> None:
elif args.mmproj:
assert hparams.get("vision_encoder") is not None, "This model does not support multimodal"
model_class = PixtralModel
elif "moe" in hparams:
model_class = MistralMoeModel
else:
model_class = MistralModel

View File

@ -12,111 +12,111 @@ Legend:
- 🟡 Partially supported by this backend
- ❌ Not supported by this backend
| Operation | BLAS | CANN | CPU | CUDA | Metal | OpenCL | SYCL | Vulkan | zDNN |
|-----------|------|------|------|------|------|------|------|------|------|
| ABS | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ❌ |
| ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
| ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
| ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
| ADD_ID | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ |
| ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
| ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
| ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
| CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ |
| CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ❌ |
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ |
| CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ |
| CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
| CONV_3D | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
| CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ |
| COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ |
| COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
| CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
| CROSS_ENTROPY_LOSS | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
| CROSS_ENTROPY_LOSS_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
| CUMSUM | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ |
| DIAG_MASK_INF | ❌ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ |
| DIV | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
| DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ✅ | ❌ |
| ELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | ❌ | ❌ |
| EXP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ❌ |
| EXPM1 | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ |
| FILL | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ |
| FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ |
| FLOOR | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ |
| GATED_LINEAR_ATTN | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
| GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
| GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
| GEGLU_QUICK | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
| GELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
| GELU_ERF | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
| GELU_QUICK | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
| GET_ROWS | ❌ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
| GET_ROWS_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ |
| GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
| GROUP_NORM_MUL_ADD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ❌ |
| HARDSWISH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ❌ |
| IM2COL | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
| IM2COL_3D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
| L2_NORM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
| LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ |
| LOG | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | ✅ | ❌ |
| MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
| MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
| MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 |
| MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ |
| NEG | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ❌ |
| NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ❌ |
| NORM_MUL_ADD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ |
| OPT_STEP_SGD | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ |
| OUT_PROD | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ |
| PAD | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ |
| PAD_REFLECT_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ |
| POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
| REGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
| RELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
| REPEAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ❌ |
| REPEAT_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
| RMS_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
| RMS_NORM_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
| RMS_NORM_MUL_ADD | ❌ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
| ROLL | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
| ROPE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
| ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
| ROUND | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ |
| RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
| RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
| SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
| SET | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ |
| SET_ROWS | ❌ | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
| SGN | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | ❌ | ❌ |
| SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
| SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
| SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
| SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ |
| SOFTCAP | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ |
| SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
| SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ |
| SOLVE_TRI | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | 🟡 | ❌ |
| SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ |
| SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ |
| SSM_CONV | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
| SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ❌ |
| STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ❌ |
| SUB | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
| SUM | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | 🟡 | ❌ |
| SUM_ROWS | ❌ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ |
| SWIGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
| SWIGLU_OAI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ❌ |
| TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | 🟡 | ❌ |
| TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
| TOP_K | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | 🟡 | ❌ |
| TRI | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ |
| TRUNC | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ |
| UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ❌ |
| XIELU | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| Operation | BLAS | CANN | CPU | CUDA | Metal | OpenCL | SYCL | Vulkan | WebGPU | zDNN |
|-----------|------|------|------|------|------|------|------|------|------|------|
| ABS | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ |
| ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ |
| ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ✅ | ❌ |
| ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
| ADD_ID | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
| ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ |
| ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ |
| ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
| CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ |
| CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ❌ | ❌ |
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
| CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ |
| CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
| CONV_3D | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ |
| CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
| COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ |
| COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
| CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
| CROSS_ENTROPY_LOSS | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| CROSS_ENTROPY_LOSS_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| CUMSUM | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
| DIAG_MASK_INF | ❌ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
| DIV | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ✅ | ❌ |
| DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ |
| ELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | ❌ | ✅ | ❌ |
| EXP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ |
| EXPM1 | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ |
| FILL | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
| FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ |
| FLOOR | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ |
| GATED_LINEAR_ATTN | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
| GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ |
| GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ |
| GEGLU_QUICK | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ |
| GELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ |
| GELU_ERF | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ |
| GELU_QUICK | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ |
| GET_ROWS | ❌ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
| GET_ROWS_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
| GROUP_NORM_MUL_ADD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ |
| HARDSWISH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ |
| IM2COL | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
| IM2COL_3D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
| L2_NORM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ |
| LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ |
| LOG | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | ✅ | ❌ | ❌ |
| MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ |
| MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ✅ | ❌ |
| MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
| MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
| NEG | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ |
| NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ❌ | ❌ |
| NORM_MUL_ADD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
| OPT_STEP_SGD | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
| OUT_PROD | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
| PAD | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
| PAD_REFLECT_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ |
| POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ |
| REGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ |
| RELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ |
| REPEAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ❌ | ❌ |
| REPEAT_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
| RMS_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
| RMS_NORM_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
| RMS_NORM_MUL_ADD | ❌ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
| ROLL | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
| ROPE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
| ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
| ROUND | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ |
| RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ |
| RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ |
| SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
| SET | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
| SET_ROWS | ❌ | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
| SGN | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | ❌ | ✅ | ❌ |
| SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ |
| SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ |
| SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
| SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ |
| SOFTCAP | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ |
| SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
| SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ |
| SOLVE_TRI | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | 🟡 | ❌ | ❌ |
| SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ |
| SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ |
| SSM_CONV | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ |
| SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ |
| STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ |
| SUB | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ✅ | ❌ |
| SUM | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ |
| SUM_ROWS | ❌ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
| SWIGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ |
| SWIGLU_OAI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ✅ | ❌ |
| TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ |
| TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
| TOP_K | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ |
| TRI | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
| TRUNC | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ |
| UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ |
| XIELU | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ |

18741
docs/ops/WebGPU.csv Normal file

File diff suppressed because it is too large Load Diff

View File

@ -20,6 +20,7 @@ else()
add_subdirectory(gguf-hash)
add_subdirectory(gguf)
add_subdirectory(idle)
add_subdirectory(lookahead)
add_subdirectory(lookup)
add_subdirectory(parallel)

View File

@ -0,0 +1,5 @@
set(TARGET llama-idle)
add_executable(${TARGET} idle.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

3
examples/idle/README.md Normal file
View File

@ -0,0 +1,3 @@
# llama.cpp/example/idle
https://github.com/ggml-org/llama.cpp/pull/17766

110
examples/idle/idle.cpp Normal file
View File

@ -0,0 +1,110 @@
#include "arg.h"
#include "common.h"
#include "log.h"
#include "llama.h"
#include <cmath>
#include <cstdio>
#include <cstring>
#include <string>
#include <thread>
#include <vector>
static void print_usage(int /*argc*/, char ** argv) {
printf("\nexample usage:\n");
printf("\n %s -m model.gguf [-ngl n_gpu_layers]\n", argv[0]);
printf("\n");
}
int main(int argc, char ** argv) {
common_params params;
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
return 1;
}
common_init();
// init LLM
llama_backend_init();
llama_numa_init(params.numa);
// initialize the model
llama_model_params model_params = common_model_params_to_llama(params);
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
if (model == NULL) {
LOG_ERR("%s: error: unable to load model\n" , __func__);
return 1;
}
const llama_vocab * vocab = llama_model_get_vocab(model);
// we need just a dummy token to evaluate
std::vector<llama_token> prompt_tokens(1, llama_vocab_bos(vocab));
llama_context_params ctx_params = llama_context_default_params();
ctx_params.n_ctx = 512;
ctx_params.n_batch = 512;
ctx_params.no_perf = false;
llama_context * ctx = llama_init_from_model(model, ctx_params);
if (ctx == NULL) {
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
return 1;
}
llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
const int n_iters = 3;
// warm-up
llama_decode(ctx, batch);
llama_memory_clear(llama_get_memory(ctx), true);
llama_synchronize(ctx);
for (int64_t t_pause_ms = 0; t_pause_ms <= 4000; t_pause_ms += 800) {
double t_sum_us = 0.0;
double t_sum2_us = 0.0;
for (int i = 0; i < n_iters; i++) {
// this pause is important - it simulates "idle GPU"
std::this_thread::sleep_for(std::chrono::milliseconds(t_pause_ms));
const int64_t t_start_us = llama_time_us();
// this should take constant time
llama_decode(ctx, batch);
llama_synchronize(ctx);
const int64_t t_end_us = llama_time_us();
const double t_cur_us = t_end_us - t_start_us;
#if 1
// print individual decode times
printf(" - decode time: %8.2f ms\n", t_cur_us / 1000);
#endif
t_sum_us += t_cur_us;
t_sum2_us += t_cur_us * t_cur_us;
llama_memory_clear(llama_get_memory(ctx), true);
llama_synchronize(ctx); // just in case
}
const double t_avg_us = t_sum_us / n_iters;
const double t_dev_us = sqrt((t_sum2_us / (n_iters - 1)) - (t_avg_us * t_avg_us * n_iters) / (n_iters - 1));
printf("iters: %4d, pause: %5d ms, avg decode time: %8.2f +/- %4.2f ms\n", n_iters, (int) t_pause_ms, t_avg_us / 1000, t_dev_us / 1000);
fflush(stdout);
}
llama_free(ctx);
llama_model_free(model);
return 0;
}

View File

@ -241,6 +241,12 @@ int main(int argc, char ** argv) {
llama_batch_free(batch);
// this one is managed by common_init_result
//llama_free(ctx);
llama_free(ctx2);
llama_free(ctx3);
if (result0 != result2) {
fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__);
return 1;

View File

@ -1,6 +1,5 @@
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
#ifdef __cplusplus
@ -8,7 +7,7 @@ extern "C" {
#endif
#define RPC_PROTO_MAJOR_VERSION 3
#define RPC_PROTO_MINOR_VERSION 5
#define RPC_PROTO_MINOR_VERSION 6
#define RPC_PROTO_PATCH_VERSION 0
#define GGML_RPC_MAX_SERVERS 16

View File

@ -2196,6 +2196,15 @@ extern "C" {
int p2,
int p3);
// pad each dimension with values on the other side of the torus (looping around)
GGML_API struct ggml_tensor * ggml_pad_circular(
struct ggml_context * ctx,
struct ggml_tensor * a,
int p0,
int p1,
int p2,
int p3);
GGML_API struct ggml_tensor * ggml_pad_ext(
struct ggml_context * ctx,
struct ggml_tensor * a,
@ -2209,6 +2218,19 @@ extern "C" {
int rp3
);
// pad each dimension with values on the other side of the torus (looping around)
GGML_API struct ggml_tensor * ggml_pad_ext_circular(
struct ggml_context * ctx,
struct ggml_tensor * a,
int lp0,
int rp0,
int lp1,
int rp1,
int lp2,
int rp2,
int lp3,
int rp3);
// pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
GGML_API struct ggml_tensor * ggml_pad_reflect_1d(
struct ggml_context * ctx,

View File

@ -534,8 +534,12 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
fs::path best_path;
for (const auto & search_path : search_paths) {
if (!fs::exists(search_path)) {
if (std::error_code ec; !fs::exists(search_path, ec)) {
if (ec) {
GGML_LOG_DEBUG("%s: posix_stat(%s) failure, error-message: %s\n", __func__, path_str(search_path).c_str(), ec.message().c_str());
} else {
GGML_LOG_DEBUG("%s: search path %s does not exist\n", __func__, path_str(search_path).c_str());
}
continue;
}
fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
@ -575,8 +579,12 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
for (const auto & search_path : search_paths) {
fs::path filename = backend_filename_prefix().native() + name_path.native() + backend_filename_extension().native();
fs::path path = search_path / filename;
if (fs::exists(path)) {
if (std::error_code ec; fs::exists(path, ec)) {
return get_reg().load_backend(path, silent);
} else {
if (ec) {
GGML_LOG_DEBUG("%s: posix_stat(%s) failure, error-message: %s\n", __func__, path_str(path).c_str(), ec.message().c_str());
}
}
}
return nullptr;

View File

@ -2551,6 +2551,8 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
case GGML_OP_ACC:
case GGML_OP_GROUP_NORM:
case GGML_OP_PAD:
// TODO: add circular padding support for cann, see https://github.com/ggml-org/llama.cpp/pull/16985
return ggml_get_op_params_i32(op, 8) == 0;
case GGML_OP_ARANGE:
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_LEAKY_RELU:

View File

@ -6554,8 +6554,13 @@ static void ggml_call_mul_mat(ggml_type type, const ggml_compute_params * params
ggml_compute_forward_mul_mat(params, &dst);
}
static inline int64_t ggml_wrap_around(int64_t coord, int64_t size) {
return (coord + size) % size; // adding size avoids negative number weirdness
}
// ggml_compute_forward_conv_2d
static void ggml_compute_forward_conv_2d_impl(const ggml_compute_params * params,
const ggml_tensor * kernel, // [KW, KH, IC, OC]
const ggml_tensor * src, // [W, H, C, N]
@ -7591,6 +7596,7 @@ void ggml_compute_forward_upscale(
// ggml_compute_forward_pad
template<bool circular_t>
static void ggml_compute_forward_pad_f32(
const ggml_compute_params * params,
ggml_tensor * dst) {
@ -7615,13 +7621,29 @@ static void ggml_compute_forward_pad_f32(
const int32_t lp3 = ggml_get_op_params_i32(dst, 6);
const int32_t rp3 = ggml_get_op_params_i32(dst, 7);
// TODO: optimize
for (int64_t i2 = 0; i2 < ne2; ++i2) {
for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
for (int64_t i0 = 0; i0 < ne0; ++i0) {
for (int64_t i3 = 0; i3 < ne3; ++i3) {
// circular means wrap around on a torus, so x and y loop around
if constexpr (circular_t) {
const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
const int64_t src_i0 = ggml_wrap_around(i0 - lp0, ne00);
const int64_t src_i1 = ggml_wrap_around(i1 - lp1, ne01);
const int64_t src_i2 = ggml_wrap_around(i2 - lp2, ne02);
const int64_t src_i3 = ggml_wrap_around(i3 - lp3, ne03);
const int64_t src_idx =
src_i3*nb03 +
src_i2*nb02 +
src_i1*nb01 +
src_i0*nb00;
const float * src_ptr = (const float *)((char *) src0->data + src_idx);
dst_ptr[dst_idx] = *src_ptr;
} else {
const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
if ((i0 >= lp0 && i0 < ne0 - rp0) \
&& (i1 >= lp1 && i1 < ne1 - rp1) \
@ -7638,17 +7660,22 @@ static void ggml_compute_forward_pad_f32(
}
}
}
}
void ggml_compute_forward_pad(
const ggml_compute_params * params,
ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const bool circular = (bool) ggml_get_op_params_i32(dst, 8);
switch (src0->type) {
case GGML_TYPE_F32:
{
ggml_compute_forward_pad_f32(params, dst);
if (circular) {
ggml_compute_forward_pad_f32<true>(params, dst);
} else {
ggml_compute_forward_pad_f32<false>(params, dst);
}
} break;
default:
{

View File

@ -577,14 +577,13 @@ namespace ggml_cuda_mma {
const int64_t * xs1 = xs + 2;
xi[2] = xs1[0];
xi[3] = xs1[1];
}else{
NO_DEVICE_CODE;
}
} else {
NO_DEVICE_CODE;
}
#endif // defined(RDNA4)
} else {
NO_DEVICE_CODE;
}
} else {
NO_DEVICE_CODE;
}
#else
#pragma unroll
for (int l = 0; l < t.ne; ++l) {

View File

@ -160,9 +160,9 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const
case GGML_TYPE_F32:
return ampere_mma_available(cc);
case GGML_TYPE_F16:
return volta_mma_available(cc) || turing_mma_available(cc) || amd_wmma_available(cc);
return volta_mma_available(cc) || turing_mma_available(cc) || (amd_wmma_available(cc) && GGML_CUDA_CC_IS_RDNA4(cc));
case GGML_TYPE_BF16:
return ampere_mma_available(cc) || amd_wmma_available(cc);
return ampere_mma_available(cc) || (amd_wmma_available(cc) && GGML_CUDA_CC_IS_RDNA4(cc));
default:
return false;
}

View File

@ -1,9 +1,17 @@
#include "pad.cuh"
#include <stdint.h>
__device__ __forceinline__ int64_t wrap_around(int64_t coord, int64_t size) {
// + size ensures negatives are handled properly
return (coord + size) % size;
}
static __global__ void pad_f32(const float * src, float * dst,
const int lp0, const int rp0, const int lp1, const int rp1,
const int lp2, const int rp2, const int lp3, const int rp3,
const int ne0, const int ne1, const int ne2, const int ne3) {
const int ne0, const int ne1, const int ne2, const int ne3,
const bool circular) {
// blockIdx.z: i3*ne2+i2
// blockIdx.y: i1
// blockIDx.x: i0 / CUDA_PAD_BLOCK_SIZE
@ -12,15 +20,15 @@ static __global__ void pad_f32(const float * src, float * dst,
int i1 = blockIdx.y;
int i2 = blockIdx.z % ne2;
int i3 = blockIdx.z / ne2;
if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
return;
}
// operation
const int64_t dst_idx = i3 * (ne0 * ne1 * ne2) + i2 * (ne0 * ne1) + i1 * ne0 + i0;
if ((i0 >= lp0 && i0 < ne0 - rp0) &&
(i1 >= lp1 && i1 < ne1 - rp1) &&
(i2 >= lp2 && i2 < ne2 - rp2) &&
if (!circular) {
if ((i0 >= lp0 && i0 < ne0 - rp0) && (i1 >= lp1 && i1 < ne1 - rp1) && (i2 >= lp2 && i2 < ne2 - rp2) &&
(i3 >= lp3 && i3 < ne3 - rp3)) {
const int64_t i00 = i0 - lp0;
const int64_t i01 = i1 - lp1;
@ -37,14 +45,35 @@ static __global__ void pad_f32(const float * src, float * dst,
dst[dst_idx] = 0.0f;
}
}
// circular means on a torus, so x and y wrap around
else {
const int64_t ne00 = ne0 - lp0 - rp0;
const int64_t ne01 = ne1 - lp1 - rp1;
const int64_t ne02 = ne2 - lp2 - rp2;
const int64_t ne03 = ne3 - lp3 - rp3;
const int64_t i00 = wrap_around(i0 - lp0, ne00);
const int64_t i01 = wrap_around(i1 - lp1, ne01);
const int64_t i02 = wrap_around(i2 - lp2, ne02);
const int64_t i03 = wrap_around(i3 - lp3, ne03);
const int64_t src_idx = i03 * (ne00 * ne01 * ne02) + i02 * (ne00 * ne01) + i01 * ne00 + i00;
dst[dst_idx] = src[src_idx];
}
}
static void pad_f32_cuda(const float * src, float * dst,
const int lp0, const int rp0, const int lp1, const int rp1,
const int lp2, const int rp2, const int lp3, const int rp3,
const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
const int ne0, const int ne1, const int ne2, const int ne3,
const bool circular, cudaStream_t stream) {
int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
dim3 gridDim(num_blocks, ne1, ne2 * ne3);
pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(src, dst, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3, ne0, ne1, ne2, ne3);
pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(src, dst,
lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3,
ne0, ne1, ne2, ne3, circular);
}
void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@ -65,8 +94,10 @@ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const int32_t rp2 = ((const int32_t *) (dst->op_params))[5];
const int32_t lp3 = ((const int32_t *) (dst->op_params))[6];
const int32_t rp3 = ((const int32_t *) (dst->op_params))[7];
const int32_t circular = ((const int32_t *) (dst->op_params))[8];
pad_f32_cuda(src0_d, dst_d,
lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3,
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
(bool) circular, stream);
}

View File

@ -24,9 +24,6 @@ struct ggml_metal_command_buffer {
};
struct ggml_metal {
id<MTLDevice> device;
id<MTLCommandQueue> queue; // currently a pointer to the device queue, but might become separate queue [TAG_QUEUE_PER_BACKEND]
ggml_metal_device_t dev;
ggml_metal_library_t lib;
@ -91,15 +88,15 @@ ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {
// init context
ggml_metal_t res = calloc(1, sizeof(struct ggml_metal));
res->device = ggml_metal_device_get_obj(dev);
id<MTLDevice> device = ggml_metal_device_get_obj(dev);
GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[res->device name] UTF8String]);
GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
// TODO: would it be better to have one queue for the backend and one queue for the device?
// the graph encoders and async ops would use the backend queue while the sync ops would use the device queue?
//res->queue = [device newCommandQueue]; [TAG_QUEUE_PER_BACKEND]
res->queue = ggml_metal_device_get_queue(dev);
if (res->queue == nil) {
id<MTLCommandQueue> queue = ggml_metal_device_get_queue(dev);
if (queue == nil) {
GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__);
return NULL;
}
@ -274,7 +271,8 @@ static struct ggml_metal_buffer_id ggml_metal_get_buffer_id(const struct ggml_te
void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
@autoreleasepool {
// wrap the source data into a Metal buffer
id<MTLBuffer> buf_src = [ctx->device newBufferWithBytes:data
id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);
id<MTLBuffer> buf_src = [device newBufferWithBytes:data
length:size
options:MTLResourceStorageModeShared];
@ -289,7 +287,8 @@ void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor,
// queue the copy operation into the queue of the Metal context
// this will be queued at the end, after any currently ongoing GPU operations
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
id<MTLCommandQueue> queue = ggml_metal_device_get_queue(ctx->dev);
id<MTLCommandBuffer> cmd_buf = [queue commandBuffer];
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
[encoder copyFromBuffer:buf_src
@ -315,7 +314,8 @@ void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor,
void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
@autoreleasepool {
id<MTLBuffer> buf_dst = [ctx->device newBufferWithBytesNoCopy:data
id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);
id<MTLBuffer> buf_dst = [device newBufferWithBytesNoCopy:data
length:size
options:MTLResourceStorageModeShared
deallocator:nil];
@ -331,7 +331,8 @@ void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * te
// queue the copy operation into the queue of the Metal context
// this will be queued at the end, after any currently ongoing GPU operations
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
id<MTLCommandQueue> queue = ggml_metal_device_get_queue(ctx->dev);
id<MTLCommandBuffer> cmd_buf = [queue commandBuffer];
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
[encoder copyFromBuffer:bid_src.metal
@ -362,6 +363,9 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph *
// number of threads in addition to the main thread
const int n_cb = ctx->n_cb;
// keep the memory wired
ggml_metal_device_rsets_keep_alive(ctx->dev);
// submit the ggml compute graph to the GPU by creating command buffers and encoding the ops in them
// the first n_nodes_0 are encoded and submitted for processing directly by the calling thread
// while these nodes are processing, we start n_cb threads to enqueue the rest of the nodes
@ -389,7 +393,8 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph *
if (!ctx->capture_started) {
// create capture scope
ctx->capture_scope = [[MTLCaptureManager sharedCaptureManager] newCaptureScopeWithDevice:ctx->device];
id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);
ctx->capture_scope = [[MTLCaptureManager sharedCaptureManager] newCaptureScopeWithDevice:device];
MTLCaptureDescriptor * descriptor = [MTLCaptureDescriptor new];
descriptor.captureObject = ctx->capture_scope;
@ -406,10 +411,13 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph *
}
}
// short-hand
id<MTLCommandQueue> queue = ggml_metal_device_get_queue(ctx->dev);
// the main thread commits the first few commands immediately
// cmd_buf[n_cb]
{
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
[cmd_buf retain];
if (ctx->cmd_bufs[n_cb].obj) {
@ -428,7 +436,7 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph *
// prepare the rest of the command buffers asynchronously (optional)
// cmd_buf[0.. n_cb)
for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
[cmd_buf retain];
if (ctx->cmd_bufs[cb_idx].obj) {
@ -589,9 +597,11 @@ void ggml_metal_set_abort_callback(ggml_metal_t ctx, ggml_abort_callback abort_c
}
bool ggml_metal_supports_family(ggml_metal_t ctx, int family) {
GGML_ASSERT(ctx->device != nil);
GGML_ASSERT(ctx->dev != nil);
return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);
return [device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
}
void ggml_metal_capture_next_compute(ggml_metal_t ctx) {

View File

@ -186,6 +186,16 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_flash_att
int32_t dv,
int32_t nwg);
// MTLResidencySet wrapper
typedef void * ggml_metal_rset_t;
// a collection of residency sets (non-owning)
typedef struct ggml_metal_rsets * ggml_metal_rsets_t;
ggml_metal_rsets_t ggml_metal_rsets_init(void);
void ggml_metal_rsets_free(ggml_metal_rsets_t rsets);
//
// device
//
@ -219,6 +229,11 @@ void * ggml_metal_device_get_queue(ggml_metal_device_t dev); // id<MTLCommandQue
ggml_metal_library_t ggml_metal_device_get_library(ggml_metal_device_t dev);
void ggml_metal_device_rsets_add(ggml_metal_device_t dev, ggml_metal_rset_t rset);
void ggml_metal_device_rsets_rm (ggml_metal_device_t dev, ggml_metal_rset_t rset);
void ggml_metal_device_rsets_keep_alive(ggml_metal_device_t dev);
void ggml_metal_device_get_memory(ggml_metal_device_t dev, size_t * free, size_t * total);
bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_tensor * op);

View File

@ -1,7 +1,6 @@
#import "ggml-metal-device.h"
#import "ggml-impl.h"
#import "ggml-threading.h"
#include <Foundation/Foundation.h>
@ -519,11 +518,106 @@ struct ggml_metal_device {
// ref: https://github.com/ggml-org/llama.cpp/pull/15906
id<MTLCommandQueue> mtl_queue;
ggml_metal_rsets_t rsets;
ggml_metal_library_t library;
struct ggml_metal_device_props props;
};
//
// MTLResidenceSet wrapper
//
struct ggml_metal_rsets {
NSLock * lock;
NSMutableArray * data;
// number of seconds since the last graph computation
// keep the residency sets wired for that amount of time to avoid being collected by the OS
int keep_alive_s;
// background heartbeat thread to keep the residency sets alive
atomic_bool d_stop;
atomic_int d_loop;
dispatch_group_t d_group;
};
ggml_metal_rsets_t ggml_metal_rsets_init(void) {
ggml_metal_rsets_t res = calloc(1, sizeof(struct ggml_metal_rsets));
res->lock = [[NSLock alloc] init];
res->data = [[NSMutableArray alloc] init];
// by default keep the memory wired for 3 minutes
res->keep_alive_s = 3*60;
const char * GGML_METAL_RESIDENCY_KEEP_ALIVE_S = getenv("GGML_METAL_RESIDENCY_KEEP_ALIVE_S");
if (GGML_METAL_RESIDENCY_KEEP_ALIVE_S) {
res->keep_alive_s = atoi(GGML_METAL_RESIDENCY_KEEP_ALIVE_S);
}
if (res->keep_alive_s <= 0) {
res->keep_alive_s = 3*60;
}
GGML_LOG_INFO("%s: creating a residency set collection (keep_alive = %d s)\n", __func__, res->keep_alive_s);
atomic_store_explicit(&res->d_stop, false, memory_order_relaxed);
atomic_store_explicit(&res->d_loop, 2*res->keep_alive_s, memory_order_relaxed);
res->d_group = dispatch_group_create();
// start a background thread that periodically requests residency for all the currently active sets in the collection
// the requests stop after a certain amount of time (keep_alive_s) of inactivity
dispatch_queue_t d_queue = dispatch_get_global_queue(QOS_CLASS_DEFAULT, 0);
dispatch_group_async(res->d_group, d_queue, ^{
#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, visionOS 2.0, *)) {
while (!atomic_load_explicit(&res->d_stop, memory_order_relaxed)) {
if (atomic_load_explicit(&res->d_loop, memory_order_relaxed) > 0) {
[res->lock lock];
for (int i = 0; i < (int) res->data.count; ++i) {
[res->data[i] requestResidency];
}
atomic_fetch_sub_explicit(&res->d_loop, 1, memory_order_relaxed);
[res->lock unlock];
}
// half a second
usleep(500 * 1000);
}
}
#endif
});
return res;
}
void ggml_metal_rsets_free(ggml_metal_rsets_t rsets) {
if (rsets == NULL) {
return;
}
// note: if you hit this assert, most likely you haven't deallocated all Metal resources before exiting
GGML_ASSERT([rsets->data count] == 0);
atomic_store_explicit(&rsets->d_stop, true, memory_order_relaxed);
dispatch_group_wait(rsets->d_group, DISPATCH_TIME_FOREVER);
dispatch_release(rsets->d_group);
[rsets->data release];
[rsets->lock release];
free(rsets);
}
ggml_metal_device_t ggml_metal_device_init(void) {
ggml_metal_device_t dev = calloc(1, sizeof(struct ggml_metal_device));
@ -692,7 +786,11 @@ ggml_metal_device_t ggml_metal_device_init(void) {
GGML_LOG_ERROR("%s: error: failed to create library\n", __func__);
}
// --------------------------------------------------
if (dev->props.use_residency_sets) {
dev->rsets = ggml_metal_rsets_init();
} else {
dev->rsets = nil;
}
// print MTL GPU family:
GGML_LOG_INFO("%s: GPU name: %s\n", __func__, dev->props.name);
@ -745,6 +843,8 @@ ggml_metal_device_t ggml_metal_device_init(void) {
void ggml_metal_device_free(ggml_metal_device_t dev) {
assert(dev != NULL);
ggml_metal_rsets_free(dev->rsets);
ggml_metal_library_free(dev->library);
dev->library = NULL;
@ -773,6 +873,42 @@ ggml_metal_library_t ggml_metal_device_get_library(ggml_metal_device_t dev) {
return dev->library;
}
void ggml_metal_device_rsets_add(ggml_metal_device_t dev, ggml_metal_rset_t rset) {
if (rset == nil) {
return;
}
GGML_ASSERT(dev->rsets);
[dev->rsets->lock lock];
[dev->rsets->data addObject:rset];
[dev->rsets->lock unlock];
}
void ggml_metal_device_rsets_rm(ggml_metal_device_t dev, ggml_metal_rset_t rset) {
if (rset == nil) {
return;
}
GGML_ASSERT(dev->rsets);
[dev->rsets->lock lock];
[dev->rsets->data removeObject:rset];
[dev->rsets->lock unlock];
}
void ggml_metal_device_rsets_keep_alive(ggml_metal_device_t dev) {
if (dev->rsets == NULL) {
return;
}
atomic_store_explicit(&dev->rsets->d_loop, 2*dev->rsets->keep_alive_s, memory_order_relaxed);
}
void ggml_metal_device_get_memory(ggml_metal_device_t dev, size_t * free, size_t * total) {
if (@available(macOS 10.12, iOS 16.0, *)) {
*total = dev->mtl_device.recommendedMaxWorkingSetSize;
@ -901,6 +1037,11 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
case GGML_OP_POOL_2D:
return op->src[0]->type == GGML_TYPE_F32;
case GGML_OP_PAD:
// TODO: add circular padding support for metal, see https://github.com/ggml-org/llama.cpp/pull/16985
if (ggml_get_op_params_i32(op, 8) != 0) {
return false;
}
return (ggml_get_op_params_i32(op, 0) == 0) && (ggml_get_op_params_i32(op, 2) == 0) &&
(ggml_get_op_params_i32(op, 4) == 0) && (ggml_get_op_params_i32(op, 6) == 0);
case GGML_OP_PAD_REFLECT_1D:
@ -1066,9 +1207,8 @@ struct ggml_metal_buffer {
// note: cannot use explicity "id<MTLResidencySet>" here because it is not available on certain OSes
id rset;
// pointers to global device objects
id<MTLDevice> device;
id<MTLCommandQueue> queue;
// pointers to global device
ggml_metal_device_t dev;
};
static void ggml_metal_log_allocated_size(id<MTLDevice> device, size_t size_aligned) {
@ -1111,7 +1251,7 @@ static bool ggml_metal_buffer_rset_init(ggml_metal_buffer_t buf) {
desc.initialCapacity = buf->n_buffers;
NSError * error;
buf->rset = [buf->device newResidencySetWithDescriptor:desc error:&error];
buf->rset = [buf->dev->mtl_device newResidencySetWithDescriptor:desc error:&error];
if (error) {
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
[desc release];
@ -1172,6 +1312,8 @@ static void * ggml_metal_host_malloc(size_t n) {
ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size, bool shared) {
ggml_metal_buffer_t res = calloc(1, sizeof(struct ggml_metal_buffer));
res->dev = dev;
const size_t size_page = sysconf(_SC_PAGESIZE);
size_t size_aligned = size;
@ -1196,9 +1338,6 @@ ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size,
res->owned = true;
res->device = ggml_metal_device_get_obj(dev);
res->queue = ggml_metal_device_get_queue(dev);
res->n_buffers = 1;
if (res->all_data != NULL) {
@ -1207,12 +1346,12 @@ ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size,
if (size_aligned > 0) {
if (props_dev->use_shared_buffers && shared) {
res->buffers[0].metal = [res->device newBufferWithBytesNoCopy:res->all_data
res->buffers[0].metal = [res->dev->mtl_device newBufferWithBytesNoCopy:res->all_data
length:size_aligned
options:MTLResourceStorageModeShared
deallocator:nil];
} else {
res->buffers[0].metal = [res->device newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate];
res->buffers[0].metal = [res->dev->mtl_device newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate];
}
}
@ -1233,6 +1372,8 @@ ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size,
return NULL;
}
ggml_metal_device_rsets_add(dev, res->rset);
//ggml_metal_log_allocated_size(device, size_aligned);
return res;
@ -1241,6 +1382,8 @@ ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size,
ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, size_t size, size_t max_tensor_size) {
ggml_metal_buffer_t res = calloc(1, sizeof(struct ggml_metal_buffer));
res->dev = dev;
res->all_data = ptr;
res->all_size = size;
@ -1263,9 +1406,6 @@ ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, s
size_aligned += (size_page - (size_aligned % size_page));
}
res->device = ggml_metal_device_get_obj(dev);
res->queue = ggml_metal_device_get_queue(dev);
const struct ggml_metal_device_props * props_dev = ggml_metal_device_get_props(dev);
// the buffer fits into the max buffer size allowed by the device
@ -1275,7 +1415,7 @@ ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, s
res->buffers[res->n_buffers].metal = nil;
if (size_aligned > 0) {
res->buffers[res->n_buffers].metal = [res->device newBufferWithBytesNoCopy:ptr length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
res->buffers[res->n_buffers].metal = [res->dev->mtl_device newBufferWithBytesNoCopy:ptr length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
if (res->buffers[res->n_buffers].metal == nil) {
GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
@ -1284,7 +1424,7 @@ ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, s
}
}
ggml_metal_log_allocated_size(res->device, size_aligned);
ggml_metal_log_allocated_size(res->dev->mtl_device, size_aligned);
++res->n_buffers;
} else {
@ -1302,7 +1442,7 @@ ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, s
res->buffers[res->n_buffers].metal = nil;
if (size_step_aligned > 0) {
res->buffers[res->n_buffers].metal = [res->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) ptr + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
res->buffers[res->n_buffers].metal = [res->dev->mtl_device newBufferWithBytesNoCopy:(void *) ((uint8_t *) ptr + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
if (res->buffers[res->n_buffers].metal == nil) {
GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0);
@ -1311,7 +1451,7 @@ ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, s
}
}
ggml_metal_log_allocated_size(res->device, size_step_aligned);
ggml_metal_log_allocated_size(res->dev->mtl_device, size_step_aligned);
if (i + size_step < size) {
GGML_LOG_INFO("\n");
@ -1329,10 +1469,14 @@ ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, s
return NULL;
}
ggml_metal_device_rsets_add(dev, res->rset);
return res;
}
void ggml_metal_buffer_free(ggml_metal_buffer_t buf) {
ggml_metal_device_rsets_rm(buf->dev, buf->rset);
for (int i = 0; i < buf->n_buffers; i++) {
[buf->buffers[i].metal release];
}
@ -1369,8 +1513,7 @@ void ggml_metal_buffer_memset_tensor(ggml_metal_buffer_t buf, struct ggml_tensor
struct ggml_metal_buffer_id bid_dst = ggml_metal_buffer_get_id(buf, tensor);
bid_dst.offs += offset;
id<MTLCommandQueue> queue = buf->queue;
id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
id<MTLCommandBuffer> cmd_buf = [buf->dev->mtl_queue commandBufferWithUnretainedReferences];
{
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
@ -1396,7 +1539,7 @@ void ggml_metal_buffer_set_tensor(ggml_metal_buffer_t buf, struct ggml_tensor *
@autoreleasepool {
// src
void * data_ptr = (void *)(uintptr_t) data; // "const cast" the src data
id<MTLBuffer> buf_src = [buf->device newBufferWithBytesNoCopy:data_ptr
id<MTLBuffer> buf_src = [buf->dev->mtl_device newBufferWithBytesNoCopy:data_ptr
length:size
options:MTLResourceStorageModeShared
deallocator:nil];
@ -1411,8 +1554,7 @@ void ggml_metal_buffer_set_tensor(ggml_metal_buffer_t buf, struct ggml_tensor *
// this is alternative to waitUntilCompleted, which should be faster, but don't seem to make much difference
dispatch_semaphore_t completion_semaphore = dispatch_semaphore_create(0);
id<MTLCommandQueue> queue = buf->queue;
id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
id<MTLCommandBuffer> cmd_buf = [buf->dev->mtl_queue commandBufferWithUnretainedReferences];
{
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
@ -1454,15 +1596,14 @@ void ggml_metal_buffer_get_tensor(ggml_metal_buffer_t buf, const struct ggml_ten
bid_src.offs += offset;
// dst
id<MTLBuffer> buf_dst = [buf->device newBufferWithBytesNoCopy:data
id<MTLBuffer> buf_dst = [buf->dev->mtl_device newBufferWithBytesNoCopy:data
length:size
options:MTLResourceStorageModeShared
deallocator:nil];
GGML_ASSERT(buf_dst);
id<MTLCommandQueue> queue = buf->queue;
id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
id<MTLCommandBuffer> cmd_buf = [buf->dev->mtl_queue commandBufferWithUnretainedReferences];
{
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
@ -1488,8 +1629,7 @@ void ggml_metal_buffer_clear(ggml_metal_buffer_t buf, uint8_t value) {
}
@autoreleasepool {
id<MTLCommandQueue> queue = buf->queue;
id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
id<MTLCommandBuffer> cmd_buf = [buf->dev->mtl_queue commandBufferWithUnretainedReferences];
{
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];

View File

@ -3083,6 +3083,10 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
case GGML_OP_REPEAT:
return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; // Assuming F32 for now, can be expanded
case GGML_OP_PAD:
// TODO: add circular padding support for opencl, see https://github.com/ggml-org/llama.cpp/pull/16985
if (ggml_get_op_params_i32(op, 8) != 0) {
return false;
}
return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
case GGML_OP_UPSCALE: {
ggml_scale_mode mode = (ggml_scale_mode)(ggml_get_op_params_i32(op, 0) & 0xFF);

View File

@ -128,6 +128,7 @@ struct rpc_msg_device_count_rsp {
struct rpc_msg_get_alloc_size_req {
uint32_t device;
rpc_tensor tensor;
rpc_tensor srcs[GGML_MAX_SRC];
};
struct rpc_msg_get_alloc_size_rsp {
@ -572,6 +573,11 @@ static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
static rpc_tensor serialize_tensor(const ggml_tensor * tensor) {
rpc_tensor result;
if (!tensor) {
memset(&result, 0, sizeof(result));
return result;
}
result.id = reinterpret_cast<uint64_t>(tensor);
result.type = tensor->type;
if (tensor->buffer) {
@ -753,23 +759,41 @@ static size_t ggml_backend_rpc_get_max_size(ggml_backend_buffer_type_t buft) {
}
static size_t ggml_backend_rpc_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
// should we query the remote server for the actual size
bool rpc_get = false;
// See comments in init_tensor.
if (ggml_is_quantized(tensor->type) && (tensor->ne[0] % 512 != 0) && (tensor->view_src == nullptr)) {
rpc_get |= ggml_is_quantized(tensor->type) && (tensor->ne[0] % 512 != 0) && (tensor->view_src == nullptr);
// ops that require additional memory for fleeting data on certain backends
// ref: https://github.com/ggml-org/llama.cpp/pull/15966
rpc_get |= tensor->op == GGML_OP_FLASH_ATTN_EXT;
rpc_get |= tensor->op == GGML_OP_MUL_MAT_ID;
if (rpc_get) {
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
auto sock = get_socket(buft_ctx->endpoint);
rpc_msg_get_alloc_size_req request;
request.device = buft_ctx->device;
request.tensor = serialize_tensor(tensor);
rpc_msg_get_alloc_size_req request = {
/*.device =*/ buft_ctx->device,
/*.tensor =*/ serialize_tensor(tensor),
/*.srcs =*/ {},
};
// .get_alloc_size could be a function of the tensor's srcs, so we must serialize them as well
for (int i = 0; i < GGML_MAX_SRC; i++) {
request.srcs[i] = serialize_tensor(tensor->src[i]);
}
// TODO: cache the alloc responses to avoid extra RPC calls?
rpc_msg_get_alloc_size_rsp response;
bool status = send_rpc_cmd(sock, RPC_CMD_GET_ALLOC_SIZE, &request, sizeof(request), &response, sizeof(response));
RPC_STATUS_ASSERT(status);
return response.alloc_size;
} else {
return ggml_nbytes(tensor);
}
return ggml_nbytes(tensor);
}
static ggml_backend_buffer_type_i ggml_backend_rpc_buffer_type_interface = {
@ -1017,7 +1041,7 @@ bool rpc_server::get_alloc_size(const rpc_msg_get_alloc_size_req & request, rpc_
}
ggml_backend_buffer_type_t buft;
struct ggml_init_params params {
/*.mem_size =*/ ggml_tensor_overhead(),
/*.mem_size =*/ ggml_tensor_overhead()*(1 + GGML_MAX_SRC),
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
@ -1025,12 +1049,18 @@ bool rpc_server::get_alloc_size(const rpc_msg_get_alloc_size_req & request, rpc_
ggml_context_ptr ctx_ptr { ggml_init(params) };
GGML_ASSERT(ctx_ptr != nullptr);
ggml_context * ctx = ctx_ptr.get();
ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor);
ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor);
if (tensor == nullptr) {
GGML_LOG_ERROR("Null tensor pointer passed to server get_alloc_size function.\n");
return false;
}
for (int i = 0; i < GGML_MAX_SRC; i++) {
if (request.srcs[i].id != 0) {
tensor->src[i] = deserialize_tensor(ctx, &request.srcs[i]);
}
}
LOG_DBG("[%s] device: %d, buffer: %p, data: %p\n", __func__, dev_id, (void*)tensor->buffer, tensor->data);
if (tensor->buffer == nullptr) {
//No buffer allocated.
@ -1227,7 +1257,8 @@ bool rpc_server::get_cached_file(uint64_t hash, std::vector<uint8_t> & data) {
char hash_str[17];
snprintf(hash_str, sizeof(hash_str), "%016" PRIx64, hash);
fs::path cache_file = fs::path(cache_dir) / hash_str;
if (!fs::exists(cache_file)) {
std::error_code ec;
if (!fs::exists(cache_file, ec)) {
return false;
}
std::ifstream ifs(cache_file, std::ios::binary);

View File

@ -4613,6 +4613,10 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_ACC:
return true;
case GGML_OP_PAD:
// TODO: add circular padding support for syscl, see https://github.com/ggml-org/llama.cpp/pull/16985
if (ggml_get_op_params_i32(op, 8) != 0) {
return false;
}
return ggml_is_contiguous(op->src[0]);
case GGML_OP_LEAKY_RELU:
case GGML_OP_TIMESTEP_EMBEDDING:

View File

@ -353,10 +353,17 @@ enum vk_conv_shapes {
CONV_SHAPE_COUNT,
};
uint32_t conv_shapes_wg_denoms[][3] = {
{ 128, 128, 1 },
{ 64, 32, 1 },
{ 32, 256, 1 },
struct vk_conv_block_size {
uint32_t K;
uint32_t NPQ;
uint32_t CRS;
};
vk_conv_block_size vk_conv_block_sizes[CONV_SHAPE_COUNT] = {
// K NPQ CRS
{ 128, 128, 16 }, // CONV_SHAPE_128x128
{ 64, 32, 32 }, // CONV_SHAPE_64x32
{ 32, 256, 16 }, // CONV_SHAPE_32x256
};
enum dmmv_wg_sizes {
@ -519,6 +526,7 @@ struct vk_device_struct {
bool fp16;
bool bf16;
bool pipeline_robustness;
bool memory_priority;
vk::Device device;
uint32_t vendor_id;
vk::DriverId driver_id;
@ -1042,6 +1050,7 @@ struct vk_op_pad_push_constants {
uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
uint32_t misalign_offsets;
uint32_t circular;
uint32_t lp0; uint32_t rp0;
uint32_t lp1; uint32_t rp1;
@ -1084,6 +1093,7 @@ static vk_op_pad_push_constants vk_op_pad_push_constants_init(const ggml_tensor
p.rp2 = dst->op_params[5];
p.lp3 = dst->op_params[6];
p.rp3 = dst->op_params[7];
p.circular = dst->op_params[8];
return p; // fastdiv values and offsets are initialized later in ggml_vk_op
}
@ -1343,20 +1353,11 @@ struct vk_op_conv2d_push_constants {
uint32_t Cin;
uint32_t N;
uint32_t KW;
uint32_t KH;
uint32_t W;
uint32_t H;
uint32_t OW;
uint32_t OH;
uint32_t s0;
uint32_t s1;
uint32_t p0;
uint32_t p1;
uint32_t d0;
uint32_t d1;
uint32_t nb01;
uint32_t nb02;
uint32_t nb03;
@ -1380,48 +1381,6 @@ template <> void init_pushconst_fastdiv(vk_op_conv2d_push_constants &p) {
init_fastdiv_values(p.OW*p.OH, p.OWOHmp, p.OWOHL);
}
struct vk_op_conv_transpose_2d_push_constants {
uint32_t Cout;
uint32_t Cin;
uint32_t N;
uint32_t KW;
uint32_t KH;
uint32_t W;
uint32_t H;
uint32_t OW;
uint32_t OH;
uint32_t s0;
uint32_t s1;
uint32_t p0;
uint32_t p1;
uint32_t d0;
uint32_t d1;
uint32_t nb01;
uint32_t nb02;
uint32_t nb03;
uint32_t nb11;
uint32_t nb12;
uint32_t nb13;
uint32_t nb1;
uint32_t nb2;
uint32_t nb3;
// init_fastdiv_values constants for dividing by OW, OW*OH
uint32_t OWmp; uint32_t OWL;
uint32_t OWOHmp; uint32_t OWOHL;
};
template <> void init_pushconst_fastdiv(vk_op_conv_transpose_2d_push_constants &p) {
// Compute magic values to divide by OW, OW*OH
init_fastdiv_values(p.OW, p.OWmp, p.OWL);
init_fastdiv_values(p.OW*p.OH, p.OWOHmp, p.OWOHL);
}
struct vk_op_conv2d_dw_push_constants {
uint32_t ne;
uint32_t batches;
@ -2369,7 +2328,13 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std
vk::PhysicalDeviceMemoryProperties mem_props = device->physical_device.getMemoryProperties();
const vk::MemoryAllocateFlagsInfo mem_flags_info { mem_flags };
const vk::MemoryPriorityAllocateInfoEXT mem_priority_info { 1.0f };
vk::MemoryAllocateFlagsInfo mem_flags_info { mem_flags };
if (device->memory_priority) {
mem_flags_info.setPNext(&mem_priority_info);
}
for (auto it = req_flags_list.begin(); it != req_flags_list.end(); it++) {
const auto & req_flags = *it;
@ -3574,7 +3539,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
SHADER_REDUCTION_MODE_SHMEM;
for (uint32_t i = 0; i < mul_mat_vec_max_cols; ++i) {
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f32_f32", arr_dmmv_f32_f32_f32_len[reduc], arr_dmmv_f32_f32_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1, false, use_subgroups, force_subgroup_size);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f32_f32", arr_dmmv_f32_f32_f32_len[reduc], arr_dmmv_f32_f32_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {wg_size_subgroup, 1, i+1}, 1, false, use_subgroups, force_subgroup_size);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_F16 ][i], "mul_mat_vec_f16_f32_f32", arr_dmmv_f16_f32_f32_len[reduc], arr_dmmv_f16_f32_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1, false, use_subgroups, force_subgroup_size);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_BF16][i], "mul_mat_vec_bf16_f32_f32", arr_dmmv_bf16_f32_f32_len[reduc], arr_dmmv_bf16_f32_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1, false, use_subgroups, force_subgroup_size);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q4_0][i], "mul_mat_vec_q4_0_f32_f32", arr_dmmv_q4_0_f32_f32_len[reduc], arr_dmmv_q4_0_f32_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size);
@ -3598,7 +3563,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ4_NL][i], "mul_mat_vec_iq4_nl_f32_f32", arr_dmmv_iq4_nl_f32_f32_len[reduc16], arr_dmmv_iq4_nl_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_MXFP4][i], "mul_mat_vec_mxfp4_f32_f32", arr_dmmv_mxfp4_f32_f32_len[reduc16], arr_dmmv_mxfp4_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f16_f32", arr_dmmv_f32_f16_f32_len[reduc], arr_dmmv_f32_f16_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1, false, use_subgroups, force_subgroup_size);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f16_f32", arr_dmmv_f32_f16_f32_len[reduc], arr_dmmv_f32_f16_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {wg_size_subgroup, 1, i+1}, 1, false, use_subgroups, force_subgroup_size);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_F16 ][i], "mul_mat_vec_f16_f16_f32", arr_dmmv_f16_f16_f32_len[reduc], arr_dmmv_f16_f16_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1, false, use_subgroups, force_subgroup_size);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_BF16][i], "mul_mat_vec_bf16_f16_f32", arr_dmmv_bf16_f16_f32_len[reduc], arr_dmmv_bf16_f16_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1, false, use_subgroups, force_subgroup_size);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q4_0][i], "mul_mat_vec_q4_0_f16_f32", arr_dmmv_q4_0_f16_f32_len[reduc], arr_dmmv_q4_0_f16_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size);
@ -3644,7 +3609,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
#endif // GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT
}
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", arr_dmmv_id_f32_f32_f32_len[reduc], arr_dmmv_id_f32_f32_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {wg_size_subgroup, 2}, 1, false, use_subgroups, force_subgroup_size);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", arr_dmmv_id_f32_f32_f32_len[reduc], arr_dmmv_id_f32_f32_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {wg_size_subgroup, 1}, 1, false, use_subgroups, force_subgroup_size);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32", arr_dmmv_id_f16_f32_f32_len[reduc], arr_dmmv_id_f16_f32_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {wg_size_subgroup, 2}, 1, false, use_subgroups, force_subgroup_size);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_BF16], "mul_mat_vec_id_bf16_f32", arr_dmmv_id_bf16_f32_f32_len[reduc], arr_dmmv_id_bf16_f32_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {wg_size_subgroup, 2}, 1, false, use_subgroups, force_subgroup_size);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_f32", arr_dmmv_id_q4_0_f32_f32_len[reduc], arr_dmmv_id_q4_0_f32_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq}, 1, true, use_subgroups, force_subgroup_size);
@ -4050,7 +4015,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
uint32_t nary_shmem = 2 * sizeof(int) * BLOCK_SIZE +
sizeof(int) * device->subgroup_size +
2 * sizeof(int) +
(BLOCK_SIZE / device->subgroup_size) * sizeof(int);
2 * (BLOCK_SIZE / device->subgroup_size) * sizeof(int);
if (device->subgroup_arithmetic && device->subgroup_require_full_support && device->subgroup_shuffle && device->subgroup_ballot &&
nary_shmem <= device->properties.limits.maxComputeSharedMemorySize) {
ggml_vk_create_pipeline2(device, device->pipeline_topk_f32[i], "topk_f32_"+std::to_string(i), topk_nary_search_f32_len, topk_nary_search_f32_data, "main", 2, sizeof(vk_op_topk_push_constants), {BLOCK_SIZE, 1, 1}, {BLOCK_SIZE, device->subgroup_size, device->subgroup_size_log2}, 1, true, true, device->subgroup_size);
@ -4070,10 +4035,16 @@ static void ggml_vk_load_shaders(vk_device& device) {
for (auto &s : device->pipeline_solve_tri_f32) {
const vk_solve_tri_pipeline_state &state = s.first;
// Max number of rows to load at a time, limited by shared memory
const uint32_t batch_N = device->properties.limits.maxComputeSharedMemorySize / ((state.N + state.K) * sizeof(float));
// Need at least K invocations, and prefer a minimum of 128 to spread out loading shared memory
const uint32_t block_size = std::max(128u, 1u << (uint32_t)ceilf(log2f(float(state.K))));
ggml_vk_create_pipeline(
device, s.second, "solve_tri_f32",
solve_tri_f32_len, solve_tri_f32_data, "main", 3,
sizeof(vk_op_binary_push_constants), {1, 1, 1}, { 0, state.N, state.K }, 1, true);
sizeof(vk_op_binary_push_constants), {1, 1, 1}, { 0, state.N, state.K, batch_N, block_size }, 1, true);
}
#define IM2COL(bda) \
@ -4119,12 +4090,10 @@ static void ggml_vk_load_shaders(vk_device& device) {
// conv2d, conv_transpose_2d
for (uint32_t s = 0; s < CONV_SHAPE_COUNT; ++s) {
uint32_t conv2d_WG_SIZE = 256;
uint32_t conv2d_BS_K = 128;
uint32_t conv2d_BS_CRS = 16;
uint32_t use_collectives = 0; // Enables subgroup ops for preventing the re-calculation of indices.
uint32_t conv2d_BS_NPQ = 128;
uint32_t conv2d_TS_K = 8;
uint32_t conv2d_TS_K = (s == CONV_SHAPE_64x32) ? 4 : 8;
uint32_t conv2d_SHMEM_PAD = 4;
vk_conv_block_size conv2d_BS = vk_conv_block_sizes[s];
bool conv2d_UNROLL = true;
#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
@ -4138,29 +4107,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
conv2d_UNROLL = false;
} else if (device->vendor_id == VK_VENDOR_ID_AMD) {
conv2d_SHMEM_PAD = device->architecture == vk_device_architecture::AMD_GCN ? 1 : 4;
}
switch (s) {
default:
case CONV_SHAPE_128x128:
conv2d_BS_K = conv_shapes_wg_denoms[CONV_SHAPE_128x128][0];
conv2d_BS_NPQ = conv_shapes_wg_denoms[CONV_SHAPE_128x128][1];
conv2d_BS_CRS = 16;
if (device->vendor_id == VK_VENDOR_ID_AMD && device->architecture != vk_device_architecture::AMD_GCN) {
if (s == CONV_SHAPE_128x128 && device->architecture != vk_device_architecture::AMD_GCN) {
conv2d_UNROLL = false;
}
break;
case CONV_SHAPE_64x32:
conv2d_BS_K = conv_shapes_wg_denoms[CONV_SHAPE_64x32][0];
conv2d_BS_NPQ = conv_shapes_wg_denoms[CONV_SHAPE_64x32][1];
conv2d_BS_CRS = 32;
conv2d_TS_K = 4;
break;
case CONV_SHAPE_32x256:
conv2d_BS_K = conv_shapes_wg_denoms[CONV_SHAPE_32x256][0];
conv2d_BS_NPQ = conv_shapes_wg_denoms[CONV_SHAPE_32x256][1];
conv2d_BS_CRS = 16;
break;
}
// Use collectives on pre-Turing NVIDIA GPUs and GCN AMD cards, which had slower integer math.
@ -4174,22 +4123,22 @@ static void ggml_vk_load_shaders(vk_device& device) {
allow_collectives_nv &&
allow_collectives_amd) {
use_collectives = 1;
conv2d_BS_CRS = std::min(
conv2d_BS.CRS = std::min(
device->subgroup_size,
conv2d_BS_CRS); // CRS block size should be capped at subgroup size for correctness when shuffle is used.
conv2d_BS.CRS); // CRS block size should be capped at subgroup size for correctness when shuffle is used.
}
uint32_t conv2d_shmem_req =
(conv2d_BS_K * (conv2d_BS_CRS + conv2d_SHMEM_PAD) + conv2d_BS_CRS * (conv2d_BS_NPQ + conv2d_SHMEM_PAD)) * sizeof(float);
(conv2d_BS.K * (conv2d_BS.CRS + conv2d_SHMEM_PAD) + conv2d_BS.CRS * (conv2d_BS.NPQ + conv2d_SHMEM_PAD)) * sizeof(float);
if (device->properties.limits.maxComputeSharedMemorySize < conv2d_shmem_req) {
conv2d_BS_CRS = 8;
conv2d_BS.CRS = 8;
if (use_collectives) {
conv2d_BS_CRS = std::min(device->subgroup_size, conv2d_BS_CRS);
conv2d_BS.CRS = std::min(device->subgroup_size, conv2d_BS.CRS);
}
}
std::array<uint32_t, 3> wg_denoms = { conv2d_BS_K, conv2d_BS_NPQ, 1 };
std::vector<uint32_t> spec_constants = { conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives, conv2d_SHMEM_PAD };
std::array<uint32_t, 3> wg_denoms = { conv2d_BS.K, 1, 1 };
std::vector<uint32_t> spec_constants = { conv2d_WG_SIZE, conv2d_BS.K, conv2d_BS.CRS, conv2d_BS.NPQ, conv2d_TS_K, use_collectives, conv2d_SHMEM_PAD };
#define CREATE_CONV(name, type_suffix, spv_suffix) \
for (auto &c : device->pipeline_##name##type_suffix[s]) { \
@ -4206,15 +4155,13 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline( \
device, c.second, #name #type_suffix, \
name##type_suffix##spv_suffix##_len, name##type_suffix##spv_suffix##_data, "main", 3, \
sizeof(vk_op_##name##_push_constants), wg_denoms, spec_constants_cpy, 1, true, use_collectives); \
sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants_cpy, 1, true, use_collectives); \
}
#define CREATE_CONVS(spv_suffix) \
CREATE_CONV(conv2d, _f32, spv_suffix) \
CREATE_CONV(conv2d, _f16_f32, spv_suffix) \
if (device->properties.limits.maxPushConstantsSize >= sizeof(vk_op_conv_transpose_2d_push_constants)) { \
CREATE_CONV(conv_transpose_2d, _f32, spv_suffix) \
CREATE_CONV(conv_transpose_2d, _f16_f32, spv_suffix) \
}
CREATE_CONV(conv_transpose_2d, _f16_f32, spv_suffix)
#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
if (device->coopmat2) {
CREATE_CONVS(_cm2)
@ -4235,9 +4182,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_cwhn_f16_f32, "conv2d_dw_cwhn_f16_f32", conv2d_dw_cwhn_f16_f32_len, conv2d_dw_cwhn_f16_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
for (uint32_t i = 0; i < num_topk_moe_pipelines; ++i) {
ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX], "topk_moe_f32_early_softmax_"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 0}, 1, true, true);
ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX_NORM], "topk_moe_f32_early_softmax_norm"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 1, 0}, 1, true, true);
ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_LATE_SOFTMAX], "topk_moe_f32_late_softmax"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 1}, 1, true, true);
ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX], "topk_moe_f32_early_softmax_"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 0}, 1, true, true, device->subgroup_size);
ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX_NORM], "topk_moe_f32_early_softmax_norm"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 1, 0}, 1, true, true, device->subgroup_size);
ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_LATE_SOFTMAX], "topk_moe_f32_late_softmax"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 1}, 1, true, true, device->subgroup_size);
}
for (auto &c : compiles) {
@ -4340,6 +4287,9 @@ static vk_device ggml_vk_get_device(size_t idx) {
#endif
} else if (strcmp("VK_KHR_pipeline_executable_properties", properties.extensionName) == 0) {
pipeline_executable_properties_support = true;
} else if (strcmp("VK_EXT_memory_priority", properties.extensionName) == 0 &&
getenv("GGML_VK_ENABLE_MEMORY_PRIORITY")) {
device->memory_priority = true;
}
}
@ -4531,6 +4481,16 @@ static vk_device ggml_vk_get_device(size_t idx) {
device_extensions.push_back("VK_EXT_pipeline_robustness");
}
VkPhysicalDeviceMemoryPriorityFeaturesEXT memory_priority_features;
memory_priority_features.pNext = nullptr;
memory_priority_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_PRIORITY_FEATURES_EXT;
memory_priority_features.memoryPriority = VK_FALSE;
if (device->memory_priority) {
last_struct->pNext = (VkBaseOutStructure *)&memory_priority_features;
last_struct = (VkBaseOutStructure *)&memory_priority_features;
device_extensions.push_back("VK_EXT_memory_priority");
}
VkPhysicalDeviceSubgroupSizeControlFeaturesEXT subgroup_size_control_features;
subgroup_size_control_features.pNext = nullptr;
subgroup_size_control_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_SIZE_CONTROL_FEATURES_EXT;
@ -5110,7 +5070,7 @@ static void ggml_vk_print_gpu_info(size_t idx) {
}
}
static bool ggml_vk_instance_validation_ext_available();
static bool ggml_vk_instance_layer_settings_available();
static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
static bool ggml_vk_instance_debug_utils_ext_available(const std::vector<vk::ExtensionProperties> & instance_extensions);
static bool ggml_vk_device_is_supported(const vk::PhysicalDevice & vkdev);
@ -5139,19 +5099,19 @@ static void ggml_vk_instance_init() {
vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, api_version };
const std::vector<vk::ExtensionProperties> instance_extensions = vk::enumerateInstanceExtensionProperties();
const bool validation_ext = ggml_vk_instance_validation_ext_available();
const bool layer_settings = ggml_vk_instance_layer_settings_available();
#ifdef __APPLE__
const bool portability_enumeration_ext = ggml_vk_instance_portability_enumeration_ext_available(instance_extensions);
#endif
const bool debug_utils_ext = ggml_vk_instance_debug_utils_ext_available(instance_extensions) && getenv("GGML_VK_DEBUG_MARKERS") != nullptr;
std::vector<const char*> layers;
if (validation_ext) {
if (layer_settings) {
layers.push_back("VK_LAYER_KHRONOS_validation");
}
std::vector<const char*> extensions;
if (validation_ext) {
extensions.push_back("VK_EXT_validation_features");
if (layer_settings) {
extensions.push_back("VK_EXT_layer_settings");
}
#ifdef __APPLE__
if (portability_enumeration_ext) {
@ -5161,26 +5121,24 @@ static void ggml_vk_instance_init() {
if (debug_utils_ext) {
extensions.push_back("VK_EXT_debug_utils");
}
vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags{}, &app_info, layers, extensions);
VkBool32 enable_best_practice = layer_settings;
std::vector<vk::LayerSettingEXT> settings = {
{
"VK_LAYER_KHRONOS_validation",
"validate_best_practices",
vk::LayerSettingTypeEXT::eBool32,
1,
&enable_best_practice
},
};
vk::LayerSettingsCreateInfoEXT layer_setting_info(settings);
vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags{}, &app_info, layers, extensions, &layer_setting_info);
#ifdef __APPLE__
if (portability_enumeration_ext) {
instance_create_info.flags |= vk::InstanceCreateFlagBits::eEnumeratePortabilityKHR;
}
#endif
std::vector<vk::ValidationFeatureEnableEXT> features_enable;
vk::ValidationFeaturesEXT validation_features;
if (validation_ext) {
features_enable = { vk::ValidationFeatureEnableEXT::eBestPractices };
validation_features = {
features_enable,
{},
};
validation_features.setPNext(nullptr);
instance_create_info.setPNext(&validation_features);
GGML_LOG_DEBUG("ggml_vulkan: Validation layers enabled\n");
}
vk_instance.instance = vk::createInstance(instance_create_info);
vk_instance_initialized = true;
@ -6928,6 +6886,10 @@ static bool ggml_vk_should_use_mmvq(const vk_device& device, uint32_t m, uint32_
// Quantization overhead is not worth it for small k
switch (device->vendor_id) {
case VK_VENDOR_ID_NVIDIA:
if (src0_type == GGML_TYPE_Q2_K) {
return true;
}
if (k <= 4096) {
return false;
}
@ -8260,59 +8222,23 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
}
}
static std::array<uint32_t, 3> ggml_vk_get_conv_elements(const ggml_tensor *dst) {
const ggml_tensor *src0 = dst->src[0];
const ggml_tensor *src1 = dst->src[1];
// src0 - kernel: [KW, KH, Cin, Cout]
// src1 - input: [W, H, Cin, N]
// dst - result: [OW, OH, Cout, N]
// Copied from ggml.c: int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d)
auto calc_conv_output_size = [](int64_t ins, int64_t ks, int s, int p, int d) -> int64_t {
return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
static vk_conv_shapes ggml_vk_conv_select_shape(ggml_backend_vk_context * ctx, uint32_t K, uint32_t NPQ) {
auto n_tiles = [&](vk_conv_shapes s) {
return CEIL_DIV(K, vk_conv_block_sizes[s].K)
* CEIL_DIV(NPQ, vk_conv_block_sizes[s].NPQ);
};
// parallelize in {OW/BS_K, OH/BS_NPQ, 1}
int64_t W = src1->ne[0];
int64_t H = src1->ne[1];
int64_t KW = src0->ne[0];
int64_t KH = src0->ne[1];
int64_t Cout = src0->ne[3];
int64_t N = src1->ne[3];
int64_t OH = calc_conv_output_size(H, KH, dst->op_params[1], dst->op_params[3], dst->op_params[5]);
int64_t OW = calc_conv_output_size(W, KW, dst->op_params[0], dst->op_params[2], dst->op_params[4]);
int64_t NPQ = N * OW * OH;
// Tile output matrix to (K/NB_K, NPQ/NB_NPQ, 1) workgroups
std::array<uint32_t, 3> elements = { static_cast<uint32_t>(Cout), static_cast<uint32_t>(NPQ), 1 };
return elements;
// We can't query number of shader cores on Intel, use 32 as a placeholder
// so small convolutions will still choose a smaller tile.
const uint32_t shader_core_count = ctx->device->shader_core_count > 0 ? ctx->device->shader_core_count : 32;
if (K > 64 && n_tiles(CONV_SHAPE_128x128) >= shader_core_count * 2) {
return CONV_SHAPE_128x128;
} else if (K <= 32 && n_tiles(CONV_SHAPE_32x256) >= shader_core_count * 2) {
return CONV_SHAPE_32x256;
} else {
return CONV_SHAPE_64x32;
}
static std::array<uint32_t, 3> ggml_vk_get_conv_transpose_2d_elements(const ggml_tensor *dst) {
const ggml_tensor *src0 = dst->src[0];
const ggml_tensor *src1 = dst->src[1];
// src0 - kernel: [KW, KH, Cout, Cin]
// src1 - input: [W, H, Cin, N]
// dst - result: [OW, OH, Cout, N]
auto calc_conv_output_size = [](int64_t ins, int64_t ks, int s, int p, int d) -> int64_t {
return (ins - 1) * s - 2 * p + (ks - 1) * d + 1;
};
// parallelize in {OW/BS_K, OH/BS_NPQ, 1}
int64_t W = src1->ne[0];
int64_t H = src1->ne[1];
int64_t KW = src0->ne[0];
int64_t KH = src0->ne[1];
int64_t Cout = src0->ne[2];
int64_t N = src1->ne[3];
int64_t OH = calc_conv_output_size(H, KH, dst->op_params[0], 0, 1);
int64_t OW = calc_conv_output_size(W, KW, dst->op_params[0], 0, 1);
int64_t NPQ = N * OW * OH;
// Tile output matrix to (K/NB_K, NPQ/NB_NPQ, 1) workgroups
std::array<uint32_t, 3> elements = { static_cast<uint32_t>(Cout), static_cast<uint32_t>(NPQ), 1 };
return elements;
}
static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * dst, ggml_op op) {
@ -8775,39 +8701,20 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
return nullptr;
case GGML_OP_CONV_2D:
case GGML_OP_CONV_TRANSPOSE_2D:
if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 &&
ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
std::array<uint32_t, 3> elements{};
if (op == GGML_OP_CONV_2D) elements = ggml_vk_get_conv_elements(dst);
else if (op == GGML_OP_CONV_TRANSPOSE_2D) elements = ggml_vk_get_conv_transpose_2d_elements(dst);
vk_conv_shapes shape;
uint32_t tiles[CONV_SHAPE_COUNT];
for (uint32_t i = 0; i < CONV_SHAPE_COUNT; ++i) {
tiles[i] = CEIL_DIV(elements[0], conv_shapes_wg_denoms[i][0]) * CEIL_DIV(elements[1], conv_shapes_wg_denoms[i][1]);
}
// We can't query number of shader cores on Intel, use 32 as a placeholder
// so small convolutions will still choose a smaller tile.
const uint32_t shader_core_count = ctx->device->shader_core_count > 0 ? ctx->device->shader_core_count : 32;
if (elements[0] > 64 && tiles[CONV_SHAPE_128x128] >= shader_core_count * 2) {
shape = CONV_SHAPE_128x128;
} else if (elements[0] <= 32 && tiles[CONV_SHAPE_32x256] >= shader_core_count * 2) {
shape = CONV_SHAPE_32x256;
} else {
shape = CONV_SHAPE_64x32;
}
uint32_t KW = static_cast<uint32_t>(src0->ne[0]);
uint32_t KH = static_cast<uint32_t>(src0->ne[1]);
uint32_t s0 = static_cast<uint32_t>(dst->op_params[0]);
uint32_t s1 = op == GGML_OP_CONV_2D ? static_cast<uint32_t>(dst->op_params[1]) : static_cast<uint32_t>(dst->op_params[0]);
uint32_t p0 = op == GGML_OP_CONV_2D ? static_cast<uint32_t>(dst->op_params[2]) : 0;
uint32_t p1 = op == GGML_OP_CONV_2D ? static_cast<uint32_t>(dst->op_params[3]) : 0;
uint32_t d0 = op == GGML_OP_CONV_2D ? static_cast<uint32_t>(dst->op_params[4]) : 1;
uint32_t d1 = op == GGML_OP_CONV_2D ? static_cast<uint32_t>(dst->op_params[5]) : 1;
if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
uint32_t K = dst->ne[2]; // Cout
uint32_t NPQ = dst->ne[3] * dst->ne[1] * dst->ne[0]; // N * OH * OW
vk_conv_shapes shape = ggml_vk_conv_select_shape(ctx, K, NPQ);
bool transpose = dst->op == GGML_OP_CONV_TRANSPOSE_2D;
uint32_t KW = (uint32_t)src0->ne[0];
uint32_t KH = (uint32_t)src0->ne[1];
uint32_t s0 = (uint32_t)(ggml_get_op_params_i32(dst, 0));
uint32_t s1 = !transpose ? (uint32_t)ggml_get_op_params_i32(dst, 1) : s0;
uint32_t p0 = !transpose ? (uint32_t)ggml_get_op_params_i32(dst, 2) : 0;
uint32_t p1 = !transpose ? (uint32_t)ggml_get_op_params_i32(dst, 3) : 0;
uint32_t d0 = !transpose ? (uint32_t)ggml_get_op_params_i32(dst, 4) : 1;
uint32_t d1 = !transpose ? (uint32_t)ggml_get_op_params_i32(dst, 5) : 1;
vk_conv2d_pipeline_state conv2d_pipeline_state(s0, s1, p0, p1, d0, d1, KW, KH);
std::map<vk_conv2d_pipeline_state, vk_pipeline> *pipelines = nullptr;
@ -9126,13 +9033,21 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
elements = { N * OC * OH * OW, 1, 1};
} break;
case GGML_OP_CONV_2D:
{
elements = ggml_vk_get_conv_elements(dst);
} break;
case GGML_OP_CONV_TRANSPOSE_2D:
{
elements = ggml_vk_get_conv_transpose_2d_elements(dst);
} break;
if constexpr (std::is_same_v<PC, vk_op_conv2d_push_constants>) {
const uint32_t NPQ = pc.N * pc.OH * pc.OW;
const vk_conv_shapes shape = ggml_vk_conv_select_shape(ctx, pc.Cout, NPQ);
const uint32_t NPQ_blocks = CEIL_DIV(NPQ, vk_conv_block_sizes[shape].NPQ);
elements = { pc.Cout, NPQ_blocks, 1 };
if (elements[1] > 512) {
elements[2] = CEIL_DIV(elements[1], 512);
elements[1] = 512;
}
} else {
GGML_ABORT("invalid push constant type for CONV_2D");
}
break;
case GGML_OP_ADD:
case GGML_OP_SUB:
case GGML_OP_DIV:
@ -10683,30 +10598,24 @@ static void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context & subctx,
GGML_ASSERT(dst->type == GGML_TYPE_F32);
GGML_TENSOR_BINARY_OP_LOCALS
GGML_ASSERT(nb00 == sizeof(float) || nb00 == sizeof(ggml_fp16_t));
GGML_ASSERT(nb10 == sizeof(float));
GGML_ASSERT(nb0 == sizeof(float));
bool transpose = dst->op == GGML_OP_CONV_TRANSPOSE_2D;
vk_op_conv2d_push_constants p{};
p.Cout = static_cast<uint32_t>(ne03);
p.Cin = static_cast<uint32_t>(ne02);
p.Cout = static_cast<uint32_t>(!transpose ? ne03 : ne02);
p.Cin = static_cast<uint32_t>(!transpose ? ne02 : ne03);
p.N = static_cast<uint32_t>(ne13);
GGML_ASSERT(p.Cout == ne2);
GGML_ASSERT(p.Cin == ne12);
p.KW = static_cast<uint32_t>(ne00);
p.KH = static_cast<uint32_t>(ne01);
p.W = static_cast<uint32_t>(ne10);
p.H = static_cast<uint32_t>(ne11);
p.OW = static_cast<uint32_t>(ne0);
p.OH = static_cast<uint32_t>(ne1);
p.s0 = static_cast<uint32_t>(dst->op_params[0]);
p.s1 = static_cast<uint32_t>(dst->op_params[1]);
p.p0 = static_cast<uint32_t>(dst->op_params[2]);
p.p1 = static_cast<uint32_t>(dst->op_params[3]);
p.d0 = static_cast<uint32_t>(dst->op_params[4]);
p.d1 = static_cast<uint32_t>(dst->op_params[5]);
p.nb01 = static_cast<uint32_t>(nb01 / nb00);
p.nb02 = static_cast<uint32_t>(nb02 / nb00);
p.nb03 = static_cast<uint32_t>(nb03 / nb00);
@ -10719,59 +10628,7 @@ static void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context & subctx,
p.nb2 = static_cast<uint32_t>(nb2 / nb0);
p.nb3 = static_cast<uint32_t>(nb3 / nb0);
GGML_ASSERT(ne03 == ne2);
GGML_ASSERT(ne02 == ne12);
ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_2D, std::move(p));
}
static void ggml_vk_conv_transpose_2d(ggml_backend_vk_context * ctx, vk_context & subctx, const ggml_tensor * src0,
const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
GGML_TENSOR_BINARY_OP_LOCALS
GGML_ASSERT(nb00 == sizeof(float) || nb00 == sizeof(ggml_fp16_t));
GGML_ASSERT(nb10 == sizeof(float));
GGML_ASSERT(nb0 == sizeof(float));
vk_op_conv_transpose_2d_push_constants p{};
p.Cout = static_cast<uint32_t>(ne02);
p.Cin = static_cast<uint32_t>(ne03);
p.N = static_cast<uint32_t>(ne13);
p.KW = static_cast<uint32_t>(ne00);
p.KH = static_cast<uint32_t>(ne01);
p.W = static_cast<uint32_t>(ne10);
p.H = static_cast<uint32_t>(ne11);
p.OW = static_cast<uint32_t>(ne0);
p.OH = static_cast<uint32_t>(ne1);
p.s0 = static_cast<uint32_t>(dst->op_params[0]);
p.s1 = static_cast<uint32_t>(dst->op_params[0]);
p.p0 = 0;
p.p1 = 0;
p.d0 = 1;
p.d1 = 1;
p.nb01 = static_cast<uint32_t>(nb01 / nb00);
p.nb02 = static_cast<uint32_t>(nb02 / nb00);
p.nb03 = static_cast<uint32_t>(nb03 / nb00);
p.nb11 = static_cast<uint32_t>(nb11 / nb10);
p.nb12 = static_cast<uint32_t>(nb12 / nb10);
p.nb13 = static_cast<uint32_t>(nb13 / nb10);
p.nb1 = static_cast<uint32_t>(nb1 / nb0);
p.nb2 = static_cast<uint32_t>(nb2 / nb0);
p.nb3 = static_cast<uint32_t>(nb3 / nb0);
GGML_ASSERT(ne02 == ne2);
GGML_ASSERT(ne03 == ne12);
ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_TRANSPOSE_2D, std::move(p));
ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, dst->op, std::move(p));
}
static void ggml_vk_conv_2d_dw(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@ -12142,11 +11999,8 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
break;
case GGML_OP_CONV_2D:
ggml_vk_conv_2d(ctx, compute_ctx, src0, src1, node);
break;
case GGML_OP_CONV_TRANSPOSE_2D:
ggml_vk_conv_transpose_2d(ctx, compute_ctx, src0, src1, node);
ggml_vk_conv_2d(ctx, compute_ctx, src0, src1, node);
break;
case GGML_OP_CONV_2D_DW:
@ -14179,10 +14033,12 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
const uint32_t N = op->src[0]->ne[0];
const uint32_t K = op->src[1]->ne[0];
// K dimension limited to workgroup size
if (K > 128) {
if (K > 1u << device->max_workgroup_size_log2) {
return false;
}
if (N * N * sizeof(float) + N * K * sizeof(float) > device->properties.limits.maxComputeSharedMemorySize) {
const uint32_t batch_N = device->properties.limits.maxComputeSharedMemorySize / ((N + K) * sizeof(float));
if (batch_N == 0) {
return false;
}
return true;
@ -14255,13 +14111,6 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
case GGML_OP_CONV_2D:
case GGML_OP_CONV_TRANSPOSE_2D:
{
// Op is disabled for Apple because it segfaults at pipeline create time on MoltenVK
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
const vk_device& device = ggml_vk_get_device(ctx->device);
if (op->op == GGML_OP_CONV_TRANSPOSE_2D &&
device->properties.limits.maxPushConstantsSize < sizeof(vk_op_conv_transpose_2d_push_constants)) {
return false;
}
// Channel-contiguous format is not supported yet.
return ((op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
op->src[1]->type == GGML_TYPE_F32 &&
@ -14386,21 +14235,21 @@ ggml_backend_reg_t ggml_backend_vk_reg() {
}
// Extension availability
static bool ggml_vk_instance_validation_ext_available() {
static bool ggml_vk_instance_layer_settings_available() {
#ifdef GGML_VULKAN_VALIDATE
// Check if validation layer provides the extension
const std::string layer_name = "VK_LAYER_KHRONOS_validation";
for (const auto& layer : vk::enumerateInstanceLayerProperties()) {
if (layer_name == layer.layerName.data()) {
for (const auto& ext : vk::enumerateInstanceExtensionProperties(layer_name)) {
if (strcmp("VK_EXT_validation_features", ext.extensionName.data()) == 0) {
if (strcmp("VK_EXT_layer_settings", ext.extensionName.data()) == 0) {
return true;
}
}
}
}
std::cerr << "ggml_vulkan: WARNING: Validation layer or layer extension VK_EXT_validation_features not found." << std::endl;
std::cerr << "ggml_vulkan: WARNING: Validation layer or layer extension VK_EXT_layer_settings not found." << std::endl;
#endif
return false;
}

View File

@ -32,22 +32,12 @@ layout(push_constant) uniform parameter {
uint32_t Cin;
uint32_t N;
// Tensor spatial sizes: kernel, input, output
uint32_t KW;
uint32_t KH;
// Tensor spatial sizes: input, output
uint32_t W;
uint32_t H;
uint32_t OW;
uint32_t OH;
// Parameters: stride, padding, dilation - 0=y, 1=x
uint32_t s0;
uint32_t s1;
uint32_t p0;
uint32_t p1;
uint32_t d0;
uint32_t d1;
// Strides in elements
uint32_t nb01;
uint32_t nb02;
@ -77,13 +67,14 @@ layout(constant_id = 3) const uint BS_NPQ = 128;
layout(constant_id = 4) const uint TS_K = 8;
layout(constant_id = 5) const uint use_collectives = 1;
layout(constant_id = 6) const uint SHMEM_PAD = 4;
// Stride, padding, dilation
layout(constant_id = 7) const uint s0 = 1;
layout(constant_id = 8) const uint s1 = 1;
layout(constant_id = 9) const uint p0 = 0;
layout(constant_id = 10) const uint p1 = 0;
layout(constant_id = 11) const uint d0 = 1;
layout(constant_id = 12) const uint d1 = 1;
// Kernel spatial sizes
layout(constant_id = 13) const uint KW = 1;
layout(constant_id = 14) const uint KH = 1;
@ -138,7 +129,7 @@ P,Q=OH,OW
*/
uint32_t B_idx_K = gl_WorkGroupID.x;
uint32_t B_idx_NPQ = gl_WorkGroupID.y;
uint32_t B_idx_NPQ = gl_WorkGroupID.y + gl_WorkGroupID.z * 512;
uint32_t T_y = tid / NT_NPQ;
uint32_t T_x = tid % NT_NPQ;
@ -178,6 +169,10 @@ ACC_TYPE perElemOpStore(const in uint32_t r, const in uint32_t c, const in ACC_T
#endif
void main() {
if (B_idx_NPQ * BS_NPQ >= NPQ) {
return;
}
#ifdef COOPMAT2
coopmat<ACC_TYPE, gl_ScopeWorkgroup, BS_K, BS_NPQ, gl_MatrixUseAccumulator> matC;
matC = coopmat<ACC_TYPE, gl_ScopeWorkgroup, BS_K, BS_NPQ, gl_MatrixUseAccumulator>(0.0);

View File

@ -8,6 +8,7 @@ layout (push_constant) uniform parameter
uint ne00; uint ne01; uint ne02; uint ne03; uint nb00; uint nb01; uint nb02; uint nb03;
uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
uint misalign_offsets;
uint circular;
uint lp0; uint rp0;
uint lp1; uint rp1;
@ -18,6 +19,10 @@ layout (push_constant) uniform parameter
uint get_aoffset() { return p.misalign_offsets >> 16; }
uint get_doffset() { return p.misalign_offsets & 0xFFFF; }
uint wrap_around(int coord, uint size) {
return (uint(coord + int(size))) % size; // add size to avoid issues with negative
}
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
@ -40,10 +45,20 @@ void main() {
const uint src0_idx = (i3 - p.lp3)*p.nb03 + (i2 - p.lp2)*p.nb02 + (i1 - p.lp1)*p.nb01 + (i0 - p.lp0)*p.nb00;
const uint dst_idx = i3*p.nb13 + i2*p.nb12 + i1*p.nb11 + i0*p.nb10;
if (p.circular != 0u) {
const uint ci0 = wrap_around(int(i0) - int(p.lp0), p.ne00);
const uint ci1 = wrap_around(int(i1) - int(p.lp1), p.ne01);
const uint ci2 = wrap_around(int(i2) - int(p.lp2), p.ne02);
const uint ci3 = wrap_around(int(i3) - int(p.lp3), p.ne03);
const uint circular_src_idx = ci3*p.nb03 + ci2*p.nb02 + ci1*p.nb01 + ci0*p.nb00;
data_d[get_doffset() + dst_idx] = D_TYPE(data_a[get_aoffset() + circular_src_idx]);
} else {
const bool is_src0 = i0 >= p.lp0 && i0 < p.ne10 - p.rp0 &&
i1 >= p.lp1 && i1 < p.ne11 - p.rp1 &&
i2 >= p.lp2 && i2 < p.ne12 - p.rp2 &&
i3 >= p.lp3 && i3 < p.ne13 - p.rp3;
data_d[get_doffset() + dst_idx] = D_TYPE(is_src0 ? data_a[get_aoffset() + src0_idx] : 0.0f);
}
}

View File

@ -131,8 +131,12 @@ void main() {
rms_norm(num_blocks);
} else if (num_blocks > 16) {
rms_norm(32);
} else if (num_blocks > 8) {
} else if (num_blocks > 12) {
rms_norm(16);
} else if (num_blocks > 10) {
rms_norm(12);
} else if (num_blocks > 8) {
rms_norm(10);
} else if (num_blocks > 4) {
rms_norm(8);
} else if (num_blocks == 4) {

View File

@ -5,8 +5,9 @@
layout (constant_id = 1) const uint N = 64;
layout (constant_id = 2) const uint K = 32;
layout (constant_id = 3) const uint BATCH_N = 32;
layout(local_size_x = 128, local_size_y = 1, local_size_z = 1) in;
layout(local_size_x_id = 4, local_size_y = 1, local_size_z = 1) in;
uint a_base, b_base, x_base;
@ -22,8 +23,8 @@ void store_x(uint r, uint c, FLOAT_TYPE v) {
data_d[x_base + r * p.nb21 + c * p.nb20] = D_TYPE(v);
}
shared FLOAT_TYPE shA[N * N];
shared FLOAT_TYPE shB[N * K];
shared FLOAT_TYPE shA[BATCH_N * N];
shared FLOAT_TYPE shB[BATCH_N * K];
void main() {
const uint batch = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
@ -39,34 +40,42 @@ void main() {
b_base = get_boffset() + i2 * p.nb12 + i3 * p.nb13;
x_base = get_doffset() + i2 * p.nb22 + i3 * p.nb23;
// Load the A matrix into shA
[[unroll]] for (uint i = 0; i < N * N; i += gl_WorkGroupSize.x) {
FLOAT_TYPE X[N];
// Loop over batches of rows
[[unroll]] for (uint row_base = 0; row_base < N; row_base += BATCH_N) {
const uint cur_N = min(BATCH_N, N - row_base);
// Load the A matrix batch into shA
[[unroll]] for (uint i = 0; i < cur_N * N; i += gl_WorkGroupSize.x) {
uint idx = i + tid;
if (((N * N) % gl_WorkGroupSize.x == 0) || idx < N * N) {
shA[idx] = get_a(idx / N, idx % N);
if (((cur_N * N) % gl_WorkGroupSize.x == 0) || idx < cur_N * N) {
shA[idx] = get_a(row_base + idx / N, idx % N);
}
}
// Load the B matrix into shB
[[unroll]] for (uint i = 0; i < N * K; i += gl_WorkGroupSize.x) {
// Load the B matrix batch into shB
[[unroll]] for (uint i = 0; i < cur_N * K; i += gl_WorkGroupSize.x) {
uint idx = i + tid;
if (((N * K) % gl_WorkGroupSize.x == 0) || idx < N * K) {
shB[idx] = get_b(idx / K, idx % K);
if (((cur_N * K) % gl_WorkGroupSize.x == 0) || idx < cur_N * K) {
shB[idx] = get_b(row_base + idx / K, idx % K);
}
}
barrier();
FLOAT_TYPE X[N];
// Each thread solves one column
if (tid < K) {
[[unroll]] for (int r = 0; r < N; ++r) {
FLOAT_TYPE b = shB[r * K + tid];
[[unroll]] for (uint row_offset = 0; row_offset < cur_N; ++row_offset) {
uint r = row_base + row_offset;
FLOAT_TYPE b = shB[row_offset * K + tid];
// Compute x[r,c] = (b[r,c] - sum(a[r,c]*x[c])) / a[r,r]
[[unroll]] for (int c = 0; c < r; ++c) {
b -= shA[r * N + c] * X[c];
b -= shA[row_offset * N + c] * X[c];
}
FLOAT_TYPE x = b / shA[r * N + r];
FLOAT_TYPE x = b / shA[row_offset * N + r];
X[r] = x;
store_x(r, tid, x);
}
}
barrier();
}
}

View File

@ -75,7 +75,7 @@ void softmax_warp_inplace(inout float vals[experts_per_thread], const uint limit
}
void main() {
const uint row = gl_WorkGroupID.x * gl_WorkGroupSize.y + gl_LocalInvocationID.y;
const uint row = gl_WorkGroupID.x * gl_WorkGroupSize.y + gl_SubgroupID;
if (row >= n_rows) {
return;
}
@ -83,17 +83,18 @@ void main() {
const uint logits_offset = n_experts * row;
const uint weights_offset = n_expert_used * row;
const uint ids_offset = n_experts * row;
const uint lane = gl_SubgroupInvocationID;
float wt[experts_per_thread];
[[unroll]]
for (uint i = 0; i < n_experts; i += WARP_SIZE) {
const uint expert = i + gl_LocalInvocationID.x;
const uint expert = i + lane;
wt[i / WARP_SIZE] = (n_experts % WARP_SIZE == 0 || expert < n_experts) ? logits[logits_offset + expert] : -INFINITY;
}
if (!late_softmax) {
softmax_warp_inplace(wt, n_experts, gl_LocalInvocationID.x, false);
softmax_warp_inplace(wt, n_experts, lane, false);
}
// at this point, each thread holds a portion of softmax,
@ -111,11 +112,11 @@ void main() {
for (int k = 0; k < n_expert_used; k++) {
float max_val = wt[0];
uint max_expert = gl_LocalInvocationID.x;
uint max_expert = lane;
[[unroll]]
for (int i = 1; i < experts_per_thread; i++) {
const uint expert = gl_LocalInvocationID.x + i * WARP_SIZE;
const uint expert = lane + i * WARP_SIZE;
if ((n_experts % WARP_SIZE == 0 || expert < n_experts) && wt[i] > max_val) {
max_val = wt[i];
max_expert = expert;
@ -132,11 +133,11 @@ void main() {
}
}
if ((k & (WARP_SIZE - 1)) == gl_LocalInvocationID.x) {
if ((k & (WARP_SIZE - 1)) == lane) {
output_weights[k / WARP_SIZE] = max_val;
}
if ((max_expert & (WARP_SIZE - 1)) == gl_LocalInvocationID.x) {
if ((max_expert & (WARP_SIZE - 1)) == lane) {
wt[max_expert / WARP_SIZE] = -INFINITY;
ids[ids_offset + k] = max_expert;
@ -158,12 +159,12 @@ void main() {
}
if (late_softmax) {
softmax_warp_inplace(output_weights, n_expert_used, gl_LocalInvocationID.x, true);
softmax_warp_inplace(output_weights, n_expert_used, lane, true);
}
[[unroll]]
for (uint i = 0; i < experts_per_thread; ++i) {
uint idx = i * WARP_SIZE + gl_LocalInvocationID.x;
uint idx = i * WARP_SIZE + lane;
if (idx < n_expert_used) {
weights[weights_offset + idx] = output_weights[i];
}

View File

@ -38,6 +38,7 @@ shared int counts[SUBGROUP_SIZE];
shared int sh_min_idx;
shared uint sh_total;
shared uint offset_partials[BLOCK_SIZE / SUBGROUP_SIZE];
shared uint eq_min_partials[BLOCK_SIZE / SUBGROUP_SIZE];
// Map float values to uint such that comparisons still work.
// Positive values set the high bit, negative values are inverted.
@ -156,6 +157,11 @@ void topk(const uint row) {
// We need to compact these values to the start of the dst_row array.
// Have each subgroup count how many items it'll store, so other
// subgroups can compute their base offset.
// Values strictly greater than range_min must be stored. For values equal
// to range_min, there can be ties and it's possible we'll need to store
// an arbitrary subset of them.
// If total == p.k, have a fast path where we don't need to handle ties.
if (total == p.k) {
bool top = f2ui(intBitsToFloat(v.y)) >= range_min;
uvec4 b = subgroupBallot(top);
uint bit_count = subgroupBallotBitCount(b);
@ -176,6 +182,42 @@ void topk(const uint row) {
// TODO: Copy directly to the output?
dst_row[out_idx + bit_count_ex] = v;
}
} else {
bool top = f2ui(intBitsToFloat(v.y)) > range_min;
bool eq_min = f2ui(intBitsToFloat(v.y)) == range_min;
uvec4 b_top = subgroupBallot(top);
uvec4 b_eq_min = subgroupBallot(eq_min);
uint bit_count_top = subgroupBallotBitCount(b_top);
uint bit_count_eq_min = subgroupBallotBitCount(b_eq_min);
if ((tid % SUBGROUP_SIZE) == 0) {
offset_partials[tid / SUBGROUP_SIZE] = bit_count_top;
eq_min_partials[tid / SUBGROUP_SIZE] = bit_count_eq_min;
}
barrier();
uint out_idx = 0;
uint eq_min_base = 0;
uint eq_min_idx = 0;
[[unroll]] for (int i = 0; i < BLOCK_SIZE / SUBGROUP_SIZE; ++i) {
if (i < tid / SUBGROUP_SIZE) {
out_idx += offset_partials[i];
eq_min_idx += eq_min_partials[i];
}
eq_min_base += offset_partials[i];
}
// range_min values are stored at the end
eq_min_idx += eq_min_base;
uint bit_count_ex_top = subgroupBallotExclusiveBitCount(b_top);
uint bit_count_ex_eq_min = subgroupBallotExclusiveBitCount(b_eq_min);
if (top) {
// TODO: Copy directly to the output?
dst_row[out_idx + bit_count_ex_top] = v;
}
if (eq_min && eq_min_idx + bit_count_ex_eq_min < p.k) {
dst_row[eq_min_idx + bit_count_ex_eq_min] = v;
}
}
barrier();
}

File diff suppressed because it is too large Load Diff

View File

@ -19,6 +19,15 @@ def parse_decls(decls_text):
return decls
def replace_repl_placeholders(variant, template_map):
for repl, code in variant["REPLS"].items():
for key, val in template_map.items():
# Match "key" and avoid matching subsequences using by using \b
code = re.sub(rf'\b{re.escape(str(key))}\b', str(val), code)
variant["REPLS"][repl] = code
return variant
def replace_placeholders(shader_text, replacements):
for key, val in replacements.items():
# Match {{KEY}} literally, where KEY is escaped
@ -71,6 +80,10 @@ def generate_variants(fname, input_dir, output_dir, outfile):
decls_map = parse_decls(extract_block(text, "DECLS"))
except ValueError:
decls_map = {}
try:
templates_map = ast.literal_eval(extract_block(text, "REPL_TEMPLATES"))
except ValueError:
templates_map = {}
for fname in sorted(os.listdir(input_dir)):
if fname.endswith(".tmpl"):
@ -90,9 +103,11 @@ def generate_variants(fname, input_dir, output_dir, outfile):
if key not in decls_map:
raise ValueError(f"DECLS key '{key}' not found.")
decls_code += decls_map[key] + "\n\n"
final_shader = re.sub(r'\bDECLS\b', decls_code, shader_template)
if "REPLS" in variant:
variant = replace_repl_placeholders(variant, templates_map)
final_shader = replace_placeholders(final_shader, variant["REPLS"])
# second run to expand placeholders in repl_template
final_shader = replace_placeholders(final_shader, variant["REPLS"])
final_shader = expand_includes(final_shader, input_dir)

View File

@ -0,0 +1,461 @@
#define(REPL_TEMPLATES)
{
"XIELU_FUNC": "{{MUTATE}}[dst_i] = select(((exp(min(src[src_i], {{TYPE}}(params.eps))) - 1.0) - src[src_i]) * {{TYPE}}(params.alpha_n) + {{TYPE}}(params.beta) * src[src_i], {{TYPE}}(params.alpha_p) * src[src_i] * src[src_i] + {{TYPE}}(params.beta) * src[src_i], src[src_i] > 0.0);",
"ABS_FUNC": "{{MUTATE}}[dst_i] = abs(src[src_i]);",
"SGN_FUNC": "{{MUTATE}}[dst_i] = select({{TYPE}}(select(0.0, -1.0, src[src_i] < 0.0)), {{TYPE}}(1.0), src[src_i] > 0.0);",
"NEG_FUNC": "{{MUTATE}}[dst_i] = -src[src_i];",
"STEP_FUNC": "{{MUTATE}}[dst_i] = {{TYPE}}(select(0.0, 1.0, src[src_i] > 0.0));",
"TANH_FUNC": "{{MUTATE}}[dst_i] = tanh(clamp(src[src_i], -9.010913, 9.010913)); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458",
"RELU_FUNC": "{{MUTATE}}[dst_i] = select(0.0, src[src_i], src[src_i] > 0.0);",
"ELU_FUNC": "{{MUTATE}}[dst_i] = select(exp(src[src_i]) - 1.0, src[src_i], src[src_i] > 0.0);",
"HARDSIGMOID_FUNC": "{{MUTATE}}[dst_i] = min(1.0, max(0.0, (src[src_i] + 3.0) / 6.0));",
"SIGMOID_FUNC": "{{MUTATE}}[dst_i] = 1.0 / (1.0 + exp(-src[src_i]));",
"SILU_FUNC": "{{MUTATE}}[dst_i] = src[src_i] / (1.0 + exp(-src[src_i]));",
"EXP_FUNC": "{{MUTATE}}[dst_i] = exp(src[src_i]);",
"HARDSWISH_FUNC": "{{MUTATE}}[dst_i] = src[src_i] * min(1.0, max(0.0, (src[src_i] + 3.0) / 6.0));",
"GELU_FUNC": "{{MUTATE}}[dst_i] = 0.5 * src[src_i] * (1.0 + tanh(clamp(sqrt(2.0 / 3.14159265) * (src[src_i] + 0.044715 * pow(src[src_i], 3.0)), -9.010913, 9.010913))); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458",
"GELU_QUICK_FUNC": "{{MUTATE}}[dst_i] = src[src_i] * 0.5 * (1.0 + tanh(clamp(0.79788456 * (src[src_i] + 0.044715 * src[src_i] * src[src_i] * src[src_i]), -9.010913, 9.010913))); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458",
"GELU_ERF_FUNC": "{{MUTATE}}[dst_i] = 0.5 * src[src_i] * (1.0 + tanh(clamp(0.79788456 * (src[src_i] + 0.044715 * src[src_i] * src[src_i] * src[src_i]), -9.010913, 9.010913))); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458"
}
#end(REPL_TEMPLATES)
#define(VARIANTS)
[
{
"SHADER_NAME": "abs_f32",
"REPLS": { "TYPE": "f32", "FUNC": "ABS_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
"DECLS": ["NOT_INPLACE"]
},
{
"SHADER_NAME": "abs_f16",
"REPLS": { "TYPE": "f16", "FUNC": "ABS_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
"DECLS": ["NOT_INPLACE"]
},
{
"SHADER_NAME": "abs_inplace_f32",
"REPLS": { "TYPE": "f32", "FUNC": "ABS_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
"DECLS": ["INPLACE"]
},
{
"SHADER_NAME": "abs_inplace_f16",
"REPLS": { "TYPE": "f16", "FUNC": "ABS_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
"DECLS": ["INPLACE"]
},
{
"SHADER_NAME": "sgn_f32",
"REPLS": { "TYPE": "f32", "FUNC": "SGN_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
"DECLS": ["NOT_INPLACE"]
},
{
"SHADER_NAME": "sgn_f16",
"REPLS": { "TYPE": "f16", "FUNC": "SGN_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
"DECLS": ["NOT_INPLACE"]
},
{
"SHADER_NAME": "sgn_inplace_f32",
"REPLS": { "TYPE": "f32", "FUNC": "SGN_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
"DECLS": ["INPLACE"]
},
{
"SHADER_NAME": "sgn_inplace_f16",
"REPLS": { "TYPE": "f16", "FUNC": "SGN_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
"DECLS": ["INPLACE"]
},
{
"SHADER_NAME": "neg_f32",
"REPLS": { "TYPE": "f32", "FUNC": "NEG_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
"DECLS": ["NOT_INPLACE"]
},
{
"SHADER_NAME": "neg_f16",
"REPLS": { "TYPE": "f16", "FUNC": "NEG_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
"DECLS": ["NOT_INPLACE"]
},
{
"SHADER_NAME": "neg_inplace_f32",
"REPLS": { "TYPE": "f32", "FUNC": "NEG_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
"DECLS": ["INPLACE"]
},
{
"SHADER_NAME": "neg_inplace_f16",
"REPLS": { "TYPE": "f16", "FUNC": "NEG_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
"DECLS": ["INPLACE"]
},
{
"SHADER_NAME": "step_f32",
"REPLS": { "TYPE": "f32", "FUNC": "STEP_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
"DECLS": ["NOT_INPLACE"]
},
{
"SHADER_NAME": "step_f16",
"REPLS": { "TYPE": "f16", "FUNC": "STEP_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
"DECLS": ["NOT_INPLACE"]
},
{
"SHADER_NAME": "step_inplace_f32",
"REPLS": { "TYPE": "f32", "FUNC": "STEP_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
"DECLS": ["INPLACE"]
},
{
"SHADER_NAME": "step_inplace_f16",
"REPLS": { "TYPE": "f16", "FUNC": "STEP_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
"DECLS": ["INPLACE"]
},
{
"SHADER_NAME": "tanh_f32",
"REPLS": { "TYPE": "f32", "FUNC": "TANH_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
"DECLS": ["NOT_INPLACE"]
},
{
"SHADER_NAME": "tanh_f16",
"REPLS": { "TYPE": "f16", "FUNC": "TANH_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
"DECLS": ["NOT_INPLACE"]
},
{
"SHADER_NAME": "tanh_inplace_f32",
"REPLS": { "TYPE": "f32", "FUNC": "TANH_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
"DECLS": ["INPLACE"]
},
{
"SHADER_NAME": "tanh_inplace_f16",
"REPLS": { "TYPE": "f16", "FUNC": "TANH_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
"DECLS": ["INPLACE"]
},
{
"SHADER_NAME": "elu_f32",
"REPLS": { "TYPE": "f32", "FUNC": "ELU_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
"DECLS": ["NOT_INPLACE"]
},
{
"SHADER_NAME": "elu_f16",
"REPLS": { "TYPE": "f16", "FUNC": "ELU_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
"DECLS": ["NOT_INPLACE"]
},
{
"SHADER_NAME": "elu_inplace_f32",
"REPLS": { "TYPE": "f32", "FUNC": "ELU_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
"DECLS": ["INPLACE"]
},
{
"SHADER_NAME": "elu_inplace_f16",
"REPLS": { "TYPE": "f16", "FUNC": "ELU_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
"DECLS": ["INPLACE"]
},
{
"SHADER_NAME": "relu_f32",
"REPLS": { "TYPE": "f32", "FUNC": "RELU_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
"DECLS": ["NOT_INPLACE"]
},
{
"SHADER_NAME": "relu_f16",
"REPLS": { "TYPE": "f16", "FUNC": "RELU_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
"DECLS": ["NOT_INPLACE"]
},
{
"SHADER_NAME": "relu_inplace_f32",
"REPLS": { "TYPE": "f32", "FUNC": "RELU_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
"DECLS": ["INPLACE"]
},
{
"SHADER_NAME": "relu_inplace_f16",
"REPLS": { "TYPE": "f16", "FUNC": "RELU_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
"DECLS": ["INPLACE"]
},
{
"SHADER_NAME": "sigmoid_f32",
"REPLS": { "TYPE": "f32", "FUNC": "SIGMOID_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
"DECLS": ["NOT_INPLACE"]
},
{
"SHADER_NAME": "sigmoid_f16",
"REPLS": { "TYPE": "f16", "FUNC": "SIGMOID_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
"DECLS": ["NOT_INPLACE"]
},
{
"SHADER_NAME": "sigmoid_inplace_f32",
"REPLS": { "TYPE": "f32", "FUNC": "SIGMOID_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
"DECLS": ["INPLACE"]
},
{
"SHADER_NAME": "sigmoid_inplace_f16",
"REPLS": { "TYPE": "f16", "FUNC": "SIGMOID_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
"DECLS": ["INPLACE"]
},
{
"SHADER_NAME": "silu_f32",
"REPLS": { "TYPE": "f32", "FUNC": "SILU_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
"DECLS": ["NOT_INPLACE"]
},
{
"SHADER_NAME": "silu_f16",
"REPLS": { "TYPE": "f16", "FUNC": "SILU_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
"DECLS": ["NOT_INPLACE"]
},
{
"SHADER_NAME": "silu_inplace_f32",
"REPLS": { "TYPE": "f32", "FUNC": "SILU_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
"DECLS": ["INPLACE"]
},
{
"SHADER_NAME": "silu_inplace_f16",
"REPLS": { "TYPE": "f16", "FUNC": "SILU_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
"DECLS": ["INPLACE"]
},
{
"SHADER_NAME": "exp_f32",
"REPLS": { "TYPE": "f32", "FUNC": "EXP_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
"DECLS": ["NOT_INPLACE"]
},
{
"SHADER_NAME": "exp_f16",
"REPLS": { "TYPE": "f16", "FUNC": "EXP_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
"DECLS": ["NOT_INPLACE"]
},
{
"SHADER_NAME": "exp_inplace_f32",
"REPLS": { "TYPE": "f32", "FUNC": "EXP_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
"DECLS": ["INPLACE"]
},
{
"SHADER_NAME": "exp_inplace_f16",
"REPLS": { "TYPE": "f16", "FUNC": "EXP_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
"DECLS": ["INPLACE"]
},
{
"SHADER_NAME": "hardsigmoid_f32",
"REPLS": { "TYPE": "f32", "FUNC": "HARDSIGMOID_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
"DECLS": ["NOT_INPLACE"]
},
{
"SHADER_NAME": "hardsigmoid_f16",
"REPLS": { "TYPE": "f16", "FUNC": "HARDSIGMOID_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
"DECLS": ["NOT_INPLACE"]
},
{
"SHADER_NAME": "hardsigmoid_inplace_f32",
"REPLS": { "TYPE": "f32", "FUNC": "HARDSIGMOID_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
"DECLS": ["INPLACE"]
},
{
"SHADER_NAME": "hardsigmoid_inplace_f16",
"REPLS": { "TYPE": "f16", "FUNC": "HARDSIGMOID_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
"DECLS": ["INPLACE"]
},
{
"SHADER_NAME": "hardswish_f32",
"REPLS": { "TYPE": "f32", "FUNC": "HARDSWISH_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
"DECLS": ["NOT_INPLACE"]
},
{
"SHADER_NAME": "hardswish_f16",
"REPLS": { "TYPE": "f16", "FUNC": "HARDSWISH_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
"DECLS": ["NOT_INPLACE"]
},
{
"SHADER_NAME": "hardswish_inplace_f32",
"REPLS": { "TYPE": "f32", "FUNC": "HARDSWISH_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
"DECLS": ["INPLACE"]
},
{
"SHADER_NAME": "hardswish_inplace_f16",
"REPLS": { "TYPE": "f16", "FUNC": "HARDSWISH_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
"DECLS": ["INPLACE"]
},
{
"SHADER_NAME": "gelu_f32",
"REPLS": { "TYPE": "f32", "FUNC": "GELU_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
"DECLS": ["NOT_INPLACE"]
},
{
"SHADER_NAME": "gelu_f16",
"REPLS": { "TYPE": "f16", "FUNC": "GELU_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
"DECLS": ["NOT_INPLACE"]
},
{
"SHADER_NAME": "gelu_inplace_f32",
"REPLS": { "TYPE": "f32", "FUNC": "GELU_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
"DECLS": ["INPLACE"]
},
{
"SHADER_NAME": "gelu_inplace_f16",
"REPLS": { "TYPE": "f16", "FUNC": "GELU_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
"DECLS": ["INPLACE"]
},
{
"SHADER_NAME": "gelu_quick_f32",
"REPLS": { "TYPE": "f32", "FUNC": "GELU_QUICK_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
"DECLS": ["NOT_INPLACE"]
},
{
"SHADER_NAME": "gelu_quick_f16",
"REPLS": { "TYPE": "f16", "FUNC": "GELU_QUICK_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
"DECLS": ["NOT_INPLACE"]
},
{
"SHADER_NAME": "gelu_quick_inplace_f32",
"REPLS": { "TYPE": "f32", "FUNC": "GELU_QUICK_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
"DECLS": ["INPLACE"]
},
{
"SHADER_NAME": "gelu_quick_inplace_f16",
"REPLS": { "TYPE": "f16", "FUNC": "GELU_QUICK_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
"DECLS": ["INPLACE"]
},
{
"SHADER_NAME": "xielu_f32",
"REPLS": { "TYPE": "f32", "FUNC": "XIELU_FUNC", "EXT_PARAMS": "alpha_n: f32, alpha_p: f32, beta: f32, eps: f32", "MUTATE": "dst" },
"DECLS": ["NOT_INPLACE"]
},
{
"SHADER_NAME": "xielu_f16",
"REPLS": { "TYPE": "f16", "FUNC": "XIELU_FUNC", "EXT_PARAMS": "alpha_n: f32, alpha_p: f32, beta: f32, eps: f32", "MUTATE": "dst" },
"DECLS": ["NOT_INPLACE"]
},
{
"SHADER_NAME": "xielu_inplace_f32",
"REPLS": { "TYPE": "f32", "FUNC": "XIELU_FUNC", "EXT_PARAMS": "alpha_n: f32, alpha_p: f32, beta: f32, eps: f32", "MUTATE": "src" },
"DECLS": ["INPLACE"]
},
{
"SHADER_NAME": "xielu_inplace_f16",
"REPLS": { "TYPE": "f16", "FUNC": "XIELU_FUNC", "EXT_PARAMS": "alpha_n: f32, alpha_p: f32, beta: f32, eps: f32", "MUTATE": "src" },
"DECLS": ["INPLACE"]
},
{
"SHADER_NAME": "gelu_erf_f32",
"REPLS": { "TYPE": "f32", "FUNC": "GELU_ERF_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
"DECLS": ["NOT_INPLACE"]
},
{
"SHADER_NAME": "gelu_erf_f16",
"REPLS": { "TYPE": "f16", "FUNC": "GELU_ERF_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
"DECLS": ["NOT_INPLACE"]
},
{
"SHADER_NAME": "gelu_erf_inplace_f32",
"REPLS": { "TYPE": "f32", "FUNC": "GELU_ERF_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
"DECLS": ["INPLACE"]
},
{
"SHADER_NAME": "gelu_erf_inplace_f16",
"REPLS": { "TYPE": "f16", "FUNC": "GELU_ERF_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
"DECLS": ["INPLACE"]
}
]
#end(VARIANTS)
#define(DECLS)
#decl(INPLACE)
@group(0) @binding(1)
var<uniform> params: Params;
#enddecl(INPLACE)
#decl(NOT_INPLACE)
@group(0) @binding(1)
var<storage, read_write> dst: array<{{TYPE}}>;
@group(0) @binding(2)
var<uniform> params: Params;
#enddecl(NOT_INPLACE)
#end(DECLS)
#define(SHADER)
enable f16;
fn update(dst_i: u32, src_i: u32) {
{{FUNC}}
}
@group(0) @binding(0)
var<storage, read_write> src: array<{{TYPE}}>;
DECLS
struct Params {
ne: u32, // total number of elements
offset_src: u32, // in elements
offset_dst: u32, // in elements
// Strides (in elements) may be permuted
stride_src0: u32,
stride_src1: u32,
stride_src2: u32,
stride_src3: u32,
stride_dst0: u32,
stride_dst1: u32,
stride_dst2: u32,
stride_dst3: u32,
// Logical shapes
src_ne0: u32,
src_ne1: u32,
src_ne2: u32,
dst_ne0: u32,
dst_ne1: u32,
dst_ne2: u32,
{{EXT_PARAMS}}
};
override wg_size: u32;
@compute @workgroup_size(wg_size)
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
if (gid.x >= params.ne) {
return;
}
var i = gid.x;
let i3 = i / (params.src_ne2 * params.src_ne1 * params.src_ne0);
i = i % (params.src_ne2 * params.src_ne1 * params.src_ne0);
let i2 = i / (params.src_ne1 * params.src_ne0);
i = i % (params.src_ne1 * params.src_ne0);
let i1 = i / params.src_ne0;
let i0 = i % params.src_ne0;
var j = gid.x;
let j3 = j / (params.dst_ne2 * params.dst_ne1 * params.dst_ne0);
j = j % (params.dst_ne2 * params.dst_ne1 * params.dst_ne0);
let j2 = j / (params.dst_ne1 * params.dst_ne0);
j = j % (params.dst_ne1 * params.dst_ne0);
let j1 = j / params.dst_ne0;
let j0 = j % params.dst_ne0;
let src_idx = i0 * params.stride_src0 + i1 * params.stride_src1 +
i2 * params.stride_src2 + i3 * params.stride_src3;
let dst_idx = j0 * params.stride_dst0 + j1 * params.stride_dst1 +
j2 * params.stride_dst2 + j3 * params.stride_dst3;
update(params.offset_dst + dst_idx, params.offset_src + src_idx);
}
#end(SHADER)

View File

@ -4947,6 +4947,18 @@ struct ggml_tensor * ggml_pad(
return ggml_pad_ext(ctx, a, 0, p0, 0, p1, 0, p2, 0, p3);
}
// ggml_pad_circular
struct ggml_tensor * ggml_pad_circular(
struct ggml_context * ctx,
struct ggml_tensor * a,
int p0,
int p1,
int p2,
int p3) {
return ggml_pad_ext_circular(ctx, a, 0, p0, 0, p1, 0, p2, 0, p3);
}
struct ggml_tensor * ggml_pad_ext(
struct ggml_context * ctx,
struct ggml_tensor * a,
@ -4973,6 +4985,7 @@ struct ggml_tensor * ggml_pad_ext(
ggml_set_op_params_i32(result, 5, rp2);
ggml_set_op_params_i32(result, 6, lp3);
ggml_set_op_params_i32(result, 7, rp3);
ggml_set_op_params_i32(result, 8, 0); // not circular by default
result->op = GGML_OP_PAD;
@ -4981,6 +4994,25 @@ struct ggml_tensor * ggml_pad_ext(
return result;
}
// ggml_pad_ext_circular
struct ggml_tensor * ggml_pad_ext_circular(
struct ggml_context * ctx,
struct ggml_tensor * a,
int lp0,
int rp0,
int lp1,
int rp1,
int lp2,
int rp2,
int lp3,
int rp3
) {
struct ggml_tensor * result = ggml_pad_ext(ctx, a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3);
ggml_set_op_params_i32(result, 8, 1); // circular
return result;
}
// ggml_pad_reflect_1d
struct ggml_tensor * ggml_pad_reflect_1d(

View File

@ -376,6 +376,7 @@ class TensorNameMap:
"model.layers.{bid}.block_sparse_moe.primary_router", # smallthinker
"model.layers.{bid}.feed_forward.gate", # lfm2moe
"model.layers.{bid}.mlp.router.gate", # afmoe
"layers.{bid}.gate", # mistral-large
),
MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
@ -450,6 +451,7 @@ class TensorNameMap:
"model.layers.{bid}.feed_forward.shared_expert.up_proj", # llama4
"model.layers.{bid}.feed_forward.down_proj",
"model.layers.{bid}.mlp.shared_mlp.up_proj", # hunyuan
"layers.{bid}.shared_experts.w3", # mistral-large
),
MODEL_TENSOR.FFN_UP_CHEXP: (
@ -496,6 +498,7 @@ class TensorNameMap:
"model.layers.{bid}.mlp.shared_experts.gate_proj", # deepseek deepseek2
"model.layers.{bid}.feed_forward.shared_expert.gate_proj", # llama4
"model.layers.{bid}.mlp.shared_mlp.gate_proj", # hunyuan
"layers.{bid}.shared_experts.w1", # mistral-large
),
MODEL_TENSOR.FFN_GATE_CHEXP: (
@ -557,6 +560,7 @@ class TensorNameMap:
"model.layers.{bid}.feed_forward.shared_expert.down_proj", # llama4
"model.layers.{bid}.shared_mlp.output_linear", # granitemoe
"model.layers.{bid}.mlp.shared_mlp.down_proj", # hunyuan
"layers.{bid}.shared_experts.w2", # mistral-large
),
MODEL_TENSOR.FFN_DOWN_CHEXP: (
@ -924,14 +928,17 @@ class TensorNameMap:
MODEL_TENSOR.ATTN_Q_A: (
"model.layers.{bid}.self_attn.q_a_proj", # deepseek2
"layers.{bid}.attention.wq_a", # mistral-large
),
MODEL_TENSOR.ATTN_Q_B: (
"model.layers.{bid}.self_attn.q_b_proj", # deepseek2
"layers.{bid}.attention.wq_b", # mistral-large
),
MODEL_TENSOR.ATTN_KV_A_MQA: (
"model.layers.{bid}.self_attn.kv_a_proj_with_mqa", # deepseek2
"layers.{bid}.attention.wkv_a_with_mqa", # mistral-large
),
MODEL_TENSOR.ATTN_KV_B: (
@ -940,18 +947,22 @@ class TensorNameMap:
MODEL_TENSOR.ATTN_K_B: (
"model.layers.{bid}.self_attn.k_b_proj", # deepseek2
"layers.{bid}.attention.k_b_proj", # mistral-large
),
MODEL_TENSOR.ATTN_V_B: (
"model.layers.{bid}.self_attn.v_b_proj", # deepseek2
"layers.{bid}.attention.v_b_proj", # mistral-large
),
MODEL_TENSOR.ATTN_Q_A_NORM: (
"model.layers.{bid}.self_attn.q_a_layernorm", # deepseek2
"layers.{bid}.attention.q_a_norm", # mistral-large
),
MODEL_TENSOR.ATTN_KV_A_NORM: (
"model.layers.{bid}.self_attn.kv_a_layernorm", # deepseek2
"layers.{bid}.attention.kv_a_norm", # mistral-large
),
MODEL_TENSOR.ATTN_SUB_NORM: (

View File

@ -666,7 +666,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
std::map<int, std::string> mapped;
int blk_id = 0;
int pruned_attention_w = 0;
// make a list of weights
std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
@ -674,11 +673,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
for (const auto & it : ml.weights_map) {
const std::string remapped_name(remap_layer(it.first, prune_list, mapped, blk_id));
if (remapped_name.empty()) {
if (it.first.find("attn_v.weight") != std::string::npos ||
it.first.find("attn_qkv.weight") != std::string::npos ||
it.first.find("attn_kv_b.weight") != std::string::npos) {
pruned_attention_w++;
}
LLAMA_LOG_DEBUG("%s: pruning tensor %s\n", __func__, it.first.c_str());
continue;
}
@ -703,7 +697,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
});
}
bool is_clip_model = false;
for (const auto * it : tensors) {
const struct ggml_tensor * tensor = it->tensor;
@ -717,30 +710,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
qs.has_output = true;
}
is_clip_model |= name.rfind("mm.", 0) == 0; // check the "mm." prefix
}
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
// sanity checks for models that have attention layers
if (qs.n_attention_wv != 0 && !is_clip_model)
{
int32_t n_layer_all = model.hparams.n_layer;
if (llama_model_has_encoder(&model)) {
// now n_layer_all is the number of attention layers in the encoder
// for each decoder block, there are 2 attention layers
n_layer_all += 2 * model.hparams.dec_n_layer;
}
// note: for linear-attention models (such as Qwen3 Next) this is the number of linear layers
const int32_t n_layer_recr = std::count(model.hparams.recurrent_layer_arr.begin(), model.hparams.recurrent_layer_arr.end(), true);
LLAMA_LOG_INFO("%s: n_layer_all = %d, n_layer_recr = %d, pruned_attention_w = %d\n", __func__, n_layer_all, n_layer_recr, pruned_attention_w);
GGML_ASSERT((qs.n_attention_wv == n_layer_all - pruned_attention_w - n_layer_recr) && "n_attention_wv is unexpected");
}
size_t total_size_org = 0;
size_t total_size_new = 0;

View File

@ -286,10 +286,11 @@ static double nmse(const float * a, const float * b, size_t n) {
return mse_a_b / mse_a_0;
}
// difference between 2 integer sets (Jaccard distance, 0 - no difference, 1 - no overlap)
static double jdst(const int32_t * a, const int32_t * b, size_t n) {
std::unordered_map<int32_t, size_t> set_a;
std::unordered_map<int32_t, size_t> set_b;
// difference between 2 sets (Jaccard distance, 0 - no difference, 1 - no overlap)
template <typename T>
static double jdst(const T * a, const T * b, size_t n) {
std::unordered_map<T, size_t> set_a;
std::unordered_map<T, size_t> set_b;
for (size_t i = 0; i < n; ++i) {
set_a[a[i]]++;
@ -5001,21 +5002,69 @@ struct test_top_k : public test_case {
const ggml_type type;
const std::array<int64_t, 4> ne;
const int k;
const bool ties;
ggml_tensor * input {};
std::string vars() override {
return VARS_TO_STR3(type, ne, k);
return VARS_TO_STR4(type, ne, k, ties);
}
test_top_k(ggml_type type = GGML_TYPE_F32,
std::array<int64_t, 4> ne = {16, 10, 10, 10},
int k = 4)
: type(type), ne(ne), k(k) {}
int k = 4, bool ties = false)
: type(type), ne(ne), k(k), ties(ties) {}
double max_err() override {
return 0.0;
}
// When there are ties, only validate the final result.
// The logic in err can't handle the sentinel tensors.
bool run_whole_graph() override { return ties; }
double err(const float * a, const float * b, size_t n) override {
// When there are no ties, we expect the exact same set of indices,
// but possibly in a different order. When there are ties, the indices
// can be different but the input values they correspond to should be
// the same. The logic for ties could work for non-ties, but only for
// the output tensor, not for the sentinel tensors.
if (ties) {
std::vector<float> src(ggml_nelements(input));
ggml_backend_tensor_get(input, src.data(), 0, ggml_nelements(input) * ggml_type_size(type));
double diff = 0.0f;
GGML_ASSERT(n == (size_t)(ggml_nrows(input) * k));
int64_t cols = input->ne[0];
std::vector<int32_t> ia(k);
std::vector<int32_t> ib(k);
std::vector<float> asrc(k);
std::vector<float> bsrc(k);
for (int64_t r = 0; r < ggml_nrows(input); r++) {
// Convert indices for the row back to integer
for (int64_t c = 0; c < k; c++) {
ia[c] = (int32_t)a[r * k + c];
ib[c] = (int32_t)b[r * k + c];
}
// The src values for each row should match.
for (int64_t c = 0; c < k; c++) {
asrc[c] = src[r * cols + ia[c]];
bsrc[c] = src[r * cols + ib[c]];
}
diff += jdst(asrc.data(), bsrc.data(), k);
// There should be no duplicate indices
std::sort(ia.begin(), ia.end());
std::sort(ib.begin(), ib.end());
if (std::adjacent_find(ia.begin(), ia.end()) != ia.end()) {
diff += 1;
}
if (std::adjacent_find(ib.begin(), ib.end()) != ib.end()) {
diff += 1;
}
}
return diff;
} else {
std::vector<int32_t> ia(n);
std::vector<int32_t> ib(n);
@ -5032,11 +5081,15 @@ struct test_top_k : public test_case {
return diff + jdst(ia.data(), ib.data(), n);
}
}
ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
ggml_set_name(a, "a");
// Save 'a' for err()
input = a;
ggml_tensor * out = ggml_top_k(ctx, a, k);
ggml_set_name(out, "out");
@ -5047,12 +5100,17 @@ struct test_top_k : public test_case {
std::random_device rd;
std::default_random_engine rng(rd());
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
// initialize with unique values to avoid ties
int tie_denom = std::max(1, std::min(10, k / 2));
for (int64_t r = 0; r < ggml_nrows(t); r++) {
std::vector<float> data(t->ne[0]);
for (int i = 0; i < t->ne[0]; i++) {
if (ties) {
// integer division to introduce duplicates
data[i] = i / tie_denom;
} else {
data[i] = i;
}
}
std::shuffle(data.begin(), data.end(), rng);
ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(float));
}
@ -5546,21 +5604,24 @@ struct test_pad : public test_case {
const std::array<int64_t, 4> ne_a;
const int pad_0;
const int pad_1;
const bool circular;
std::string vars() override {
return VARS_TO_STR4(type, ne_a, pad_0, pad_1);
return VARS_TO_STR5(type, ne_a, pad_0, pad_1, circular);
}
test_pad(ggml_type type = GGML_TYPE_F32,
std::array<int64_t, 4> ne_a = {512, 512, 1, 1},
int pad_0 = 1, int pad_1 = 1)
: type(type), ne_a(ne_a), pad_0(pad_0), pad_1(pad_1) {}
int pad_0 = 1, int pad_1 = 1, bool circular = false)
: type(type), ne_a(ne_a), pad_0(pad_0), pad_1(pad_1), circular(circular) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
ggml_set_name(a, "a");
ggml_tensor * out = ggml_pad(ctx, a, pad_0, pad_1, 0, 0);
ggml_tensor * out = circular
? ggml_pad_circular(ctx, a, pad_0, pad_1, 0, 0)
: ggml_pad(ctx, a, pad_0, pad_1, 0, 0);
ggml_set_name(out, "out");
return out;
@ -5580,17 +5641,19 @@ struct test_pad_ext : public test_case {
const int lp3;
const int rp3;
const bool v;
const bool circular;
std::string vars() override {
return VARS_TO_STR11(type, ne_a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3, v);
return VARS_TO_STR12(type, ne_a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3, v, circular);
}
test_pad_ext(ggml_type type = GGML_TYPE_F32,
std::array<int64_t, 4> ne_a = {512, 512, 3, 1},
int lp0 = 1, int rp0 = 1, int lp1 = 1, int rp1 = 1,
int lp2 = 1, int rp2 = 1, int lp3 = 1, int rp3 = 1,
bool v = false)
: type(type), ne_a(ne_a), lp0(lp0), rp0(rp0), lp1(lp1), rp1(rp1), lp2(lp2), rp2(rp2), lp3(lp3), rp3(rp3), v(v) {}
bool v = false, bool circular = false)
: type(type), ne_a(ne_a), lp0(lp0), rp0(rp0), lp1(lp1), rp1(rp1), lp2(lp2), rp2(rp2), lp3(lp3), rp3(rp3),
v(v), circular(circular) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
@ -5601,7 +5664,9 @@ struct test_pad_ext : public test_case {
ggml_set_name(a, "view of a");
}
ggml_tensor * out = ggml_pad_ext(ctx, a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3);
ggml_tensor * out = circular
? ggml_pad_ext_circular(ctx, a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3)
: ggml_pad_ext(ctx, a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3);
ggml_set_name(out, "out");
return out;
@ -6146,6 +6211,15 @@ struct test_solve_tri : public test_case {
std::string vars() override { return VARS_TO_STR3(type, ne_lhs, ne_rhs); }
uint64_t op_flops(ggml_tensor * t) override {
GGML_UNUSED(t);
int64_t n = ne_lhs[0];
int64_t k = ne_rhs[0];
int64_t batch = ne_lhs[2] * ne_lhs[3];
// n * (n + 1) / 2 non-zero elements of lhs, 2 flops each, for each col of rhs
return n * (n + 1) * k * batch;
}
test_solve_tri(ggml_type type = GGML_TYPE_F32,
std::array<int64_t, 4> ne_lhs = { 10, 10, 4, 3 },
std::array<int64_t, 4> ne_rhs = { 3, 10, 4, 3 }
@ -6982,6 +7056,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
test_cases.emplace_back(new test_conv_transpose_2d({3, 2, 3, 1}, {2, 2, 1, 3}, 1));
test_cases.emplace_back(new test_conv_transpose_2d({10, 10, 9, 1}, {3, 3, 1, 9}, 2));
test_cases.emplace_back(new test_conv_transpose_2d({129, 63, 35, 1}, {3, 3, 48, 35}, 1));
test_cases.emplace_back(new test_count_equal(GGML_TYPE_F32, {4, 500, 1, 1}));
test_cases.emplace_back(new test_count_equal(GGML_TYPE_F32, {4, 5000, 1, 1}));
@ -7656,6 +7731,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
if (k <= 1<<i) {
test_cases.emplace_back(new test_top_k(GGML_TYPE_F32, {(1<<i), 1, 1, 1}, k));
test_cases.emplace_back(new test_top_k(GGML_TYPE_F32, {(1<<i) + 11, 1, 2, 1}, k));
test_cases.emplace_back(new test_top_k(GGML_TYPE_F32, {(1<<i) + 11, 1, 2, 1}, k, true));
}
}
}
@ -7713,6 +7789,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
test_cases.emplace_back(new test_group_norm_mul_add(GGML_TYPE_F32, {9, 9, 1280, 1}));
test_cases.emplace_back(new test_acc());
test_cases.emplace_back(new test_pad());
test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {33, 17, 2, 1}, 4, 3, true)); // circular
test_cases.emplace_back(new test_pad_ext());
test_cases.emplace_back(new test_pad_reflect_1d());
test_cases.emplace_back(new test_pad_reflect_1d(GGML_TYPE_F32, {3000, 384, 4, 1}));
@ -7757,10 +7834,14 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 42, 42, 5, 2 }, { 10, 42, 5, 2 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 2, 2 }, { 10, 64, 2, 2 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 100, 100, 4, 4 }, { 41, 100, 4, 4 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 4 }, { 31, 128, 4, 4 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 300, 64, 4, 4 }));
for (bool v : {false, true}) {
test_cases.emplace_back(new test_pad_ext(GGML_TYPE_F32, {512, 512, 1, 1}, 0, 1, 0, 1, 0, 0, 0, 0, v));
test_cases.emplace_back(new test_pad_ext(GGML_TYPE_F32, {11, 22, 33, 44}, 1, 2, 3, 4, 5, 6, 7, 8, v));
for (bool circular : {false, true}) {
test_cases.emplace_back(new test_pad_ext(GGML_TYPE_F32, {512, 512, 1, 1}, 0, 1, 0, 1, 0, 0, 0, 0, v, circular));
test_cases.emplace_back(new test_pad_ext(GGML_TYPE_F32, {11, 22, 33, 44}, 1, 2, 3, 4, 5, 6, 7, 8, v, circular));
}
}
for (int hsk : { 40, 64, 72, 80, 96, 128, 192, 256, 576 }) {
@ -7898,6 +7979,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
{ 58, 3, 64, 32, 8 },
// A deep layer of a ConvNet, several images in the batch
{ 16, 3, 512, 128, 8 },
// High resolution output (large NPQ)
{1536, 3, 64, 32, 1 },
};
for (auto kernel_type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
@ -7955,6 +8038,10 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 2 }, { 6, 64, 4, 2 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 1 }, { 8, 128, 4, 1 }));
// qwen3next with CHUNK_SIZE 64
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 8, 32 }, { 64, 64, 8, 32 }));
// qwen3next with CHUNK_SIZE 128
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 32 }, { 128, 128, 4, 32 }));
test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_LOWER, GGML_TYPE_F32, { 256, 256, 4, 4 }));
test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_UPPER_DIAG, GGML_TYPE_F32, { 1024, 1024, 8, 4 }));

Binary file not shown.

View File

@ -3,6 +3,7 @@
import { copyToClipboard, isIMEComposing } from '$lib/utils';
import ChatMessageAssistant from './ChatMessageAssistant.svelte';
import ChatMessageUser from './ChatMessageUser.svelte';
import ChatMessageSystem from './ChatMessageSystem.svelte';
interface Props {
class?: string;
@ -140,8 +141,7 @@
}
function handleSaveEdit() {
if (message.role === 'user') {
// For user messages, trim to avoid accidental whitespace
if (message.role === 'user' || message.role === 'system') {
onEditWithBranching?.(message, editedContent.trim());
} else {
// For assistant messages, preserve exact content including trailing whitespace
@ -167,7 +167,28 @@
}
</script>
{#if message.role === 'user'}
{#if message.role === 'system'}
<ChatMessageSystem
bind:textareaElement
class={className}
{deletionInfo}
{editedContent}
{isEditing}
{message}
onCancelEdit={handleCancelEdit}
onConfirmDelete={handleConfirmDelete}
onCopy={handleCopy}
onDelete={handleDelete}
onEdit={handleEdit}
onEditKeydown={handleEditKeydown}
onEditedContentChange={handleEditedContentChange}
{onNavigateToSibling}
onSaveEdit={handleSaveEdit}
onShowDeleteDialogChange={handleShowDeleteDialogChange}
{showDeleteDialog}
{siblingInfo}
/>
{:else if message.role === 'user'}
<ChatMessageUser
bind:textareaElement
class={className}

View File

@ -0,0 +1,216 @@
<script lang="ts">
import { Check, X } from '@lucide/svelte';
import { Card } from '$lib/components/ui/card';
import { Button } from '$lib/components/ui/button';
import { MarkdownContent } from '$lib/components/app';
import { INPUT_CLASSES } from '$lib/constants/input-classes';
import { config } from '$lib/stores/settings.svelte';
import ChatMessageActions from './ChatMessageActions.svelte';
interface Props {
class?: string;
message: DatabaseMessage;
isEditing: boolean;
editedContent: string;
siblingInfo?: ChatMessageSiblingInfo | null;
showDeleteDialog: boolean;
deletionInfo: {
totalCount: number;
userMessages: number;
assistantMessages: number;
messageTypes: string[];
} | null;
onCancelEdit: () => void;
onSaveEdit: () => void;
onEditKeydown: (event: KeyboardEvent) => void;
onEditedContentChange: (content: string) => void;
onCopy: () => void;
onEdit: () => void;
onDelete: () => void;
onConfirmDelete: () => void;
onNavigateToSibling?: (siblingId: string) => void;
onShowDeleteDialogChange: (show: boolean) => void;
textareaElement?: HTMLTextAreaElement;
}
let {
class: className = '',
message,
isEditing,
editedContent,
siblingInfo = null,
showDeleteDialog,
deletionInfo,
onCancelEdit,
onSaveEdit,
onEditKeydown,
onEditedContentChange,
onCopy,
onEdit,
onDelete,
onConfirmDelete,
onNavigateToSibling,
onShowDeleteDialogChange,
textareaElement = $bindable()
}: Props = $props();
let isMultiline = $state(false);
let messageElement: HTMLElement | undefined = $state();
let isExpanded = $state(false);
let contentHeight = $state(0);
const MAX_HEIGHT = 200; // pixels
const currentConfig = config();
let showExpandButton = $derived(contentHeight > MAX_HEIGHT);
$effect(() => {
if (!messageElement || !message.content.trim()) return;
if (message.content.includes('\n')) {
isMultiline = true;
}
const resizeObserver = new ResizeObserver((entries) => {
for (const entry of entries) {
const element = entry.target as HTMLElement;
const estimatedSingleLineHeight = 24;
isMultiline = element.offsetHeight > estimatedSingleLineHeight * 1.5;
contentHeight = element.scrollHeight;
}
});
resizeObserver.observe(messageElement);
return () => {
resizeObserver.disconnect();
};
});
function toggleExpand() {
isExpanded = !isExpanded;
}
</script>
<div
aria-label="System message with actions"
class="group flex flex-col items-end gap-3 md:gap-2 {className}"
role="group"
>
{#if isEditing}
<div class="w-full max-w-[80%]">
<textarea
bind:this={textareaElement}
bind:value={editedContent}
class="min-h-[60px] w-full resize-none rounded-2xl px-3 py-2 text-sm {INPUT_CLASSES}"
onkeydown={onEditKeydown}
oninput={(e) => onEditedContentChange(e.currentTarget.value)}
placeholder="Edit system message..."
></textarea>
<div class="mt-2 flex justify-end gap-2">
<Button class="h-8 px-3" onclick={onCancelEdit} size="sm" variant="outline">
<X class="mr-1 h-3 w-3" />
Cancel
</Button>
<Button class="h-8 px-3" onclick={onSaveEdit} disabled={!editedContent.trim()} size="sm">
<Check class="mr-1 h-3 w-3" />
Send
</Button>
</div>
</div>
{:else}
{#if message.content.trim()}
<div class="relative max-w-[80%]">
<button
class="group/expand w-full text-left {!isExpanded && showExpandButton
? 'cursor-pointer'
: 'cursor-auto'}"
onclick={showExpandButton && !isExpanded ? toggleExpand : undefined}
type="button"
>
<Card
class="rounded-[1.125rem] !border-2 !border-dashed !border-border/50 bg-muted px-3.75 py-1.5 data-[multiline]:py-2.5"
data-multiline={isMultiline ? '' : undefined}
style="border: 2px dashed hsl(var(--border));"
>
<div
class="relative overflow-hidden transition-all duration-300 {isExpanded
? 'cursor-text select-text'
: 'select-none'}"
style={!isExpanded && showExpandButton
? `max-height: ${MAX_HEIGHT}px;`
: 'max-height: none;'}
>
{#if currentConfig.renderUserContentAsMarkdown}
<div bind:this={messageElement} class="text-md {isExpanded ? 'cursor-text' : ''}">
<MarkdownContent class="markdown-system-content" content={message.content} />
</div>
{:else}
<span
bind:this={messageElement}
class="text-md whitespace-pre-wrap {isExpanded ? 'cursor-text' : ''}"
>
{message.content}
</span>
{/if}
{#if !isExpanded && showExpandButton}
<div
class="pointer-events-none absolute right-0 bottom-0 left-0 h-48 bg-gradient-to-t from-muted to-transparent"
></div>
<div
class="pointer-events-none absolute right-0 bottom-4 left-0 flex justify-center opacity-0 transition-opacity group-hover/expand:opacity-100"
>
<Button
class="rounded-full px-4 py-1.5 text-xs shadow-md"
size="sm"
variant="outline"
>
Show full system message
</Button>
</div>
{/if}
</div>
{#if isExpanded && showExpandButton}
<div class="mb-2 flex justify-center">
<Button
class="rounded-full px-4 py-1.5 text-xs"
onclick={(e) => {
e.stopPropagation();
toggleExpand();
}}
size="sm"
variant="outline"
>
Collapse System Message
</Button>
</div>
{/if}
</Card>
</button>
</div>
{/if}
{#if message.timestamp}
<div class="max-w-[80%]">
<ChatMessageActions
actionsPosition="right"
{deletionInfo}
justify="end"
{onConfirmDelete}
{onCopy}
{onDelete}
{onEdit}
{onNavigateToSibling}
{onShowDeleteDialogChange}
{siblingInfo}
{showDeleteDialog}
role="user"
/>
</div>
{/if}
{/if}
</div>

View File

@ -145,7 +145,7 @@
{#if message.content.trim()}
<Card
class="max-w-[80%] rounded-[1.125rem] bg-primary px-3.75 py-1.5 text-primary-foreground data-[multiline]:py-2.5"
class="max-w-[80%] rounded-[1.125rem] border-none bg-primary px-3.75 py-1.5 text-primary-foreground data-[multiline]:py-2.5"
data-multiline={isMultiline ? '' : undefined}
>
{#if currentConfig.renderUserContentAsMarkdown}

View File

@ -2,6 +2,7 @@
import { ChatMessage } from '$lib/components/app';
import { chatStore } from '$lib/stores/chat.svelte';
import { conversationsStore, activeConversation } from '$lib/stores/conversations.svelte';
import { config } from '$lib/stores/settings.svelte';
import { getMessageSiblings } from '$lib/utils';
interface Props {
@ -13,6 +14,7 @@
let { class: className, messages = [], onUserAction }: Props = $props();
let allConversationMessages = $state<DatabaseMessage[]>([]);
const currentConfig = config();
function refreshAllMessages() {
const conversation = activeConversation();
@ -40,7 +42,12 @@
return [];
}
return messages.map((message) => {
// Filter out system messages if showSystemMessage is false
const filteredMessages = currentConfig.showSystemMessage
? messages
: messages.filter((msg) => msg.type !== 'system');
return filteredMessages.map((message) => {
const siblingInfo = getMessageSiblings(allConversationMessages, message.id);
return {

View File

@ -36,12 +36,6 @@
title: 'General',
icon: Settings,
fields: [
{ key: 'apiKey', label: 'API Key', type: 'input' },
{
key: 'systemMessage',
label: 'System Message (will be disabled if left empty)',
type: 'textarea'
},
{
key: 'theme',
label: 'Theme',
@ -52,6 +46,12 @@
{ value: 'dark', label: 'Dark', icon: Moon }
]
},
{ key: 'apiKey', label: 'API Key', type: 'input' },
{
key: 'systemMessage',
label: 'System Message',
type: 'textarea'
},
{
key: 'pasteLongTextToFileLen',
label: 'Paste long text to file length',

View File

@ -95,7 +95,7 @@
</div>
{#if field.help || SETTING_CONFIG_INFO[field.key]}
<p class="mt-1 text-xs text-muted-foreground">
{field.help || SETTING_CONFIG_INFO[field.key]}
{@html field.help || SETTING_CONFIG_INFO[field.key]}
</p>
{/if}
{:else if field.type === 'textarea'}
@ -112,13 +112,28 @@
value={String(localConfig[field.key] ?? '')}
onchange={(e) => onConfigChange(field.key, e.currentTarget.value)}
placeholder={`Default: ${SETTING_CONFIG_DEFAULT[field.key] ?? 'none'}`}
class="min-h-[100px] w-full md:max-w-2xl"
class="min-h-[10rem] w-full md:max-w-2xl"
/>
{#if field.help || SETTING_CONFIG_INFO[field.key]}
<p class="mt-1 text-xs text-muted-foreground">
{field.help || SETTING_CONFIG_INFO[field.key]}
</p>
{/if}
{#if field.key === 'systemMessage'}
<div class="mt-3 flex items-center gap-2">
<Checkbox
id="showSystemMessage"
checked={Boolean(localConfig.showSystemMessage ?? true)}
onCheckedChange={(checked) => onConfigChange('showSystemMessage', Boolean(checked))}
/>
<Label for="showSystemMessage" class="cursor-pointer text-sm font-normal">
Show system message in conversations
</Label>
</div>
{/if}
{:else if field.type === 'select'}
{@const selectedOption = field.options?.find(
(opt: { value: string; label: string; icon?: Component }) =>

View File

@ -8,6 +8,7 @@
import * as AlertDialog from '$lib/components/ui/alert-dialog';
import Input from '$lib/components/ui/input/input.svelte';
import { conversationsStore, conversations } from '$lib/stores/conversations.svelte';
import { chatStore } from '$lib/stores/chat.svelte';
import ChatSidebarActions from './ChatSidebarActions.svelte';
const sidebar = Sidebar.useSidebar();
@ -98,6 +99,10 @@
await goto(`#/chat/${id}`);
}
function handleStopGeneration(id: string) {
chatStore.stopGenerationForChat(id);
}
</script>
<ScrollArea class="h-[100vh]">
@ -132,6 +137,7 @@
onSelect={selectConversation}
onEdit={handleEditConversation}
onDelete={handleDeleteConversation}
onStop={handleStopGeneration}
/>
</Sidebar.MenuItem>
{/each}

View File

@ -1,6 +1,7 @@
<script lang="ts">
import { Trash2, Pencil, MoreHorizontal, Download, Loader2 } from '@lucide/svelte';
import { Trash2, Pencil, MoreHorizontal, Download, Loader2, Square } from '@lucide/svelte';
import { ActionDropdown } from '$lib/components/app';
import * as Tooltip from '$lib/components/ui/tooltip';
import { getAllLoadingChats } from '$lib/stores/chat.svelte';
import { conversationsStore } from '$lib/stores/conversations.svelte';
import { onMount } from 'svelte';
@ -12,6 +13,7 @@
onDelete?: (id: string) => void;
onEdit?: (id: string) => void;
onSelect?: (id: string) => void;
onStop?: (id: string) => void;
}
let {
@ -20,6 +22,7 @@
onDelete,
onEdit,
onSelect,
onStop,
isActive = false
}: Props = $props();
@ -38,8 +41,14 @@
onDelete?.(conversation.id);
}
function handleStop(event: Event) {
event.stopPropagation();
onStop?.(conversation.id);
}
function handleGlobalEditEvent(event: Event) {
const customEvent = event as CustomEvent<{ conversationId: string }>;
if (customEvent.detail.conversationId === conversation.id && isActive) {
handleEdit(event);
}
@ -88,8 +97,28 @@
>
<div class="flex min-w-0 flex-1 items-center gap-2">
{#if isLoading}
<Loader2 class="h-3.5 w-3.5 shrink-0 animate-spin text-muted-foreground" />
<Tooltip.Root>
<Tooltip.Trigger>
<div
class="stop-button flex h-4 w-4 shrink-0 cursor-pointer items-center justify-center rounded text-muted-foreground transition-colors hover:text-foreground"
onclick={handleStop}
onkeydown={(e) => e.key === 'Enter' && handleStop(e)}
role="button"
tabindex="0"
aria-label="Stop generation"
>
<Loader2 class="loading-icon h-3.5 w-3.5 animate-spin" />
<Square class="stop-icon hidden h-3 w-3 fill-current text-destructive" />
</div>
</Tooltip.Trigger>
<Tooltip.Content>
<p>Stop generation</p>
</Tooltip.Content>
</Tooltip.Root>
{/if}
<!-- svelte-ignore a11y_click_events_have_key_events -->
<!-- svelte-ignore a11y_no_static_element_interactions -->
<span class="truncate text-sm font-medium" onclick={handleMobileSidebarItemClick}>
@ -147,5 +176,25 @@
opacity: 1 !important;
}
}
.stop-button {
:global(.stop-icon) {
display: none;
}
:global(.loading-icon) {
display: block;
}
}
&:is(:hover) .stop-button {
:global(.stop-icon) {
display: block;
}
:global(.loading-icon) {
display: none;
}
}
}
</style>

View File

@ -19,8 +19,10 @@ export { default as ChatMessage } from './chat/ChatMessages/ChatMessage.svelte';
export { default as ChatMessageActions } from './chat/ChatMessages/ChatMessageActions.svelte';
export { default as ChatMessageBranchingControls } from './chat/ChatMessages/ChatMessageBranchingControls.svelte';
export { default as ChatMessageStatistics } from './chat/ChatMessages/ChatMessageStatistics.svelte';
export { default as ChatMessageSystem } from './chat/ChatMessages/ChatMessageSystem.svelte';
export { default as ChatMessageThinkingBlock } from './chat/ChatMessages/ChatMessageThinkingBlock.svelte';
export { default as ChatMessages } from './chat/ChatMessages/ChatMessages.svelte';
export { default as MessageBranchingControls } from './chat/ChatMessages/ChatMessageBranchingControls.svelte';
export { default as ChatScreen } from './chat/ChatScreen/ChatScreen.svelte';
export { default as ChatScreenHeader } from './chat/ChatScreen/ChatScreenHeader.svelte';

View File

@ -337,19 +337,23 @@
line-height: 1.75;
}
div :global(:is(h1, h2, h3, h4, h5, h6):first-child) {
margin-top: 0;
}
/* Headers with consistent spacing */
div :global(h1) {
font-size: 1.875rem;
font-weight: 700;
margin: 1.5rem 0 0.75rem 0;
line-height: 1.2;
margin: 1.5rem 0 0.75rem 0;
}
div :global(h2) {
font-size: 1.5rem;
font-weight: 600;
margin: 1.25rem 0 0.5rem 0;
line-height: 1.3;
margin: 1.25rem 0 0.5rem 0;
}
div :global(h3) {

View File

@ -3,6 +3,7 @@ export const SETTING_CONFIG_DEFAULT: Record<string, string | number | boolean> =
// Do not use nested objects, keep it single level. Prefix the key if you need to group them.
apiKey: '',
systemMessage: '',
showSystemMessage: true,
theme: 'system',
showThoughtInProgress: false,
showToolCalls: false,
@ -43,8 +44,9 @@ export const SETTING_CONFIG_DEFAULT: Record<string, string | number | boolean> =
};
export const SETTING_CONFIG_INFO: Record<string, string> = {
apiKey: 'Set the API Key if you are using --api-key option for the server.',
apiKey: 'Set the API Key if you are using <code>--api-key</code> option for the server.',
systemMessage: 'The starting message that defines how model should behave.',
showSystemMessage: 'Display the system message at the top of each conversation.',
theme:
'Choose the color theme for the interface. You can choose between System (follows your device settings), Light, or Dark.',
pasteLongTextToFileLen:

View File

@ -90,7 +90,6 @@ export class ChatService {
custom,
timings_per_token,
// Config options
systemMessage,
disableReasoningFormat
} = options;
@ -104,6 +103,7 @@ export class ChatService {
}
})
.filter((msg) => {
// Filter out empty system messages
if (msg.role === 'system') {
const content = typeof msg.content === 'string' ? msg.content : '';
@ -113,10 +113,8 @@ export class ChatService {
return true;
});
const processedMessages = ChatService.injectSystemMessage(normalizedMessages, systemMessage);
const requestBody: ApiChatCompletionRequest = {
messages: processedMessages.map((msg: ApiChatMessageData) => ({
messages: normalizedMessages.map((msg: ApiChatMessageData) => ({
role: msg.role,
content: msg.content
})),
@ -680,46 +678,6 @@ export class ChatService {
// Utilities
// ─────────────────────────────────────────────────────────────────────────────
/**
* Injects a system message at the beginning of the conversation if provided.
* Checks for existing system messages to avoid duplication.
*
* @param messages - Array of chat messages to process
* @param systemMessage - Optional system message to inject
* @returns Array of messages with system message injected at the beginning if provided
* @private
*/
private static injectSystemMessage(
messages: ApiChatMessageData[],
systemMessage?: string
): ApiChatMessageData[] {
const trimmedSystemMessage = systemMessage?.trim();
if (!trimmedSystemMessage) {
return messages;
}
if (messages.length > 0 && messages[0].role === 'system') {
if (messages[0].content !== trimmedSystemMessage) {
const updatedMessages = [...messages];
updatedMessages[0] = {
role: 'system',
content: trimmedSystemMessage
};
return updatedMessages;
}
return messages;
}
const systemMsg: ApiChatMessageData = {
role: 'system',
content: trimmedSystemMessage
};
return [systemMsg, ...messages];
}
/**
* Parses error response and creates appropriate error with context information
* @param response - HTTP response object

View File

@ -166,6 +166,49 @@ export class DatabaseService {
return rootMessage.id;
}
/**
* Creates a system prompt message for a conversation.
*
* @param convId - Conversation ID
* @param systemPrompt - The system prompt content (must be non-empty)
* @param parentId - Parent message ID (typically the root message)
* @returns The created system message
* @throws Error if systemPrompt is empty
*/
static async createSystemMessage(
convId: string,
systemPrompt: string,
parentId: string
): Promise<DatabaseMessage> {
const trimmedPrompt = systemPrompt.trim();
if (!trimmedPrompt) {
throw new Error('Cannot create system message with empty content');
}
const systemMessage: DatabaseMessage = {
id: uuid(),
convId,
type: 'system',
timestamp: Date.now(),
role: 'system',
content: trimmedPrompt,
parent: parentId,
thinking: '',
children: []
};
await db.messages.add(systemMessage);
const parentMessage = await db.messages.get(parentId);
if (parentMessage) {
await db.messages.update(parentId, {
children: [...parentMessage.children, systemMessage.id]
});
}
return systemMessage;
}
/**
* Deletes a conversation and all its messages.
*

View File

@ -2,7 +2,11 @@ import { DatabaseService, ChatService } from '$lib/services';
import { conversationsStore } from '$lib/stores/conversations.svelte';
import { config } from '$lib/stores/settings.svelte';
import { contextSize, isRouterMode } from '$lib/stores/server.svelte';
import { selectedModelName, modelsStore } from '$lib/stores/models.svelte';
import {
selectedModelName,
modelsStore,
selectedModelContextSize
} from '$lib/stores/models.svelte';
import {
normalizeModelName,
filterByLeafNodeId,
@ -261,6 +265,13 @@ class ChatStore {
return activeState.contextTotal;
}
if (isRouterMode()) {
const modelContextSize = selectedModelContextSize();
if (modelContextSize && modelContextSize > 0) {
return modelContextSize;
}
}
const propsContextSize = contextSize();
if (propsContextSize && propsContextSize > 0) {
return propsContextSize;
@ -458,6 +469,14 @@ class ChatStore {
onError?: (error: Error) => void,
modelOverride?: string | null
): Promise<void> {
// Ensure model props are cached before streaming (for correct n_ctx in processing info)
if (isRouterMode()) {
const modelName = modelOverride || selectedModelName();
if (modelName && !modelsStore.getModelProps(modelName)) {
await modelsStore.fetchModelProps(modelName);
}
}
let streamedContent = '';
let streamedReasoningContent = '';
let streamedToolCallContent = '';
@ -624,6 +643,22 @@ class ChatStore {
this.clearChatStreaming(currentConv.id);
try {
if (isNewConversation) {
const rootId = await DatabaseService.createRootMessage(currentConv.id);
const currentConfig = config();
const systemPrompt = currentConfig.systemMessage?.toString().trim();
if (systemPrompt) {
const systemMessage = await DatabaseService.createSystemMessage(
currentConv.id,
systemPrompt,
rootId
);
conversationsStore.addMessageToActive(systemMessage);
}
}
const userMessage = await this.addMessage('user', content, 'text', '-1', extras);
if (!userMessage) throw new Error('Failed to add user message');
if (isNewConversation && content)
@ -666,13 +701,17 @@ class ChatStore {
if (!activeConv) return;
await this.savePartialResponseIfNeeded(activeConv.id);
await this.stopGenerationForChat(activeConv.id);
}
async stopGenerationForChat(convId: string): Promise<void> {
await this.savePartialResponseIfNeeded(convId);
this.stopStreaming();
this.abortRequest(activeConv.id);
this.setChatLoading(activeConv.id, false);
this.clearChatStreaming(activeConv.id);
this.clearProcessingState(activeConv.id);
this.abortRequest(convId);
this.setChatLoading(convId, false);
this.clearChatStreaming(convId);
this.clearProcessingState(convId);
}
/**
@ -999,14 +1038,20 @@ class ChatStore {
const activeConv = conversationsStore.activeConversation;
if (!activeConv || this.isLoading) return;
const result = this.getMessageByIdWithRole(messageId, 'user');
let result = this.getMessageByIdWithRole(messageId, 'user');
if (!result) {
result = this.getMessageByIdWithRole(messageId, 'system');
}
if (!result) return;
const { message: msg } = result;
try {
const allMessages = await conversationsStore.getConversationMessages(activeConv.id);
const rootMessage = allMessages.find((m) => m.type === 'root' && m.parent === null);
const isFirstUserMessage = rootMessage && msg.parent === rootMessage.id;
const isFirstUserMessage =
msg.role === 'user' && rootMessage && msg.parent === rootMessage.id;
const parentId = msg.parent || rootMessage?.id;
if (!parentId) return;
@ -1037,7 +1082,10 @@ class ChatStore {
);
}
await conversationsStore.refreshActiveMessages();
if (msg.role === 'user') {
await this.generateResponseForMessage(newMessage.id);
}
} catch (error) {
console.error('Failed to edit message with branching:', error);
}

View File

@ -158,6 +158,22 @@ class ModelsStore {
return this.modelPropsCache.get(modelId) ?? null;
}
/**
* Get context size (n_ctx) for a specific model from cached props
*/
getModelContextSize(modelId: string): number | null {
const props = this.modelPropsCache.get(modelId);
return props?.default_generation_settings?.n_ctx ?? null;
}
/**
* Get context size for the currently selected model or null if no model is selected
*/
get selectedModelContextSize(): number | null {
if (!this.selectedModelName) return null;
return this.getModelContextSize(this.selectedModelName);
}
/**
* Check if props are being fetched for a model
*/
@ -579,3 +595,4 @@ export const loadedModelIds = () => modelsStore.loadedModelIds;
export const loadingModelIds = () => modelsStore.loadingModelIds;
export const propsCacheVersion = () => modelsStore.propsCacheVersion;
export const singleModelName = () => modelsStore.singleModelName;
export const selectedModelContextSize = () => modelsStore.selectedModelContextSize;

View File

@ -1,4 +1,4 @@
export type ChatMessageType = 'root' | 'text' | 'think';
export type ChatMessageType = 'root' | 'text' | 'think' | 'system';
export type ChatRole = 'user' | 'assistant' | 'system';
export interface ChatUploadedFile {