From 9a96389544a08fd829fccda28142ce2066017fde Mon Sep 17 00:00:00 2001 From: Christian Kastner Date: Thu, 7 Aug 2025 13:45:41 +0200 Subject: [PATCH 01/12] ggml: Skip backend library linking code when GGML_BACKEND_DL=ON (#15094) Any available libraries are found and loaded dynamically at runtime. --- ggml/cmake/ggml-config.cmake.in | 82 +++++++++++++++++---------------- 1 file changed, 42 insertions(+), 40 deletions(-) diff --git a/ggml/cmake/ggml-config.cmake.in b/ggml/cmake/ggml-config.cmake.in index 65a75a2f1c..91c9d5cd34 100644 --- a/ggml/cmake/ggml-config.cmake.in +++ b/ggml/cmake/ggml-config.cmake.in @@ -106,7 +106,7 @@ if(NOT TARGET ggml::ggml) find_library(GGML_LIBRARY ggml REQUIRED - HINTS ${GGML_LIB_DIR} ${GGML_BACKEND_DIR} + HINTS ${GGML_LIB_DIR} NO_CMAKE_FIND_ROOT_PATH) add_library(ggml::ggml UNKNOWN IMPORTED) @@ -125,54 +125,56 @@ if(NOT TARGET ggml::ggml) IMPORTED_LOCATION "${GGML_BASE_LIBRARY}") set(_ggml_all_targets "") - foreach(_ggml_backend ${GGML_AVAILABLE_BACKENDS}) - string(REPLACE "-" "_" _ggml_backend_pfx "${_ggml_backend}") - string(TOUPPER "${_ggml_backend_pfx}" _ggml_backend_pfx) + if (NOT GGML_BACKEND_DL) + foreach(_ggml_backend ${GGML_AVAILABLE_BACKENDS}) + string(REPLACE "-" "_" _ggml_backend_pfx "${_ggml_backend}") + string(TOUPPER "${_ggml_backend_pfx}" _ggml_backend_pfx) - find_library(${_ggml_backend_pfx}_LIBRARY ${_ggml_backend} - REQUIRED - HINTS ${GGML_LIB_DIR} - NO_CMAKE_FIND_ROOT_PATH) + find_library(${_ggml_backend_pfx}_LIBRARY ${_ggml_backend} + REQUIRED + HINTS ${GGML_LIB_DIR} + NO_CMAKE_FIND_ROOT_PATH) - message(STATUS "Found ${${_ggml_backend_pfx}_LIBRARY}") + message(STATUS "Found ${${_ggml_backend_pfx}_LIBRARY}") - add_library(ggml::${_ggml_backend} UNKNOWN IMPORTED) - set_target_properties(ggml::${_ggml_backend} - PROPERTIES - INTERFACE_INCLUDE_DIRECTORIES "${GGML_INCLUDE_DIR}" - IMPORTED_LINK_INTERFACE_LANGUAGES "CXX" - IMPORTED_LOCATION "${${_ggml_backend_pfx}_LIBRARY}" - INTERFACE_COMPILE_FEATURES c_std_90 - POSITION_INDEPENDENT_CODE ON) - - string(REGEX MATCH "^ggml-cpu" is_cpu_variant "${_ggml_backend}") - if(is_cpu_variant) - list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES "ggml::ggml-base") - set_target_properties(ggml::${_ggml_backend} - PROPERTIES - INTERFACE_LINK_LIBRARIES "${GGML_CPU_INTERFACE_LINK_LIBRARIES}") - - if(GGML_CPU_INTERFACE_LINK_OPTIONS) - set_target_properties(ggml::${_ggml_backend} - PROPERTIES - INTERFACE_LINK_OPTIONS "${GGML_CPU_INTERFACE_LINK_OPTIONS}") - endif() - - else() - list(APPEND ${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES "ggml::ggml-base") + add_library(ggml::${_ggml_backend} UNKNOWN IMPORTED) set_target_properties(ggml::${_ggml_backend} PROPERTIES - INTERFACE_LINK_LIBRARIES "${${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES}") + INTERFACE_INCLUDE_DIRECTORIES "${GGML_INCLUDE_DIR}" + IMPORTED_LINK_INTERFACE_LANGUAGES "CXX" + IMPORTED_LOCATION "${${_ggml_backend_pfx}_LIBRARY}" + INTERFACE_COMPILE_FEATURES c_std_90 + POSITION_INDEPENDENT_CODE ON) - if(${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS) + string(REGEX MATCH "^ggml-cpu" is_cpu_variant "${_ggml_backend}") + if(is_cpu_variant) + list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES "ggml::ggml-base") + set_target_properties(ggml::${_ggml_backend} + PROPERTIES + INTERFACE_LINK_LIBRARIES "${GGML_CPU_INTERFACE_LINK_LIBRARIES}") + + if(GGML_CPU_INTERFACE_LINK_OPTIONS) + set_target_properties(ggml::${_ggml_backend} + PROPERTIES + INTERFACE_LINK_OPTIONS "${GGML_CPU_INTERFACE_LINK_OPTIONS}") + endif() + + else() + list(APPEND ${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES "ggml::ggml-base") set_target_properties(ggml::${_ggml_backend} PROPERTIES - INTERFACE_LINK_OPTIONS "${${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS}") - endif() - endif() + INTERFACE_LINK_LIBRARIES "${${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES}") - list(APPEND _ggml_all_targets ggml::${_ggml_backend}) - endforeach() + if(${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS) + set_target_properties(ggml::${_ggml_backend} + PROPERTIES + INTERFACE_LINK_OPTIONS "${${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS}") + endif() + endif() + + list(APPEND _ggml_all_targets ggml::${_ggml_backend}) + endforeach() + endif() list(APPEND GGML_INTERFACE_LINK_LIBRARIES ggml::ggml-base "${_ggml_all_targets}") set_target_properties(ggml::ggml From 7ad67ba9fe2b909e271dd31b99c5fce3aba35899 Mon Sep 17 00:00:00 2001 From: uvos Date: Thu, 7 Aug 2025 16:44:14 +0200 Subject: [PATCH 02/12] HIP: add cmake option to enable compiler output of kernel resource usage metrics (#15103) --- ggml/CMakeLists.txt | 1 + ggml/src/ggml-hip/CMakeLists.txt | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 7c4666b7ed..1fb7abeaf0 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -176,6 +176,7 @@ option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF) option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 "ggml: enable rocWMMA FlashAttention on GFX12" OFF) option(GGML_HIP_MMQ_MFMA "ggml: enable MFMA MMA for CDNA in MMQ" ON) +option(GGML_HIP_EXPORT_METRICS "ggml: enable kernel perf metrics output" OFF) option(GGML_MUSA_GRAPHS "ggml: use MUSA graph, experimental, unstable" OFF) option(GGML_MUSA_MUDNN_COPY "ggml: enable muDNN for accelerated copy" OFF) option(GGML_VULKAN "ggml: use Vulkan" OFF) diff --git a/ggml/src/ggml-hip/CMakeLists.txt b/ggml/src/ggml-hip/CMakeLists.txt index e92ec7faa3..852de97346 100644 --- a/ggml/src/ggml-hip/CMakeLists.txt +++ b/ggml/src/ggml-hip/CMakeLists.txt @@ -121,6 +121,10 @@ if (GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 OR ${hip_VERSION} VERSION_GREATER_EQUAL 7 add_compile_definitions(GGML_HIP_ROCWMMA_FATTN_GFX12) endif() +if (GGML_HIP_EXPORT_METRICS) + set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Rpass-analysis=kernel-resource-usage --save-temps") +endif() + if (NOT GGML_CUDA_FA) add_compile_definitions(GGML_CUDA_NO_FA) endif() From 99acbc9921b119aa7ed929eb5780a66a8f06e6d9 Mon Sep 17 00:00:00 2001 From: RunningLeon Date: Fri, 8 Aug 2025 00:20:40 +0800 Subject: [PATCH 03/12] llama : Support intern-s1 (#14875) * support internvl * support interns1 * resolve comments * put interns1 in tensor mapping * resolve comment * move tokenizer changes to sub class --- convert_hf_to_gguf.py | 109 ++++++++++++++++++++++++++++++++- gguf-py/gguf/tensor_mapping.py | 15 +++++ 2 files changed, 122 insertions(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 82b005e84a..8e8f09642c 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -3328,7 +3328,13 @@ class Qwen25OmniModel(Qwen2VLVisionModel): @ModelBase.register("InternVisionModel") class InternVisionModel(MmprojModel): def set_gguf_parameters(self): + assert self.hparams_vision is not None + if isinstance(self.hparams_vision['image_size'], list): + self.hparams_vision['image_size'] = self.hparams_vision['image_size'][0] + if isinstance(self.hparams_vision['patch_size'], list): + self.hparams_vision['patch_size'] = self.hparams_vision['patch_size'][0] super().set_gguf_parameters() + hparams = self.hparams self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.INTERNVL) self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"]) @@ -3352,14 +3358,30 @@ class InternVisionModel(MmprojModel): return gguf.GGMLQuantizationType.F32 return False + def _mapping_interns1_name(self, name): + names_map = { + "model.multi_modal_projector.layer_norm.bias": "mlp1.0.bias", + "model.multi_modal_projector.layer_norm.weight": "mlp1.0.weight", + "model.multi_modal_projector.linear_1.bias": "mlp1.1.bias", + "model.multi_modal_projector.linear_1.weight": "mlp1.1.weight", + "model.multi_modal_projector.linear_2.bias": "mlp1.3.bias", + "model.multi_modal_projector.linear_2.weight": "mlp1.3.weight", + } + if name in names_map: + name = names_map[name] + return name + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused - if name.startswith("vision_model") or name.startswith("mlp"): + vision_prefix = ['vision_model', 'mlp', 'model.vision_tower', 'model.multi_modal_projector'] + # deal with intern-s1 special case + name = self._mapping_interns1_name(name) + if any([name.startswith(prefix) for prefix in vision_prefix]): # process visual tensors # correct name if name.startswith("vision_model"): name = "vision_tower." + name - if (".ls" in name or "position_embedding" in name) and not name.endswith(".weight"): + if (".ls" in name or ".lambda_" in name or "position_embedding" in name) and not name.endswith(".weight"): name += ".weight" # split QKV tensors if needed if ".qkv." in name: @@ -3445,6 +3467,10 @@ class Qwen2MoeModel(TextModel): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: # process the experts separately + name = name.replace("language_model.", "") # InternVL + if name.startswith("mlp") or name.startswith("vision_model") or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector"): + # skip visual tensors + return [] if name.find("experts") != -1: n_experts = self.hparams["num_experts"] assert bid is not None @@ -3498,6 +3524,85 @@ class Qwen3Model(Qwen2Model): class Qwen3MoeModel(Qwen2MoeModel): model_arch = gguf.MODEL_ARCH.QWEN3MOE + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + hparams = ModelBase.load_hparams(self.dir_model) + self.origin_hf_arch = hparams.get('architectures', [None])[0] + + def set_vocab(self): + # deal with intern-s1 + if self.origin_hf_arch == 'InternS1ForConditionalGeneration': + self._set_vocab_interns1() + return + + try: + self._set_vocab_sentencepiece() + except FileNotFoundError: + self._set_vocab_gpt2() + + def _set_vocab_interns1(self): + tokens: list[str] = [] + toktypes: list[int] = [] + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) + vocab = getattr(tokenizer, 'vocab', tokenizer.get_vocab()) + vocab_size = self.hparams.get("vocab_size", len(vocab)) + assert max(vocab.values()) < vocab_size + + tokpre = self.get_vocab_base_pre(tokenizer) + + reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab.items()} + added_vocab = tokenizer.get_added_vocab() + + added_tokens_decoder = tokenizer.added_tokens_decoder + + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.UNUSED) + else: + token: str = reverse_vocab[i] + if token in added_vocab: + # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized. + # To avoid unexpected issues - we make sure to normalize non-normalized tokens + if not added_tokens_decoder[i].normalized: + previous_token = token + token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) + if previous_token != token: + logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer") + + if added_tokens_decoder[i].special or self.does_token_look_special(token): + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.USER_DEFINED) + else: + toktypes.append(gguf.TokenType.NORMAL) + tokens.append(token) + + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + special_tokens_map_file = self.dir_model / 'special_tokens_map.json' + additional_special_tokens = [] + if special_tokens_map_file.is_file(): + with open(special_tokens_map_file, encoding = 'utf-8') as f: + additional_special_tokens = json.load(f).get('additional_special_tokens', []) + tokenizer_cfg_file = self.dir_model / 'special_tokens_map.json' + if tokenizer_cfg_file.is_file(): + with open(tokenizer_cfg_file, encoding = 'utf-8') as f: + added_tokens_decoder = json.load(f).get('added_tokens_decoder', {}) + token2ids_map = {data['content'] : int(token) for token, data in added_tokens_decoder.items() if data['special']} + for token in additional_special_tokens: + if token in token2ids_map: + special_vocab._set_special_token(token, token2ids_map[token]) + special_vocab._set_special_token('eos', 151645) + special_vocab._set_special_token("bos", 151643) + special_vocab.add_to_gguf(self.gguf_writer) + @ModelBase.register("GPT2LMHeadModel") class GPT2Model(TextModel): diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index a0f11563ac..dc7c03b464 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1110,11 +1110,13 @@ class TensorNameMap: MODEL_TENSOR.V_ENC_EMBD_CLS: ( "vision_tower.vision_model.embeddings.class_embedding", + "model.vision_tower.embeddings.cls_token", # Intern-S1 "vision_model.class_embedding", # llama 4 ), MODEL_TENSOR.V_ENC_EMBD_PATCH: ( "vision_tower.vision_model.embeddings.patch_embedding", + "model.vision_tower.embeddings.patch_embeddings.projection", # Intern-S1 "vpm.embeddings.patch_embedding", "model.vision_model.embeddings.patch_embedding", # SmolVLM "vision_tower.patch_conv", # pixtral @@ -1124,6 +1126,7 @@ class TensorNameMap: MODEL_TENSOR.V_ENC_EMBD_POS: ( "vision_tower.vision_model.embeddings.position_embedding", + "model.vision_tower.embeddings.position_embeddings", # Intern-S1 "vpm.embeddings.position_embedding", "model.vision_model.embeddings.position_embedding", # SmolVLM "vision_model.positional_embedding_vlm", # llama 4 @@ -1131,6 +1134,7 @@ class TensorNameMap: MODEL_TENSOR.V_ENC_ATTN_Q: ( "vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj", + "model.vision_tower.encoder.layer.{bid}.attention.q_proj", # Intern-S1 "vpm.encoder.layers.{bid}.self_attn.q_proj", "model.vision_model.encoder.layers.{bid}.self_attn.q_proj", # SmolVLM "vision_model.model.layers.{bid}.self_attn.q_proj", # llama4 @@ -1140,10 +1144,12 @@ class TensorNameMap: MODEL_TENSOR.V_ENC_ATTN_Q_NORM: ( "vision_tower.vision_model.encoder.layers.{bid}.attn.q_norm", # InternVL + "model.vision_tower.encoder.layer.{bid}.attention.q_norm", # Intern-S1 ), MODEL_TENSOR.V_ENC_ATTN_K: ( "vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj", + "model.vision_tower.encoder.layer.{bid}.attention.k_proj", # Intern-S1 "vpm.encoder.layers.{bid}.self_attn.k_proj", "model.vision_model.encoder.layers.{bid}.self_attn.k_proj", # SmolVLM "vision_model.model.layers.{bid}.self_attn.k_proj", # llama4 @@ -1153,10 +1159,12 @@ class TensorNameMap: MODEL_TENSOR.V_ENC_ATTN_K_NORM: ( "vision_tower.vision_model.encoder.layers.{bid}.attn.k_norm", # InternVL + "model.vision_tower.encoder.layer.{bid}.attention.k_norm", # Intern-S1 ), MODEL_TENSOR.V_ENC_ATTN_V: ( "vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj", + "model.vision_tower.encoder.layer.{bid}.attention.v_proj", # Intern-S1 "vpm.encoder.layers.{bid}.self_attn.v_proj", "model.vision_model.encoder.layers.{bid}.self_attn.v_proj", # SmolVLM "vision_model.model.layers.{bid}.self_attn.v_proj", # llama4 @@ -1167,6 +1175,7 @@ class TensorNameMap: MODEL_TENSOR.V_ENC_INPUT_NORM: ( "vision_tower.vision_model.encoder.layers.{bid}.layer_norm1", "vision_tower.vision_model.encoder.layers.{bid}.norm1", # InternVL + "model.vision_tower.encoder.layer.{bid}.layernorm_before", # Intern-S1 "vpm.encoder.layers.{bid}.layer_norm1", "model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM "vision_tower.transformer.layers.{bid}.attention_norm", # pixtral @@ -1177,6 +1186,7 @@ class TensorNameMap: MODEL_TENSOR.V_ENC_ATTN_O: ( "vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj", "vision_tower.vision_model.encoder.layers.{bid}.attn.proj", # InternVL + "model.vision_tower.encoder.layer.{bid}.attention.projection_layer", # Intern-S1 "vpm.encoder.layers.{bid}.self_attn.out_proj", "model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM "vision_model.model.layers.{bid}.self_attn.o_proj", # llama4 @@ -1187,6 +1197,7 @@ class TensorNameMap: MODEL_TENSOR.V_ENC_POST_ATTN_NORM: ( "vision_tower.vision_model.encoder.layers.{bid}.layer_norm2", "vision_tower.vision_model.encoder.layers.{bid}.norm2", # InternVL + "model.vision_tower.encoder.layer.{bid}.layernorm_after", # Intern-S1 "vpm.encoder.layers.{bid}.layer_norm2", "model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM "vision_model.model.layers.{bid}.post_attention_layernorm", # llama4 @@ -1196,6 +1207,7 @@ class TensorNameMap: MODEL_TENSOR.V_ENC_FFN_UP: ( "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1", + "model.vision_tower.encoder.layer.{bid}.mlp.fc1", # Intern-S1 "vpm.encoder.layers.{bid}.mlp.fc1", "model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM, gemma3 "vision_tower.transformer.layers.{bid}.feed_forward.up_proj", # pixtral @@ -1211,6 +1223,7 @@ class TensorNameMap: MODEL_TENSOR.V_ENC_FFN_DOWN: ( "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc2", + "model.vision_tower.encoder.layer.{bid}.mlp.fc2", # Intern-S1 "vpm.encoder.layers.{bid}.mlp.fc2", "model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM, gemma3 "vision_tower.transformer.layers.{bid}.feed_forward.down_proj", # pixtral @@ -1221,10 +1234,12 @@ class TensorNameMap: MODEL_TENSOR.V_LAYER_SCALE_1: ( "vision_tower.vision_model.encoder.layers.{bid}.ls1", # InternVL + "model.vision_tower.encoder.layer.{bid}.lambda_1", # Intern-S1 ), MODEL_TENSOR.V_LAYER_SCALE_2: ( "vision_tower.vision_model.encoder.layers.{bid}.ls2", # InternVL + "model.vision_tower.encoder.layer.{bid}.lambda_2", # Intern-S1 ), MODEL_TENSOR.V_PRE_NORM: ( From a0552c8beef74e843bb085c8ef0c63f9ed7a2b27 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Thu, 7 Aug 2025 15:07:11 -0500 Subject: [PATCH 04/12] vulkan: Add env var to disable host visible vidmem (#15109) --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 165933a729..f1cb90e3b3 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -534,6 +534,7 @@ struct vk_device_struct { ggml_backend_buffer_type buffer_type; bool disable_fusion; + bool disable_host_visible_vidmem; #ifdef GGML_VULKAN_MEMORY_DEBUG std::unique_ptr memory_logger; @@ -1804,6 +1805,8 @@ static vk_buffer ggml_vk_create_buffer_device(vk_device& device, size_t size) { } else if (device->uma) { // Fall back to host memory type buf = ggml_vk_create_buffer(device, size, vk::MemoryPropertyFlagBits::eDeviceLocal, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + } else if (device->disable_host_visible_vidmem) { + buf = ggml_vk_create_buffer(device, size, vk::MemoryPropertyFlagBits::eDeviceLocal, vk::MemoryPropertyFlagBits::eDeviceLocal); } else { // use rebar if available, otherwise fallback to device only visible memory buf = ggml_vk_create_buffer(device, size, vk::MemoryPropertyFlagBits::eDeviceLocal | vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent, vk::MemoryPropertyFlagBits::eDeviceLocal); @@ -3265,6 +3268,9 @@ static vk_device ggml_vk_get_device(size_t idx) { const char* GGML_VK_PREFER_HOST_MEMORY = getenv("GGML_VK_PREFER_HOST_MEMORY"); device->prefer_host_memory = GGML_VK_PREFER_HOST_MEMORY != nullptr; + const char* GGML_VK_DISABLE_HOST_VISIBLE_VIDMEM = getenv("GGML_VK_DISABLE_HOST_VISIBLE_VIDMEM"); + device->disable_host_visible_vidmem = GGML_VK_DISABLE_HOST_VISIBLE_VIDMEM != nullptr; + bool fp16_storage = false; bool fp16_compute = false; bool maintenance4_support = false; From c4f53563df4575196ea13f5ed669ea8ea659a6be Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Thu, 7 Aug 2025 15:44:20 -0500 Subject: [PATCH 05/12] vulkan: support fattn sinks (#15126) --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 58 +++++++++++++------ .../vulkan-shaders/flash_attn.comp | 21 +++++++ .../vulkan-shaders/flash_attn_base.comp | 13 ++++- .../vulkan-shaders/flash_attn_cm1.comp | 21 +++++++ .../vulkan-shaders/flash_attn_cm2.comp | 28 +++++++++ .../flash_attn_split_k_reduce.comp | 27 ++++++++- 6 files changed, 148 insertions(+), 20 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index f1cb90e3b3..b1cbbc9866 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -2286,14 +2286,14 @@ static void ggml_vk_load_shaders(vk_device& device) { }; #define CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, HSK, HSV, HEAD_SIZES) \ - ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][0][0][0], "flash_attn_f32_f16_" #HEAD_SIZES "_f16acc" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,false), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,false), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \ - ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][0][0][1], "flash_attn_f32_f16_" #HEAD_SIZES "_aligned_f16acc" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,false), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,false), fa_rows_cols(FAPATH,HSK,HSV,0,TYPE,false)[1], true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \ - ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][1][0][0], "flash_attn_f32_f16_" #HEAD_SIZES "_f32acc" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,false), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,false), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \ - ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][1][0][1], "flash_attn_f32_f16_" #HEAD_SIZES "_aligned_f32acc" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,false), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,false), fa_rows_cols(FAPATH,HSK,HSV,0,TYPE,false)[1], true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \ - ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][0][1][0], "flash_attn_f32_f16_" #HEAD_SIZES "_f16acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,true), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,true), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \ - ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][0][1][1], "flash_attn_f32_f16_" #HEAD_SIZES "_aligned_f16acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,true), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,true), fa_rows_cols(FAPATH,HSK,HSV,0,TYPE,true)[1], true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \ - ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][1][1][0], "flash_attn_f32_f16_" #HEAD_SIZES "_f32acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,true), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,true), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \ - ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][1][1][1], "flash_attn_f32_f16_" #HEAD_SIZES "_aligned_f32acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,true), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,true), fa_rows_cols(FAPATH,HSK,HSV,0,TYPE,true)[1], true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \ + ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][0][0][0], "flash_attn_f32_f16_" #HEAD_SIZES "_f16acc" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 6, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,false), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,false), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \ + ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][0][0][1], "flash_attn_f32_f16_" #HEAD_SIZES "_aligned_f16acc" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 6, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,false), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,false), fa_rows_cols(FAPATH,HSK,HSV,0,TYPE,false)[1], true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \ + ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][1][0][0], "flash_attn_f32_f16_" #HEAD_SIZES "_f32acc" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 6, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,false), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,false), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \ + ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][1][0][1], "flash_attn_f32_f16_" #HEAD_SIZES "_aligned_f32acc" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 6, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,false), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,false), fa_rows_cols(FAPATH,HSK,HSV,0,TYPE,false)[1], true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \ + ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][0][1][0], "flash_attn_f32_f16_" #HEAD_SIZES "_f16acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 6, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,true), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,true), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \ + ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][0][1][1], "flash_attn_f32_f16_" #HEAD_SIZES "_aligned_f16acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 6, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,true), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,true), fa_rows_cols(FAPATH,HSK,HSV,0,TYPE,true)[1], true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \ + ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][1][1][0], "flash_attn_f32_f16_" #HEAD_SIZES "_f32acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 6, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,true), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,true), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \ + ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][1][1][1], "flash_attn_f32_f16_" #HEAD_SIZES "_aligned_f32acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 6, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,true), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,true), fa_rows_cols(FAPATH,HSK,HSV,0,TYPE,true)[1], true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \ #define CREATE_FA(TYPE, NAMELC, FAPATH, SUFFIX) \ CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 64, 64, 64) \ @@ -2910,7 +2910,7 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_MXFP4], "get_rows_mxfp4_f32", get_rows_mxfp4_f32_len, get_rows_mxfp4_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256 * 4, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_flash_attn_split_k_reduce, "fa_split_k_reduce", fa_split_k_reduce_len, fa_split_k_reduce_data, "main", 2, 4 * sizeof(uint32_t), {1, device->subgroup_size, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_flash_attn_split_k_reduce, "fa_split_k_reduce", fa_split_k_reduce_len, fa_split_k_reduce_data, "main", 3, 5 * sizeof(uint32_t), {1, device->subgroup_size, 1}, {device->subgroup_size}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1, "quantize_q8_1", quantize_q8_1_len, quantize_q8_1_data, "main", 2, 1 * sizeof(uint32_t), {32 * device->subgroup_size / 8, 1, 1}, { device->subgroup_size }, 1); for (uint32_t i = 0; i < p021_max_gqa_ratio; ++i) { @@ -6507,11 +6507,14 @@ static bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, co return supported; } -static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * q, const ggml_tensor * k, const ggml_tensor * v, const ggml_tensor * mask, ggml_tensor * dst, bool dryrun = false) { +static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * q, const ggml_tensor * k, const ggml_tensor * v, const ggml_tensor * mask, const ggml_tensor * sinks, ggml_tensor * dst, bool dryrun = false) { VK_LOG_DEBUG("ggml_vk_flash_attn((" << q << ", name=" << q->name << ", type=" << q->type << ", ne0=" << q->ne[0] << ", ne1=" << q->ne[1] << ", ne2=" << q->ne[2] << ", ne3=" << q->ne[3] << ", nb0=" << q->nb[0] << ", nb1=" << q->nb[1] << ", nb2=" << q->nb[2] << ", nb3=" << q->nb[3]; std::cerr << "), (" << k << ", name=" << k->name << ", type=" << k->type << ", ne0=" << k->ne[0] << ", ne1=" << k->ne[1] << ", ne2=" << k->ne[2] << ", ne3=" << k->ne[3] << ", nb0=" << k->nb[0] << ", nb1=" << k->nb[1] << ", nb2=" << k->nb[2] << ", nb3=" << k->nb[3]; std::cerr << "), (" << v << ", name=" << v->name << ", type=" << v->type << ", ne0=" << v->ne[0] << ", ne1=" << v->ne[1] << ", ne2=" << v->ne[2] << ", ne3=" << v->ne[3] << ", nb0=" << v->nb[0] << ", nb1=" << v->nb[1] << ", nb2=" << v->nb[2] << ", nb3=" << v->nb[3]; std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; + if (sinks) { + std::cerr << "), (" << sinks << ", name=" << sinks->name << ", type=" << sinks->type << ", ne0=" << sinks->ne[0] << ", ne1=" << sinks->ne[1] << ", ne2=" << sinks->ne[2] << ", ne3=" << sinks->ne[3] << ", nb0=" << sinks->nb[0] << ", nb1=" << sinks->nb[1] << ", nb2=" << sinks->nb[2] << ", nb3=" << sinks->nb[3]; + } std::cerr << "), " << (dryrun ? "dryrun" : "") << ")"); GGML_TENSOR_LOCALS(int64_t, neq, q, ne) @@ -6710,10 +6713,10 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); - vk_buffer d_Q = nullptr, d_K = nullptr, d_V = nullptr, d_D = nullptr, d_M = nullptr; - size_t q_buf_offset = 0, k_buf_offset = 0, v_buf_offset = 0, d_buf_offset = 0, m_buf_offset = 0; + vk_buffer d_Q = nullptr, d_K = nullptr, d_V = nullptr, d_D = nullptr, d_M = nullptr, d_S = nullptr; + size_t q_buf_offset = 0, k_buf_offset = 0, v_buf_offset = 0, d_buf_offset = 0, m_buf_offset = 0, s_buf_offset = 0; - bool Q_uma = false, K_uma = false, V_uma = false, D_uma = false, M_uma = false; + bool Q_uma = false, K_uma = false, V_uma = false, D_uma = false, M_uma = false, S_uma = false; if (ctx->device->uma) { ggml_vk_host_get(ctx->device, q->data, d_Q, q_buf_offset); @@ -6728,6 +6731,10 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx ggml_vk_host_get(ctx->device, mask->data, d_M, m_buf_offset); M_uma = d_M != nullptr; } + if (sinks) { + ggml_vk_host_get(ctx->device, sinks->data, d_S, s_buf_offset); + S_uma = d_S != nullptr; + } } @@ -6763,7 +6770,17 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx } } - uint32_t mask_n_head_log2 = ((mask != nullptr) << 16) | n_head_log2; + if (!S_uma) { + d_S = d_Q; + s_buf_offset = q_buf_offset; + if (sinks) { + ggml_backend_vk_buffer_context * s_buf_ctx = (ggml_backend_vk_buffer_context*)sinks->buffer->context; + d_S = s_buf_ctx->dev_buffer; + s_buf_offset = vk_tensor_offset(sinks) + sinks->view_offs; + } + } + + uint32_t mask_n_head_log2 = ((sinks != nullptr) << 24) | ((mask != nullptr) << 16) | n_head_log2; const vk_flash_attn_push_constants pc = { N, KV, (uint32_t)ne1, (uint32_t)ne2, (uint32_t)ne3, @@ -6787,6 +6804,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx vk_subbuffer{d_K, k_buf_offset, VK_WHOLE_SIZE}, vk_subbuffer{d_V, v_buf_offset, VK_WHOLE_SIZE}, vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE}, + vk_subbuffer{d_S, s_buf_offset, VK_WHOLE_SIZE}, vk_subbuffer{ctx->prealloc_split_k, 0, VK_WHOLE_SIZE}, }, // We only use split_k when group query attention is enabled, which means @@ -6796,10 +6814,11 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx pc, { workgroups_x * pipeline->wg_denoms[0], workgroups_y, workgroups_z }); ggml_vk_sync_buffers(subctx); - const std::array pc2 = { HSV, (uint32_t)ne1, (uint32_t)ne3, split_k }; + const std::array pc2 = { HSV, (uint32_t)ne1, (uint32_t)ne3, split_k, (sinks != nullptr) }; ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_flash_attn_split_k_reduce, { vk_subbuffer{ctx->prealloc_split_k, 0, VK_WHOLE_SIZE}, + vk_subbuffer{d_S, s_buf_offset, VK_WHOLE_SIZE}, vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE}, }, pc2, { (uint32_t)ne1, HSV, (uint32_t)ne3 }); @@ -6810,6 +6829,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx vk_subbuffer{d_K, k_buf_offset, VK_WHOLE_SIZE}, vk_subbuffer{d_V, v_buf_offset, VK_WHOLE_SIZE}, vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE}, + vk_subbuffer{d_S, s_buf_offset, VK_WHOLE_SIZE}, vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE}, }, pc, { workgroups_x, workgroups_y, workgroups_z }); @@ -9874,7 +9894,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr break; case GGML_OP_FLASH_ATTN_EXT: - ggml_vk_flash_attn(ctx, compute_ctx, src0, src1, src2, src3, node, dryrun); + ggml_vk_flash_attn(ctx, compute_ctx, src0, src1, src2, src3, node->src[4], node, dryrun); break; @@ -10951,8 +10971,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm if (head_sizes == FA_HEAD_SIZE_UNSUPPORTED) { return false; } - // TODO: support attention sinks [TAG_ATTN_SINKS] - if (op->src[4]) { + if (op->src[4] && op->src[4]->type != GGML_TYPE_F32) { return false; } if (op->src[0]->type != GGML_TYPE_F32) { @@ -11547,6 +11566,9 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph * if (tensor->op == GGML_OP_FLASH_ATTN_EXT) { const float * params = (const float *)tensor->op_params; tensor_clone = ggml_flash_attn_ext(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], src_clone[3], params[0], params[1], params[2]); + if (src_clone[4]) { + ggml_flash_attn_ext_add_sinks(tensor_clone, src_clone[4]); + } } else if (tensor->op == GGML_OP_MUL_MAT) { tensor_clone = ggml_mul_mat(ggml_ctx, src_clone[0], src_clone[1]); } else if (tensor->op == GGML_OP_MUL_MAT_ID) { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp index 45c6e7736a..d40848e15f 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp @@ -305,6 +305,27 @@ void main() { return; } + if ((p.mask_n_head_log2 & SINK_ENABLE_BIT) != 0) { + [[unroll]] for (uint32_t r = 0; r < Br; ++r) { + float sink = perElemOpGetSink(r, 0u, ACC_TYPE(0), iq2); + + float ms = 1.0f; + float vs = 1.0f; + + if (sink > Mf[r]) { + ms = exp(Mf[r] - sink); + + [[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) { + Of[r][d] *= ms; + } + } else { + vs = exp(sink - Mf[r]); + } + + Lf[r] = Lf[r]*ms + vs; + } + } + float Lfrcp[Br]; [[unroll]] for (uint32_t r = 0; r < Br; ++r) { Lfrcp[r] = 1.0 / Lf[r]; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp index 7defe72b40..b57c9dcfc4 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp @@ -50,10 +50,13 @@ layout (push_constant) uniform parameter { uint32_t k_num; } p; +#define SINK_ENABLE_BIT (1<<24) #define MASK_ENABLE_BIT (1<<16) #define N_LOG2_MASK 0xFFFF -layout (binding = 4) writeonly buffer O {D_TYPE data_o[];}; +layout (binding = 4) readonly buffer S {float data_s[];}; + +layout (binding = 5) writeonly buffer O {D_TYPE data_o[];}; #if defined(A_TYPE_PACKED16) #define BINDING_IDX_K 0 @@ -111,6 +114,14 @@ ACC_TYPE perElemOpComputeSlope(const in uint32_t r, const in uint32_t c, const i return ACC_TYPE(pow(base, ACC_TYPE(exph))); } +// Load the sink value, indexed by Q's dimension 2. +ACC_TYPE perElemOpGetSink(const in uint32_t r, const in uint32_t c, const in ACC_TYPE elem, const in uint32_t iq2) +{ + const uint32_t h = iq2 + (r % p.gqa_ratio); + + return ACC_TYPE(data_s[h]); +} + uint32_t i, N, KV, split_k_index, Tr, start_j, end_j, iq2, iq3, rk2, rk3, rv2, rv3, ik2, ik3, iv2, iv3, q_stride, k_stride, v_stride, m_stride; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp index 486735fe8b..230e815f22 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp @@ -329,6 +329,27 @@ void main() { return; } + if ((p.mask_n_head_log2 & SINK_ENABLE_BIT) != 0) { + [[unroll]] for (uint32_t r = 0; r < Br; ++r) { + float sink = perElemOpGetSink(r, 0u, ACC_TYPE(0), iq2); + + float ms = 1.0f; + float vs = 1.0f; + + if (sink > Mf[r]) { + ms = exp(Mf[r] - sink); + + [[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) { + Of[r][d] *= ACC_TYPE(ms); + } + } else { + vs = exp(sink - Mf[r]); + } + + Lf[r] = Lf[r]*ms + vs; + } + } + float Lfrcp[rows_per_thread]; [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) { Lfrcp[r] = 1.0 / Lf[r]; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp index 274f48fcab..b0564ca0bf 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp @@ -248,6 +248,34 @@ void main() { // resize L by using smear/reduce coopMatReduceNV(Ldiag, L, gl_CooperativeMatrixReduceRowNV, smearReduce); + if ((p.mask_n_head_log2 & SINK_ENABLE_BIT) != 0) { + coopmat S; + coopMatPerElementNV(S, S, perElemOpGetSink, iq2); + + coopmat Mr; + + // resize M by using smear/reduce + coopMatReduceNV(Mr, M, gl_CooperativeMatrixReduceRowNV, smearReduce); + + // O, Ldiag, Mr all have the same type so all element locations match + [[unroll]] for (uint32_t i = 0; i < Ldiag.length(); ++i) { + ACC_TYPE sink = S[i]; + + ACC_TYPE ms = ACC_TYPE(1.0f); + ACC_TYPE vs = ACC_TYPE(1.0f); + + if (sink > Mr[i]) { + ms = exp(Mr[i] - sink); + + O[i] *= ms; + } else { + vs = exp(sink - Mr[i]); + } + + Ldiag[i] = Ldiag[i]*ms + vs; + } + } + [[unroll]] for (int k = 0; k < Ldiag.length(); ++k) { Ldiag[k] = ACC_TYPE(1.0) / Ldiag[k]; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp index 0a17a9df23..76ef4b6dfb 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp @@ -7,13 +7,15 @@ layout(constant_id = 0) const uint BLOCK_SIZE = 32; layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; layout (binding = 0) readonly buffer A {float data_a[];}; -layout (binding = 1) writeonly buffer D {float data_d[];}; +layout (binding = 1) readonly buffer B {float data_s[];}; +layout (binding = 2) writeonly buffer D {float data_d[];}; layout (push_constant) uniform parameter { uint D; uint N; uint ne3; uint k_num; + uint sinks; } p; shared float tmpsh[BLOCK_SIZE]; @@ -73,6 +75,22 @@ void main() { } L = tmpsh[0]; + float sink; + if (p.sinks != 0) { + sink = data_s[n]; + + float ms = 1.0f; + float vs = 1.0f; + + if (sink > m_max) { + ms = exp(m_max - sink); + } else { + vs = exp(sink - m_max); + } + + L = L*ms + vs; + } + L = 1.0 / L; // D dimension is split across workgroups in the y dimension @@ -85,6 +103,13 @@ void main() { float m = data_a[m_offset + k * lm_stride]; O += exp(m - m_max) * data_a[o_offset]; } + if (p.sinks != 0) { + if (sink > m_max) { + float ms = 1.0f; + ms = exp(m_max - sink); + O *= ms; + } + } O *= L; data_d[iq3 * D * N + D * n + d] = O; } From 50aa9389014bba2dd12234132aa6b8ca3601a17f Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Thu, 7 Aug 2025 23:26:03 +0200 Subject: [PATCH 06/12] convert : support non-mxfp4 HF model (#15153) * convert : support non-mxfp4 HF model * rm redundant check * disable debug check --- convert_hf_to_gguf.py | 21 ++++++++++++++++----- src/llama-quant.cpp | 2 +- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 8e8f09642c..b8c7d97a78 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -8102,7 +8102,6 @@ class GptOssModel(TextModel): def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: blocks0: Tensor = torch.zeros(1) blocks1: Tensor = torch.zeros(1) - found_mxfp4_tensors = False # we assume that tensors are loaded in the correct order for name, data_torch in self.get_tensors(): if "mlp.experts.down_proj_blocks" in name: @@ -8110,7 +8109,6 @@ class GptOssModel(TextModel): elif "mlp.experts.down_proj_scales" in name: new_name = self.map_tensor_name(name.replace("_scales", ".weight")) self.repack_mxfp4(new_name, blocks0, data_torch) - found_mxfp4_tensors = True elif "mlp.experts.gate_up_proj_blocks" in name: blocks0, blocks1 = data_torch[:, ::2, :, :], data_torch[:, 1::2, :, :] elif "mlp.experts.gate_up_proj_scales" in name: @@ -8119,9 +8117,6 @@ class GptOssModel(TextModel): new_name_up = self.map_tensor_name(name.replace("gate_up_proj_scales", "up_proj.weight")) self.repack_mxfp4(new_name_gate, blocks0, scales0) self.repack_mxfp4(new_name_up, blocks1, scales1) - found_mxfp4_tensors = True - if not found_mxfp4_tensors: - raise ValueError("No MXFP4 tensors found in the model. Please make sure you are using MXFP4 model.") return [] def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: @@ -8134,7 +8129,12 @@ class GptOssModel(TextModel): if "down_proj" in name: if name.endswith("_bias"): name = name.replace("down_proj_bias", "down_proj.bias") + elif "_blocks" not in name and "_scales" not in name: + logger.warning(f"{name} is not in MXFP4, performance may be degraded") + name = name.replace("down_proj", "down_proj.weight") + data_torch = data_torch.transpose(-1, -2) else: + # otherwise, it should already be repacked to ggml MXFP4 format return [] # split the gate_up into gate and up @@ -8147,7 +8147,18 @@ class GptOssModel(TextModel): (self.map_tensor_name(name_gate), gate_proj_bias), (self.map_tensor_name(name_up), up_proj_bias) ] + elif "_blocks" not in name and "_scales" not in name: + logger.warning(f"{name} is not in MXFP4, performance may be degraded") + name_up = name.replace("gate_up_proj", "up_proj.weight") + name_gate = name.replace("gate_up_proj", "gate_proj.weight") + data_torch = data_torch.transpose(-1, -2) + gate_proj_weight, up_proj_weight = data_torch[:, ::2, :], data_torch[:, 1::2, :] + return [ + (self.map_tensor_name(name_gate), gate_proj_weight), + (self.map_tensor_name(name_up), up_proj_weight) + ] else: + # otherwise, it should already be repacked to ggml MXFP4 format return [] return [(self.map_tensor_name(name), data_torch)] diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 9c0ffde27a..1d0361cc16 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -999,7 +999,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use); // TODO: temporary sanity check that the F16 -> MXFP4 is lossless -#if 1 +#if 0 if (new_type == GGML_TYPE_MXFP4) { auto * x = f32_data_03; From aaa3d07ae749b781d6135eaff23c7fa8a4ab404a Mon Sep 17 00:00:00 2001 From: lhez Date: Fri, 8 Aug 2025 13:47:03 +0900 Subject: [PATCH 07/12] opencl: support sink in `soft_max` (attn sinks) (#15152) --- ggml/src/ggml-opencl/ggml-opencl.cpp | 49 +++++++++++-------- ggml/src/ggml-opencl/kernels/softmax_4_f16.cl | 12 ++++- ggml/src/ggml-opencl/kernels/softmax_4_f32.cl | 12 ++++- ggml/src/ggml-opencl/kernels/softmax_f16.cl | 12 ++++- ggml/src/ggml-opencl/kernels/softmax_f32.cl | 12 ++++- 5 files changed, 68 insertions(+), 29 deletions(-) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 4f765ab530..b32d5da307 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -2520,8 +2520,6 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te case GGML_OP_CLAMP: return op->src[0]->type == GGML_TYPE_F32; case GGML_OP_SOFT_MAX: - // TODO: support attention sinks [TAG_ATTN_SINKS] - return op->src[2] == nullptr; case GGML_OP_NORM: case GGML_OP_RMS_NORM: return true; @@ -6594,17 +6592,24 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c GGML_ASSERT(src1->extra); } + const ggml_tensor * src2 = dst->src[2]; + if (src2) { + GGML_ASSERT(src2->extra); + } + ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; ggml_tensor_extra_cl * extra1 = src1 ? (ggml_tensor_extra_cl *)src1->extra : nullptr; + ggml_tensor_extra_cl * extra2 = src2 ? (ggml_tensor_extra_cl *)src2->extra : nullptr; cl_ulong offset0 = extra0->offset + src0->view_offs; cl_ulong offsetd = extrad->offset + dst->view_offs; cl_ulong offset1 = extra1 ? extra1->offset + src1->view_offs : offset0; + cl_ulong offset2 = extra2 ? extra2->offset + src2->view_offs : offset0; const int ne00 = src0->ne[0]; const int ne01 = src0->ne[1]; @@ -6672,25 +6677,27 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), extra1 ? &extra1->data_device : &extra0->data_device)); CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); - CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); - CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); - CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01)); - CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02)); - CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03)); - CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12)); - CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne13)); - CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb11)); - CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb12)); - CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb13)); - CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb1)); - CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb2)); - CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb3)); - CL_CHECK(clSetKernelArg(kernel, 18, sizeof(float), &scale)); - CL_CHECK(clSetKernelArg(kernel, 19, sizeof(float), &max_bias)); - CL_CHECK(clSetKernelArg(kernel, 20, sizeof(float), &m0)); - CL_CHECK(clSetKernelArg(kernel, 21, sizeof(float), &m1)); - CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &n_head_log2)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), extra2 ? &extra2->data_device : &extra0->data_device)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01)); + CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb02)); + CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb03)); + CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12)); + CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne13)); + CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb11)); + CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb12)); + CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb13)); + CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb1)); + CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb2)); + CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb3)); + CL_CHECK(clSetKernelArg(kernel, 20, sizeof(float), &scale)); + CL_CHECK(clSetKernelArg(kernel, 21, sizeof(float), &max_bias)); + CL_CHECK(clSetKernelArg(kernel, 22, sizeof(float), &m0)); + CL_CHECK(clSetKernelArg(kernel, 23, sizeof(float), &m1)); + CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &n_head_log2)); size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03}; size_t local_work_size[] = {(size_t)nth, 1, 1}; diff --git a/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl b/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl index a6d8ede670..571d16507c 100644 --- a/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +++ b/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl @@ -26,6 +26,8 @@ kernel void kernel_soft_max_4_f16( ulong offset0, global char * src1, ulong offset1, + global char * src2, + ulong offset2, global char * dst, ulong offsetd, int ne00, @@ -48,6 +50,7 @@ kernel void kernel_soft_max_4_f16( ) { src0 = src0 + offset0; src1 = src1 + offset1; + src2 = src2 + offset2; dst = dst + offsetd; int i03 = get_group_id(2); @@ -60,6 +63,7 @@ kernel void kernel_soft_max_4_f16( global float4 * psrc4 = (global float4 *)(src0 + i01*nb01 + i02*nb02 + i03*nb03); global half4 * pmask = src1 != src0 ? (global half4 *)(src1 + i11*nb11 + i12*nb12 + i13*nb13) : 0; + global float * psrc2 = src2 != src0 ? (global float *)(src2) : 0; global float4 * pdst4 = (global float4 *)(dst + i01*nb1 + i02*nb2 + i03*nb3); float slope = 1.0f; @@ -75,7 +79,7 @@ kernel void kernel_soft_max_4_f16( } // parallel max - float4 lmax4 = -INFINITY; + float4 lmax4 = psrc2 ? psrc2[i02] : -INFINITY; for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) { lmax4 = fmax(lmax4, psrc4[i00]*scale + slope*(pmask ? convert_float4(pmask[i00]) : 0.0f)); } @@ -92,7 +96,11 @@ kernel void kernel_soft_max_4_f16( } float lsum = lsum4.s0 + lsum4.s1 + lsum4.s2 + lsum4.s3; - const float sum = sub_group_reduce_add(lsum); + float sum = sub_group_reduce_add(lsum); + + if (psrc2) { + sum += exp(psrc2[i02] - max); + } for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) { pdst4[i00] /= sum; diff --git a/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl b/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl index 35b5573b46..1f944b2201 100644 --- a/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +++ b/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl @@ -26,6 +26,8 @@ kernel void kernel_soft_max_4( ulong offset0, global char * src1, ulong offset1, + global char * src2, + ulong offset2, global char * dst, ulong offsetd, int ne00, @@ -48,6 +50,7 @@ kernel void kernel_soft_max_4( ) { src0 = src0 + offset0; src1 = src1 + offset1; + src2 = src2 + offset2; dst = dst + offsetd; int i03 = get_group_id(2); @@ -60,6 +63,7 @@ kernel void kernel_soft_max_4( global float4 * psrc4 = (global float4 *)(src0 + i01*nb01 + i02*nb02 + i03*nb03); global float4 * pmask = src1 != src0 ? (global float4 *)(src1 + i11*nb11 + i12*nb12 + i13*nb13) : 0; + global float * psrc2 = src2 != src0 ? (global float *)(src2) : 0; global float4 * pdst4 = (global float4 *)(dst + i01*nb1 + i02*nb2 + i03*nb3); float slope = 1.0f; @@ -75,7 +79,7 @@ kernel void kernel_soft_max_4( } // parallel max - float4 lmax4 = -INFINITY; + float4 lmax4 = psrc2 ? psrc2[i02] : -INFINITY; for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) { lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f)); } @@ -92,7 +96,11 @@ kernel void kernel_soft_max_4( } float lsum = lsum4.s0 + lsum4.s1 + lsum4.s2 + lsum4.s3; - const float sum = sub_group_reduce_add(lsum); + float sum = sub_group_reduce_add(lsum); + + if (psrc2) { + sum += exp(psrc2[i02] - max); + } for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) { pdst4[i00] /= sum; diff --git a/ggml/src/ggml-opencl/kernels/softmax_f16.cl b/ggml/src/ggml-opencl/kernels/softmax_f16.cl index 9d292b5746..4baa6c28e4 100644 --- a/ggml/src/ggml-opencl/kernels/softmax_f16.cl +++ b/ggml/src/ggml-opencl/kernels/softmax_f16.cl @@ -26,6 +26,8 @@ kernel void kernel_soft_max_f16( ulong offset0, global char * src1, ulong offset1, + global char * src2, + ulong offset2, global char * dst, ulong offsetd, int ne00, @@ -48,6 +50,7 @@ kernel void kernel_soft_max_f16( ) { src0 = src0 + offset0; src1 = src1 + offset1; + src2 = src2 + offset2; dst = dst + offsetd; int i03 = get_group_id(2); @@ -60,6 +63,7 @@ kernel void kernel_soft_max_f16( global float * psrc0 = (global float *)(src0 + i01*nb01 + i02*nb02 + i03*nb03); global half * pmask = src1 != src0 ? (global half *)(src1 + i11*nb11 + i12*nb12 + i13*nb13) : 0; + global float * psrc2 = src2 != src0 ? (global float *)(src2) : 0; global float * pdst = (global float *)(dst + i01*nb1 + i02*nb2 + i03*nb3); float slope = 1.0f; @@ -75,7 +79,7 @@ kernel void kernel_soft_max_f16( } // parallel max - float lmax = -INFINITY; + float lmax = psrc2 ? psrc2[i02] : -INFINITY; for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) { lmax = fmax(lmax, psrc0[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f)); } @@ -91,7 +95,11 @@ kernel void kernel_soft_max_f16( pdst[i00] = exp_psrc0; } - const float sum = sub_group_reduce_add(lsum); + float sum = sub_group_reduce_add(lsum); + + if (psrc2) { + sum += exp(psrc2[i02] - max); + } for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) { pdst[i00] /= sum; diff --git a/ggml/src/ggml-opencl/kernels/softmax_f32.cl b/ggml/src/ggml-opencl/kernels/softmax_f32.cl index 7c53dfbe5a..d503190b47 100644 --- a/ggml/src/ggml-opencl/kernels/softmax_f32.cl +++ b/ggml/src/ggml-opencl/kernels/softmax_f32.cl @@ -26,6 +26,8 @@ kernel void kernel_soft_max( ulong offset0, global char * src1, ulong offset1, + global char * src2, + ulong offset2, global char * dst, ulong offsetd, int ne00, @@ -48,6 +50,7 @@ kernel void kernel_soft_max( ) { src0 = src0 + offset0; src1 = src1 + offset1; + src2 = src2 + offset2; dst = dst + offsetd; int i03 = get_group_id(2); @@ -60,6 +63,7 @@ kernel void kernel_soft_max( global float * psrc0 = (global float *)(src0 + i01*nb01 + i02*nb02 + i03*nb03); global float * pmask = src1 != src0 ? (global float *)(src1 + i11*nb11 + i12*nb12 + i13*nb13) : 0; + global float * psrc2 = src2 != src0 ? (global float *)(src2) : 0; global float * pdst = (global float *)(dst + i01*nb1 + i02*nb2 + i03*nb3); float slope = 1.0f; @@ -75,7 +79,7 @@ kernel void kernel_soft_max( } // parallel max - float lmax = -INFINITY; + float lmax = psrc2 ? psrc2[i02] : -INFINITY; for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) { lmax = fmax(lmax, psrc0[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f)); } @@ -91,7 +95,11 @@ kernel void kernel_soft_max( pdst[i00] = exp_psrc0; } - const float sum = sub_group_reduce_add(lsum); + float sum = sub_group_reduce_add(lsum); + + if (psrc2) { + sum += exp(psrc2[i02] - max); + } for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) { pdst[i00] /= sum; From 1425f587a82bc303469b5c32759a2746ba4e1e20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Fri, 8 Aug 2025 08:19:58 +0200 Subject: [PATCH 08/12] CUDA: attention sinks for mma FlashAttention (#15157) --- ggml/src/ggml-cuda/fattn-mma-f16.cuh | 87 ++++++++++++++++++++++------ ggml/src/ggml-cuda/fattn.cu | 2 +- ggml/src/ggml-cuda/ggml-cuda.cu | 3 +- 3 files changed, 73 insertions(+), 19 deletions(-) diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh index 3712538441..39731baaeb 100644 --- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh +++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh @@ -785,6 +785,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile( const half2 * const __restrict__ K_h2, const half2 * const __restrict__ V_h2, const half2 * const __restrict__ mask_h2, + const float * const __restrict__ sinks_f, float2 * const __restrict__ dstk, float2 * const __restrict__ dstk_fixup, const float scale, @@ -957,6 +958,52 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile( } } + // If attention sinks are used, potentially re-scale if KQ_max is small. + // Also add the sink as a value to KQ_rowsum, this is done after synchonization of KQ_rowsum + // so it's being done unconditionally for every thread. + if (!is_fixup && (np == 1 || threadIdx.y % np == 0) && sinks_f) { + float KQ_max_scale[cols_per_thread]; +#pragma unroll + for (int col = 0; col < cols_per_thread; ++col) { + static_assert(ntiles == 1 || ntiles == 2, "ntiles > 2 not implemented"); + const int jc = ntiles == 1 ? 2*tile_C_VKQ::get_j(col/2) + col % 2 : tile_C_VKQ_16::get_i(col); + const float sink = sinks_f[jc % ncols2]; + + const float KQ_max_new = fmaxf(KQ_max[col], sink); + const float KQ_max_diff = KQ_max[col] - KQ_max_new; + KQ_max_scale[col] = expf(KQ_max_diff); + KQ_max[col] = KQ_max_new; + + *((uint32_t *) &KQ_max_scale[col]) *= KQ_max_diff >= SOFTMAX_FTZ_THRESHOLD; + + const float KQ_max_add = expf(sink - KQ_max_new); + KQ_rowsum[col] = KQ_max_scale[col]*KQ_rowsum[col] + KQ_max_add; + } + + if (ntiles == 1) { + const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale[0], KQ_max_scale[1]); +#pragma unroll + for (int i = 0; i < DV/tile_C_VKQ::I; ++i) { +#pragma unroll + for (int l = 0; l < tile_C_VKQ::ne; ++l) { + VKQ_C[i].x[l] *= KQ_max_scale_h2; + } + } + } else { +#pragma unroll + for (int col = 0; col < cols_per_thread; ++col) { + const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale[col], KQ_max_scale[col]); +#pragma unroll + for (int i = 0; i < DV/tile_C_VKQ_16::J; ++i) { +#pragma unroll + for (int l0 = 0; l0 < tile_C_VKQ_16::ne; l0 += 2) { + VKQ_C_16[i*ntiles/2 + col/2].x[l0 + col % 2] *= KQ_max_scale_h2; + } + } + } + } + } + // Combine VKQ accumulator values if np > 1. // It's also faster to do small writes to shared memory, then large write to VRAM than to do small writes to VRAM. // So also write VKQ accumulators to shared memory in column-major format if np == 1. @@ -1271,18 +1318,21 @@ static __global__ void flash_attn_ext_f16( while (kbc < kbc_stop && kb0_stop == iter_k) { const int sequence = kbc / (iter_k*iter_j*(ne02/ncols2)); - const int head = (kbc - iter_k*iter_j*(ne02/ncols2)*sequence) / (iter_k*iter_j); - const int jt = (kbc - iter_k*iter_j*(ne02/ncols2)*sequence - iter_k*iter_j*head) / iter_k; // j index of current tile. + const int zt = (kbc - iter_k*iter_j*(ne02/ncols2)*sequence) / (iter_k*iter_j); // head in units of ncols2 + const int jt = (kbc - iter_k*iter_j*(ne02/ncols2)*sequence - iter_k*iter_j*zt) / iter_k; // j index of current tile. - const float2 * Q_f2 = (const float2 *) (Q + nb03*sequence + nb02*(head*ncols2)); - const half2 * K_h2 = (const half2 *) (K + nb13*sequence + nb12*(head*ncols2 / gqa_ratio)); + const int head0 = zt * ncols2; + + const float2 * Q_f2 = (const float2 *) (Q + nb03*sequence + nb02* head0); + const half2 * K_h2 = (const half2 *) (K + nb13*sequence + nb12*(head0 / gqa_ratio)); const half2 * mask_h2 = ncols2 == 1 && !mask ? nullptr : (const half2 *) (mask + nb33*(sequence % ne33) + nb31*jt*ncols1); - float2 * dstk = ((float2 *) dst) + (sequence*ne01*ne02 + head*ncols2) * (DV/2); + float2 * dstk = ((float2 *) dst) + (sequence*ne01*ne02 + head0) * (DV/2); - const half2 * V_h2 = mla ? K_h2 + (DKQ/2 - DV/2) : (const half2 *) (V + nb23*sequence + nb22*(head*ncols2 / gqa_ratio)); + const half2 * V_h2 = mla ? K_h2 + (DKQ/2 - DV/2) : (const half2 *) (V + nb23*sequence + nb22*(head0 / gqa_ratio)); + const float * sinks_f = sinks ? (const float *) sinks + head0 : nullptr; - const float slope = ncols2 == 1 ? get_alibi_slope(max_bias, head, n_head_log2, m0, m1) : 1.0f; + const float slope = ncols2 == 1 ? get_alibi_slope(max_bias, head0, n_head_log2, m0, m1) : 1.0f; const int kb0_start_kernel = kb0_start * kb_niter; int kb0_stop_kernel = kb0_stop * kb_niter; @@ -1295,12 +1345,12 @@ static __global__ void flash_attn_ext_f16( if (kb0_start == 0) { constexpr bool needs_fixup = false; // CUDA block is working on an entire tile. flash_attn_ext_f16_process_tile - (Q_f2, K_h2, V_h2, mask_h2, dstk, dst_meta, scale, slope, logit_softcap, + (Q_f2, K_h2, V_h2, mask_h2, sinks_f, dstk, dst_meta, scale, slope, logit_softcap, ne01, ne02, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel); } else { constexpr bool needs_fixup = true; // CUDA block is working on the beginning of a tile. flash_attn_ext_f16_process_tile - (Q_f2, K_h2, V_h2, mask_h2, dstk, dst_meta, scale, slope, logit_softcap, + (Q_f2, K_h2, V_h2, mask_h2, sinks_f, dstk, dst_meta, scale, slope, logit_softcap, ne01, ne02, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel); } @@ -1316,18 +1366,21 @@ static __global__ void flash_attn_ext_f16( } const int sequence = kbc / (iter_k*iter_j*(ne02/ncols2)); - const int head = (kbc - iter_k*iter_j*(ne02/ncols2)*sequence) / (iter_k*iter_j); - const int jt = (kbc - iter_k*iter_j*(ne02/ncols2)*sequence - iter_k*iter_j*head) / iter_k; // j index of current tile. + const int zt = (kbc - iter_k*iter_j*(ne02/ncols2)*sequence) / (iter_k*iter_j); // head in units of ncols2 + const int jt = (kbc - iter_k*iter_j*(ne02/ncols2)*sequence - iter_k*iter_j*zt) / iter_k; // j index of current tile. - const float2 * Q_f2 = (const float2 *) (Q + nb03*sequence + nb02*(head*ncols2)); - const half2 * K_h2 = (const half2 *) (K + nb13*sequence + nb12*(head*ncols2 / gqa_ratio)); + const int head0 = zt * ncols2; + + const float2 * Q_f2 = (const float2 *) (Q + nb03*sequence + nb02* head0); + const half2 * K_h2 = (const half2 *) (K + nb13*sequence + nb12*(head0 / gqa_ratio)); const half2 * mask_h2 = ncols2 == 1 && !mask ? nullptr : (const half2 *) (mask + nb33*(sequence % ne33) + nb31*jt*ncols1); - float2 * dstk = ((float2 *) dst) + (sequence*ne01*ne02 + head*ncols2) * (DV/2); + float2 * dstk = ((float2 *) dst) + (sequence*ne01*ne02 + head0) * (DV/2); - const half2 * V_h2 = mla ? K_h2 + (DKQ/2 - DV/2) : (const half2 *) (V + nb23*sequence + nb22*(head*ncols2 / gqa_ratio)); + const half2 * V_h2 = mla ? K_h2 + (DKQ/2 - DV/2) : (const half2 *) (V + nb23*sequence + nb22*(head0 / gqa_ratio)); + const float * sinks_f = sinks ? (const float *) sinks + head0 : nullptr; - const float slope = ncols2 == 1 ? get_alibi_slope(max_bias, head, n_head_log2, m0, m1) : 1.0f; + const float slope = ncols2 == 1 ? get_alibi_slope(max_bias, head0, n_head_log2, m0, m1) : 1.0f; const int kb0_start_kernel = kb0_start * kb_niter; int kb0_stop_kernel = kb0_stop * kb_niter; @@ -1339,7 +1392,7 @@ static __global__ void flash_attn_ext_f16( constexpr bool is_fixup = true; // Last index writes its data to fixup buffer to avoid data races with other blocks. constexpr bool needs_fixup = false; flash_attn_ext_f16_process_tile - (Q_f2, K_h2, V_h2, mask_h2, dstk, dst_meta, scale, slope, logit_softcap, + (Q_f2, K_h2, V_h2, mask_h2, sinks_f, dstk, dst_meta, scale, slope, logit_softcap, ne01, ne02, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel); #else GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask); GGML_UNUSED(sinks); diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu index 8ddd0415b7..6c1185deac 100644 --- a/ggml/src/ggml-cuda/fattn.cu +++ b/ggml/src/ggml-cuda/fattn.cu @@ -282,7 +282,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst const enum ggml_prec prec = ggml_flash_attn_ext_get_prec(KQV); // TODO: currently only vec implementation for sinks is supported [TAG_ATTN_SINKS] - if (sinks) { + if (sinks && !fp16_mma_available(cc)) { if (prec == GGML_PREC_DEFAULT && fast_fp16_available(cc)) { ggml_cuda_flash_attn_ext_vec_f16(ctx, dst); } else { diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index ec7ab25518..19e9c405ea 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3532,7 +3532,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g return op->src[1]->ne[0] == 576 && op->src[2]->ne[0] == 512 && op->src[3] && gqa_ratio % 16 == 0; } // TODO: more general-purpose attention sink support [TAG_ATTN_SINKS] - if (op->src[4] && op->src[0]->ne[0] != 64 && op->src[0]->ne[0] != 128) { // currently only sinks for head_size 64 and 128 are supported + if (op->src[4] && !fp16_mma_available(ggml_cuda_info().devices[dev_ctx->device].cc) + && op->src[0]->ne[0] != 64 && op->src[0]->ne[0] != 128) { return false; } if (op->src[0]->ne[0] == 192) { From 6c7e9a54406dbba5e53754a8f70a285414717b06 Mon Sep 17 00:00:00 2001 From: Olivier Chafik Date: Fri, 8 Aug 2025 10:45:18 +0100 Subject: [PATCH 09/12] vendor: sync minja (#15161) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * vendor: sync minja * Update minja.hpp * Apply suggestions from code review Co-authored-by: Sigbjørn Skjæret --------- Co-authored-by: Sigbjørn Skjæret --- vendor/minja/chat-template.hpp | 15 ++++++++--- vendor/minja/minja.hpp | 49 +++++++++++++++++++++++----------- 2 files changed, 44 insertions(+), 20 deletions(-) diff --git a/vendor/minja/chat-template.hpp b/vendor/minja/chat-template.hpp index cf113bf222..d5295b335b 100644 --- a/vendor/minja/chat-template.hpp +++ b/vendor/minja/chat-template.hpp @@ -162,8 +162,15 @@ class chat_template { }), false); caps_.supports_tools = contains(out, "some_tool"); - auto out_empty = try_raw_render(json::array({dummy_user_msg, {{"role", "assistant"}, {"content", ""}}}), {}, false); - auto out_null = try_raw_render(json::array({dummy_user_msg, {{"role", "assistant"}, {"content", nullptr}}}), {}, false); + const auto render_with_content = [&](const json & content) { + const json assistant_msg {{"role", "assistant"}, {"content", content}}; + // Render two assistant messages as some templates like QwQ-32B are handling + // the content differently depending on whether it's the last message or not + // (to remove the tag in all but the last message). + return try_raw_render(json::array({dummy_user_msg, assistant_msg, dummy_user_msg, assistant_msg}), {}, false); + }; + auto out_empty = render_with_content(""); + auto out_null = render_with_content(json()); caps_.requires_non_null_content = contains(out_empty, user_needle) && !contains(out_null, user_needle); json j_null; @@ -191,12 +198,12 @@ class chat_template { dummy_user_msg, make_tool_calls_msg(json::array({make_tool_call("ipython", dummy_args_obj.dump())})), }), {}, false); - auto tool_call_renders_str_arguments = contains(out, "\"argument_needle\":") || contains(out, "'argument_needle':"); + auto tool_call_renders_str_arguments = contains(out, "") || contains(out, "\"argument_needle\":") || contains(out, "'argument_needle':"); out = try_raw_render(json::array({ dummy_user_msg, make_tool_calls_msg(json::array({make_tool_call("ipython", dummy_args_obj)})), }), {}, false); - auto tool_call_renders_obj_arguments = contains(out, "\"argument_needle\":") || contains(out, "'argument_needle':"); + auto tool_call_renders_obj_arguments = contains(out, "") || contains(out, "\"argument_needle\":") || contains(out, "'argument_needle':"); caps_.supports_tool_calls = tool_call_renders_str_arguments || tool_call_renders_obj_arguments; caps_.requires_object_arguments = !tool_call_renders_str_arguments && tool_call_renders_obj_arguments; diff --git a/vendor/minja/minja.hpp b/vendor/minja/minja.hpp index dd107dccda..dad75efbba 100644 --- a/vendor/minja/minja.hpp +++ b/vendor/minja/minja.hpp @@ -1291,6 +1291,12 @@ public: } }; +static bool in(const Value & value, const Value & container) { + return (((container.is_array() || container.is_object()) && container.contains(value)) || + (value.is_string() && container.is_string() && + container.to_str().find(value.to_str()) != std::string::npos)); +} + class BinaryOpExpr : public Expression { public: enum class Op { StrConcat, Add, Sub, Mul, MulMul, Div, DivDiv, Mod, Eq, Ne, Lt, Gt, Le, Ge, And, Or, In, NotIn, Is, IsNot }; @@ -1355,13 +1361,8 @@ public: case Op::Gt: return l > r; case Op::Le: return l <= r; case Op::Ge: return l >= r; - case Op::In: return (((r.is_array() || r.is_object()) && r.contains(l)) || - (l.is_string() && r.is_string() && - r.to_str().find(l.to_str()) != std::string::npos)); - case Op::NotIn: - return !(((r.is_array() || r.is_object()) && r.contains(l)) || - (l.is_string() && r.is_string() && - r.to_str().find(l.to_str()) != std::string::npos)); + case Op::In: return in(l, r); + case Op::NotIn: return !in(l, r); default: break; } throw std::runtime_error("Unknown binary operator"); @@ -1500,6 +1501,13 @@ public: } else if (method->get_name() == "pop") { vargs.expectArgs("pop method", {1, 1}, {0, 0}); return obj.pop(vargs.args[0]); + } else if (method->get_name() == "keys") { + vargs.expectArgs("keys method", {0, 0}, {0, 0}); + auto result = Value::array(); + for (const auto& key : obj.keys()) { + result.push_back(Value(key)); + } + return result; } else if (method->get_name() == "get") { vargs.expectArgs("get method", {1, 2}, {0, 0}); auto key = vargs.args[0]; @@ -1541,6 +1549,16 @@ public: } else if (method->get_name() == "capitalize") { vargs.expectArgs("capitalize method", {0, 0}, {0, 0}); return Value(capitalize(str)); + } else if (method->get_name() == "upper") { + vargs.expectArgs("upper method", {0, 0}, {0, 0}); + auto result = str; + std::transform(result.begin(), result.end(), result.begin(), ::toupper); + return Value(result); + } else if (method->get_name() == "lower") { + vargs.expectArgs("lower method", {0, 0}, {0, 0}); + auto result = str; + std::transform(result.begin(), result.end(), result.begin(), ::tolower); + return Value(result); } else if (method->get_name() == "endswith") { vargs.expectArgs("endswith method", {1, 1}, {0, 0}); auto suffix = vargs.args[0].get(); @@ -2646,15 +2664,11 @@ inline std::shared_ptr Context::builtins() { auto items = Value::array(); if (args.contains("object")) { auto & obj = args.at("object"); - if (obj.is_string()) { - auto json_obj = json::parse(obj.get()); - for (const auto & kv : json_obj.items()) { - items.push_back(Value::array({kv.key(), kv.value()})); - } - } else if (!obj.is_null()) { - for (auto & key : obj.keys()) { - items.push_back(Value::array({key, obj.at(key)})); - } + if (!obj.is_object()) { + throw std::runtime_error("Can only get item pairs from a mapping"); + } + for (auto & key : obj.keys()) { + items.push_back(Value::array({key, obj.at(key)})); } } return items; @@ -2782,6 +2796,9 @@ inline std::shared_ptr Context::builtins() { if (!items.is_array()) throw std::runtime_error("object is not iterable"); return items; })); + globals.set("in", simple_function("in", { "item", "items" }, [](const std::shared_ptr &, Value & args) -> Value { + return in(args.at("item"), args.at("items")); + })); globals.set("unique", simple_function("unique", { "items" }, [](const std::shared_ptr &, Value & args) -> Value { auto & items = args.at("items"); if (!items.is_array()) throw std::runtime_error("object is not iterable"); From cd6983d56d2cce94ecb86bb114ae8379a609073c Mon Sep 17 00:00:00 2001 From: AN Long Date: Fri, 8 Aug 2025 21:37:22 +0900 Subject: [PATCH 10/12] ggml : fix field name when new ggml_backend (#14944) --- ggml/src/ggml-blas/ggml-blas.cpp | 8 ++++---- ggml/src/ggml-cpu/ggml-cpu.cpp | 8 ++++---- ggml/src/ggml-cuda/ggml-cuda.cu | 8 ++++---- ggml/src/ggml-opencl/ggml-opencl.cpp | 8 ++++---- ggml/src/ggml-rpc/ggml-rpc.cpp | 8 ++++---- ggml/src/ggml-sycl/ggml-sycl.cpp | 8 ++++---- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 8 ++++---- 7 files changed, 28 insertions(+), 28 deletions(-) diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp index ec158dfac6..aeac2e5744 100644 --- a/ggml/src/ggml-blas/ggml-blas.cpp +++ b/ggml/src/ggml-blas/ggml-blas.cpp @@ -281,10 +281,10 @@ ggml_backend_t ggml_backend_blas_init(void) { ggml_backend_blas_context * ctx = new ggml_backend_blas_context; ggml_backend_t backend = new ggml_backend { - /* .guid = */ ggml_backend_blas_guid(), - /* .interface = */ blas_backend_i, - /* .device = */ ggml_backend_reg_dev_get(ggml_backend_blas_reg(), 0), - /* .context = */ ctx, + /* .guid = */ ggml_backend_blas_guid(), + /* .iface = */ blas_backend_i, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_blas_reg(), 0), + /* .context = */ ctx, }; #if defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP) diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp index e16cdc9d49..8dacd36714 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -214,10 +214,10 @@ ggml_backend_t ggml_backend_cpu_init(void) { ctx->abort_callback_data = NULL; ggml_backend_t cpu_backend = new ggml_backend { - /* .guid = */ ggml_backend_cpu_guid(), - /* .interface = */ ggml_backend_cpu_i, - /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), - /* .context = */ ctx, + /* .guid = */ ggml_backend_cpu_guid(), + /* .iface = */ ggml_backend_cpu_i, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), + /* .context = */ ctx, }; if (cpu_backend == NULL) { diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 19e9c405ea..d9110491ec 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3799,10 +3799,10 @@ ggml_backend_t ggml_backend_cuda_init(int device) { } ggml_backend_t cuda_backend = new ggml_backend { - /* .guid = */ ggml_backend_cuda_guid(), - /* .interface = */ ggml_backend_cuda_interface, - /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), device), - /* .context = */ ctx, + /* .guid = */ ggml_backend_cuda_guid(), + /* .iface = */ ggml_backend_cuda_interface, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), device), + /* .context = */ ctx, }; return cuda_backend; diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index b32d5da307..8ba1e00df7 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -2624,10 +2624,10 @@ ggml_backend_t ggml_backend_opencl_init(void) { ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(dev); ggml_backend_t backend = new ggml_backend { - /* .guid = */ ggml_backend_opencl_guid(), - /* .interface = */ ggml_backend_opencl_i, - /* .device = */ dev, - /* .context = */ backend_ctx + /* .guid = */ ggml_backend_opencl_guid(), + /* .iface = */ ggml_backend_opencl_i, + /* .device = */ dev, + /* .context = */ backend_ctx }; return backend; diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp index 29bc421d58..df6ba54076 100644 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp @@ -823,10 +823,10 @@ ggml_backend_t ggml_backend_rpc_init(const char * endpoint) { }; ggml_backend_t backend = new ggml_backend { - /* .guid = */ ggml_backend_rpc_guid(), - /* .interface = */ ggml_backend_rpc_interface, - /* .device = */ ggml_backend_rpc_add_device(endpoint), - /* .context = */ ctx + /* .guid = */ ggml_backend_rpc_guid(), + /* .iface = */ ggml_backend_rpc_interface, + /* .device = */ ggml_backend_rpc_add_device(endpoint), + /* .context = */ ctx }; return backend; } diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 6fa27418ce..3992dad01d 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -4586,10 +4586,10 @@ ggml_backend_t ggml_backend_sycl_init(int device) { }; ggml_backend_t sycl_backend = new ggml_backend { - /* .guid = */ ggml_backend_sycl_guid(), - /* .interface = */ ggml_backend_sycl_interface, - /* .device = */ ggml_backend_reg_dev_get(ggml_backend_sycl_reg(), device), - /* .context = */ ctx + /* .guid = */ ggml_backend_sycl_guid(), + /* .iface = */ ggml_backend_sycl_interface, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_sycl_reg(), device), + /* .context = */ ctx }; return sycl_backend; diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index b1cbbc9866..4070e248ba 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -10767,10 +10767,10 @@ ggml_backend_t ggml_backend_vk_init(size_t dev_num) { ggml_vk_init(ctx, dev_num); ggml_backend_t vk_backend = new ggml_backend { - /* .guid = */ ggml_backend_vk_guid(), - /* .interface = */ ggml_backend_vk_interface, - /* .device = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), dev_num), - /* .context = */ ctx, + /* .guid = */ ggml_backend_vk_guid(), + /* .iface = */ ggml_backend_vk_interface, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), dev_num), + /* .context = */ ctx, }; return vk_backend; From 4850b52aedceeb70bb4fe49f2d7cd1df6ee98682 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Fri, 8 Aug 2025 23:04:36 +0200 Subject: [PATCH 11/12] server-bench: external OAI servers, sqlite (#15179) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * server-bench: external OAI servers, sqlite * Update scripts/server-bench.py Co-authored-by: Sigbjørn Skjæret * Update scripts/server-bench.py Co-authored-by: Sigbjørn Skjæret * Update scripts/server-bench.py Co-authored-by: Sigbjørn Skjæret * raise_for_status --------- Co-authored-by: Sigbjørn Skjæret --- scripts/server-bench.py | 70 ++++++++++++++++++++++++++++------------- 1 file changed, 48 insertions(+), 22 deletions(-) diff --git a/scripts/server-bench.py b/scripts/server-bench.py index 9326be8d57..a716020173 100755 --- a/scripts/server-bench.py +++ b/scripts/server-bench.py @@ -4,6 +4,7 @@ import argparse import json import os import random +import sqlite3 import subprocess from time import sleep, time from typing import Optional, Union @@ -47,6 +48,8 @@ def get_prompts_rng(prompt_lengths: list[int]) -> list[list[int]]: def get_server(path_server: str, path_log: Optional[str]) -> dict: + if path_server.startswith("http://") or path_server.startswith("https://"): + return {"process": None, "address": path_server, "fout": None} if os.environ.get("LLAMA_ARG_HOST") is None: logger.info("LLAMA_ARG_HOST not explicitly set, using 127.0.0.1") os.environ["LLAMA_ARG_HOST"] = "127.0.0.1" @@ -89,15 +92,13 @@ def get_prompt_length(data: dict) -> int: f"{server_address}/apply-template", json={"messages": [{"role": "user", "content": data["prompt"], "stream": True}]} ) - if response.status_code != 200: - raise RuntimeError(f"Server returned status code {response.status_code}: {response.text}") + response.raise_for_status() prompt: str = json.loads(response.text)["prompt"] response = session.post( f"{server_address}/tokenize", json={"content": prompt, "add_special": True} ) - if response.status_code != 200: - raise RuntimeError(f"Server returned status code {response.status_code}: {response.text}") + response.raise_for_status() tokens: list[str] = json.loads(response.text)["tokens"] return len(tokens) @@ -107,7 +108,12 @@ def send_prompt(data: dict) -> tuple[float, list[float]]: server_address: str = data["server_address"] t_submit = time() - if data["synthetic_prompt"]: + if data["external_server"]: + json_data: dict = { + "prompt": data["prompt"], "ignore_eos": True, + "seed": data["seed"], "max_tokens": data["n_predict"], "stream": True} + response = session.post(f"{server_address}/v1/completions", json=json_data, stream=True) + elif data["synthetic_prompt"]: json_data: dict = { "prompt": data["prompt"], "ignore_eos": True, "cache_prompt": False, "seed": data["seed"], "n_predict": data["n_predict"], "stream": True} @@ -117,34 +123,38 @@ def send_prompt(data: dict) -> tuple[float, list[float]]: f"{server_address}/apply-template", json={"messages": [{"role": "user", "content": data["prompt"], "stream": True}]} ) - if response.status_code != 200: - raise RuntimeError(f"Server returned status code {response.status_code}: {response.text}") + response.raise_for_status() prompt: str = json.loads(response.text)["prompt"] json_data: dict = {"prompt": prompt, "seed": data["seed"], "n_predict": data["n_predict"], "stream": True} response = session.post(f"{server_address}/completion", json=json_data, stream=True) + response.raise_for_status() + lines = [] token_arrival_times: list[float] = [] for line in response.iter_lines(decode_unicode=False): if not line.startswith(b"data: "): continue + lines.append(line) token_arrival_times.append(time()) token_arrival_times = token_arrival_times[:-1] - - if response.status_code != 200: - raise RuntimeError(f"Server returned status code {response.status_code}: {response.text}") + if len(lines) > 1 and "timings" in json.loads(lines[-2][6:]): + token_arrival_times = token_arrival_times[:-1] return (t_submit, token_arrival_times) -def benchmark(path_server: str, path_log: Optional[str], prompt_source: str, n_prompts: int, n_predict: int, n_predict_min: int, seed_offset: int): +def benchmark( + path_server: str, path_log: Optional[str], path_db: Optional[str], name: Optional[str], prompt_source: str, n_prompts: int, + n_predict: int, n_predict_min: int, seed_offset: int): + external_server: bool = path_server.startswith("http://") or path_server.startswith("https://") if os.environ.get("LLAMA_ARG_N_PARALLEL") is None: logger.info("LLAMA_ARG_N_PARALLEL not explicitly set, using 32") os.environ["LLAMA_ARG_N_PARALLEL"] = "32" - if os.environ.get("LLAMA_ARG_N_GPU_LAYERS") is None: + if not external_server and os.environ.get("LLAMA_ARG_N_GPU_LAYERS") is None: logger.info("LLAMA_ARG_N_GPU_LAYERS not explicitly set, using 999") os.environ["LLAMA_ARG_N_GPU_LAYERS"] = "999" - if os.environ.get("LLAMA_ARG_FLASH_ATTN") is None: + if not external_server and os.environ.get("LLAMA_ARG_FLASH_ATTN") is None: logger.info("LLAMA_ARG_FLASH_ATTN not explicitly set, using 'true'") os.environ["LLAMA_ARG_FLASH_ATTN"] = "true" @@ -165,7 +175,7 @@ def benchmark(path_server: str, path_log: Optional[str], prompt_source: str, n_p else: n_predict_min = n_predict - if os.environ.get("LLAMA_ARG_CTX_SIZE") is None: + if not external_server and os.environ.get("LLAMA_ARG_CTX_SIZE") is None: context_per_slot: int = int(1.05 * (n_predict + (np.max(prompt_n) if synthetic_prompts else 2048))) context_total: int = context_per_slot * parallel os.environ["LLAMA_ARG_CTX_SIZE"] = str(context_total) @@ -176,6 +186,7 @@ def benchmark(path_server: str, path_log: Optional[str], prompt_source: str, n_p try: server = get_server(path_server, path_log) server_address: str = server["address"] + assert external_server == (server["process"] is None) adapter = requests.adapters.HTTPAdapter(pool_connections=parallel, pool_maxsize=parallel) # type: ignore session = requests.Session() @@ -188,8 +199,9 @@ def benchmark(path_server: str, path_log: Optional[str], prompt_source: str, n_p if seed_offset >= 0: random.seed(3 * (seed_offset + 1000 * i) + 1) data.append({ - "session": session, "server_address": server_address, "prompt": p, "synthetic_prompt": synthetic_prompts, - "n_predict": random.randint(n_predict_min, n_predict), "seed": (3 * (seed_offset + 1000 * i) + 2) if seed_offset >= 0 else -1}) + "session": session, "server_address": server_address, "external_server": external_server, "prompt": p, + "synthetic_prompt": synthetic_prompts, "n_predict": random.randint(n_predict_min, n_predict), + "seed": (3 * (seed_offset + 1000 * i) + 2) if seed_offset >= 0 else -1}) if not synthetic_prompts: logger.info("Getting the prompt lengths...") @@ -199,7 +211,7 @@ def benchmark(path_server: str, path_log: Optional[str], prompt_source: str, n_p t0 = time() results: list[tuple[float, list[float]]] = thread_map(send_prompt, data, max_workers=parallel, chunksize=1) finally: - if server is not None: + if server is not None and server["process"] is not None: server["process"].terminate() server["process"].wait() if session is not None: @@ -233,15 +245,24 @@ def benchmark(path_server: str, path_log: Optional[str], prompt_source: str, n_p logger.info(f"Average generation depth: {depth_sum / token_t.shape[0]:.2f} tokens") logger.info(f"Average total generation speed: {token_t.shape[0] / token_t_last:.2f} tokens/s") logger.info(f"Average generation speed per slot: {token_t.shape[0] / (parallel * token_t_last):.2f} tokens/s / slot") - logger.info("") - logger.info( - "The above numbers are the speeds as observed by the Python script and may differ from the performance reported by the server, " - "particularly when the server is fast vs. the network or Python script (e.g. when serving a very small model).") + + if path_db is not None: + con = sqlite3.connect(path_db) + cursor = con.cursor() + cursor.execute( + "CREATE TABLE IF NOT EXISTS server_bench" + "(name TEXT, n_parallel INTEGER, prompt_source TEXT, n_prompts INTEGER, " + "n_predict INTEGER, n_predict_min INTEGER, seed_offset INTEGER, runtime REAL);") + cursor.execute( + "INSERT INTO server_bench VALUES (?, ?, ?, ?, ?, ?, ?, ?);", + [name, parallel, prompt_source, n_prompts, n_predict, n_predict_min, seed_offset, token_t_last]) + con.commit() plt.figure() plt.scatter(prompt_n, 1e3 * prompt_t, s=10.0, marker=".", alpha=0.25) plt.xlim(0, 1.05e0 * np.max(prompt_n)) plt.ylim(0, 1.05e3 * np.max(prompt_t)) + plt.title(name or "") plt.xlabel("Prompt length [tokens]") plt.ylabel("Time to first token [ms]") plt.savefig("prompt_time.png", dpi=240) @@ -250,6 +271,7 @@ def benchmark(path_server: str, path_log: Optional[str], prompt_source: str, n_p plt.figure() plt.hist(token_t, np.arange(0, bin_max)) plt.xlim(0, bin_max + 1) + plt.title(name or "") plt.xlabel("Time [s]") plt.ylabel("Num. tokens generated per second") plt.savefig("gen_rate.png", dpi=240) @@ -259,9 +281,13 @@ if __name__ == "__main__": parser = argparse.ArgumentParser( description="Tool for benchmarking the throughput of the llama.cpp HTTP server. " "Results are printed to console and visualized as plots (saved to current working directory). " - "To pass arguments such as the model path to the server, set the corresponding environment variables (see llama-server --help).") + "To pass arguments such as the model path to the server, set the corresponding environment variables (see llama-server --help). " + "The reported numbers are the speeds as observed by the Python script and may differ from the performance reported by the server, " + "particularly when the server is fast vs. the network or Python script (e.g. when serving a very small model).") parser.add_argument("--path_server", type=str, default="llama-server", help="Path to the llama.cpp server binary") parser.add_argument("--path_log", type=str, default="server-bench-{port}.log", help="Path to the model to use for the benchmark") + parser.add_argument("--path_db", type=str, default=None, help="Path to an sqlite database to store the benchmark results in") + parser.add_argument("--name", type=str, default=None, help="Name to label plots and database entries with") parser.add_argument( "--prompt_source", type=str, default="rng-1024-2048", help="How to get the prompts for the benchmark, either 'mmlu' for MMLU questions or " From e54d41befcc1575f4c898c5ff4ef43970cead75f Mon Sep 17 00:00:00 2001 From: compilade Date: Fri, 8 Aug 2025 17:48:26 -0400 Subject: [PATCH 12/12] gguf-py : add Numpy MXFP4 de/quantization support (#15111) * gguf-py : add MXFP4 de/quantization support * ggml-quants : handle zero amax for MXFP4 --- ggml/src/ggml-quants.c | 2 +- gguf-py/gguf/quants.py | 57 +++++++++++++++++++++++++++++++++--- gguf-py/tests/test_quants.py | 17 ++++++++--- 3 files changed, 67 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index a57d2a16d6..94f6405ca1 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -288,7 +288,7 @@ void quantize_row_mxfp4_ref(const float * GGML_RESTRICT x, block_mxfp4 * GGML_RE } } - const uint8_t e = (uint8_t) (floorf(log2f(amax)) - 2 + 127); + const uint8_t e = amax > 0.0f ? (uint8_t) (floorf(log2f(amax)) - 2 + 127) : 0; const float d = GGML_E8M0_TO_FP32_HALF(e); diff --git a/gguf-py/gguf/quants.py b/gguf-py/gguf/quants.py index 3c8ba82e19..31845ea6ee 100644 --- a/gguf-py/gguf/quants.py +++ b/gguf-py/gguf/quants.py @@ -228,8 +228,7 @@ class Q4_0(__Quant, qtype=GGMLQuantizationType.Q4_0): d = max / -8 with np.errstate(divide="ignore"): id = np.where(d == 0, 0, 1 / d) - # FIXME: Q4_0's reference rounding is cursed and depends on FMA - qs = np.trunc((np.float64(blocks) * np.float64(id)) + np.float64(8.5), dtype=np.float32).astype(np.uint8).clip(0, 15) + qs = np.trunc((blocks * id) + np.float32(8.5), dtype=np.float32).astype(np.uint8).clip(0, 15) qs = qs.reshape((n_blocks, 2, cls.block_size // 2)) qs = qs[..., 0, :] | (qs[..., 1, :] << np.uint8(4)) @@ -300,8 +299,7 @@ class Q5_0(__Quant, qtype=GGMLQuantizationType.Q5_0): d = max / -16 with np.errstate(divide="ignore"): id = np.where(d == 0, 0, 1 / d) - # FIXME: Q5_0's reference rounding is cursed and depends on FMA - q = np.trunc((np.float64(blocks) * np.float64(id)) + np.float64(16.5), dtype=np.float32).astype(np.uint8).clip(0, 31) + q = np.trunc((blocks * id) + np.float32(16.5), dtype=np.float32).astype(np.uint8).clip(0, 31) qs = q.reshape((n_blocks, 2, cls.block_size // 2)) qs = (qs[..., 0, :] & np.uint8(0x0F)) | (qs[..., 1, :] << np.uint8(4)) @@ -655,6 +653,57 @@ class TQ2_0(__Quant, qtype=GGMLQuantizationType.TQ2_0): return (d * qs.astype(np.float32)) +class MXFP4(__Quant, qtype=GGMLQuantizationType.MXFP4): + # e2m1 values (doubled) + # ref: https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf + kvalues = (0, 1, 2, 3, 4, 6, 8, 12, 0, -1, -2, -3, -4, -6, -8, -12) + + @staticmethod + # see ggml_e8m0_to_fp32_half in ggml-impl.h + def e8m0_to_fp32_half(x: np.ndarray) -> np.ndarray: + bits = np.where(x < 2, np.uint32(0x00200000) << np.uint32(x), np.uint32(x - 1) << np.uint32(23)) + return bits.view(np.float32) + + @classmethod + def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + d = abs(blocks).max(axis=-1, keepdims=True) + + with np.errstate(divide="ignore"): + e = np.where(d > 0, np.floor(np.log2(d)) - 2 + 127, 0).astype(np.uint8) + + d = cls.e8m0_to_fp32_half(e) + + kvalues = np.array(cls.kvalues, dtype=np.int8).reshape((1, 1, 16)) + + errs = np.abs(d.reshape((n_blocks, 1, 1)) * kvalues.astype(np.float32) - blocks.reshape((n_blocks, cls.block_size, 1))) + best = np.argmin(errs, axis=-1, keepdims=True) + + qs = best.reshape(n_blocks, 2, cls.block_size // 2).astype(np.uint8) + qs = qs[:, 0] | (qs[:, 1] << np.uint8(4)) + + qs = qs.reshape((n_blocks, cls.block_size // 2)) + + return np.concatenate([e, qs], axis=-1) + + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + e, qs = np.hsplit(blocks, [1]) + + d = cls.e8m0_to_fp32_half(e) + + qs = qs.reshape((n_blocks, 1, cls.block_size // 2)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 2, 1)) + qs = (qs & np.uint8(0x0F)).view(np.int8) + + kvalues = np.array(cls.kvalues, dtype=np.int8).reshape(1, 1, 16) + qs = np.take_along_axis(kvalues, qs, axis=-1).reshape((n_blocks, cls.block_size)) + + return (d * qs.astype(np.float32)) + + class IQ2_XXS(__Quant, qtype=GGMLQuantizationType.IQ2_XXS): ksigns: bytes = ( b"\x00\x81\x82\x03\x84\x05\x06\x87\x88\x09\x0a\x8b\x0c\x8d\x8e\x0f" diff --git a/gguf-py/tests/test_quants.py b/gguf-py/tests/test_quants.py index f04d5acce2..172fa0018a 100755 --- a/gguf-py/tests/test_quants.py +++ b/gguf-py/tests/test_quants.py @@ -67,6 +67,7 @@ class GGMLQuants: "q4_0", "q4_1", "q5_0", "q5_1", "q8_0", "q2_K", "q3_K", "q4_K", "q5_K", "q6_K", "tq1_0", "tq2_0", + "mxfp4", "iq2_xxs", "iq2_xs", "iq2_s", "iq3_xxs", "iq3_s", "iq1_s", "iq1_m", "iq4_nl", "iq4_xs", ): @@ -140,14 +141,21 @@ def compare_tensors(t1: np.ndarray, t2: np.ndarray, qtype: GGMLQuantizationType) return False -def do_test(libggml_path: Path, quick: bool = False): +def do_test(libggml_path: Path, quick: bool = False, user_type: GGMLQuantizationType | None = None): ggml_quants = GGMLQuants(libggml_path) np.set_printoptions(precision=None, threshold=(4 * 256) + 1, formatter={"int": lambda n: "0x%02X" % n}) r = np.random.randn(8, 1024, 1024).astype(np.float32, copy=False) + # test zero blocks + r[0, 0, :] = 0 + ## Maybe test infinities? (can make NANs, not really useful in practice) + # r[0, 1, 0] = np.inf + # r[0, 2, 0] = -np.inf + # r[0, 3, 0] = np.inf + # r[0, 3, 1] = -np.inf - for qtype in (GGMLQuantizationType.F16, *gguf.quants._type_traits.keys()): + for qtype in ((GGMLQuantizationType.F16, *gguf.quants._type_traits.keys()) if user_type is None else (user_type,)): has_dequantize = False has_quantize = False @@ -228,11 +236,12 @@ def do_test(libggml_path: Path, quick: bool = False): if __name__ == "__main__": parser = argparse.ArgumentParser(description="Test Python (de)quantization against the reference C implementation") - parser.add_argument("--libggml", type=Path, default=Path(__file__).parent.parent.parent / "build" / "ggml" / "src" / "libggml.so", help="The path to libggml.so") + parser.add_argument("--libggml", type=Path, default=Path(__file__).parent.parent.parent / "build" / "bin" / "libggml.so", help="The path to libggml.so") parser.add_argument("--quick", action="store_true", help="Don't quantize with C when it's not strictly necessary") + parser.add_argument("--type", type=str, help="The quant type to test (all by default)") args = parser.parse_args() logging.basicConfig(level=logging.DEBUG) - do_test(args.libggml, args.quick) + do_test(args.libggml, args.quick, GGMLQuantizationType[args.type.upper()] if args.type is not None else None)