Merge branch 'ggml-org:master' into master

2025-08-04 10:57:36 -07:00 · 2025-08-04 10:57:36 -07:00 · 4c5874282b
parent ea39068e39 2721257e3e
commit 4c5874282b
55 changed files with 5582 additions and 493 deletions
--- a/.devops/cann.Dockerfile
+++ b/.devops/cann.Dockerfile
@ -0,0 +1,130 @@
 # ==============================================================================
 # ARGUMENTS
 # ==============================================================================
 # Define the CANN base image for easier version updates later
 ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.1.rc1-910b-openeuler22.03-py3.10
 # ==============================================================================
 # BUILD STAGE
 # Compile all binary files and libraries
 # ==============================================================================
 FROM ${CANN_BASE_IMAGE} AS build
 # Define the Ascend chip model for compilation. Default is Ascend910B3
 ARG ASCEND_SOC_TYPE=Ascend910B3
 # -- Install build dependencies --
 RUN yum install -y gcc g++ cmake make git libcurl-devel python3 python3-pip && \
    yum clean all && \
    rm -rf /var/cache/yum
 # -- Set the working directory --
 WORKDIR /app
 # -- Copy project files --
 COPY . .
 # -- Set CANN environment variables (required for compilation) --
 # Using ENV instead of `source` allows environment variables to persist across the entire image layer
 ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
 ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${LD_LIBRARY_PATH}
 ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${PATH}
 ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
 ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
 # ... You can add other environment variables from the original file as needed ...
 # For brevity, only core variables are listed here. You can paste the original ENV list here.
 # -- Build llama.cpp --
 # Use the passed ASCEND_SOC_TYPE argument and add general build options
 RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh --force \
    && \
    cmake -B build \
        -DGGML_CANN=ON \
        -DCMAKE_BUILD_TYPE=Release \
        -DSOC_TYPE=${ASCEND_SOC_TYPE} \
        . && \
    cmake --build build --config Release -j$(nproc)
 # -- Organize build artifacts for copying in later stages --
 # Create a lib directory to store all .so files
 RUN mkdir -p /app/lib && \
    find build -name "*.so" -exec cp {} /app/lib \;
 # Create a full directory to store all executables and Python scripts
 RUN mkdir -p /app/full && \
    cp build/bin/* /app/full/ && \
    cp *.py /app/full/ && \
    cp -r gguf-py /app/full/ && \
    cp -r requirements /app/full/ && \
    cp requirements.txt /app/full/
    # If you have a tools.sh script, make sure it is copied here
    # cp .devops/tools.sh /app/full/tools.sh
 # ==============================================================================
 # BASE STAGE
 # Create a minimal base image with CANN runtime and common libraries
 # ==============================================================================
 FROM ${CANN_BASE_IMAGE} AS base
 # -- Install runtime dependencies --
 RUN yum install -y libgomp curl && \
    yum clean all && \
    rm -rf /var/cache/yum
 # -- Set CANN environment variables (required for runtime) --
 ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
 ENV LD_LIBRARY_PATH=/app:${ASCEND_TOOLKIT_HOME}/lib64:${LD_LIBRARY_PATH}
 ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${PATH}
 ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
 # ... You can add other environment variables from the original file as needed ...
 WORKDIR /app
 # Copy compiled .so files from the build stage
 COPY --from=build /app/lib/ /app
 # ==============================================================================
 # FINAL STAGES (TARGETS)
 # ==============================================================================
 ### Target: full
 # Complete image with all tools, Python bindings, and dependencies
 # ==============================================================================
 FROM base AS full
 COPY --from=build /app/full /app
 # Install Python dependencies
 RUN yum install -y git python3 python3-pip && \
    pip3 install --no-cache-dir --upgrade pip setuptools wheel && \
    pip3 install --no-cache-dir -r requirements.txt && \
    yum clean all && \
    rm -rf /var/cache/yum
 # You need to provide a tools.sh script as the entrypoint
 ENTRYPOINT ["/app/tools.sh"]
 # If there is no tools.sh, you can set the default to start the server
 # ENTRYPOINT ["/app/llama-server"]
 ### Target: light
 # Lightweight image containing only llama-cli
 # ==============================================================================
 FROM base AS light
 COPY --from=build /app/full/llama-cli /app
 ENTRYPOINT [ "/app/llama-cli" ]
 ### Target: server
 # Dedicated server image containing only llama-server
 # ==============================================================================
 FROM base AS server
 ENV LLAMA_ARG_HOST=0.0.0.0
 COPY --from=build /app/full/llama-server /app
 HEALTHCHECK --interval=5m CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/app/llama-server" ]
--- a/.github/workflows/pre-tokenizer-hashes.yml
+++ b/.github/workflows/pre-tokenizer-hashes.yml
@ -0,0 +1,45 @@
 name: Check Pre-Tokenizer Hashes
 on:
    push:
        paths:
            - 'convert_hf_to_gguf.py'
            - 'convert_hf_to_gguf_update.py'
    pull_request:
        paths:
            - 'convert_hf_to_gguf.py'
            - 'convert_hf_to_gguf_update.py'
 jobs:
    pre-tokenizer-hashes:
        runs-on: ubuntu-latest
        steps:
        - name: Checkout repository
          uses: actions/checkout@v4
        - name: Set up Python
          uses: actions/setup-python@v5
          with:
              python-version: '3.11'
        - name: Install Python dependencies
          run: |
              python3 -m venv .venv
              .venv/bin/pip install -r requirements/requirements-convert_hf_to_gguf_update.txt
        - name: Update pre-tokenizer hashes
          run: |
              cp convert_hf_to_gguf.py /tmp
              .venv/bin/python convert_hf_to_gguf_update.py --check-missing
        - name: Check if committed pre-tokenizer hashes matches generated version
          run: |
              if ! diff -q convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py; then
                  echo "Model pre-tokenizer hashes (in convert_hf_to_gguf.py) do not match generated hashes (from convert_hf_to_gguf_update.py)."
                  echo "To fix: run ./convert_hf_to_gguf_update.py and commit the updated convert_hf_to_gguf.py along with your changes"
                  echo "Differences found:"
                  diff convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py || true
                  exit 1
              fi
              echo "Model pre-tokenizer hashes are up to date."
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -2647,6 +2647,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.n_out_freq = value;
        }
    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
    add_opt(common_arg(
        {"--output-format"}, "{gguf,dat}",
        string_format("output format for imatrix file (default: %s)", params.imat_dat ? "dat" : "gguf"),
        [](common_params & params, const std::string & value) {
            /**/ if (value == "gguf") { params.imat_dat = false; }
            else if (value == "dat")  { params.imat_dat = true;  }
            else { throw std::invalid_argument("invalid output format"); }
        }
    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
    add_opt(common_arg(
        {"--save-frequency"}, "N",
        string_format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq),
--- a/common/chat.cpp
+++ b/common/chat.cpp
@ -1646,7 +1646,7 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
        "|<function name=\"([^\"]+)\">"  // match 5 (function name again)
    );
-    if (auto res = builder.try_find_regex(open_regex)) {
+    while (auto res = builder.try_find_regex(open_regex)) {
        const auto & block_start = res->groups[1];
        std::string block_end = block_start.empty() ? "" : "```";
@ -1668,7 +1668,6 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
                    builder.consume_literal(block_end);
                    builder.consume_spaces();
                }
                builder.add_content(builder.consume_rest());
            } else {
                throw common_chat_msg_partial_exception("failed to parse tool call");
            }
@ -1693,11 +1692,10 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
                    builder.consume_spaces();
                }
            }
            builder.add_content(builder.consume_rest());
        }
    } else {
        builder.add_content(builder.consume_rest());
    }
    builder.add_content(builder.consume_rest());
 }
 static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
--- a/common/common.h
+++ b/common/common.h
@ -439,6 +439,7 @@ struct common_params {
    int32_t n_out_freq  = 10; // output the imatrix every n_out_freq iterations
    int32_t n_save_freq =  0; // save the imatrix every n_save_freq iterations
    int32_t i_chunk     =  0; // start processing from this chunk
    bool    imat_dat    = false; // whether the legacy imatrix.dat format should be output
    bool process_output  = false; // collect data for the output tensor
    bool compute_ppl     = true;  // whether to compute perplexity
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -684,6 +684,9 @@ class TextModel(ModelBase):
        if chkhsh == "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664":
            # ref: https://huggingface.co/tencent/Hunyuan-A13B-Instruct
            res = "hunyuan"
        if chkhsh == "bba3b3366b646dbdded5dbc42d59598b849371afc42f7beafa914afaa5b70aa6":
            # ref: https://huggingface.co/tencent/Hunyuan-4B-Instruct
            res = "hunyuan-dense"
        if chkhsh == "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6":
            # ref: https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base
            res = "falcon-h1"
@ -699,6 +702,9 @@ class TextModel(ModelBase):
        if chkhsh == "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890":
            # ref: https://huggingface.co/moonshotai/Kimi-K2-Base
            res = "kimi-k2"
        if chkhsh == "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c":
            # ref: https://huggingface.co/Qwen/Qwen3-Embedding-0.6B
            res = "qwen2"
        if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
            # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
            res = "llama-bpe"
@ -846,6 +852,9 @@ class TextModel(ModelBase):
        if chkhsh == "2085e1638f6c377a0aa4ead21b27bb4cb941bf800df86ed391011769c1758dfb":
            # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B
            res = "exaone4"
        if chkhsh == "a1e163ecab2e718a4c829d1148b6e86824ec36163bb71941c3dca9cd5ac25756":
            # ref: https://huggingface.co/JetBrains/Mellum-4b-base
            res = "mellum"
        if res is None:
            logger.warning("\n")
@ -6053,6 +6062,7 @@ class DeepseekModel(TextModel):
@ModelBase.register("DeepseekV2ForCausalLM")
@ModelBase.register("DeepseekV3ForCausalLM")
@ModelBase.register("KimiVLForConditionalGeneration")
 class DeepseekV2Model(TextModel):
    model_arch = gguf.MODEL_ARCH.DEEPSEEK2
@ -6155,6 +6165,13 @@ class DeepseekV2Model(TextModel):
    _experts: list[dict[str, Tensor]] | None = None
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        # skip vision tensors and remove "language_model." for Kimi-VL
        if "vision_tower" in name or "multi_modal_projector" in name:
            return []
        if name.startswith("language_model."):
            name = name.replace("language_model.", "")
        # rename e_score_correction_bias tensors
        if name.endswith("e_score_correction_bias"):
            name = name.replace("e_score_correction_bias", "e_score_correction.bias")
@ -7553,11 +7570,6 @@ class FalconH1Model(Mamba2Model):
 class HunYuanMoEModel(TextModel):
    model_arch = gguf.MODEL_ARCH.HUNYUAN_MOE
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # For handling tied embeddings
        self._tok_embd = None
    def set_vocab(self):
        from transformers import AutoTokenizer
        tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
@ -7651,9 +7663,6 @@ class HunYuanMoEModel(TextModel):
    _experts: list[dict[str, Tensor]] | None = None
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        if name == "model.embed_tokens.weight":
            self._tok_embd = data_torch.clone()
        if name == "lm_head.weight":
            if self.hparams.get("tie_word_embeddings", False):
                logger.info("Skipping tied output layer 'lm_head.weight'")
@ -7698,6 +7707,98 @@ class HunYuanMoEModel(TextModel):
                raise ValueError(f"Unprocessed experts: {experts}")
@ModelBase.register("HunYuanDenseV1ForCausalLM")
 class HunYuanModel(TextModel):
    model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE
    def set_vocab(self):
        if (self.dir_model / "tokenizer.json").is_file():
            self._set_vocab_gpt2()
        else:
            from transformers import AutoTokenizer
            tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
            # 1. Get the pre-tokenizer identifier hash
            tokpre = self.get_vocab_base_pre(tokenizer)
            # 2. Reverse-engineer the merges list from mergeable_ranks
            merges = []
            vocab = {}
            mergeable_ranks = tokenizer.mergeable_ranks
            for token, rank in mergeable_ranks.items():
                vocab[QwenModel.token_bytes_to_string(token)] = rank
                if len(token) == 1:
                    continue
                merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
                if len(merged) == 2:
                    merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
            # 3. Generate the tokens and toktypes lists
            vocab_size = self.hparams["vocab_size"]
            assert tokenizer.vocab_size == vocab_size
            special_tokens = tokenizer.special_tokens
            reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}
            tokens: list[str] = []
            toktypes: list[int] = []
            for i in range(vocab_size):
                if i not in reverse_vocab:
                    tokens.append(f"[PAD{i}]")
                    toktypes.append(gguf.TokenType.UNUSED)
                else:
                    token = reverse_vocab[i]
                    tokens.append(token)
                    if i in special_tokens.values():
                        toktypes.append(gguf.TokenType.CONTROL)
                    else:
                        toktypes.append(gguf.TokenType.NORMAL)
            # 4. Write all vocab-related fields to the GGUF writer
            self.gguf_writer.add_tokenizer_model("gpt2")
            self.gguf_writer.add_tokenizer_pre(tokpre)
            self.gguf_writer.add_token_list(tokens)
            self.gguf_writer.add_token_types(toktypes)
            self.gguf_writer.add_token_merges(merges)
            # 5. Add special tokens and chat templates
            special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
            special_vocab.add_to_gguf(self.gguf_writer)
            # FIX for BOS token: Overwrite incorrect id read from config.json
            if self.hparams['hidden_size'] == 4096:
                self.gguf_writer.add_bos_token_id(127958) # only for 7b dense, fix <|bos|> token
    def set_gguf_parameters(self):
        super().set_gguf_parameters()
        hparams = self.hparams
        # Rope
        rope_scaling = hparams.get("rope_scaling", {})
        if rope_scaling.get("type") == "dynamic":
            # HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
            # 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf)
            alpha = rope_scaling.get("alpha", 50)
            base = hparams.get("rope_theta", 10000.0)
            dim = hparams["head_dim"]
            scaled_base = base * (alpha ** (dim / (dim - 2)))
            self.gguf_writer.add_rope_freq_base(scaled_base)
            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
            self.gguf_writer.add_rope_scaling_factor(1)
            # There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k
            self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length
            self.gguf_writer.add_context_length(256 * 1024) # 256k context length
            # if any of our assumptions about the values are wrong, something has changed and this may need to be updated
            assert base == 10000.0 and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \
                "HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually"
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        if name == "lm_head.weight":
            if self.hparams.get("tie_word_embeddings", False):
                logger.info("Skipping tied output layer 'lm_head.weight'")
                return []
        return [(self.map_tensor_name(name), data_torch)]
@ModelBase.register("SmolLM3ForCausalLM")
 class SmolLM3Model(LlamaModel):
    model_arch = gguf.MODEL_ARCH.SMOLLM3
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@ -59,6 +59,10 @@ parser.add_argument(
    "--full", action="store_true",
    help="download full list of models - make sure you have access to all of them",
 )
 parser.add_argument(
    "--check-missing", action="store_true",
    help="only check for missing pre-tokenizer hashes",
 )
 parser.add_argument(
    "hf_token",
    help="optional HF token",
@ -70,6 +74,10 @@ hf_token = args.hf_token if args.hf_token is not None else hf_token
 if hf_token is None:
    logger.warning("HF token not found. You can provide it as an argument or set it in ~/.cache/huggingface/token")
 if args.check_missing and args.full:
    logger.warning("Downloading full list of models requested, ignoring --check-missing!")
    args.check_missing = False
 # TODO: this string has to exercise as much pre-tokenizer functionality as possible
 #       will be updated with time - contributions welcome
 CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
@ -130,6 +138,7 @@ models = [
    {"name": "midm-2.0",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct", },
    {"name": "lfm2",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"},
    {"name": "exaone4",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", },
    {"name": "mellum",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum-4b-base", },
 ]
 # some models are known to be broken upstream, so we will skip them as exceptions
@ -140,12 +149,14 @@ pre_computed_hashes = [
    {"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", "chkhsh": "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2"},
    {"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", "chkhsh": "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35"},
    {"name": "hunyuan", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-A13B-Instruct", "chkhsh": "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664"},
    {"name": "hunyuan-dense", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-4B-Instruct", "chkhsh": "bba3b3366b646dbdded5dbc42d59598b849371afc42f7beafa914afaa5b70aa6"},
    # falcon-h1 series uses 4 different tokenizers across model sizes (0.5b - 34b), hence we need to define 4 different hashes
    {"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base", "chkhsh": "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6"},
    {"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-1B-Base", "chkhsh": "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86"},
    {"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-7B-Base", "chkhsh": "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896"},
    {"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-34B-Base", "chkhsh": "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b"},
    {"name": "kimi-k2",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/moonshotai/Kimi-K2-Base",   "chkhsh": "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890"},
    {"name": "qwen2",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen3-Embedding-0.6B", "chkhsh": "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c"},
 ]
@ -220,12 +231,13 @@ if not args.full:
    all_models = models.copy()
    models = [model for model in all_models if model["name"] not in existing_models]
-logging.info(f"Downloading {len(models)} models...")
+if not args.check_missing:
-for model in models:
+    logging.info(f"Downloading {len(models)} models...")
-    try:
+    for model in models:
-        download_model(model)
+        try:
-    except Exception as e:
+            download_model(model)
-        logger.error(f"Failed to download model {model['name']}. Error: {e}")
+        except Exception as e:
            logger.error(f"Failed to download model {model['name']}. Error: {e}")
 # generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@ -2016,6 +2016,9 @@ static bool ggml_backend_cann_cpy_tensor_async(
        (ggml_backend_cann_context*)backend_dst->context;
    size_t copy_size = ggml_nbytes(dst);
    if (copy_size == 0) {
        return true;
    }
    if (backend_src != backend_dst) {
        ggml_backend_cann_buffer_context* buf_ctx_src =
            (ggml_backend_cann_buffer_context*)buf_src->context;
--- a/ggml/src/ggml-cpu/arch-fallback.h
+++ b/ggml/src/ggml-cpu/arch-fallback.h
@ -37,17 +37,21 @@
 #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
 #define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
 #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
 #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
 #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
 #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
 #elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
 // repack.cpp
 #define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
 #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
 // repack.cpp
 #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
@ -72,11 +76,13 @@
 #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
 #define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
 #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
 #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
 #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
 #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
 #elif defined(__loongarch64)
 // quants.c
@ -92,11 +98,13 @@
 #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
 #define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
 #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
 #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
 #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
 #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
 #elif defined(__riscv)
 // quants.c
@ -119,10 +127,12 @@
 #define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
 #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
 #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
 #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
 #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
 #elif defined(__s390x__)
 // quants.c
@ -147,11 +157,13 @@
 #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
 #define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
 #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
 #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
 #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
 #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
 #elif defined(__wasm__)
 // quants.c
@ -175,10 +187,12 @@
 #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
 #define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
 #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
 #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
 #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
 #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
 #endif
--- a/ggml/src/ggml-cpu/arch/x86/repack.cpp
+++ b/ggml/src/ggml-cpu/arch/x86/repack.cpp
--- a/ggml/src/ggml-cpu/repack.cpp
+++ b/ggml/src/ggml-cpu/repack.cpp
@ -412,6 +412,82 @@ void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
    }
 }
 void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
    const int qk = QK_K;
    const int nb = n / qk;
    const int ncols_interleaved = 8;
    const int blocklen = 8;
    assert (n % qk == 0);
    assert (nc % ncols_interleaved == 0);
    UNUSED(s);
    UNUSED(bs);
    UNUSED(vx);
    UNUSED(vy);
    UNUSED(nr);
    UNUSED(nc);
    UNUSED(nb);
    UNUSED(ncols_interleaved);
    UNUSED(blocklen);
    float sumf[8];
    float sum_minf[8];
    int sumi1,sumi2,sumi3,sumi4;
    int sumi;
    const block_q8_K * a_ptr = (const block_q8_K *)vy;
    for(int x = 0; x < nc / ncols_interleaved; x++) {
        const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb);
        for (int j = 0; j < ncols_interleaved; j++) {
            sumf[j] = 0.0;
            sum_minf[j] = 0.0;
        }
        for (int l = 0; l < nb; l++) {
            for (int k = 0; k < (qk / (4 * blocklen)); k++) {
                const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
                const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
                const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
                const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
                for (int j = 0; j < ncols_interleaved; j++) {
                    sumi1 = 0;
                    sumi2 = 0;
                    sumi3 = 0;
                    sumi4 = 0;
                    sumi = 0;
                    int offset = ((k / 2) % 2) + j * 2;
                    for (int i = 0; i < blocklen; ++i){
                        const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3);
                        const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3);
                        const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
                        const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
                        sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i]);
                        sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 32]);
                        sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 64]);
                        sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 96]);
                        sumi1 = sumi1 * (scales_0[offset] & 0xF);
                        sumi2 = sumi2 * (scales_1[offset] & 0xF);
                        sumi3 = sumi3 * (scales_2[offset] & 0xF);
                        sumi4 = sumi4 * (scales_3[offset] & 0xF);
                        sumi += sumi1 + sumi2 + sumi3 + sumi4;
                    }
                    sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
                }
            }
            for(int sb = 0; sb < 8; sb++) {
                const uint8_t *mins = b_ptr[l].scales + sb * 16;
                for(int j = 0; j < ncols_interleaved; j++){
                    sum_minf[j] += ((mins[j * 2] >> 4) * a_ptr[l].bsums[sb * 2] + (mins[(j * 2)+ 1] >> 4) * a_ptr[l].bsums[sb * 2 + 1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
                }
            }
        }
        for (int j = 0; j < ncols_interleaved; j++) {
            s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
        }
    }
 }
 void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
    const int qk = QK8_0;
    const int nb = n / qk;
@ -711,6 +787,97 @@ void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
    }
 }
 void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
    const int qk = QK_K;
    const int nb = n / qk;
    const int ncols_interleaved = 8;
    const int blocklen = 8;
    assert (n % qk == 0);
    assert (nr % 4 == 0);
    assert (nc % ncols_interleaved == 0);
    UNUSED(s);
    UNUSED(bs);
    UNUSED(vx);
    UNUSED(vy);
    UNUSED(nr);
    UNUSED(nc);
    UNUSED(nb);
    UNUSED(ncols_interleaved);
    UNUSED(blocklen);
    float sumf[4][8];
    float sum_minf[4][8];
    int sumi1, sumi2, sumi3, sumi4;
    int sumi;
    for (int y = 0; y < nr / 4; y++) {
        const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
        for (int x = 0; x < nc / ncols_interleaved; x++) {
            const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb);
            for (int m = 0; m < 4; m++) {
                for (int j = 0; j < ncols_interleaved; j++) {
                    sumf[m][j] = 0.0;
                    sum_minf[m][j] = 0.0;
                }
            }
            for (int l = 0; l < nb; l++) {
                for (int k = 0; k < (qk / (4 * blocklen)); k++) {
                    const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
                    const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
                    const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
                    const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
                    for (int m = 0; m < 4; m++) {
                        for (int j = 0; j < ncols_interleaved; j++) {
                            sumi1 = 0;
                            sumi2 = 0;
                            sumi3 = 0;
                            sumi4 = 0;
                            sumi = 0;
                            int offset = ((k / 2) % 2) + j * 2;
                            for (int i = 0; i < blocklen; ++i){
                                const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3);
                                const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3);
                                const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
                                const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
                                sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i]);
                                sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 512  + (k % 4) * 4 * blocklen + m * blocklen + i + 128]);
                                sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 512  + (k % 4) * 4 * blocklen + m * blocklen + i + 256]);
                                sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 512  + (k % 4) * 4 * blocklen + m * blocklen + i + 384]);
                                sumi1 = sumi1 * (scales_0[offset] & 0xF);
                                sumi2 = sumi2 * (scales_1[offset] & 0xF);
                                sumi3 = sumi3 * (scales_2[offset] & 0xF);
                                sumi4 = sumi4 * (scales_3[offset] & 0xF);
                                sumi += sumi1 + sumi2 + sumi3 + sumi4;
                            }
                            sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
                        }
                    }
                }
                for(int sb = 0; sb < 8; sb++) {
                    const uint8_t *mins = b_ptr[l].scales + sb * 16;
                    for(int m = 0; m < 4; m++) {
                        const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) *  6);
                        for(int j = 0; j < ncols_interleaved; j++) {
                            int mins_prod = ((mins[j * 2] >> 4) * bsums[0] + (mins[(j * 2)+ 1] >> 4) * bsums[1]);
                            sum_minf[m][j] += (mins_prod) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
                        }
                    }
                }
            }
            for (int m = 0; m < 4; m++) {
                for (int j = 0; j < ncols_interleaved; j++) {
                    s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
                }
            }
        }
    }
 }
 void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
    const int qk = QK8_0;
    const int nb = n / qk;
@ -914,6 +1081,50 @@ static block_q4_Kx8 make_block_q4_Kx8(block_q4_K * in, unsigned int blck_size_in
    return out;
 }
 static block_q2_Kx8 make_block_q2_Kx8(block_q2_K * in, unsigned int blck_size_interleave) {
    block_q2_Kx8 out;
    // Delta(scale) and dmin values of the eight Q2_K structures are copied onto the output interleaved structure
    for (int i = 0; i < 8; i++) {
        out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
    }
    for (int i = 0; i < 8; i++) {
        out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
    }
    const int end = QK_K * 2 / blck_size_interleave;
    // Interleave Q2_K quants by taking 8 bytes at a time
    for (int i = 0; i < end; ++i) {
        int src_id = i % 8;
        int src_offset = (i / 8) * blck_size_interleave;
        int dst_offset = i * blck_size_interleave;
        uint64_t elems;
        memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
        memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
    }
    // The below logic is designed so as to unpack and rearrange scales and mins values in Q2_K
    // Currently the Q2_K structure has 16 scales and 16 mins packed in 16 bytes ( 4 bits for each value)
    // The output Q2_Kx8 structure has 128 bytes for storing scales and mins
    // Every 16 byte is packed such that it contains scales and mins for corresponding sub blocks from Q2_K structure
    // For eg - First 16 bytes contains 16 scales and 16 mins - each of first and second sub blocks from different Q2_K structures
    for(int i = 0; i < 128; i++){
        // Index for selecting which q2k super block
        int src1 = (i % 16) / 2;
        // Index for selecting scale
        int src2 = ((i / 16) * 2) + (i % 2);
        out.scales[i] = in[src1].scales[src2];
    }
    return out;
 }
 static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
    GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
    GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
@ -975,6 +1186,37 @@ static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block
    GGML_UNUSED(data_size);
 }
 static int repack_q2_K_to_q2_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
    GGML_ASSERT(t->type == GGML_TYPE_Q2_K);
    GGML_ASSERT(interleave_block == 8);
    constexpr int nrows_interleaved = 8;
    block_q2_Kx8 * dst = (block_q2_Kx8*)t->data;
    const block_q2_K * src = (const block_q2_K*) data;
    block_q2_K dst_tmp[8];
    int nrow = ggml_nrows(t);
    int nblocks = t->ne[0] / QK_K;
    GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q2_K));
    if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
        return -1;
    }
    for (int b = 0; b < nrow; b += nrows_interleaved) {
        for (int64_t x = 0; x < nblocks; x++) {
            for (int i  = 0; i < nrows_interleaved; i++ ) {
                dst_tmp[i] = src[x + i * nblocks];
            }
            *dst++ = make_block_q2_Kx8(dst_tmp, interleave_block);
        }
        src += nrows_interleaved * nblocks;
    }
    return 0;
    GGML_UNUSED(data_size);
 }
 static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
    GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
    GGML_ASSERT(interleave_block == 8);
@ -1095,6 +1337,10 @@ template <> int repack<block_q4_K, 8, 8>(struct ggml_tensor * t, const void * da
    return repack_q4_K_to_q4_K_8_bl(t, 8, data, data_size);
 }
 template <> int repack<block_q2_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
    return repack_q2_K_to_q2_K_8_bl(t, 8, data, data_size);
 }
 template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
    return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size);
 }
@ -1124,6 +1370,10 @@ template <> void gemv<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t
    ggml_gemv_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
 }
 template <> void gemv<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
    ggml_gemv_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
 }
 template <> void gemv<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
    ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
 }
@ -1148,6 +1398,10 @@ template <> void gemm<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t
    ggml_gemm_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
 }
 template <> void gemm<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
    ggml_gemm_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
 }
 template <> void gemm<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
    ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
 }
@ -1421,6 +1675,9 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
    static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 8, GGML_TYPE_Q8_0> q4_0_8x8_q8_0;
    static const ggml::cpu::repack::tensor_traits<block_q4_K, 8, 8, GGML_TYPE_Q8_K> q4_K_8x8_q8_K;
    // instance for Q2
    static const ggml::cpu::repack::tensor_traits<block_q2_K, 8, 8, GGML_TYPE_Q8_K> q2_K_8x8_q8_K;
    // instance for IQ4
    static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
@ -1446,6 +1703,12 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
                return &q4_K_8x8_q8_K;
            }
        }
    } else if (cur->type == GGML_TYPE_Q2_K) {
        if (ggml_cpu_has_avx512()) {
            if (cur->ne[1] % 8 == 0) {
                return &q2_K_8x8_q8_K;
            }
        }
    } else if (cur->type == GGML_TYPE_IQ4_NL) {
        if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
            if (cur->ne[1] % 4 == 0) {
--- a/ggml/src/ggml-cpu/repack.h
+++ b/ggml/src/ggml-cpu/repack.h
@ -44,7 +44,14 @@ struct block_q4_Kx8 {
 };
 static_assert(sizeof(block_q4_Kx8) == sizeof(ggml_half) * 16 + K_SCALE_SIZE * 8 + QK_K * 4, "wrong q4_K block size/padding");
 struct block_q2_Kx8 {
    ggml_half d[8];      // super-block scale for quantized scales
    ggml_half dmin[8];   // super-block scale for quantized mins
    uint8_t scales[128];  // scales and mins, quantized with 4 bits
    uint8_t qs[512];    // 2--bit quants
 };
 static_assert(sizeof(block_q2_Kx8) == sizeof(ggml_half) * 16 + QK_K/2 + QK_K * 2, "wrong q2_K block size/padding");
 struct block_q8_Kx4 {
    float d[4];              // delta
    int8_t qs[QK_K * 4];     // quants
@ -71,11 +78,13 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
 void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 // Native implementations
@ -86,11 +95,13 @@ void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
 void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 #if defined(__cplusplus)
--- a/ggml/src/ggml-cuda/fattn.cu
+++ b/ggml/src/ggml-cuda/fattn.cu
@ -315,8 +315,9 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
    const bool gqa_opt_applies = ((Q->ne[2] / K->ne[2]) % 2 == 0) && mask; // The mma-based kernels have GQA-specific optimizations
    const bool mma_needs_data_conversion = K->type != GGML_TYPE_F16 || V->type != GGML_TYPE_F16;
-    const bool mma_faster_for_bs1 = new_mma_available(cc) && gqa_opt_applies &&
+    const bool mma_faster_for_rtx4000 = Q->ne[3] > 1 || (Q->ne[2] > 4*K->ne[2] && K->ne[1] >= 8192);
-        (Q->ne[3] > 1 || cc < GGML_CUDA_CC_ADA_LOVELACE) && !mma_needs_data_conversion;
+    const bool mma_faster_for_bs1 = new_mma_available(cc) && gqa_opt_applies && !mma_needs_data_conversion &&
        (cc < GGML_CUDA_CC_ADA_LOVELACE || mma_faster_for_rtx4000);
    const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % (2*warp_size) == 0;
    if (Q->ne[1] == 1 && can_use_vector_kernel && !mma_faster_for_bs1) {
        if (prec == GGML_PREC_DEFAULT) {
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@ -1852,6 +1852,9 @@ static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ct
    ggml_cuda_pool_alloc<cuda_t> src0_alloc(ctx.pool());
    ggml_cuda_pool_alloc<cuda_t> src1_alloc(ctx.pool());
    bool is_src0_cont_2 = ggml_is_contiguous_2(src0);
    bool is_src1_cont_2 = ggml_is_contiguous_2(src1);
    // Handle src0
    src0_ptr = (const cuda_t *) src0->data;
@ -1870,6 +1873,8 @@ static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ct
        s11 = ne10;
        s12 = ne11*s11;
        s13 = ne12*s12;
        is_src1_cont_2 = true;
    }
    // Setup destination buffer
@ -1918,15 +1923,19 @@ static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ct
    const int64_t r2 = ne12/ne02;
    const int64_t r3 = ne13/ne03;
-    if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
+    if (r2 == 1 && r3 == 1 && is_src0_cont_2 && is_src1_cont_2) {
        // with a [0, 2, 1, 3] perm. and ne02==1 the matrix strides need to be determined from dim 3:
        const int64_t sma = ne02 == 1 ? nb03/nb00 : nb02/nb00;
        const int64_t smb = ne12 == 1 ? s13       : s12;
        // there is no broadcast and src0, src1 are contiguous across dims 2, 3
        // use cublasGemmStridedBatchedEx
        CUBLAS_CHECK(
        cublasGemmStridedBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
                ne01, ne11, ne10,
-                alpha, src0_ptr, cu_data_type_a, nb01/nb00, nb02/nb00, // strideA
+                alpha, src0_ptr, cu_data_type_a, nb01/nb00, sma,     // strideA
-                       src1_ptr, cu_data_type_b, s11,       s12,       // strideB
+                       src1_ptr, cu_data_type_b, s11,       smb,     // strideB
-                beta,     dst_t, cu_data_type,   ne0,       ne1*ne0,   // strideC
+                beta,     dst_t, cu_data_type,   ne0,       ne1*ne0, // strideC
                ne12*ne13,
                cu_compute_type,
                CUBLAS_GEMM_DEFAULT_TENSOR_OP));
--- a/ggml/src/ggml-cuda/im2col.cu
+++ b/ggml/src/ggml-cuda/im2col.cu
@ -1,65 +1,75 @@
 #include "im2col.cuh"
 #define MIN(a, b) (a) < (b) ? (a) : (b)
 #define MAX_GRIDDIM_Z 65535
 template <typename T>
 static  __global__ void im2col_kernel(
-        const float * x, T * dst, int64_t batch_offset,
+        const float * x, T * dst,
-        int64_t offset_delta, int64_t IC, int64_t IW, int64_t IH, int64_t OH, int64_t OW, int64_t KW, int64_t KH, int64_t pelements, int64_t CHW,
+        int64_t IC, int64_t IW, int64_t IH, int64_t OH, int64_t OW, int64_t KW, int64_t KH,
        int64_t IC_IH_IW, int64_t IH_IW, int64_t N_OH, int64_t KH_KW, int64_t IC_KH_KW,
        int s0, int s1, int p0, int p1, int d0, int d1) {
    const int64_t i = threadIdx.x + blockIdx.x * blockDim.x;
-    if (i >= pelements) {
+    if (i >= IC_KH_KW) {
        return;
    }
-    const int64_t  ksize = OW * KH;
+    const int64_t iic = i / (KH_KW);
-    const int64_t  kx = i / ksize;
+    const int64_t rem = i - iic * KH_KW;
-    const int64_t  kd = kx * ksize;
+    const int64_t ikh = rem / KW;
-    const int64_t  ky = (i - kd) / OW;
+    const int64_t ikw = rem - ikh * KW;
    const int64_t  ix = i % OW;
-    const int64_t  oh = blockIdx.y;
+    const int64_t  iow = blockIdx.y;
-    const int64_t  batch = blockIdx.z / IC;
+    for (int64_t iz = blockIdx.z; iz < N_OH; iz+=MAX_GRIDDIM_Z) {
-    const int64_t  ic = blockIdx.z % IC;
+        const int64_t  in = iz / OH;
        const int64_t  ioh = iz - in * OH;
-    const int64_t iiw = ix * s0 + kx * d0 - p0;
+        const int64_t iiw = iow * s0 + ikw * d0 - p0;
-    const int64_t iih = oh * s1 + ky * d1 - p1;
+        const int64_t iih = ioh * s1 + ikh * d1 - p1;
-    const int64_t offset_dst =
+        const int64_t offset_dst =
-        ((batch * OH + oh) * OW + ix) * CHW +
+            ((in * OH + ioh) * OW + iow) * IC_KH_KW + iic * KH_KW + ikh * KW + ikw;
        (ic * (KW * KH) + ky * KW + kx);
-    if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
+        if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
-        dst[offset_dst] = 0.0f;
+            dst[offset_dst] = 0.0f;
-    } else {
+        } else {
-        const int64_t offset_src = ic * offset_delta + batch * batch_offset;
+            const int64_t offset_src = iic * IC_IH_IW + in * IH_IW;
-        dst[offset_dst] = x[offset_src + iih * IW + iiw];
+            dst[offset_dst] = x[offset_src + iih * IW + iiw];
        }
    }
 }
 // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
 template <typename T>
 static void im2col_cuda(const float * x, T* dst,
    int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
-    int64_t batch, int64_t batch_offset, int64_t offset_delta,
+    int64_t N, int64_t IC_IH_IW, int64_t IH_IW,
    int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
-    const int parallel_elements = OW * KW * KH;
+    const int64_t IC_KH_KW = IC * KH * KW;
-    const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
+    const int64_t num_blocks = (IC_KH_KW + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
-    dim3 block_nums(num_blocks, OH, batch * IC);
+    const int64_t N_OH = N * OH;
-    im2col_kernel<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, stream>>>(x, dst, batch_offset, offset_delta, IC, IW, IH, OH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
+    const int64_t KH_KW = KW*KH;
    dim3 block_nums(num_blocks, OW, MIN(N_OH, MAX_GRIDDIM_Z));
    im2col_kernel<<<block_nums, MIN(IC_KH_KW, CUDA_IM2COL_BLOCK_SIZE) , 0, stream>>>(x, dst, IC, IW, IH, OH, OW, KW, KH,
                                                                                     IC_IH_IW, IH_IW, N_OH, KH_KW, IC_KH_KW,
                                                                                     s0, s1, p0, p1, d0, d1);
 }
 static void im2col_cuda_f16(const float * x, half * dst,
    int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
-    int64_t batch, int64_t batch_offset, int64_t offset_delta,
+    int64_t N, int64_t IC_IH_IW, int64_t IH_IW,
    int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
-    im2col_cuda<half>(x, dst, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, offset_delta, s0, s1, p0, p1, d0, d1, stream);
+    im2col_cuda<half>(x, dst, IW, IH, OW, OH, KW, KH, IC, N, IC_IH_IW, IH_IW, s0, s1, p0, p1, d0, d1, stream);
 }
 static void im2col_cuda_f32(const float * x, float * dst,
    int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
-    int64_t batch, int64_t batch_offset, int64_t offset_delta,
+    int64_t N, int64_t IC_IH_IW, int64_t IH_IW,
    int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
-    im2col_cuda<float>(x, dst, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, offset_delta, s0, s1, p0, p1, d0, d1, stream);
+    im2col_cuda<float>(x, dst, IW, IH, OW, OH, KW, KH, IC, N, IC_IH_IW, IH_IW, s0, s1, p0, p1, d0, d1, stream);
 }
 void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@ -91,13 +101,13 @@ void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const int64_t OH = is_2D ? dst->ne[2] : 1;
    const int64_t OW =         dst->ne[1];
-    const size_t  delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
+    const int64_t IC_IH_IW = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
-    const int64_t batch        = src1->ne[is_2D ? 3 : 2];
+    const int64_t N        = src1->ne[is_2D ? 3 : 2];
-    const size_t  batch_offset = src1->nb[is_2D ? 3 : 2] / 4; // nb is byte offset, src is type float32
+    const int64_t IH_IW    = src1->nb[is_2D ? 3 : 2] / 4; // nb is byte offset, src is type float32
    if(dst->type == GGML_TYPE_F16) {
-        im2col_cuda_f16(src1_d, (half *) dst_d, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, stream);
+        im2col_cuda_f16(src1_d, (half *) dst_d, IW, IH, OW, OH, KW, KH, IC, N, IC_IH_IW, IH_IW, s0, s1, p0, p1, d0, d1, stream);
    } else {
-        im2col_cuda_f32(src1_d, (float *) dst_d, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, stream);
+        im2col_cuda_f32(src1_d, (float *) dst_d, IW, IH, OW, OH, KW, KH, IC, N, IC_IH_IW, IH_IW, s0, s1, p0, p1, d0, d1, stream);
    }
 }
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@ -251,25 +251,21 @@ static constexpr __device__ int mmq_get_granularity_device(const int /*mmq_x*/)
 #endif // AMD_MFMA_AVAILABLE
 #if defined(GGML_USE_HIP)
-static int mmq_get_nwarps_host(const int cc) {
+static int mmq_get_nwarps_host(const int cc, const int warp_size) {
-    return amd_mfma_available(cc) ? 8 : 4;
+    return amd_mfma_available(cc) ? 8 : 256/warp_size;
 }
 #else
-static int mmq_get_nwarps_host(const int /*cc*/) {
+static int mmq_get_nwarps_host(const int /*cc*/, const int warp_size) {
-    return 8;
+    return 256/warp_size;
 }
 #endif // (GGML_USE_HIP)
 static constexpr __device__ int mmq_get_nwarps_device() {
 #if defined(GGML_USE_HIP)
 #if defined(AMD_MFMA_AVAILABLE)
    return 8;
 #else
-    return 4;
+    return 256/ggml_cuda_get_physical_warp_size();
 #endif // AMD_MFMA_AVAILABLE
 #else
    return 8;
 #endif // defined(GGML_USE_HIP)
 }
 // ------------------------------------------------------------
@ -3472,7 +3468,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
    const int cc = ggml_cuda_info().devices[id].cc;
    const int nsm = ggml_cuda_info().devices[id].nsm;
    const int warp_size = ggml_cuda_info().devices[id].warp_size;
-    const int nwarps = mmq_get_nwarps_host(cc);
+    const int nwarps = mmq_get_nwarps_host(cc, warp_size);
    const int mmq_y = get_mmq_y_host(cc);
    const dim3 block_dims(warp_size, nwarps, 1);
@ -3559,7 +3555,7 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda
    const int    cc     = ggml_cuda_info().devices[id].cc;
    const size_t smpbo  = ggml_cuda_info().devices[id].smpbo;
    const int warp_size = ggml_cuda_info().devices[id].warp_size;
-    const int nwarps    = mmq_get_nwarps_host(cc);
+    const int nwarps    = mmq_get_nwarps_host(cc, warp_size);
    const int mmq_x_max = get_mmq_x_max_host(cc);
    const int mmq_y = get_mmq_y_host(cc);
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@ -400,10 +400,10 @@ struct ggml_backend_opencl_context {
    cl_program program_mul_mm_f32_f32_l4_lm;
    cl_program program_mul_mm_f16_f32_l4_lm;
-    cl_kernel kernel_add, kernel_add_row;
+    cl_kernel kernel_add, kernel_add_row, kernel_add_f16, kernel_add_row_f16;
-    cl_kernel kernel_mul, kernel_mul_row;
+    cl_kernel kernel_mul, kernel_mul_row, kernel_mul_f16, kernel_mul_row_f16;
-    cl_kernel kernel_div, kernel_div_row;
+    cl_kernel kernel_div, kernel_div_row, kernel_div_f16, kernel_div_row_f16;
-    cl_kernel kernel_sub, kernel_sub_row;
+    cl_kernel kernel_sub, kernel_sub_row, kernel_sub_f16, kernel_sub_row_f16;
    cl_kernel kernel_scale;
    cl_kernel kernel_silu, kernel_silu_4;
    cl_kernel kernel_gelu, kernel_gelu_4;
@ -674,8 +674,10 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
        backend_ctx->program_add =
            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
-        CL_CHECK((backend_ctx->kernel_add     = clCreateKernel(backend_ctx->program_add, "kernel_add", &err), err));
+        CL_CHECK((backend_ctx->kernel_add         = clCreateKernel(backend_ctx->program_add, "kernel_add", &err), err));
-        CL_CHECK((backend_ctx->kernel_add_row = clCreateKernel(backend_ctx->program_add, "kernel_add_row", &err), err));
+        CL_CHECK((backend_ctx->kernel_add_row     = clCreateKernel(backend_ctx->program_add, "kernel_add_row", &err), err));
        CL_CHECK((backend_ctx->kernel_add_f16     = clCreateKernel(backend_ctx->program_add, "kernel_add_f16", &err), err));
        CL_CHECK((backend_ctx->kernel_add_row_f16 = clCreateKernel(backend_ctx->program_add, "kernel_add_row_f16", &err), err));
        GGML_LOG_CONT(".");
    }
@ -1089,8 +1091,10 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
        backend_ctx->program_mul =
            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
-        CL_CHECK((backend_ctx->kernel_mul     = clCreateKernel(backend_ctx->program_mul, "kernel_mul", &err), err));
+        CL_CHECK((backend_ctx->kernel_mul         = clCreateKernel(backend_ctx->program_mul, "kernel_mul", &err), err));
-        CL_CHECK((backend_ctx->kernel_mul_row = clCreateKernel(backend_ctx->program_mul, "kernel_mul_row", &err), err));
+        CL_CHECK((backend_ctx->kernel_mul_row     = clCreateKernel(backend_ctx->program_mul, "kernel_mul_row", &err), err));
        CL_CHECK((backend_ctx->kernel_mul_f16     = clCreateKernel(backend_ctx->program_mul, "kernel_mul_f16", &err), err));
        CL_CHECK((backend_ctx->kernel_mul_row_f16 = clCreateKernel(backend_ctx->program_mul, "kernel_mul_row_f16", &err), err));
        GGML_LOG_CONT(".");
    }
@ -1288,11 +1292,16 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
 #else
        const std::string kernel_src = read_file("div.cl");
 #endif
        std::string compile_opts = std::string("-cl-std=") + opencl_c_std +
                               " -cl-mad-enable -cl-finite-math-only ";
        backend_ctx->program_div =
            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
-        CL_CHECK((backend_ctx->kernel_div     = clCreateKernel(backend_ctx->program_div, "kernel_div", &err), err));
+        CL_CHECK((backend_ctx->kernel_div         = clCreateKernel(backend_ctx->program_div, "kernel_div", &err), err));
-        CL_CHECK((backend_ctx->kernel_div_row = clCreateKernel(backend_ctx->program_div, "kernel_div_row", &err), err));
+        CL_CHECK((backend_ctx->kernel_div_row     = clCreateKernel(backend_ctx->program_div, "kernel_div_row", &err), err));
        CL_CHECK((backend_ctx->kernel_div_f16     = clCreateKernel(backend_ctx->program_div, "kernel_div_f16", &err), err));
        CL_CHECK((backend_ctx->kernel_div_row_f16 = clCreateKernel(backend_ctx->program_div, "kernel_div_row_f16", &err), err));
        GGML_LOG_CONT(".");
    }
@ -1308,8 +1317,10 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
        backend_ctx->program_sub =
            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
-        CL_CHECK((backend_ctx->kernel_sub     = clCreateKernel(backend_ctx->program_sub, "kernel_sub", &err), err));
+        CL_CHECK((backend_ctx->kernel_sub         = clCreateKernel(backend_ctx->program_sub, "kernel_sub", &err), err));
-        CL_CHECK((backend_ctx->kernel_sub_row = clCreateKernel(backend_ctx->program_sub, "kernel_sub_row", &err), err));
+        CL_CHECK((backend_ctx->kernel_sub_row     = clCreateKernel(backend_ctx->program_sub, "kernel_sub_row", &err), err));
        CL_CHECK((backend_ctx->kernel_sub_f16     = clCreateKernel(backend_ctx->program_sub, "kernel_sub_f16", &err), err));
        CL_CHECK((backend_ctx->kernel_sub_row_f16 = clCreateKernel(backend_ctx->program_sub, "kernel_sub_row_f16", &err), err));
        GGML_LOG_CONT(".");
    }
@ -2035,8 +2046,8 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
    backend_ctx->adreno_cl_compiler_version = get_adreno_cl_compiler_version(driver_version);
    backend_ctx->has_vector_subgroup_broadcast =
-        backend_ctx->adreno_cl_compiler_version.major >= 47 ||
+        (backend_ctx->adreno_cl_compiler_version.type == E031 && backend_ctx->adreno_cl_compiler_version.major >= 47) ||
-        backend_ctx->adreno_cl_compiler_version.major == 17;
+        (backend_ctx->adreno_cl_compiler_version.type == DX   && backend_ctx->adreno_cl_compiler_version.major >= 17);
    GGML_LOG_INFO("ggml_opencl: vector subgroup broadcast support: %s\n",
        backend_ctx->has_vector_subgroup_broadcast ? "true" : "false");
@ -2447,12 +2458,15 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
                default:
                    return false;
            }
        case GGML_OP_ADD:
        case GGML_OP_SCALE:
            return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]);
        case GGML_OP_ADD:
        case GGML_OP_MUL:
        case GGML_OP_DIV:
        case GGML_OP_SUB:
-            return op->src[0]->type == GGML_TYPE_F32;
+            return (op->src[0]->type == op->src[1]->type) &&
                   (op->src[0]->type == op->type) &&
                   (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16);
        case GGML_OP_UNARY:
            switch (ggml_get_unary_op(op)) {
                case GGML_UNARY_OP_GELU:
@ -3680,35 +3694,39 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
    GGML_ASSERT(dst);
    GGML_ASSERT(dst->extra);
-    const int  ne00 = src0 ? src0->ne[0] : 0;
+    GGML_ASSERT(src0->type == src1->type);
-    const int  ne01 = src0 ? src0->ne[1] : 0;
+    GGML_ASSERT(src0->type == dst->type);
-    const int  ne02 = src0 ? src0->ne[2] : 0;
+    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
    const int  ne03 = src0 ? src0->ne[3] : 0;
-    const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
+    const int  ne00 = src0->ne[0];
-    const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
+    const int  ne01 = src0->ne[1];
-    const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
+    const int  ne02 = src0->ne[2];
-    const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
+    const int  ne03 = src0->ne[3];
-    const int  ne10 = src1 ? src1->ne[0] : 0;
+    const cl_ulong nb00 = src0->nb[0];
-    const int  ne11 = src1 ? src1->ne[1] : 0;
+    const cl_ulong nb01 = src0->nb[1];
-    const int  ne12 = src1 ? src1->ne[2] : 0;
+    const cl_ulong nb02 = src0->nb[2];
-    const int  ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13);
+    const cl_ulong nb03 = src0->nb[3];
-    const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
+    const int  ne10 = src1->ne[0];
-    const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
+    const int  ne11 = src1->ne[1];
-    const cl_ulong nb12 = src1 ? src1->nb[2] : 0;
+    const int  ne12 = src1->ne[2];
-    const cl_ulong nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13);
+    const int  ne13 = src1->ne[3]; UNUSED(ne13);
-    const int  ne0  = dst ? dst->ne[0] : 0;
+    const cl_ulong nb10 = src1->nb[0];
-    const int  ne1  = dst ? dst->ne[1] : 0;
+    const cl_ulong nb11 = src1->nb[1];
-    const int  ne2  = dst ? dst->ne[2] : 0;
+    const cl_ulong nb12 = src1->nb[2];
-    const int  ne3  = dst ? dst->ne[3] : 0;
+    const cl_ulong nb13 = src1->nb[3]; UNUSED(nb13);
-    const cl_ulong nb0  = dst ? dst->nb[0] : 0;
+    const int  ne0  = dst->ne[0];
-    const cl_ulong nb1  = dst ? dst->nb[1] : 0;
+    const int  ne1  = dst->ne[1];
-    const cl_ulong nb2  = dst ? dst->nb[2] : 0;
+    const int  ne2  = dst->ne[2];
-    const cl_ulong nb3  = dst ? dst->nb[3] : 0;
+    const int  ne3  = dst->ne[3];
    const cl_ulong nb0  = dst->nb[0];
    const cl_ulong nb1  = dst->nb[1];
    const cl_ulong nb2  = dst->nb[2];
    const cl_ulong nb3  = dst->nb[3];
    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
@ -3731,7 +3749,12 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
        bcast_row = true;
        int ne = ne00 / 4;
-        kernel = backend_ctx->kernel_add_row;
+
        if (src0->type == GGML_TYPE_F32) {
            kernel = backend_ctx->kernel_add_row;
        } else {
            kernel = backend_ctx->kernel_add_row_f16;
        }
        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
@ -3741,7 +3764,11 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
        CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
        CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),      &ne));
    } else {
-        kernel = backend_ctx->kernel_add;
+        if (src0->type == GGML_TYPE_F32) {
            kernel = backend_ctx->kernel_add;
        } else {
            kernel = backend_ctx->kernel_add_f16;
        }
        CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
        CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
@ -3803,35 +3830,39 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
    GGML_ASSERT(dst);
    GGML_ASSERT(dst->extra);
-    const int ne00 = src0 ? src0->ne[0] : 0;
+    GGML_ASSERT(src0->type == src1->type);
-    const int ne01 = src0 ? src0->ne[1] : 0;
+    GGML_ASSERT(src0->type == dst->type);
-    const int ne02 = src0 ? src0->ne[2] : 0;
+    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
    const int ne03 = src0 ? src0->ne[3] : 0;
-    const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
+    const int ne00 = src0->ne[0];
-    const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
+    const int ne01 = src0->ne[1];
-    const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
+    const int ne02 = src0->ne[2];
-    const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
+    const int ne03 = src0->ne[3];
-    const int ne10 = src1 ? src1->ne[0] : 0;
+    const cl_ulong nb00 = src0->nb[0];
-    const int ne11 = src1 ? src1->ne[1] : 0;
+    const cl_ulong nb01 = src0->nb[1];
-    const int ne12 = src1 ? src1->ne[2] : 0;
+    const cl_ulong nb02 = src0->nb[2];
-    const int ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13);
+    const cl_ulong nb03 = src0->nb[3];
-    const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
+    const int ne10 = src1->ne[0];
-    const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
+    const int ne11 = src1->ne[1];
-    const cl_ulong nb12 = src1 ? src1->nb[2] : 0;
+    const int ne12 = src1->ne[2];
-    const cl_ulong nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13);
+    const int ne13 = src1->ne[3]; UNUSED(ne13);
-    const int ne0  = dst ? dst->ne[0] : 0;
+    const cl_ulong nb10 = src1->nb[0];
-    const int ne1  = dst ? dst->ne[1] : 0;
+    const cl_ulong nb11 = src1->nb[1];
-    const int ne2  = dst ? dst->ne[2] : 0;
+    const cl_ulong nb12 = src1->nb[2];
-    const int ne3  = dst ? dst->ne[3] : 0;
+    const cl_ulong nb13 = src1->nb[3]; UNUSED(nb13);
-    const cl_ulong nb0  = dst ? dst->nb[0] : 0;
+    const int ne0  = dst->ne[0];
-    const cl_ulong nb1  = dst ? dst->nb[1] : 0;
+    const int ne1  = dst->ne[1];
-    const cl_ulong nb2  = dst ? dst->nb[2] : 0;
+    const int ne2  = dst->ne[2];
-    const cl_ulong nb3  = dst ? dst->nb[3] : 0;
+    const int ne3  = dst->ne[3];
    const cl_ulong nb0  = dst->nb[0];
    const cl_ulong nb1  = dst->nb[1];
    const cl_ulong nb2  = dst->nb[2];
    const cl_ulong nb3  = dst->nb[3];
    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
@ -3854,7 +3885,12 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
        bcast_row = true;
        int ne = ne00 / 4;
-        kernel = backend_ctx->kernel_mul_row;
+
        if (src0->type == GGML_TYPE_F32) {
            kernel = backend_ctx->kernel_mul_row;
        } else {
            kernel = backend_ctx->kernel_mul_row_f16;
        }
        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
@ -3864,7 +3900,11 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
        CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
        CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),      &ne));
    } else {
-        kernel = backend_ctx->kernel_mul;
+        if (src0->type == GGML_TYPE_F32) {
            kernel = backend_ctx->kernel_mul;
        } else {
            kernel = backend_ctx->kernel_mul_f16;
        }
        CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
        CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
@ -3926,6 +3966,10 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const
    GGML_ASSERT(dst);
    GGML_ASSERT(dst->extra);
    GGML_ASSERT(src0->type == src1->type);
    GGML_ASSERT(src0->type == dst->type);
    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
    const int ne00 = src0->ne[0];
    const int ne01 = src0->ne[1];
    const int ne02 = src0->ne[2];
@ -3974,7 +4018,12 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const
        bcast_row = true;
        int ne = ne00 / 4;
-        kernel = backend_ctx->kernel_div_row;
+
        if (src0->type == GGML_TYPE_F32) {
            kernel = backend_ctx->kernel_div_row;
        } else {
            kernel = backend_ctx->kernel_div_row_f16;
        }
        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
@ -3984,7 +4033,11 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const
        CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
        CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),      &ne));
    } else {
-        kernel = backend_ctx->kernel_div;
+        if (src0->type == GGML_TYPE_F32) {
            kernel = backend_ctx->kernel_div;
        } else {
            kernel = backend_ctx->kernel_div_f16;
        }
        CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
        CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
@ -4034,6 +4087,10 @@ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const
    GGML_ASSERT(dst);
    GGML_ASSERT(dst->extra);
    GGML_ASSERT(src0->type == src1->type);
    GGML_ASSERT(src0->type == dst->type);
    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
    const int ne00 = src0->ne[0];
    const int ne01 = src0->ne[1];
    const int ne02 = src0->ne[2];
@ -4082,7 +4139,12 @@ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const
        bcast_row = true;
        int ne = ne00 / 4;
-        kernel = backend_ctx->kernel_sub_row;
+
        if (src0->type == GGML_TYPE_F32) {
            kernel = backend_ctx->kernel_sub_row;
        } else {
            kernel = backend_ctx->kernel_sub_row_f16;
        }
        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
@ -4092,7 +4154,11 @@ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const
        CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
        CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),      &ne));
    } else {
-        kernel = backend_ctx->kernel_sub;
+        if (src0->type == GGML_TYPE_F32) {
            kernel = backend_ctx->kernel_sub;
        } else {
            kernel = backend_ctx->kernel_sub_f16;
        }
        CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
        CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
--- a/ggml/src/ggml-opencl/kernels/add.cl
+++ b/ggml/src/ggml-opencl/kernels/add.cl
@ -81,3 +81,76 @@ kernel void kernel_add_row(
    uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
    dst[gid] = src0[gid] + src1[idx1];
 }
 kernel void kernel_add_f16(
        global char * src0,
        ulong  offset0,
        global char * src1,
        ulong  offset1,
        global char * dst,
        ulong  offsetd,
        int   ne00,
        int   ne01,
        int   ne02,
        int   ne03,
        ulong nb00,
        ulong nb01,
        ulong nb02,
        ulong nb03,
        int   ne10,
        int   ne11,
        int   ne12,
        int   ne13,
        ulong nb10,
        ulong nb11,
        ulong nb12,
        ulong nb13,
        int   ne0,
        int   ne1,
        int   ne2,
        int   ne3,
        ulong nb0,
        ulong nb1,
        ulong nb2,
        ulong nb3
 ) {
    src0 = src0 + offset0;
    src1 = src1 + offset1;
    dst = dst + offsetd;
    int i03 = get_group_id(2);
    int i02 = get_group_id(1);
    int i01 = get_group_id(0);
    int i13 = i03 % ne13;
    int i12 = i02 % ne12;
    int i11 = i01 % ne11;
    global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
    global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
    global char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1;
    for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
        const int i10 = i0 % ne10;
        *((global half *)(dst_ptr + i0*nb0)) = *((global half *)(src0_ptr + i0*nb00)) + *((global half *)(src1_ptr + i10*nb10));
    }
 }
 kernel void kernel_add_row_f16(
        global half4 * src0,
        ulong  offset0,
        global half4 * src1,
        ulong  offset1,
        global half4 * dst,
        ulong  offsetd,
        int ne
 ) {
    src0 = (global half4*)((global char*)src0 + offset0);
    src1 = (global half4*)((global char*)src1 + offset1);
    dst = (global half4*)((global char*)dst + offsetd);
    // This performs better than using %.
    uint gid = get_global_id(0);
    uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
    dst[gid] = src0[gid] + src1[idx1];
 }
--- a/ggml/src/ggml-opencl/kernels/div.cl
+++ b/ggml/src/ggml-opencl/kernels/div.cl
@ -70,3 +70,69 @@ kernel void kernel_div_row(
    uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
    dst[gid] = src0[gid] / src1[idx1];
 }
 kernel void kernel_div_f16(
        global char * src0,
        ulong offset0,
        global char * src1,
        ulong offset1,
        global char * dst,
        ulong offsetd,
        ulong nb00,
        ulong nb01,
        ulong nb02,
        ulong nb03,
        int ne10,
        int ne11,
        int ne12,
        int ne13,
        ulong nb10,
        ulong nb11,
        ulong nb12,
        ulong nb13,
        int ne0,
        ulong nb0,
        ulong nb1,
        ulong nb2,
        ulong nb3
 ) {
    src0 = src0 + offset0;
    src1 = src1 + offset1;
    dst  = dst + offsetd;
    int i03 = get_group_id(2);
    int i02 = get_group_id(1);
    int i01 = get_group_id(0);
    int i13 = i03 % ne13;
    int i12 = i02 % ne12;
    int i11 = i01 % ne11;
    global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
    global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
    global char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1;
    for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
        const int i10 = i0 % ne10;
        *((global half *)(dst_ptr + i0*nb0)) = *((global half *)(src0_ptr + i0*nb00)) / *((global half *)(src1_ptr + i10*nb10));
    }
 }
 kernel void kernel_div_row_f16(
        global half4 * src0,
        ulong offset0,
        global half4 * src1,
        ulong offset1,
        global half4 * dst,
        ulong offsetd,
        int ne
 ) {
    src0 = (global half4*)((global char*)src0 + offset0);
    src1 = (global half4*)((global char*)src1 + offset1);
    dst = (global half4*)((global char*)dst + offsetd);
    // This performs better than using %.
    uint gid = get_global_id(0);
    uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
    dst[gid] = src0[gid] / src1[idx1];
 }
--- a/ggml/src/ggml-opencl/kernels/mul.cl
+++ b/ggml/src/ggml-opencl/kernels/mul.cl
@ -77,3 +77,76 @@ kernel void kernel_mul_row(
    uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
    dst[gid] = src0[gid] * src1[idx1];
 }
 kernel void kernel_mul_f16(
        global char * src0,
        ulong offset0,
        global char * src1,
        ulong offset1,
        global char * dst,
        ulong offsetd,
        int ne00,
        int ne01,
        int ne02,
        int ne03,
        ulong nb00,
        ulong nb01,
        ulong nb02,
        ulong nb03,
        int ne10,
        int ne11,
        int ne12,
        int ne13,
        ulong nb10,
        ulong nb11,
        ulong nb12,
        ulong nb13,
        int ne0,
        int ne1,
        int ne2,
        int ne3,
        ulong nb0,
        ulong nb1,
        ulong nb2,
        ulong nb3
 ) {
    src0 = src0 + offset0;
    src1 = src1 + offset1;
    dst  = dst + offsetd;
    int i03 = get_group_id(2);
    int i02 = get_group_id(1);
    int i01 = get_group_id(0);
    int i13 = i03 % ne13;
    int i12 = i02 % ne12;
    int i11 = i01 % ne11;
    global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
    global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
    global char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1;
    for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
        const int i10 = i0 % ne10;
        *((global half *)(dst_ptr + i0*nb0)) = *((global half *)(src0_ptr + i0*nb00)) * *((global half *)(src1_ptr + i10*nb10));
    }
 }
 kernel void kernel_mul_row_f16(
        global half4 * src0,
        ulong offset0,
        global half4 * src1,
        ulong offset1,
        global half4 * dst,
        ulong offsetd,
        int ne
 ) {
    src0 = (global half4*)((global char*)src0 + offset0);
    src1 = (global half4*)((global char*)src1 + offset1);
    dst = (global half4*)((global char*)dst + offsetd);
    // This performs better than using %.
    uint gid = get_global_id(0);
    uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
    dst[gid] = src0[gid] * src1[idx1];
 }
--- a/ggml/src/ggml-opencl/kernels/sub.cl
+++ b/ggml/src/ggml-opencl/kernels/sub.cl
@ -70,3 +70,69 @@ kernel void kernel_sub_row(
    uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
    dst[gid] = src0[gid] - src1[idx1];
 }
 kernel void kernel_sub_f16(
        global char * src0,
        ulong offset0,
        global char * src1,
        ulong offset1,
        global char * dst,
        ulong offsetd,
        ulong nb00,
        ulong nb01,
        ulong nb02,
        ulong nb03,
        int ne10,
        int ne11,
        int ne12,
        int ne13,
        ulong nb10,
        ulong nb11,
        ulong nb12,
        ulong nb13,
        int ne0,
        ulong nb0,
        ulong nb1,
        ulong nb2,
        ulong nb3
 ) {
    src0 = src0 + offset0;
    src1 = src1 + offset1;
    dst  = dst + offsetd;
    int i03 = get_group_id(2);
    int i02 = get_group_id(1);
    int i01 = get_group_id(0);
    int i13 = i03 % ne13;
    int i12 = i02 % ne12;
    int i11 = i01 % ne11;
    global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
    global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
    global char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1;
    for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
        const int i10 = i0 % ne10;
        *((global half *)(dst_ptr + i0*nb0)) = *((global half *)(src0_ptr + i0*nb00)) - *((global half *)(src1_ptr + i10*nb10));
    }
 }
 kernel void kernel_sub_row_f16(
        global half4 * src0,
        ulong offset0,
        global half4 * src1,
        ulong offset1,
        global half4 * dst,
        ulong offsetd,
        int ne
 ) {
    src0 = (global half4*)((global char*)src0 + offset0);
    src1 = (global half4*)((global char*)src1 + offset1);
    dst = (global half4*)((global char*)dst + offsetd);
    // This performs better than using %.
    uint gid = get_global_id(0);
    uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
    dst[gid] = src0[gid] - src1[idx1];
 }
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@ -2688,6 +2688,9 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
    const size_t       type_size_src0 = ggml_type_size(src0->type);
    const size_t       type_size_src1 = ggml_type_size(src1->type);
    bool is_src0_cont_2 = ggml_is_contiguous_2(src0);
    bool is_src1_cont_2 = ggml_is_contiguous_2(src1);
    // SRC1 strides
    int64_t                          s11 = nb11 / type_size_src1;
    int64_t                          s12 = nb12 / type_size_src1;
@ -2737,6 +2740,8 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
        s11      = ne10;
        s12      = ne11 * s11;
        s13      = ne12 * s12;
        is_src1_cont_2 = true;
    }
    ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool());
@ -2852,12 +2857,16 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
    else
 #endif
    {
-        if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
+        if (r2 == 1 && r3 == 1 && is_src0_cont_2 && is_src1_cont_2) {
            // with a [0, 2, 1, 3] perm. and ne02==1 the matrix strides need to be determined from dim 3:
            const int64_t sma = ne02 == 1 ? nb03/nb00 : nb02/nb00;
            const int64_t smb = ne12 == 1 ? s13       : s12;
            // there is no broadcast and src0, src1 are contiguous across dims 2, 3
            SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(*queue, oneapi::math::transpose::trans,
                                                        oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
-                                                        src0_f16, dpct::library_data_t::real_half, nb01 / nb00, nb02 / nb00,
+                                                        src0_f16, dpct::library_data_t::real_half, nb01 / nb00, sma,
-                                                        src1_f16, dpct::library_data_t::real_half, s11, s12, beta, dst_ddf,
+                                                        src1_f16, dpct::library_data_t::real_half, s11, smb, beta, dst_ddf,
                                                        mkl_data_type, ne0, ne1 * ne0, ne12 * ne13, mkl_compute_type)));
        } else {
            const int ne23 = ne12 * ne13;
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@ -222,6 +222,7 @@ enum vk_device_architecture {
    AMD_RDNA2,
    AMD_RDNA3,
    INTEL_XE2,
    NVIDIA_PRE_TURING,
 };
 // HSK x HSV
@ -315,10 +316,33 @@ static vk_device_architecture get_device_architecture(const vk::PhysicalDevice&
            // https://www.intel.com/content/www/us/en/docs/oneapi/optimization-guide-gpu/2025-0/intel-xe-gpu-architecture.html
            return vk_device_architecture::INTEL_XE2;
        }
    } else if (props.vendorID == VK_VENDOR_ID_NVIDIA) {
        const std::vector<vk::ExtensionProperties> ext_props = device.enumerateDeviceExtensionProperties();
        bool cooperative_matrix = false;
        // Detect "pre-turing" based on lack of coopmat support.
        for (const auto& properties : ext_props) {
            if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0) {
                cooperative_matrix = true;
                break;
            }
        }
        if (!cooperative_matrix) {
            return vk_device_architecture::NVIDIA_PRE_TURING;
        }
    }
    return vk_device_architecture::OTHER;
 }
 enum vk_conv_shapes {
    CONV_SHAPE_128x128,
    CONV_SHAPE_64x32,
    CONV_SHAPE_32x256,
    CONV_SHAPE_COUNT,
 };
 struct vk_device_struct {
    std::recursive_mutex mutex;
@ -483,8 +507,8 @@ struct vk_device_struct {
    vk_pipeline pipeline_rwkv_wkv6_f32;
    vk_pipeline pipeline_rwkv_wkv7_f32;
    vk_pipeline pipeline_opt_step_adamw_f32;
-    vk_pipeline pipeline_conv2d_f32;
+    vk_pipeline pipeline_conv2d_f32[CONV_SHAPE_COUNT];
-    vk_pipeline pipeline_conv2d_f16_f32;
+    vk_pipeline pipeline_conv2d_f16_f32[CONV_SHAPE_COUNT];
    vk_pipeline pipeline_conv2d_dw_whcn_f32;
    vk_pipeline pipeline_conv2d_dw_cwhn_f32;
@ -908,8 +932,22 @@ struct vk_op_conv2d_push_constants {
    uint32_t nb1;
    uint32_t nb2;
    uint32_t nb3;
    // init_fastdiv_values constants for dividing by KW, KW*KH, OW, OW*OH
    uint32_t KWmp;   uint32_t KWL;
    uint32_t KWKHmp; uint32_t KWKHL;
    uint32_t OWmp;   uint32_t OWL;
    uint32_t OWOHmp; uint32_t OWOHL;
 };
 template <> void init_pushconst_fastdiv(vk_op_conv2d_push_constants &p) {
    // Compute magic values to divide by KW, KW*KH, OW, OW*OH
    init_fastdiv_values(p.KW,       p.KWmp,    p.KWL);
    init_fastdiv_values(p.KW*p.KH,  p.KWKHmp,  p.KWKHL);
    init_fastdiv_values(p.OW,       p.OWmp,    p.OWL);
    init_fastdiv_values(p.OW*p.OH,  p.OWOHmp,  p.OWOHL);
 }
 struct vk_op_conv2d_dw_push_constants {
    uint32_t ne;
    uint32_t batches;
@ -2068,12 +2106,12 @@ static void ggml_vk_load_shaders(vk_device& device) {
        s_mmq_wg_denoms = { 32,  64,  1 };
        // spec constants and tile sizes for quant matmul (Qi_K)
-        l_warptile_mmq_k = { 256, 64, 128, 64,  1 };
+        l_warptile_mmq_k = { 256, 128, 256, 64, 1 };
-        m_warptile_mmq_k = { 256, 32,  64, 64,  0 };
+        m_warptile_mmq_k = { 256, 128, 128, 64, 1 };
-        s_warptile_mmq_k = { 256, 32,  32, 128, 0 };
+        s_warptile_mmq_k = { 256, 32,  64, 128, 0 };
-        l_mmq_wg_denoms_k = { 64, 128, 1 };
+        l_mmq_wg_denoms_k = { 128, 256, 1 };
-        m_mmq_wg_denoms_k = { 32,  64, 1 };
+        m_mmq_wg_denoms_k = { 128, 128, 1 };
-        s_mmq_wg_denoms_k = { 32,  32, 1 };
+        s_mmq_wg_denoms_k = { 32,  64,  1 };
        // spec constants and tile sizes for quant matmul_id
        l_warptile_mmqid = { 256, 128, 128, 16, 0 };
@ -2847,7 +2885,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
            ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_p021_f16_f32[i], "mul_mat_vec_p021_f16_f32"+std::to_string(i+1), mul_mat_vec_p021_f16_f32_len,              mul_mat_vec_p021_f16_f32_data,              "main", 3, 6 * sizeof(uint32_t), {1, 1, 1}, {device->subgroup_size, i + 1}, 1, true);
        }
    }
-    ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_nc_f16_f32, "mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", 3, 9 * sizeof(uint32_t), {1, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_nc_f16_f32, "mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", 3, 12 * sizeof(uint32_t), {1, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_norm_f32, "norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_group_norm_f32, "group_norm_f32", group_norm_f32_len, group_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
@ -3048,48 +3086,105 @@ static void ggml_vk_load_shaders(vk_device& device) {
    ggml_vk_create_pipeline(device, device->pipeline_opt_step_adamw_f32, "opt_step_adamw_f32", opt_step_adamw_f32_len, opt_step_adamw_f32_data, "main", 5, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
    // conv2d
-    uint32_t conv2d_WG_SIZE  = 256;
+    for (uint32_t s = 0; s < CONV_SHAPE_COUNT; ++s) {
-    uint32_t conv2d_BS_K     = 128;
+        uint32_t conv2d_WG_SIZE  = 256;
-    uint32_t conv2d_BS_CRS   = 16;
+        uint32_t conv2d_BS_K     = 128;
-    uint32_t use_collectives = 0;  // Enables subgroup ops for preventing the re-calculation of indices.
+        uint32_t conv2d_BS_CRS   = 16;
-    if (device->subgroup_shuffle &&
+        uint32_t use_collectives = 0;  // Enables subgroup ops for preventing the re-calculation of indices.
-        device->vendor_id != VK_VENDOR_ID_INTEL) {  // Do not enable collectives on Intel, see PR 14316
+        uint32_t conv2d_BS_NPQ = 128;
-        use_collectives = 1;
+        uint32_t conv2d_TS_K   = 8;
-        conv2d_BS_CRS   = std::min(
+        uint32_t conv2d_SHMEM_PAD = 4;
-            device->subgroup_size,
+        bool conv2d_UNROLL = true;
            conv2d_BS_CRS);  // CRS block size should be capped at sugroup size for correctness when shuffle is used.
    }
    uint32_t conv2d_BS_NPQ = 128;
    uint32_t conv2d_TS_K   = 8;
    uint32_t conv2d_shmem_req =
        (conv2d_BS_K * (conv2d_BS_CRS + 1) + conv2d_BS_CRS * (conv2d_BS_NPQ + 1)) * sizeof(float);
    if (device->properties.limits.maxComputeSharedMemorySize < conv2d_shmem_req) {
        conv2d_BS_CRS = 8;
        if (use_collectives) {
            conv2d_BS_CRS = std::min(device->subgroup_size, conv2d_BS_CRS);
        }
    }
-    if (use_collectives) {
+#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
-        ggml_vk_create_pipeline(
+        if (device->coopmat2) {
-            device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3,
+            conv2d_SHMEM_PAD = 8; // 8 float16_t
-            sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 },
+        }
-            { conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true, true);
+#endif
-        ggml_vk_create_pipeline(
+
-            device, device->pipeline_conv2d_f16_f32, "conv2d_f16_f32", conv2d_f16_f32_len, conv2d_f16_f32_data, "main", 3,
+        if (device->vendor_id == VK_VENDOR_ID_INTEL) {
-            sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 },
+            conv2d_SHMEM_PAD = 0;
-            { conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true, true);
+            conv2d_UNROLL = false;
-    } else {
+        } else if (device->vendor_id == VK_VENDOR_ID_AMD) {
-        ggml_vk_create_pipeline(
+            conv2d_SHMEM_PAD = device->architecture == vk_device_architecture::AMD_GCN ? 1 : 4;
-            device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3,
+        }
-            sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 },
+
-            { conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true,
+        switch (s) {
-            false);
+        default:
-        ggml_vk_create_pipeline(
+        case CONV_SHAPE_128x128:
-            device, device->pipeline_conv2d_f16_f32, "conv2d_f16_f32", conv2d_f16_f32_len, conv2d_f16_f32_data, "main", 3,
+            conv2d_BS_K = 128;
-            sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 },
+            conv2d_BS_NPQ = 128;
-            { conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true,
+            conv2d_BS_CRS = 16;
-            false);
+            if (device->vendor_id == VK_VENDOR_ID_AMD && device->architecture != vk_device_architecture::AMD_GCN) {
                conv2d_UNROLL = false;
            }
            break;
        case CONV_SHAPE_64x32:
            conv2d_BS_K = 64;
            conv2d_BS_NPQ = 32;
            conv2d_BS_CRS = 32;
            conv2d_TS_K   = 4;
            break;
        case CONV_SHAPE_32x256:
            conv2d_BS_K = 32;
            conv2d_BS_NPQ = 256;
            conv2d_BS_CRS = 16;
            break;
        }
        // Use collectives on pre-Turing NVIDIA GPUs and GCN AMD cards, which had slower integer math.
        bool allow_collectives_nv = device->vendor_id != VK_VENDOR_ID_NVIDIA ||
                                    device->architecture == vk_device_architecture::NVIDIA_PRE_TURING;
        bool allow_collectives_amd = device->vendor_id != VK_VENDOR_ID_AMD ||
                                     device->architecture == vk_device_architecture::AMD_GCN;
        if (device->subgroup_shuffle &&
            device->vendor_id != VK_VENDOR_ID_INTEL &&   // Do not enable collectives on Intel, see PR 14316.
            allow_collectives_nv &&
            allow_collectives_amd) {
            use_collectives = 1;
            conv2d_BS_CRS   = std::min(
                device->subgroup_size,
                conv2d_BS_CRS);  // CRS block size should be capped at subgroup size for correctness when shuffle is used.
        }
        uint32_t conv2d_shmem_req =
            (conv2d_BS_K * (conv2d_BS_CRS + conv2d_SHMEM_PAD) + conv2d_BS_CRS * (conv2d_BS_NPQ + conv2d_SHMEM_PAD)) * sizeof(float);
        if (device->properties.limits.maxComputeSharedMemorySize < conv2d_shmem_req) {
            conv2d_BS_CRS = 8;
            if (use_collectives) {
                conv2d_BS_CRS = std::min(device->subgroup_size, conv2d_BS_CRS);
            }
        }
        std::array<uint32_t, 3> wg_denoms = { conv2d_BS_K, conv2d_BS_NPQ, 1 };
        std::vector<uint32_t> spec_constants = { conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives, conv2d_SHMEM_PAD };
 #if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
        if (device->coopmat2) {
            ggml_vk_create_pipeline(
                device, device->pipeline_conv2d_f32[s], "conv2d_f32", conv2d_f32_cm2_len, conv2d_f32_cm2_data, "main", 3,
                sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants, 1, true, use_collectives);
            ggml_vk_create_pipeline(
                device, device->pipeline_conv2d_f16_f32[s], "conv2d_f16_f32", conv2d_f16_f32_cm2_len, conv2d_f16_f32_cm2_data, "main", 3,
                sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants, 1, true, use_collectives);
        } else
 #endif
        if (conv2d_UNROLL) {
            ggml_vk_create_pipeline(
                device, device->pipeline_conv2d_f32[s], "conv2d_f32", conv2d_f32_unroll_len, conv2d_f32_unroll_data, "main", 3,
                sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants, 1, true, use_collectives);
            ggml_vk_create_pipeline(
                device, device->pipeline_conv2d_f16_f32[s], "conv2d_f16_f32", conv2d_f16_f32_unroll_len, conv2d_f16_f32_unroll_data, "main", 3,
                sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants, 1, true, use_collectives);
        } else {
            ggml_vk_create_pipeline(
                device, device->pipeline_conv2d_f32[s], "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3,
                sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants, 1, true, use_collectives);
            ggml_vk_create_pipeline(
                device, device->pipeline_conv2d_f16_f32[s], "conv2d_f16_f32", conv2d_f16_f32_len, conv2d_f16_f32_data, "main", 3,
                sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants, 1, true, use_collectives);
        }
    }
    ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f32, "conv2d_dw_whcn_f32", conv2d_dw_whcn_f32_len, conv2d_dw_whcn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
@ -4943,26 +5038,37 @@ static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, siz
    ggml_vk_queue_command_pools_cleanup(dst->device);
 }
-static uint32_t ggml_vk_guess_split_k(ggml_backend_vk_context * ctx, int m, int n, int k, const vk_pipeline& pipeline) {
+static uint32_t ggml_vk_guess_split_k(ggml_backend_vk_context * ctx, uint32_t m, uint32_t n, uint32_t k, const vk_pipeline& pipeline) {
    VK_LOG_DEBUG("ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ")");
    uint32_t split_k = 1;
-    if (ctx->device->shader_core_count != 0 && m >= (int)pipeline->wg_denoms[0] && n >= (int)pipeline->wg_denoms[1]) {
+    if (ctx->device->shader_core_count != 0 && m >= pipeline->wg_denoms[0] && n >= pipeline->wg_denoms[1]) {
        // If k is 'large' and the SMs will fill less than halfway, use split_k.
        uint32_t m_tiles = CEIL_DIV(m, pipeline->wg_denoms[0]);
        uint32_t n_tiles = CEIL_DIV(n, pipeline->wg_denoms[1]);
-        if (k >= 2048 && m_tiles * n_tiles < ctx->device->shader_core_count / 2) {
+
-            split_k = ctx->device->shader_core_count / (m_tiles * n_tiles);
+        if (k >= 2048) {
-            // Clamp to 2 or 4
+            if (m_tiles * n_tiles <= ctx->device->shader_core_count / 2) {
-            split_k = std::min(split_k, 4u);
+                split_k = ctx->device->shader_core_count / (m_tiles * n_tiles);
-            if (split_k == 3) {
+            } else if (m_tiles * n_tiles <= ctx->device->shader_core_count * 2 / 3) {
-                split_k = 2;
+                split_k = 3;
            }
-            if (ctx->device->coopmat2) {
+            // Cap the split at 8x. Unless k is huge this is a lot of overhead.
-                // coopmat2 shader expects splits to be aligned to 256
+            split_k = std::min(split_k, 8u);
-                while (split_k > 1 && ((k / split_k) % 256) != 0) {
+
-                    split_k /= 2;
+            // ggml_vk_matmul will align the splits to be a multiple of 256.
            // If this rounded up size would cause the last split to be empty,
            // then reduce the split count.
            while (true) {
                if (split_k == 1) {
                    break;
                }
                uint32_t k_split = CEIL_DIV(k, split_k);
                k_split = ROUNDUP_POW2(k_split, 256);
                if (k_split * (split_k - 1) < k) {
                    break;
                }
                split_k--;
            }
        }
    }
@ -4974,9 +5080,22 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
    VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ", " << ggml_type_name(src0_type) << ", " << ggml_type_name(src1_type) << ")");
    if (ctx->device->coopmat2) {
        const uint32_t shader_core_count = ctx->device->shader_core_count;
        const uint32_t tiles_l = CEIL_DIV(m, mmp->a_l->wg_denoms[0]) * CEIL_DIV(n, mmp->a_l->wg_denoms[1]);
        const uint32_t tiles_m = CEIL_DIV(m, mmp->a_m->wg_denoms[0]) * CEIL_DIV(n, mmp->a_m->wg_denoms[1]);
        // Use large shader when the N dimension is greater than the medium shader's tile size
        uint32_t crossover_large = mmp->m->wg_denoms[1];
-        if ((ctx->device->mul_mat_l[src0_type] && (n > crossover_large)) || (!ctx->device->mul_mat_m[src0_type] && !ctx->device->mul_mat_s[src0_type])) {
+
        // Prefer large over medium if either:
        // - medium or large tiles would overfill the GPU
        // - large tiles with a split_k==3 fits in the GPU and medium tiles with split_k==2 does not
        //   (medium with split_k==2 is probably better if it fits - more workgroups running and less split_k overhead)
        bool prefer_large = tiles_m > shader_core_count || tiles_l > shader_core_count ||
                            // split_k==3 with large tiles likely better than medium tiles with no split_k.
                            (tiles_l <= shader_core_count / 3 && tiles_m > shader_core_count / 2);
        if ((ctx->device->mul_mat_l[src0_type] && (n > crossover_large && prefer_large)) || (!ctx->device->mul_mat_m[src0_type] && !ctx->device->mul_mat_s[src0_type])) {
            return aligned ? mmp->a_l : mmp->l;
        }
        // Use medium shader when the N dimension is greater than the small shader's tile size
@ -5020,7 +5139,11 @@ static void ggml_vk_matmul(
    GGML_ASSERT(batch_stride_d == m * n);
-    const vk_mat_mat_push_constants pc1 = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, CEIL_DIV(k, split_k), ne02, ne12, broadcast2, broadcast3, padded_n };
+    // Round the split size up to a multiple of 256 (k-quant alignment)
    uint32_t k_split = CEIL_DIV(k, split_k);
    k_split = ROUNDUP_POW2(k_split, 256);
    const vk_mat_mat_push_constants pc1 = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, k_split, ne02, ne12, broadcast2, broadcast3, padded_n };
    // Make sure enough workgroups get assigned for split k to work
    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, pc1, { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, batch });
    ggml_vk_sync_buffers(subctx);
@ -5742,7 +5865,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
    const uint64_t ne00 = src0->ne[0];
    const uint64_t ne01 = src0->ne[1];
    const uint64_t ne02 = src0->ne[2];
-    // const uint64_t ne03 = src0->ne[3];
+    const uint64_t ne03 = src0->ne[3];
    const uint64_t nb01 = src0->nb[1];
    const uint64_t nb02 = src0->nb[2];
@ -5754,7 +5877,12 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
    const uint64_t ne12 = src1->ne[2];
    // const uint64_t ne13 = src1->ne[3];
    const uint32_t nb03 = (uint32_t)(src0->nb[3] / sizeof(ggml_fp16_t));
    const uint32_t nb13 = (uint32_t)(src1->nb[3] / sizeof(float));
    const uint32_t nb23 = (uint32_t)(dst->nb[3] / sizeof(float));
    GGML_ASSERT(ne11 == 1);
    GGML_ASSERT(src0->ne[3] == src1->ne[3]); // checked in supports_op
    ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
    ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
@ -5770,7 +5898,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
        src1_uma = d_Qy != nullptr;
    }
-    const uint64_t d_ne = ne01 * ne11 * ne12;
+    const uint64_t d_ne = ne01 * ne11 * ne12 * ne03;
    const uint32_t row_stride_x = nb01 / sizeof(ggml_fp16_t);
    const uint32_t channel_stride_x = nb02 / sizeof(ggml_fp16_t);
@ -5805,10 +5933,10 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
    const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset;
    // compute
-    const std::array<uint32_t, 9> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, channel_stride_y, (uint32_t)(ne12 / ne02), (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
+    const std::array<uint32_t, 12> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, channel_stride_y, (uint32_t)(ne12 / ne02), (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)), nb03, nb13, nb23 };
    ggml_vk_sync_buffers(subctx);
    ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32,
-        { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
+        { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, pc, { (uint32_t)ne03, (uint32_t)ne01, (uint32_t)ne12 });
 }
 static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
@ -6641,6 +6769,34 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
    }
 }
 static std::array<uint32_t, 3> ggml_vk_get_conv_elements(const ggml_tensor *dst) {
    const ggml_tensor *src0 = dst->src[0];
    const ggml_tensor *src1 = dst->src[1];
    // src0 - kernel:   [KW, KH, Cin, Cout]
    // src1 - input:    [W, H, Cin, N]
    // dst - result:    [OW, OH, Cout, N]
    // Copied from ggml.c: int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d)
    auto calc_conv_output_size = [](int64_t ins, int64_t ks, int s, int p, int d) -> int64_t {
        return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
    };
    // parallelize in {OW/BS_K, OH/BS_NPQ, 1}
    int64_t W    = src1->ne[0];
    int64_t H    = src1->ne[1];
    int64_t KW   = src0->ne[0];
    int64_t KH   = src0->ne[1];
    int64_t Cout = src0->ne[3];
    int64_t N    = src1->ne[3];
    int64_t OH   = calc_conv_output_size(H, KH, dst->op_params[1], dst->op_params[3], dst->op_params[5]);
    int64_t OW   = calc_conv_output_size(W, KW, dst->op_params[0], dst->op_params[2], dst->op_params[4]);
    int64_t NPQ  = N * OW * OH;
    // Tile output matrix to (K/NB_K, NPQ/NB_NPQ, 1) workgroups
    std::array<uint32_t, 3> elements = { static_cast<uint32_t>(Cout), static_cast<uint32_t>(NPQ), 1 };
    return elements;
 }
 static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op) {
    switch (op) {
    case GGML_OP_GET_ROWS:
@ -6970,10 +7126,30 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
    case GGML_OP_CONV_2D:
        if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 &&
            ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
            auto elements = ggml_vk_get_conv_elements(dst);
            vk_conv_shapes shape;
            uint32_t tiles[CONV_SHAPE_COUNT];
            for (uint32_t i = 0; i < CONV_SHAPE_COUNT; ++i) {
                tiles[i] = CEIL_DIV(elements[0], ctx->device->pipeline_conv2d_f32[i]->wg_denoms[0]) * CEIL_DIV(elements[1], ctx->device->pipeline_conv2d_f32[i]->wg_denoms[1]);
            }
            // We can't query number of shader cores on Intel, use 32 as a placeholder
            // so small convolutions will still choose a smaller tile.
            const uint32_t shader_core_count = ctx->device->shader_core_count > 0 ? ctx->device->shader_core_count : 32;
            if (elements[0] > 64 && tiles[CONV_SHAPE_128x128] >= shader_core_count * 2) {
                shape = CONV_SHAPE_128x128;
            } else if (elements[0] <= 32 && tiles[CONV_SHAPE_32x256] >= shader_core_count * 2) {
                shape = CONV_SHAPE_32x256;
            } else {
                shape = CONV_SHAPE_64x32;
            }
            if (src0->type == GGML_TYPE_F32) {
-                return ctx->device->pipeline_conv2d_f32;
+                return ctx->device->pipeline_conv2d_f32[shape];
            } else if (src0->type == GGML_TYPE_F16) {
-                return ctx->device->pipeline_conv2d_f16_f32;
+                return ctx->device->pipeline_conv2d_f16_f32[shape];
            }
        }
        return nullptr;
@ -7301,29 +7477,8 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
        } break;
    case GGML_OP_CONV_2D:
        {
-            // src0 - kernel:   [KW, KH, Cin, Cout]
+            elements = ggml_vk_get_conv_elements(dst);
-            // src1 - input:    [W, H, Cin, N]
+        } break;
            // dst - result:    [OW, OH, Cout, N]
            // Copied from ggml.c: int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d)
            auto calc_conv_output_size = [](int64_t ins, int64_t ks, int s, int p, int d) -> int64_t {
                return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
            };
            // parallelize in {OW/BS_K, OH/BS_NPQ, 1}
            int64_t W    = src1->ne[0];
            int64_t H    = src1->ne[1];
            int64_t KW   = src0->ne[0];
            int64_t KH   = src0->ne[1];
            int64_t Cout = src0->ne[3];
            int64_t N    = src1->ne[3];
            int64_t OH   = calc_conv_output_size(H, KH, dst->op_params[1], dst->op_params[3], dst->op_params[5]);
            int64_t OW   = calc_conv_output_size(W, KW, dst->op_params[0], dst->op_params[2], dst->op_params[4]);
            int64_t NPQ  = N * OW * OH;
            // Tile output matrix to (K/NB_K, NPQ/NB_NPQ, 1) workgroups
            elements = { static_cast<uint32_t>(Cout), static_cast<uint32_t>(NPQ), 1 };
        }
        break;
    case GGML_OP_ADD:
    case GGML_OP_SUB:
    case GGML_OP_DIV:
--- a/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp
@ -1,14 +1,18 @@
 #version 450
 #extension GL_EXT_control_flow_attributes : enable
 #ifdef COOPMAT2
 #extension GL_NV_cooperative_matrix2 : enable
 #extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
 #extension GL_KHR_memory_scope_semantics : enable
 #endif
 #ifdef USE_COLLECTIVES
 #    extension GL_KHR_shader_subgroup_shuffle : enable
 #endif
 #include "types.comp"
 // Make spec constant
 #define SHMEM_PAD 0
 // shape notation: [dim(N), ..., dim(0)] -- stride(dim(j)) >= stride(dim(i)) if i > j
 layout(binding = 0) readonly buffer A {
    A_TYPE knl_data[];
@ -56,6 +60,12 @@ layout(push_constant) uniform parameter {
    uint32_t nb1;
    uint32_t nb2;
    uint32_t nb3;
    // fastdiv helper values
    uint32_t KWmp;   uint32_t KWL;
    uint32_t KWKHmp; uint32_t KWKHL;
    uint32_t OWmp;   uint32_t OWL;
    uint32_t OWOHmp; uint32_t OWOHL;
 }
 p;
@ -68,6 +78,7 @@ layout(constant_id = 3) const uint BS_NPQ          = 128;
 // Thread-tile sizes
 layout(constant_id = 4) const uint TS_K            = 8;
 layout(constant_id = 5) const uint use_collectives = 1;
 layout(constant_id = 6) const uint SHMEM_PAD       = 4;
 uint32_t       tid     = gl_LocalInvocationID.x;
 const uint32_t WG_SIZE = gl_WorkGroupSize.x;
@ -85,6 +96,12 @@ uint32_t n_elems_out = K * NPQ;
 // Number of blocktiles per input
 uint32_t NB_CRS = splitWork(CRS, BS_CRS);
 #ifdef COOPMAT2
 #define SHMEM_TYPE float16_t
 #else
 #define SHMEM_TYPE float
 #endif
 const uint32_t Ash_stride = BS_CRS + SHMEM_PAD;
 const uint32_t Bsh_stride = BS_NPQ + SHMEM_PAD;
@ -94,8 +111,8 @@ const uint32_t Bsh_numel = BS_CRS * BS_NPQ;
 const uint32_t Ash_len = BS_K * Ash_stride;
 const uint32_t Bsh_len = BS_CRS * Bsh_stride;
-shared float Ash[Ash_len];  // K x CRS
+shared SHMEM_TYPE Ash[Ash_len];  // K x CRS
-shared float Bsh[Bsh_len];  // CRS x NPQ
+shared SHMEM_TYPE Bsh[Bsh_len];  // CRS x NPQ
 // Threadtile sizes
 const uint32_t TS_NPQ = BS_K * BS_NPQ / WG_SIZE / TS_K;
@ -104,10 +121,6 @@ const uint32_t TS_NPQ = BS_K * BS_NPQ / WG_SIZE / TS_K;
 const uint32_t NT_K   = BS_K / TS_K;
 const uint32_t NT_NPQ = BS_NPQ / TS_NPQ;
 float regA[TS_K];
 float regB[TS_NPQ];
 float regC[TS_K][TS_NPQ];
 /*
 Compute
 KxCRS @ CRSxNPQ = K x NPQ
@ -131,12 +144,44 @@ uint32_t       Br    = tid / BS_NPQ;
 uint32_t       Bc    = tid % BS_NPQ;
 const uint32_t BrpWg = WG_SIZE / BS_NPQ;
 // see init_fastdiv_values in ggml-vulkan.cpp
 uint fastdiv(uint n, uint mp, uint L) {
    uint msbs, lsbs;
    // msbs = mulhi(n, mp)
    umulExtended(n, mp, msbs, lsbs);
    return (msbs + n) >> L;
 }
 #ifdef COOPMAT2
 #define ACC_TYPE float16_t
 ACC_TYPE perElemOpStore(const in uint32_t r, const in uint32_t c, const in ACC_TYPE elem)
 {
    uint32_t K_idx   = B_idx_K * BS_K + r;
    uint32_t NPQ_idx = B_idx_NPQ * BS_NPQ + c;
    uint32_t N_idx   = fastdiv(NPQ_idx, p.OWOHmp, p.OWOHL); // divide by p.OH * p.OW;
    uint32_t OH_idx  = fastdiv(NPQ_idx - N_idx * p.OH * p.OW, p.OWmp, p.OWL); // divide by p.OW;
    uint32_t OW_idx  = NPQ_idx - N_idx * p.OH * p.OW - OH_idx * p.OW;
    uint32_t dst_idx = OW_idx + OH_idx * p.nb1 + K_idx * p.nb2 + N_idx * p.nb3;
    if (K_idx < K && NPQ_idx < NPQ) {
        dst_data[dst_idx] = D_TYPE(elem);
    }
    return elem;
 }
 #endif
 void main() {
 #ifdef COOPMAT2
    coopmat<ACC_TYPE, gl_ScopeWorkgroup, BS_K, BS_NPQ, gl_MatrixUseAccumulator> matC;
    matC = coopmat<ACC_TYPE, gl_ScopeWorkgroup, BS_K, BS_NPQ, gl_MatrixUseAccumulator>(0.0);
 #else
    float regC[TS_K][TS_NPQ];
    for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
        for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) {
            regC[T_ly][T_lx] = 0.0;
        }
    }
 #endif
    /* Advance block in CRS dim */
    for (uint32_t B_idx_CRS = 0; B_idx_CRS < NB_CRS; B_idx_CRS++) {
        uint32_t CRS_idx_a;
@ -151,9 +196,9 @@ void main() {
        uint32_t cached_KW_idx;
        if (use_collectives == 1) {
            cached_CRS_idx                = B_idx_CRS * BS_CRS + gl_SubgroupInvocationID;
-            cached_Cin_idx                = cached_CRS_idx / (p.KW * p.KH);
+            cached_Cin_idx                = fastdiv(cached_CRS_idx, p.KWKHmp, p.KWKHL); // divide by (p.KW * p.KH);
            uint32_t cached_CRS_remainder = (cached_CRS_idx - cached_Cin_idx * p.KW * p.KH);
-            cached_KH_idx                 = cached_CRS_remainder / p.KW;
+            cached_KH_idx                 = fastdiv(cached_CRS_remainder, p.KWmp, p.KWL); // divide by p.KW;
            cached_KW_idx                 = cached_CRS_remainder - cached_KH_idx * p.KW;
            CRS_idx_a = subgroupShuffle(cached_CRS_idx, Ac);
@ -162,16 +207,16 @@ void main() {
            KW_idx_a  = subgroupShuffle(cached_KW_idx, Ac);
        } else {
            CRS_idx_a              = B_idx_CRS * BS_CRS + Ac;  // Global CRS_idx_a (column index of A)
-            Cin_idx_a              = CRS_idx_a / (p.KW * p.KH);
+            Cin_idx_a              = fastdiv(CRS_idx_a, p.KWKHmp, p.KWKHL); // divide by (p.KW * p.KH);
            uint32_t CRS_remainder = CRS_idx_a - Cin_idx_a * p.KW * p.KH;
-            KH_idx_a               = CRS_remainder / p.KW;
+            KH_idx_a               = fastdiv(CRS_remainder, p.KWmp, p.KWL); // divide by p.KW;
            KW_idx_a               = CRS_remainder - KH_idx_a * p.KW;
        }
 #else
        CRS_idx_a     = B_idx_CRS * BS_CRS + Ac;  // Global CRS_idx_a (column index of A)
-        Cin_idx_a     = CRS_idx_a / (p.KW * p.KH);
+        Cin_idx_a     = fastdiv(CRS_idx_a, p.KWKHmp, p.KWKHL); // divide by (p.KW * p.KH); / (p.KW * p.KH);
        CRS_remainder = CRS_idx_a - Cin_idx_a * p.KW * p.KH;
-        KH_idx_a      = CRS_remainder / p.KW;
+        KH_idx_a      = fastdiv(CRS_remainder, p.KWmp, p.KWL); // divide by p.KW;
        KW_idx_a      = CRS_remainder - KH_idx_a * p.KW;
 #endif
@ -185,16 +230,16 @@ void main() {
            if (K_idx >= K || CRS_idx_a >= CRS) {
                val = 0.0;
            }
-            Ash[B_ly * Ash_stride + B_lx] = val;
+            Ash[B_ly * Ash_stride + B_lx] = SHMEM_TYPE(val);
        }
        /* Load input to B_block: (BS_CRS x BS_NPQ) */
-        for (uint32_t r_offset = 0; r_offset < BS_CRS; r_offset += BrpWg) {
+        UNROLL for (uint32_t r_offset = 0; r_offset < BS_CRS; r_offset += BrpWg) {
            uint32_t B_ly          = r_offset + Br;             /* Row index of B block */
            uint32_t B_lx          = Bc;
            uint32_t NPQ_idx       = B_idx_NPQ * BS_NPQ + B_lx; /* Global NPQ index (column index of B) */
-            uint32_t N_idx         = NPQ_idx / (p.OH * p.OW);
+            uint32_t N_idx         = fastdiv(NPQ_idx, p.OWOHmp, p.OWOHL); // divide by p.OH * p.OW;
            uint32_t NPQ_remainder = NPQ_idx - N_idx * p.OH * p.OW;
-            uint32_t OH_idx        = NPQ_remainder / p.OW;
+            uint32_t OH_idx        = fastdiv(NPQ_remainder, p.OWmp, p.OWL); // divide by p.OW;
            uint32_t OW_idx        = NPQ_remainder - OH_idx * p.OW;
            uint32_t CRS_idx_b;
@ -209,16 +254,16 @@ void main() {
                KW_idx_b  = subgroupShuffle(cached_KW_idx, r_offset + Br);
            } else {
                CRS_idx_b              = B_idx_CRS * BS_CRS + B_ly; /* Global CRS index (row index of B) */
-                Cin_idx_b              = CRS_idx_b / (p.KW * p.KH);
+                Cin_idx_b              = fastdiv(CRS_idx_b, p.KWKHmp, p.KWKHL); // divide by (p.KW * p.KH);
                uint32_t CRS_remainder = CRS_idx_b - Cin_idx_b * p.KW * p.KH;
-                KH_idx_b               = CRS_remainder / p.KW;
+                KH_idx_b               = fastdiv(CRS_remainder, p.KWmp, p.KWL); // divide by p.KW;
                KW_idx_b               = CRS_remainder - KH_idx_b * p.KW;
            }
 #else
            CRS_idx_b              = B_idx_CRS * BS_CRS + B_ly; /* Global CRS index (row index of B) */
-            Cin_idx_b              = CRS_idx_b / (p.KW * p.KH);
+            Cin_idx_b              = fastdiv(CRS_idx_b, p.KWKHmp, p.KWKHL); // divide by (p.KW * p.KH);
            uint32_t CRS_remainder = CRS_idx_b - Cin_idx_b * p.KW * p.KH;
-            KH_idx_b               = CRS_remainder / p.KW;
+            KH_idx_b               = fastdiv(CRS_remainder, p.KWmp, p.KWL); // divide by p.KW;
            KW_idx_b               = CRS_remainder - KH_idx_b * p.KW;
 #endif
@ -230,36 +275,55 @@ void main() {
            if (CRS_idx_b >= CRS || NPQ_idx >= NPQ || H_idx < 0 || H_idx >= p.H || W_idx < 0 || W_idx >= p.W) {
                val = 0.0;
            }
-            Bsh[B_ly * Bsh_stride + B_lx] = val;
+            Bsh[B_ly * Bsh_stride + B_lx] = SHMEM_TYPE(val);
        }
        barrier();
-        for (uint32_t CRS_lidx = 0; CRS_lidx < BS_CRS; CRS_lidx++) {
+#ifdef COOPMAT2
-            for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
+        coopmat<float16_t, gl_ScopeWorkgroup, BS_K, BS_CRS, gl_MatrixUseA> matA;
-                regA[T_ly] = Ash[(T_y * TS_K + T_ly) * Ash_stride + CRS_lidx];
+        coopmat<float16_t, gl_ScopeWorkgroup, BS_CRS, BS_NPQ, gl_MatrixUseB> matB;
-            }
+
-            for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) {
+        coopMatLoad(matA, Ash, 0, Ash_stride, gl_CooperativeMatrixLayoutRowMajor);
-                regB[T_lx] = Bsh[CRS_lidx * Bsh_stride + T_x * TS_NPQ + T_lx];
+        coopMatLoad(matB, Bsh, 0, Bsh_stride, gl_CooperativeMatrixLayoutRowMajor);
-            }
+        matC = coopMatMulAdd(matA, matB, matC);
-            for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
+#else
        if (T_y * TS_K < K) {
            UNROLL for (uint32_t CRS_lidx = 0; CRS_lidx < BS_CRS; CRS_lidx++) {
                float regA[TS_K];
                float regB[TS_NPQ];
                for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
                    regA[T_ly] = Ash[(T_y * TS_K + T_ly) * Ash_stride + CRS_lidx];
                }
                for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) {
-                    regC[T_ly][T_lx] = fma(regA[T_ly], regB[T_lx], regC[T_ly][T_lx]);
+                    regB[T_lx] = Bsh[CRS_lidx * Bsh_stride + T_x * TS_NPQ + T_lx];
                }
                for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
                    for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) {
                        regC[T_ly][T_lx] = fma(regA[T_ly], regB[T_lx], regC[T_ly][T_lx]);
                    }
                }
            }
        }
 #endif
        barrier();
    }
    /* Save C* */
-    for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
+#ifdef COOPMAT2
-        for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) {
+    coopMatPerElementNV(matC, matC, perElemOpStore);
-            uint32_t K_idx   = B_idx_K * BS_K + T_y * TS_K + T_ly;
+#else
-            uint32_t NPQ_idx = B_idx_NPQ * BS_NPQ + T_x * TS_NPQ + T_lx;
+    if (T_y * TS_K < K) {
-            uint32_t N_idx   = NPQ_idx / (p.OH * p.OW);
+        for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
-            uint32_t OH_idx  = (NPQ_idx - N_idx * p.OH * p.OW) / p.OW;
+            for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) {
-            uint32_t OW_idx  = NPQ_idx - N_idx * p.OH * p.OW - OH_idx * p.OW;
+                uint32_t K_idx   = B_idx_K * BS_K + T_y * TS_K + T_ly;
-            uint32_t dst_idx = OW_idx + OH_idx * p.nb1 + K_idx * p.nb2 + N_idx * p.nb3;
+                uint32_t NPQ_idx = B_idx_NPQ * BS_NPQ + T_x * TS_NPQ + T_lx;
-            if (K_idx < K && NPQ_idx < NPQ) {
+                uint32_t N_idx   = fastdiv(NPQ_idx, p.OWOHmp, p.OWOHL); // divide by p.OH * p.OW;
-                dst_data[dst_idx] = regC[T_ly][T_lx];
+                uint32_t OH_idx  = fastdiv(NPQ_idx - N_idx * p.OH * p.OW, p.OWmp, p.OWL); // divide by p.OW;
                uint32_t OW_idx  = NPQ_idx - N_idx * p.OH * p.OW - OH_idx * p.OW;
                uint32_t dst_idx = OW_idx + OH_idx * p.nb1 + K_idx * p.nb2 + N_idx * p.nb3;
                if (K_idx < K && NPQ_idx < NPQ) {
                    dst_data[dst_idx] = regC[T_ly][T_lx];
                }
            }
        }
    }
 #endif
 }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp
@ -26,6 +26,9 @@ layout (push_constant) uniform parameter
    uint ne12;
    uint b_offset;
    uint d_offset;
    uint nb03;
    uint nb13;
    uint nb23;
 } p;
 shared FLOAT_TYPE tmp[BLOCK_SIZE];
@ -34,6 +37,7 @@ void main() {
    const uint tid       = gl_LocalInvocationID.x;
    const uint row_x     = gl_GlobalInvocationID.y;
    const uint channel   = gl_GlobalInvocationID.z;
    const uint i3        = gl_WorkGroupID.x;
    const uint channel_x = channel / p.channel_x_divisor;
    const uint channel_y = channel % p.ne12;
@ -41,7 +45,7 @@ void main() {
    const uint nrows_dst = p.nrows_x;
    const uint row_dst   = row_x;
-    const uint idst = channel*nrows_dst + row_dst;
+    const uint idst = i3*p.nb23 + channel*nrows_dst + row_dst;
    FLOAT_TYPE temp = 0.0f;
@ -58,8 +62,8 @@ void main() {
                const uint row_y = col_x;
-                const uint ix = channel_x*p.channel_stride_x + row_x*p.row_stride_x + col_x;
+                const uint ix = i3*p.nb03 + channel_x*p.channel_stride_x + row_x*p.row_stride_x + col_x;
-                const uint iy = channel_y*p.channel_stride_y + row_y;
+                const uint iy = i3*p.nb13 + channel_y*p.channel_stride_y + row_y;
                const vec4 av4 = vec4(data_a_v4[ix / 4]);
                const vec4 bv4 = vec4(data_b_v4[iy / 4]);
@ -74,8 +78,8 @@ void main() {
            const uint row_y = col_x;
-            const uint ix = channel_x*p.channel_stride_x + row_x*p.row_stride_x + col_x;
+            const uint ix = i3*p.nb03 + channel_x*p.channel_stride_x + row_x*p.row_stride_x + col_x;
-            const uint iy = channel_y*p.channel_stride_y + row_y;
+            const uint iy = i3*p.nb13 + channel_y*p.channel_stride_y + row_y;
            const vec4 av4 = vec4(data_a_v4[ix / 4]);
            const vec4 bv4 = vec4(data_b_v4[iy / 4]);
@ -91,8 +95,8 @@ void main() {
            const uint row_y = col_x;
-            const uint ix = channel_x*p.channel_stride_x + row_x*p.row_stride_x + col_x;
+            const uint ix = i3*p.nb03 + channel_x*p.channel_stride_x + row_x*p.row_stride_x + col_x;
-            const uint iy = channel_y*p.channel_stride_y + row_y;
+            const uint iy = i3*p.nb13 + channel_y*p.channel_stride_y + row_y;
            const FLOAT_TYPE xi = FLOAT_TYPE(data_a[ix]);
--- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
@ -655,8 +655,16 @@ void process_shaders() {
    string_to_spv("opt_step_adamw_f32", "opt_step_adamw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));
-    string_to_spv("conv2d_f32", "conv2d_mm.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}});
+    string_to_spv("conv2d_f32_unroll", "conv2d_mm.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}, {"UNROLL", "[[unroll]]"}});
-    string_to_spv("conv2d_f16_f32", "conv2d_mm.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}});
+    string_to_spv("conv2d_f16_f32_unroll", "conv2d_mm.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}, {"UNROLL", "[[unroll]]"}});
    string_to_spv("conv2d_f32", "conv2d_mm.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}, {"UNROLL", ""}});
    string_to_spv("conv2d_f16_f32", "conv2d_mm.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}, {"UNROLL", ""}});
 #if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
    string_to_spv("conv2d_f32", "conv2d_mm.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}, {"UNROLL", "[[unroll]]"}, {"COOPMAT2", "1"}}, true, false, true);
    string_to_spv("conv2d_f16_f32", "conv2d_mm.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}, {"UNROLL", "[[unroll]]"}, {"COOPMAT2", "1"}}, true, false, true);
 #endif
    string_to_spv("conv2d_dw_whcn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"WHCN", "1"}}));
    string_to_spv("conv2d_dw_cwhn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"CWHN", "1"}}));
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -376,6 +376,7 @@ class MODEL_ARCH(IntEnum):
    ERNIE4_5         = auto()
    ERNIE4_5_MOE     = auto()
    HUNYUAN_MOE      = auto()
    HUNYUAN_DENSE    = auto()
    SMOLLM3          = auto()
    LFM2             = auto()
    DREAM            = auto()
@ -697,6 +698,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
    MODEL_ARCH.ERNIE4_5_MOE:     "ernie4_5-moe",
    MODEL_ARCH.FALCON_H1:        "falcon-h1",
    MODEL_ARCH.HUNYUAN_MOE:      "hunyuan-moe",
    MODEL_ARCH.HUNYUAN_DENSE:    "hunyuan-dense",
    MODEL_ARCH.SMOLLM3:          "smollm3",
    MODEL_ARCH.LFM2:             "lfm2",
    MODEL_ARCH.DREAM:            "dream",
@ -2471,6 +2473,22 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.FFN_DOWN_SHEXP,
        MODEL_TENSOR.FFN_UP_SHEXP,
    ],
    MODEL_ARCH.HUNYUAN_DENSE: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
        MODEL_TENSOR.OUTPUT,
        MODEL_TENSOR.ATTN_NORM,
        MODEL_TENSOR.ATTN_Q,
        MODEL_TENSOR.ATTN_Q_NORM,
        MODEL_TENSOR.ATTN_K,
        MODEL_TENSOR.ATTN_K_NORM,
        MODEL_TENSOR.ATTN_V,
        MODEL_TENSOR.ATTN_OUT,
        MODEL_TENSOR.FFN_NORM,
        MODEL_TENSOR.FFN_GATE,
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
    ],
    MODEL_ARCH.SMOLLM3: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@ -33,6 +33,7 @@ class TensorNameMap:
            "language_model.model.embed_tokens",         # llama4
            "encoder",                                   # neobert
            "model.transformer.wte",                     # llada
            "embed_tokens",                              # qwen3-embedding
        ),
        # Token type embeddings
@ -143,6 +144,7 @@ class TensorNameMap:
            "transformer_encoder.{bid}.attention_norm",             # neobert
            "model.layers.{bid}.operator_norm",                     # lfm2
            "model.transformer.blocks.{bid}.attn_norm",             # llada
            "layers.{bid}.input_layernorm",                         # qwen3-embedding
        ),
        # Attention norm 2
@ -188,6 +190,7 @@ class TensorNameMap:
            "transformer.h.{bid}.attn.attention.q_proj",                 # exaone
            "model.layers.{bid}.self_attn.q_proj",                       # llama4
            "model.transformer.blocks.{bid}.q_proj",                     # llada
            "layers.{bid}.self_attn.q_proj",                             # qwen3-embedding
        ),
        # Attention key
@ -205,6 +208,7 @@ class TensorNameMap:
            "transformer.h.{bid}.attn.attention.k_proj",               # exaone
            "model.layers.{bid}.self_attn.k_proj",                     # llama4
            "model.transformer.blocks.{bid}.k_proj",                   # llada
            "layers.{bid}.self_attn.k_proj",                           # qwen3-embedding
        ),
        # Attention value
@ -221,6 +225,7 @@ class TensorNameMap:
            "transformer.h.{bid}.attn.attention.v_proj",                 # exaone
            "model.layers.{bid}.self_attn.v_proj",                       # llama4
            "model.transformer.blocks.{bid}.v_proj",                     # llada
            "layers.{bid}.self_attn.v_proj",                             # qwen3-embedding
        ),
        # Attention output
@ -254,6 +259,7 @@ class TensorNameMap:
            "model.layers.{bid}.self_attn.o_proj",                          # llama4
            "transformer_encoder.{bid}.wo",                                 # neobert
            "model.transformer.blocks.{bid}.attn_out",                      # llada
            "layers.{bid}.self_attn.o_proj",                                # qwen3-embedding
        ),
        # Attention output norm
@ -300,6 +306,7 @@ class TensorNameMap:
            "transformer_encoder.{bid}.ffn_norm",                            # neobert
            "model.layers.layers.{bid}.pre_mlp_norm",                        # plamo2
            "model.transformer.blocks.{bid}.ff_norm",                        # llada
            "layers.{bid}.post_attention_layernorm",                         # qwen3-embedding
        ),
        # Post feed-forward norm
@ -373,7 +380,8 @@ class TensorNameMap:
            "model.layers.{bid}.feed_forward.up_proj",                # llama4 jamba granite-hybrid
            "transformer_encoder.{bid}.ffn.w12",                      # neobert
            "model.layers.{bid}.block_sparse_moe.up",                 # smallthinker
-            "model.transformer.blocks.{bid}.up_proj",                  # llada
+            "model.transformer.blocks.{bid}.up_proj",                 # llada
            "layers.{bid}.mlp.up_proj",                               # qwen3-embedding
        ),
        MODEL_TENSOR.FFN_UP_EXP: (
@ -416,6 +424,7 @@ class TensorNameMap:
            "model.layers.{bid}.feed_forward.gate_proj",  # llama4 jamba granite-hybrid
            "model.layers.{bid}.block_sparse_moe.gate",   # smallthinker
            "model.transformer.blocks.{bid}.ff_proj",     # llada
            "layers.{bid}.mlp.gate_proj",                 # qwen3-embedding
        ),
        MODEL_TENSOR.FFN_GATE_EXP: (
@ -465,7 +474,8 @@ class TensorNameMap:
            "model.layers.{bid}.feed_forward.down_proj",              # llama4 jamba granite-hybrid
            "transformer_encoder.{bid}.ffn.w3",                       # neobert
            "model.layers.{bid}.block_sparse_moe.down",               # smallthinker
-            "model.transformer.blocks.{bid}.ff_out",                   # llada
+            "model.transformer.blocks.{bid}.ff_out",                  # llada
            "layers.{bid}.mlp.down_proj",                             # qwen3-embedding
        ),
        MODEL_TENSOR.FFN_DOWN_EXP: (
@ -497,6 +507,7 @@ class TensorNameMap:
            "encoder.layer.{bid}.attention.self.layer_norm_q",                # jina-bert-v2
            "transformer.layers.{bid}.attn.q_norm",                           # openelm
            "model.layers.layers.{bid}.mixer.q",                              # plamo2
            "layers.{bid}.self_attn.q_norm",                                  # qwen3-embedding
        ),
        MODEL_TENSOR.ATTN_K_NORM: (
@ -508,6 +519,7 @@ class TensorNameMap:
            "encoder.layer.{bid}.attention.self.layer_norm_k",                # jina-bert-v2
            "transformer.layers.{bid}.attn.k_norm",                           # openelm
            "model.layers.layers.{bid}.mixer.k",                              # plamo2
            "layers.{bid}.self_attn.k_norm",                                  # qwen3-embedding
        ),
        MODEL_TENSOR.ROPE_FREQS: (
--- a/gguf-py/gguf/vocab.py
+++ b/gguf-py/gguf/vocab.py
@ -312,7 +312,11 @@ class SpecialVocab:
        with open(config_file, encoding = 'utf-8') as f:
            config = json.load(f)
        for typ in self.special_token_types:
-            self._set_special_token(typ, config.get(f'{typ}_token_id'))
+            token_id = config.get(f'{typ}_token_id')
            # If not found at root, check in text_config (for multimodal models like Kimi-VL)
            if token_id is None and 'text_config' in config:
                token_id = config['text_config'].get(f'{typ}_token_id')
            self._set_special_token(typ, token_id)
        return True
--- a/requirements/requirements-convert_hf_to_gguf_update.txt
+++ b/requirements/requirements-convert_hf_to_gguf_update.txt
@ -1,7 +1 @@
 -r ./requirements-convert_legacy_llama.txt
 --extra-index-url https://download.pytorch.org/whl/cpu
 torch~=2.2.1; platform_machine != "s390x"
 # torch s390x packages can only be found from nightly builds
 --extra-index-url https://download.pytorch.org/whl/nightly
 torch>=0.0.0.dev0; platform_machine == "s390x"
--- a/scripts/compare-commits.sh
+++ b/scripts/compare-commits.sh
@ -1,19 +1,41 @@
 #!/usr/bin/env bash
 if [ $# -lt 2 ]; then
-    echo "usage: ./scripts/compare-commits.sh <commit1> <commit2> [additional llama-bench arguments]"
+    echo "usage: ./scripts/compare-commits.sh <commit1> <commit2> [tool] [additional arguments]"
    echo "  tool: 'llama-bench' (default) or 'test-backend-ops'"
    echo "  additional arguments: passed to the selected tool"
    exit 1
 fi
 set -e
 set -x
 # Parse arguments
 commit1=$1
 commit2=$2
 tool=${3:-llama-bench}
 additional_args="${@:4}"
 # Validate tool argument
 if [ "$tool" != "llama-bench" ] && [ "$tool" != "test-backend-ops" ]; then
    echo "Error: tool must be 'llama-bench' or 'test-backend-ops'"
    exit 1
 fi
 # verify at the start that the compare script has all the necessary dependencies installed
 ./scripts/compare-llama-bench.py --check
-bench_args="${@:3}"
+if [ "$tool" = "llama-bench" ]; then
    db_file="llama-bench.sqlite"
    target="llama-bench"
    run_args="-o sql -oe md $additional_args"
 else  # test-backend-ops
    db_file="test-backend-ops.sqlite"
    target="test-backend-ops"
    run_args="perf --output sql $additional_args"
 fi
-rm -f llama-bench.sqlite > /dev/null
+rm -f "$db_file" > /dev/null
 # to test a backend, call the script with the corresponding environment variable (e.g. GGML_CUDA=1 ./scripts/compare-commits.sh ...)
 if [ -n "$GGML_CUDA" ]; then
@ -25,14 +47,14 @@ dir="build-bench"
 function run {
    rm -fr ${dir} > /dev/null
    cmake -B ${dir} -S . ${CMAKE_OPTS} > /dev/null
-    cmake --build ${dir} -t llama-bench > /dev/null
+    cmake --build ${dir} -t $target -j $(nproc) > /dev/null
-    ${dir}/bin/llama-bench -o sql -oe md $bench_args | sqlite3 llama-bench.sqlite
+    ${dir}/bin/$target $run_args | sqlite3 "$db_file"
 }
-git checkout $1 > /dev/null
+git checkout $commit1 > /dev/null
 run
-git checkout $2 > /dev/null
+git checkout $commit2 > /dev/null
 run
-./scripts/compare-llama-bench.py -b $1 -c $2
+./scripts/compare-llama-bench.py -b $commit1 -c $commit2 --tool $tool -i "$db_file"
--- a/scripts/compare-llama-bench.py
+++ b/scripts/compare-llama-bench.py
@ -1,16 +1,16 @@
 #!/usr/bin/env python3
 import logging
 import argparse
 import heapq
 import sys
 import os
 from glob import glob
 import sqlite3
 import json
 import csv
-from typing import Optional, Union
+import heapq
 import json
 import logging
 import os
 import sqlite3
 import sys
 from collections.abc import Iterator, Sequence
 from glob import glob
 from typing import Any, Optional, Union
 try:
    import git
@ -23,7 +23,7 @@ except ImportError as e:
 logger = logging.getLogger("compare-llama-bench")
 # All llama-bench SQL fields
-DB_FIELDS = [
+LLAMA_BENCH_DB_FIELDS = [
    "build_commit", "build_number", "cpu_info",       "gpu_info",   "backends",     "model_filename",
    "model_type",   "model_size",   "model_n_params", "n_batch",    "n_ubatch",     "n_threads",
    "cpu_mask",     "cpu_strict",   "poll",           "type_k",     "type_v",       "n_gpu_layers",
@ -33,7 +33,7 @@ DB_FIELDS = [
    "test_time",    "avg_ns",       "stddev_ns",      "avg_ts",     "stddev_ts",
 ]
-DB_TYPES = [
+LLAMA_BENCH_DB_TYPES = [
    "TEXT",    "INTEGER", "TEXT",    "TEXT",    "TEXT",    "TEXT",
    "TEXT",    "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER",
    "TEXT",    "INTEGER", "INTEGER", "TEXT",    "TEXT",    "INTEGER",
@ -42,20 +42,41 @@ DB_TYPES = [
    "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER",
    "TEXT",    "INTEGER", "INTEGER", "REAL",    "REAL",
 ]
 assert len(DB_FIELDS) == len(DB_TYPES)
-# Properties by which to differentiate results per commit:
+# All test-backend-ops SQL fields
-KEY_PROPERTIES = [
+TEST_BACKEND_OPS_DB_FIELDS = [
    "test_time", "build_commit", "backend_name",  "op_name", "op_params", "test_mode",
    "supported", "passed",       "error_message", "time_us", "flops",     "bandwidth_gb_s",
    "memory_kb", "n_runs"
 ]
 TEST_BACKEND_OPS_DB_TYPES = [
    "TEXT",    "TEXT",    "TEXT", "TEXT", "TEXT", "TEXT",
    "INTEGER", "INTEGER", "TEXT", "REAL", "REAL", "REAL",
    "INTEGER", "INTEGER"
 ]
 assert len(LLAMA_BENCH_DB_FIELDS) == len(LLAMA_BENCH_DB_TYPES)
 assert len(TEST_BACKEND_OPS_DB_FIELDS) == len(TEST_BACKEND_OPS_DB_TYPES)
 # Properties by which to differentiate results per commit for llama-bench:
 LLAMA_BENCH_KEY_PROPERTIES = [
    "cpu_info", "gpu_info", "backends", "n_gpu_layers", "tensor_buft_overrides", "model_filename", "model_type",
    "n_batch", "n_ubatch", "embeddings", "cpu_mask", "cpu_strict", "poll", "n_threads", "type_k", "type_v",
    "use_mmap", "no_kv_offload", "split_mode", "main_gpu", "tensor_split", "flash_attn", "n_prompt", "n_gen", "n_depth"
 ]
-# Properties that are boolean and are converted to Yes/No for the table:
+# Properties by which to differentiate results per commit for test-backend-ops:
-BOOL_PROPERTIES = ["embeddings", "cpu_strict", "use_mmap", "no_kv_offload", "flash_attn"]
+TEST_BACKEND_OPS_KEY_PROPERTIES = [
    "backend_name", "op_name", "op_params", "test_mode"
 ]
-# Header names for the table:
+# Properties that are boolean and are converted to Yes/No for the table:
-PRETTY_NAMES = {
+LLAMA_BENCH_BOOL_PROPERTIES = ["embeddings", "cpu_strict", "use_mmap", "no_kv_offload", "flash_attn"]
 TEST_BACKEND_OPS_BOOL_PROPERTIES = ["supported", "passed"]
 # Header names for the table (llama-bench):
 LLAMA_BENCH_PRETTY_NAMES = {
    "cpu_info": "CPU", "gpu_info": "GPU", "backends": "Backends", "n_gpu_layers": "GPU layers",
    "tensor_buft_overrides": "Tensor overrides", "model_filename": "File", "model_type": "Model", "model_size": "Model size [GiB]",
    "model_n_params": "Num. of par.", "n_batch": "Batch size", "n_ubatch": "Microbatch size", "embeddings": "Embeddings",
@ -64,21 +85,42 @@ PRETTY_NAMES = {
    "flash_attn": "FlashAttention",
 }
-DEFAULT_SHOW = ["model_type"]  # Always show these properties by default.
+# Header names for the table (test-backend-ops):
-DEFAULT_HIDE = ["model_filename"]  # Always hide these properties by default.
+TEST_BACKEND_OPS_PRETTY_NAMES = {
    "backend_name": "Backend", "op_name": "GGML op", "op_params": "Op parameters", "test_mode": "Mode",
    "supported": "Supported", "passed": "Passed", "error_message": "Error",
    "flops": "FLOPS", "bandwidth_gb_s": "Bandwidth (GB/s)", "memory_kb": "Memory (KB)", "n_runs": "Runs"
 }
 DEFAULT_SHOW_LLAMA_BENCH = ["model_type"]  # Always show these properties by default.
 DEFAULT_HIDE_LLAMA_BENCH = ["model_filename"]  # Always hide these properties by default.
 DEFAULT_SHOW_TEST_BACKEND_OPS = ["backend_name", "op_name"]  # Always show these properties by default.
 DEFAULT_HIDE_TEST_BACKEND_OPS = ["error_message"]  # Always hide these properties by default.
 GPU_NAME_STRIP = ["NVIDIA GeForce ", "Tesla ", "AMD Radeon "]  # Strip prefixes for smaller tables.
 MODEL_SUFFIX_REPLACE = {" - Small": "_S", " - Medium": "_M", " - Large": "_L"}
-DESCRIPTION = """Creates tables from llama-bench data written to multiple JSON/CSV files, a single JSONL file or SQLite database. Example usage (Linux):
+DESCRIPTION = """Creates tables from llama-bench or test-backend-ops data written to multiple JSON/CSV files, a single JSONL file or SQLite database. Example usage (Linux):
 For llama-bench:
 $ git checkout master
-$ make clean && make llama-bench
+$ cmake -B ${BUILD_DIR} ${CMAKE_OPTS} && cmake --build ${BUILD_DIR} -t llama-bench -j $(nproc)
 $ ./llama-bench -o sql | sqlite3 llama-bench.sqlite
 $ git checkout some_branch
-$ make clean && make llama-bench
+$ cmake -B ${BUILD_DIR} ${CMAKE_OPTS} && cmake --build ${BUILD_DIR} -t llama-bench -j $(nproc)
 $ ./llama-bench -o sql | sqlite3 llama-bench.sqlite
 $ ./scripts/compare-llama-bench.py
 For test-backend-ops:
 $ git checkout master
 $ cmake -B ${BUILD_DIR} ${CMAKE_OPTS} && cmake --build ${BUILD_DIR} -t test-backend-ops -j $(nproc)
 $ ./test-backend-ops perf --output sql | sqlite3 test-backend-ops.sqlite
 $ git checkout some_branch
 $ cmake -B ${BUILD_DIR} ${CMAKE_OPTS} && cmake --build ${BUILD_DIR} -t test-backend-ops -j $(nproc)
 $ ./test-backend-ops perf --output sql | sqlite3 test-backend-ops.sqlite
 $ ./scripts/compare-llama-bench.py --tool test-backend-ops -i test-backend-ops.sqlite
 Performance numbers from multiple runs per commit are averaged WITHOUT being weighted by the --repetitions parameter of llama-bench.
 """
@ -96,6 +138,13 @@ help_c = (
    "Defaults to the non-master commit for which llama-bench was run most recently."
 )
 parser.add_argument("-c", "--compare", help=help_c)
 help_t = (
    "The tool whose data is being compared. "
    "Either 'llama-bench' or 'test-backend-ops'. "
    "This determines the database schema and comparison logic used. "
    "If left unspecified, try to determine from the input file."
 )
 parser.add_argument("-t", "--tool", help=help_t, default=None, choices=[None, "llama-bench", "test-backend-ops"])
 help_i = (
    "JSON/JSONL/SQLite/CSV files for comparing commits. "
    "Specify multiple times to use multiple input files (JSON/CSV only). "
@ -114,7 +163,8 @@ parser.add_argument("-o", "--output", help=help_o, default="pipe")
 help_s = (
    "Columns to add to the table. "
    "Accepts a comma-separated list of values. "
-    f"Legal values: {', '.join(KEY_PROPERTIES[:-3])}. "
+    f"Legal values for test-backend-ops: {', '.join(TEST_BACKEND_OPS_KEY_PROPERTIES)}. "
    f"Legal values for llama-bench: {', '.join(LLAMA_BENCH_KEY_PROPERTIES[:-3])}. "
    "Defaults to model name (model_type) and CPU and/or GPU name (cpu_info, gpu_info) "
    "plus any column where not all data points are the same. "
    "If the columns are manually specified, then the results for each unique combination of the "
@ -142,8 +192,14 @@ if unknown_args:
    sys.exit(1)
 input_file = known_args.input
-if not input_file and os.path.exists("./llama-bench.sqlite"):
+tool = known_args.tool
-    input_file = ["llama-bench.sqlite"]
+
 if not input_file:
    if tool == "llama-bench" and os.path.exists("./llama-bench.sqlite"):
        input_file = ["llama-bench.sqlite"]
    elif tool == "test-backend-ops" and os.path.exists("./test-backend-ops.sqlite"):
        input_file = ["test-backend-ops.sqlite"]
 if not input_file:
    sqlite_files = glob("*.sqlite")
    if len(sqlite_files) == 1:
@ -161,14 +217,23 @@ class LlamaBenchData:
    build_len_max: int
    build_len: int = 8
    builds: list[str] = []
-    check_keys = set(KEY_PROPERTIES + ["build_commit", "test_time", "avg_ts"])
+    tool: str = "llama-bench"  # Tool type: "llama-bench" or "test-backend-ops"
-    def __init__(self):
+    def __init__(self, tool: str = "llama-bench"):
        self.tool = tool
        try:
            self.repo = git.Repo(".", search_parent_directories=True)
        except git.InvalidGitRepositoryError:
            self.repo = None
        # Set schema-specific properties based on tool
        if self.tool == "llama-bench":
            self.check_keys = set(LLAMA_BENCH_KEY_PROPERTIES + ["build_commit", "test_time", "avg_ts"])
        elif self.tool == "test-backend-ops":
            self.check_keys = set(TEST_BACKEND_OPS_KEY_PROPERTIES + ["build_commit", "test_time"])
        else:
            assert False
    def _builds_init(self):
        self.build_len = self.build_len_min
@ -252,52 +317,121 @@ class LlamaBenchData:
 class LlamaBenchDataSQLite3(LlamaBenchData):
    connection: sqlite3.Connection
    cursor: sqlite3.Cursor
    table_name: str
-    def __init__(self):
+    def __init__(self, tool: str = "llama-bench"):
-        super().__init__()
+        super().__init__(tool)
        self.connection = sqlite3.connect(":memory:")
        self.cursor = self.connection.cursor()
-        self.cursor.execute(f"CREATE TABLE test({', '.join(' '.join(x) for x in zip(DB_FIELDS, DB_TYPES))});")
+
        # Set table name and schema based on tool
        if self.tool == "llama-bench":
            self.table_name = "llama_bench"
            db_fields = LLAMA_BENCH_DB_FIELDS
            db_types = LLAMA_BENCH_DB_TYPES
        elif self.tool == "test-backend-ops":
            self.table_name = "test_backend_ops"
            db_fields = TEST_BACKEND_OPS_DB_FIELDS
            db_types = TEST_BACKEND_OPS_DB_TYPES
        else:
            assert False
        self.cursor.execute(f"CREATE TABLE {self.table_name}({', '.join(' '.join(x) for x in zip(db_fields, db_types))});")
    def _builds_init(self):
        if self.connection:
-            self.build_len_min = self.cursor.execute("SELECT MIN(LENGTH(build_commit)) from test;").fetchone()[0]
+            self.build_len_min = self.cursor.execute(f"SELECT MIN(LENGTH(build_commit)) from {self.table_name};").fetchone()[0]
-            self.build_len_max = self.cursor.execute("SELECT MAX(LENGTH(build_commit)) from test;").fetchone()[0]
+            self.build_len_max = self.cursor.execute(f"SELECT MAX(LENGTH(build_commit)) from {self.table_name};").fetchone()[0]
            if self.build_len_min != self.build_len_max:
                logger.warning("Data contains commit hashes of differing lengths. It's possible that the wrong commits will be compared. "
                               "Try purging the the database of old commits.")
-                self.cursor.execute(f"UPDATE test SET build_commit = SUBSTRING(build_commit, 1, {self.build_len_min});")
+                self.cursor.execute(f"UPDATE {self.table_name} SET build_commit = SUBSTRING(build_commit, 1, {self.build_len_min});")
-            builds = self.cursor.execute("SELECT DISTINCT build_commit FROM test;").fetchall()
+            builds = self.cursor.execute(f"SELECT DISTINCT build_commit FROM {self.table_name};").fetchall()
            self.builds = list(map(lambda b: b[0], builds))  # list[tuple[str]] -> list[str]
        super()._builds_init()
    def builds_timestamp(self, reverse: bool = False) -> Union[Iterator[tuple], Sequence[tuple]]:
        data = self.cursor.execute(
-            "SELECT build_commit, test_time FROM test ORDER BY test_time;").fetchall()
+            f"SELECT build_commit, test_time FROM {self.table_name} ORDER BY test_time;").fetchall()
        return reversed(data) if reverse else data
    def get_rows(self, properties: list[str], hexsha8_baseline: str, hexsha8_compare: str) -> Sequence[tuple]:
        if self.tool == "llama-bench":
            return self._get_rows_llama_bench(properties, hexsha8_baseline, hexsha8_compare)
        elif self.tool == "test-backend-ops":
            return self._get_rows_test_backend_ops(properties, hexsha8_baseline, hexsha8_compare)
        else:
            assert False
    def _get_rows_llama_bench(self, properties: list[str], hexsha8_baseline: str, hexsha8_compare: str) -> Sequence[tuple]:
        select_string = ", ".join(
            [f"tb.{p}" for p in properties] + ["tb.n_prompt", "tb.n_gen", "tb.n_depth", "AVG(tb.avg_ts)", "AVG(tc.avg_ts)"])
        equal_string = " AND ".join(
-            [f"tb.{p} = tc.{p}" for p in KEY_PROPERTIES] + [
+            [f"tb.{p} = tc.{p}" for p in LLAMA_BENCH_KEY_PROPERTIES] + [
                f"tb.build_commit = '{hexsha8_baseline}'", f"tc.build_commit = '{hexsha8_compare}'"]
        )
        group_order_string = ", ".join([f"tb.{p}" for p in properties] + ["tb.n_gen", "tb.n_prompt", "tb.n_depth"])
-        query = (f"SELECT {select_string} FROM test tb JOIN test tc ON {equal_string} "
+        query = (f"SELECT {select_string} FROM {self.table_name} tb JOIN {self.table_name} tc ON {equal_string} "
                 f"GROUP BY {group_order_string} ORDER BY {group_order_string};")
        return self.cursor.execute(query).fetchall()
    def _get_rows_test_backend_ops(self, properties: list[str], hexsha8_baseline: str, hexsha8_compare: str) -> Sequence[tuple]:
        # For test-backend-ops, we compare FLOPS and bandwidth metrics (prioritizing FLOPS over bandwidth)
        select_string = ", ".join(
            [f"tb.{p}" for p in properties] + [
                "AVG(tb.flops)", "AVG(tc.flops)",
                "AVG(tb.bandwidth_gb_s)", "AVG(tc.bandwidth_gb_s)"
            ])
        equal_string = " AND ".join(
            [f"tb.{p} = tc.{p}" for p in TEST_BACKEND_OPS_KEY_PROPERTIES] + [
                f"tb.build_commit = '{hexsha8_baseline}'", f"tc.build_commit = '{hexsha8_compare}'",
                "tb.supported = 1", "tc.supported = 1", "tb.passed = 1", "tc.passed = 1"]  # Only compare successful tests
        )
        group_order_string = ", ".join([f"tb.{p}" for p in properties])
        query = (f"SELECT {select_string} FROM {self.table_name} tb JOIN {self.table_name} tc ON {equal_string} "
                 f"GROUP BY {group_order_string} ORDER BY {group_order_string};")
        return self.cursor.execute(query).fetchall()
 class LlamaBenchDataSQLite3File(LlamaBenchDataSQLite3):
-    def __init__(self, data_file: str):
+    def __init__(self, data_file: str, tool: Any):
-        super().__init__()
+        super().__init__(tool)
        self.connection.close()
        self.connection = sqlite3.connect(data_file)
        self.cursor = self.connection.cursor()
        # Check which table exists in the database
        tables = self.cursor.execute("SELECT name FROM sqlite_master WHERE type='table';").fetchall()
        table_names = [table[0] for table in tables]
        # Tool selection logic
        if tool is None:
            if "llama_bench" in table_names:
                self.table_name = "llama_bench"
                self.tool = "llama-bench"
            elif "test_backend_ops" in table_names:
                self.table_name = "test_backend_ops"
                self.tool = "test-backend-ops"
            else:
                raise RuntimeError(f"No suitable table found in database. Available tables: {table_names}")
        elif tool == "llama-bench":
            if "llama_bench" in table_names:
                self.table_name = "llama_bench"
                self.tool = "llama-bench"
            else:
                raise RuntimeError(f"Table 'test' not found for tool 'llama-bench'. Available tables: {table_names}")
        elif tool == "test-backend-ops":
            if "test_backend_ops" in table_names:
                self.table_name = "test_backend_ops"
                self.tool = "test-backend-ops"
            else:
                raise RuntimeError(f"Table 'test_backend_ops' not found for tool 'test-backend-ops'. Available tables: {table_names}")
        else:
            raise RuntimeError(f"Unknown tool: {tool}")
        self._builds_init()
    @staticmethod
@ -317,20 +451,23 @@ class LlamaBenchDataSQLite3File(LlamaBenchDataSQLite3):
 class LlamaBenchDataJSONL(LlamaBenchDataSQLite3):
-    def __init__(self, data_file: str):
+    def __init__(self, data_file: str, tool: str = "llama-bench"):
-        super().__init__()
+        super().__init__(tool)
        # Get the appropriate field list based on tool
        db_fields = LLAMA_BENCH_DB_FIELDS if tool == "llama-bench" else TEST_BACKEND_OPS_DB_FIELDS
        with open(data_file, "r", encoding="utf-8") as fp:
            for i, line in enumerate(fp):
                parsed = json.loads(line)
-                for k in parsed.keys() - set(DB_FIELDS):
+                for k in parsed.keys() - set(db_fields):
                    del parsed[k]
                if (missing_keys := self._check_keys(parsed.keys())):
                    raise RuntimeError(f"Missing required data key(s) at line {i + 1}: {', '.join(missing_keys)}")
-                self.cursor.execute(f"INSERT INTO test({', '.join(parsed.keys())}) VALUES({', '.join('?' * len(parsed))});", tuple(parsed.values()))
+                self.cursor.execute(f"INSERT INTO {self.table_name}({', '.join(parsed.keys())}) VALUES({', '.join('?' * len(parsed))});", tuple(parsed.values()))
        self._builds_init()
@ -349,21 +486,24 @@ class LlamaBenchDataJSONL(LlamaBenchDataSQLite3):
 class LlamaBenchDataJSON(LlamaBenchDataSQLite3):
-    def __init__(self, data_files: list[str]):
+    def __init__(self, data_files: list[str], tool: str = "llama-bench"):
-        super().__init__()
+        super().__init__(tool)
        # Get the appropriate field list based on tool
        db_fields = LLAMA_BENCH_DB_FIELDS if tool == "llama-bench" else TEST_BACKEND_OPS_DB_FIELDS
        for data_file in data_files:
            with open(data_file, "r", encoding="utf-8") as fp:
                parsed = json.load(fp)
                for i, entry in enumerate(parsed):
-                    for k in entry.keys() - set(DB_FIELDS):
+                    for k in entry.keys() - set(db_fields):
                        del entry[k]
                    if (missing_keys := self._check_keys(entry.keys())):
                        raise RuntimeError(f"Missing required data key(s) at entry {i + 1}: {', '.join(missing_keys)}")
-                    self.cursor.execute(f"INSERT INTO test({', '.join(entry.keys())}) VALUES({', '.join('?' * len(entry))});", tuple(entry.values()))
+                    self.cursor.execute(f"INSERT INTO {self.table_name}({', '.join(entry.keys())}) VALUES({', '.join('?' * len(entry))});", tuple(entry.values()))
        self._builds_init()
@ -384,21 +524,24 @@ class LlamaBenchDataJSON(LlamaBenchDataSQLite3):
 class LlamaBenchDataCSV(LlamaBenchDataSQLite3):
-    def __init__(self, data_files: list[str]):
+    def __init__(self, data_files: list[str], tool: str = "llama-bench"):
-        super().__init__()
+        super().__init__(tool)
        # Get the appropriate field list based on tool
        db_fields = LLAMA_BENCH_DB_FIELDS if tool == "llama-bench" else TEST_BACKEND_OPS_DB_FIELDS
        for data_file in data_files:
            with open(data_file, "r", encoding="utf-8") as fp:
                for i, parsed in enumerate(csv.DictReader(fp)):
                    keys = set(parsed.keys())
-                    for k in keys - set(DB_FIELDS):
+                    for k in keys - set(db_fields):
                        del parsed[k]
                    if (missing_keys := self._check_keys(keys)):
                        raise RuntimeError(f"Missing required data key(s) at line {i + 1}: {', '.join(missing_keys)}")
-                    self.cursor.execute(f"INSERT INTO test({', '.join(parsed.keys())}) VALUES({', '.join('?' * len(parsed))});", tuple(parsed.values()))
+                    self.cursor.execute(f"INSERT INTO {self.table_name}({', '.join(parsed.keys())}) VALUES({', '.join('?' * len(parsed))});", tuple(parsed.values()))
        self._builds_init()
@ -419,21 +562,90 @@ class LlamaBenchDataCSV(LlamaBenchDataSQLite3):
        return True
 def format_flops(flops_value: float) -> str:
    """Format FLOPS values with appropriate units for better readability."""
    if flops_value == 0:
        return "0.00"
    # Define unit thresholds and names
    units = [
        (1e12, "T"),   # TeraFLOPS
        (1e9, "G"),    # GigaFLOPS
        (1e6, "M"),    # MegaFLOPS
        (1e3, "k"),    # kiloFLOPS
        (1, "")        # FLOPS
    ]
    for threshold, unit in units:
        if abs(flops_value) >= threshold:
            formatted_value = flops_value / threshold
            if formatted_value >= 100:
                return f"{formatted_value:.1f}{unit}"
            else:
                return f"{formatted_value:.2f}{unit}"
    # Fallback for very small values
    return f"{flops_value:.2f}"
 def format_flops_for_table(flops_value: float, target_unit: str) -> str:
    """Format FLOPS values for table display without unit suffix (since unit is in header)."""
    if flops_value == 0:
        return "0.00"
    # Define unit thresholds based on target unit
    unit_divisors = {
        "TFLOPS": 1e12,
        "GFLOPS": 1e9,
        "MFLOPS": 1e6,
        "kFLOPS": 1e3,
        "FLOPS": 1
    }
    divisor = unit_divisors.get(target_unit, 1)
    formatted_value = flops_value / divisor
    if formatted_value >= 100:
        return f"{formatted_value:.1f}"
    else:
        return f"{formatted_value:.2f}"
 def get_flops_unit_name(flops_values: list) -> str:
    """Determine the best FLOPS unit name based on the magnitude of values."""
    if not flops_values or all(v == 0 for v in flops_values):
        return "FLOPS"
    # Find the maximum absolute value to determine appropriate unit
    max_flops = max(abs(v) for v in flops_values if v != 0)
    if max_flops >= 1e12:
        return "TFLOPS"
    elif max_flops >= 1e9:
        return "GFLOPS"
    elif max_flops >= 1e6:
        return "MFLOPS"
    elif max_flops >= 1e3:
        return "kFLOPS"
    else:
        return "FLOPS"
 bench_data = None
 if len(input_file) == 1:
    if LlamaBenchDataSQLite3File.valid_format(input_file[0]):
-        bench_data = LlamaBenchDataSQLite3File(input_file[0])
+        bench_data = LlamaBenchDataSQLite3File(input_file[0], tool)
    elif LlamaBenchDataJSON.valid_format(input_file):
-        bench_data = LlamaBenchDataJSON(input_file)
+        bench_data = LlamaBenchDataJSON(input_file, tool)
    elif LlamaBenchDataJSONL.valid_format(input_file[0]):
-        bench_data = LlamaBenchDataJSONL(input_file[0])
+        bench_data = LlamaBenchDataJSONL(input_file[0], tool)
    elif LlamaBenchDataCSV.valid_format(input_file):
-        bench_data = LlamaBenchDataCSV(input_file)
+        bench_data = LlamaBenchDataCSV(input_file, tool)
 else:
    if LlamaBenchDataJSON.valid_format(input_file):
-        bench_data = LlamaBenchDataJSON(input_file)
+        bench_data = LlamaBenchDataJSON(input_file, tool)
    elif LlamaBenchDataCSV.valid_format(input_file):
-        bench_data = LlamaBenchDataCSV(input_file)
+        bench_data = LlamaBenchDataCSV(input_file, tool)
 if not bench_data:
    raise RuntimeError("No valid (or some invalid) input files found.")
@ -504,12 +716,29 @@ else:
 name_compare = bench_data.get_commit_name(hexsha8_compare)
 # Get tool-specific configuration
 if tool == "llama-bench":
    key_properties = LLAMA_BENCH_KEY_PROPERTIES
    bool_properties = LLAMA_BENCH_BOOL_PROPERTIES
    pretty_names = LLAMA_BENCH_PRETTY_NAMES
    default_show = DEFAULT_SHOW_LLAMA_BENCH
    default_hide = DEFAULT_HIDE_LLAMA_BENCH
 elif tool == "test-backend-ops":
    key_properties = TEST_BACKEND_OPS_KEY_PROPERTIES
    bool_properties = TEST_BACKEND_OPS_BOOL_PROPERTIES
    pretty_names = TEST_BACKEND_OPS_PRETTY_NAMES
    default_show = DEFAULT_SHOW_TEST_BACKEND_OPS
    default_hide = DEFAULT_HIDE_TEST_BACKEND_OPS
 else:
    assert False
 # If the user provided columns to group the results by, use them:
 if known_args.show is not None:
    show = known_args.show.split(",")
    unknown_cols = []
    for prop in show:
-        if prop not in KEY_PROPERTIES[:-3]:  # Last three values are n_prompt, n_gen, n_depth.
+        valid_props = key_properties if tool == "test-backend-ops" else key_properties[:-3]  # Exclude n_prompt, n_gen, n_depth for llama-bench
        if prop not in valid_props:
            unknown_cols.append(prop)
    if unknown_cols:
        logger.error(f"Unknown values for --show: {', '.join(unknown_cols)}")
@ -518,32 +747,54 @@ if known_args.show is not None:
    rows_show = bench_data.get_rows(show, hexsha8_baseline, hexsha8_compare)
 # Otherwise, select those columns where the values are not all the same:
 else:
-    rows_full = bench_data.get_rows(KEY_PROPERTIES, hexsha8_baseline, hexsha8_compare)
+    rows_full = bench_data.get_rows(key_properties, hexsha8_baseline, hexsha8_compare)
    properties_different = []
-    for i, kp_i in enumerate(KEY_PROPERTIES):
+
-        if kp_i in DEFAULT_SHOW or kp_i in ["n_prompt", "n_gen", "n_depth"]:
+    if tool == "llama-bench":
-            continue
+        # For llama-bench, skip n_prompt, n_gen, n_depth from differentiation logic
-        for row_full in rows_full:
+        check_properties = [kp for kp in key_properties if kp not in ["n_prompt", "n_gen", "n_depth"]]
-            if row_full[i] != rows_full[0][i]:
+        for i, kp_i in enumerate(key_properties):
-                properties_different.append(kp_i)
+            if kp_i in default_show or kp_i in ["n_prompt", "n_gen", "n_depth"]:
-                break
+                continue
            for row_full in rows_full:
                if row_full[i] != rows_full[0][i]:
                    properties_different.append(kp_i)
                    break
    elif tool == "test-backend-ops":
        # For test-backend-ops, check all key properties
        for i, kp_i in enumerate(key_properties):
            if kp_i in default_show:
                continue
            for row_full in rows_full:
                if row_full[i] != rows_full[0][i]:
                    properties_different.append(kp_i)
                    break
    else:
        assert False
    show = []
    # Show CPU and/or GPU by default even if the hardware for all results is the same:
    if rows_full and "n_gpu_layers" not in properties_different:
        ngl = int(rows_full[0][KEY_PROPERTIES.index("n_gpu_layers")])
-        if ngl != 99 and "cpu_info" not in properties_different:
+    if tool == "llama-bench":
-            show.append("cpu_info")
+        # Show CPU and/or GPU by default even if the hardware for all results is the same:
        if rows_full and "n_gpu_layers" not in properties_different:
            ngl = int(rows_full[0][key_properties.index("n_gpu_layers")])
-    show += properties_different
+            if ngl != 99 and "cpu_info" not in properties_different:
                show.append("cpu_info")
-    index_default = 0
+        show += properties_different
-    for prop in ["cpu_info", "gpu_info", "n_gpu_layers", "main_gpu"]:
+
-        if prop in show:
+        index_default = 0
-            index_default += 1
+        for prop in ["cpu_info", "gpu_info", "n_gpu_layers", "main_gpu"]:
-    show = show[:index_default] + DEFAULT_SHOW + show[index_default:]
+            if prop in show:
-    for prop in DEFAULT_HIDE:
+                index_default += 1
        show = show[:index_default] + default_show + show[index_default:]
    elif tool == "test-backend-ops":
        show = default_show + properties_different
    else:
        assert False
    for prop in default_hide:
        try:
            show.remove(prop)
        except ValueError:
@ -551,7 +802,7 @@ else:
    # Add plot_x parameter to parameters to show if it's not already present:
    if known_args.plot:
-        for k, v in PRETTY_NAMES.items():
+        for k, v in pretty_names.items():
            if v == known_args.plot_x and k not in show:
                show.append(k)
                break
@ -563,60 +814,120 @@ if not rows_show:
    sys.exit(1)
 table = []
-for row in rows_show:
+primary_metric = "FLOPS"  # Default to FLOPS for test-backend-ops
-    n_prompt = int(row[-5])
+
-    n_gen    = int(row[-4])
+if tool == "llama-bench":
-    n_depth  = int(row[-3])
+    # For llama-bench, create test names and compare avg_ts values
-    if n_prompt != 0 and n_gen == 0:
+    for row in rows_show:
-        test_name = f"pp{n_prompt}"
+        n_prompt = int(row[-5])
-    elif n_prompt == 0 and n_gen != 0:
+        n_gen    = int(row[-4])
-        test_name = f"tg{n_gen}"
+        n_depth  = int(row[-3])
-    else:
+        if n_prompt != 0 and n_gen == 0:
-        test_name = f"pp{n_prompt}+tg{n_gen}"
+            test_name = f"pp{n_prompt}"
-    if n_depth != 0:
+        elif n_prompt == 0 and n_gen != 0:
-        test_name = f"{test_name}@d{n_depth}"
+            test_name = f"tg{n_gen}"
-    #           Regular columns    test name    avg t/s values              Speedup
+        else:
-    #            VVVVVVVVVVVVV     VVVVVVVVV    VVVVVVVVVVVVVV              VVVVVVV
+            test_name = f"pp{n_prompt}+tg{n_gen}"
-    table.append(list(row[:-5]) + [test_name] + list(row[-2:]) + [float(row[-1]) / float(row[-2])])
+        if n_depth != 0:
            test_name = f"{test_name}@d{n_depth}"
        #           Regular columns    test name    avg t/s values              Speedup
        #            VVVVVVVVVVVVV     VVVVVVVVV    VVVVVVVVVVVVVV              VVVVVVV
        table.append(list(row[:-5]) + [test_name] + list(row[-2:]) + [float(row[-1]) / float(row[-2])])
 elif tool == "test-backend-ops":
    # Determine the primary metric by checking rows until we find one with valid data
    if rows_show:
        primary_metric = "FLOPS"  # Default to FLOPS
        flops_values = []
        # Collect all FLOPS values to determine the best unit
        for sample_row in rows_show:
            baseline_flops = float(sample_row[-4])
            compare_flops = float(sample_row[-3])
            baseline_bandwidth = float(sample_row[-2])
            if baseline_flops > 0:
                flops_values.extend([baseline_flops, compare_flops])
            elif baseline_bandwidth > 0 and not flops_values:
                primary_metric = "Bandwidth (GB/s)"
        # If we have FLOPS data, determine the appropriate unit
        if flops_values:
            primary_metric = get_flops_unit_name(flops_values)
    # For test-backend-ops, prioritize FLOPS > bandwidth for comparison
    for row in rows_show:
        # Extract metrics: flops, bandwidth_gb_s (baseline and compare)
        baseline_flops = float(row[-4])
        compare_flops = float(row[-3])
        baseline_bandwidth = float(row[-2])
        compare_bandwidth = float(row[-1])
        # Determine which metric to use for comparison (prioritize FLOPS > bandwidth)
        if baseline_flops > 0 and compare_flops > 0:
            # Use FLOPS comparison (higher is better)
            speedup = compare_flops / baseline_flops
            baseline_str = format_flops_for_table(baseline_flops, primary_metric)
            compare_str = format_flops_for_table(compare_flops, primary_metric)
        elif baseline_bandwidth > 0 and compare_bandwidth > 0:
            # Use bandwidth comparison (higher is better)
            speedup = compare_bandwidth / baseline_bandwidth
            baseline_str = f"{baseline_bandwidth:.2f}"
            compare_str = f"{compare_bandwidth:.2f}"
        else:
            # Fallback if no valid data is available
            baseline_str = "N/A"
            compare_str = "N/A"
            from math import nan
            speedup = nan
        table.append(list(row[:-4]) + [baseline_str, compare_str, speedup])
 else:
    assert False
 # Some a-posteriori fixes to make the table contents prettier:
-for bool_property in BOOL_PROPERTIES:
+for bool_property in bool_properties:
    if bool_property in show:
        ip = show.index(bool_property)
        for row_table in table:
            row_table[ip] = "Yes" if int(row_table[ip]) == 1 else "No"
-if "model_type" in show:
+if tool == "llama-bench":
-    ip = show.index("model_type")
+    if "model_type" in show:
-    for (old, new) in MODEL_SUFFIX_REPLACE.items():
+        ip = show.index("model_type")
        for (old, new) in MODEL_SUFFIX_REPLACE.items():
            for row_table in table:
                row_table[ip] = row_table[ip].replace(old, new)
    if "model_size" in show:
        ip = show.index("model_size")
        for row_table in table:
-            row_table[ip] = row_table[ip].replace(old, new)
+            row_table[ip] = float(row_table[ip]) / 1024 ** 3
-if "model_size" in show:
+    if "gpu_info" in show:
-    ip = show.index("model_size")
+        ip = show.index("gpu_info")
-    for row_table in table:
+        for row_table in table:
-        row_table[ip] = float(row_table[ip]) / 1024 ** 3
+            for gns in GPU_NAME_STRIP:
                row_table[ip] = row_table[ip].replace(gns, "")
-if "gpu_info" in show:
+            gpu_names = row_table[ip].split(", ")
-    ip = show.index("gpu_info")
+            num_gpus = len(gpu_names)
-    for row_table in table:
+            all_names_the_same = len(set(gpu_names)) == 1
-        for gns in GPU_NAME_STRIP:
+            if len(gpu_names) >= 2 and all_names_the_same:
-            row_table[ip] = row_table[ip].replace(gns, "")
+                row_table[ip] = f"{num_gpus}x {gpu_names[0]}"
-        gpu_names = row_table[ip].split(", ")
+headers  = [pretty_names.get(p, p) for p in show]
-        num_gpus = len(gpu_names)
+if tool == "llama-bench":
-        all_names_the_same = len(set(gpu_names)) == 1
+    headers += ["Test", f"t/s {name_baseline}", f"t/s {name_compare}", "Speedup"]
-        if len(gpu_names) >= 2 and all_names_the_same:
+elif tool == "test-backend-ops":
-            row_table[ip] = f"{num_gpus}x {gpu_names[0]}"
+    headers += [f"{primary_metric} {name_baseline}", f"{primary_metric} {name_compare}", "Speedup"]
-
+else:
-headers  = [PRETTY_NAMES[p] for p in show]
+    assert False
 headers += ["Test", f"t/s {name_baseline}", f"t/s {name_compare}", "Speedup"]
 if known_args.plot:
-    def create_performance_plot(table_data: list[list[str]], headers: list[str], baseline_name: str, compare_name: str, output_file: str, plot_x_param: str, log_scale: bool = False):
+    def create_performance_plot(table_data: list[list[str]], headers: list[str], baseline_name: str, compare_name: str, output_file: str, plot_x_param: str, log_scale: bool = False, tool_type: str = "llama-bench", metric_name: str = "t/s"):
        try:
            import matplotlib.pyplot as plt
            import matplotlib
            import matplotlib.pyplot as plt
            matplotlib.use('Agg')
        except ImportError as e:
            logger.error("matplotlib is required for --plot.")
@ -627,7 +938,7 @@ if known_args.plot:
        plot_x_label = plot_x_param
        if plot_x_param not in ["n_prompt", "n_gen", "n_depth"]:
-            pretty_name = PRETTY_NAMES.get(plot_x_param, plot_x_param)
+            pretty_name = LLAMA_BENCH_PRETTY_NAMES.get(plot_x_param, plot_x_param)
            if pretty_name in data_headers:
                plot_x_index = data_headers.index(pretty_name)
                plot_x_label = pretty_name
@ -746,8 +1057,16 @@ if known_args.plot:
            title = ', '.join(title_parts) if title_parts else "Performance comparison"
            # Determine y-axis label based on tool type
            if tool_type == "llama-bench":
                y_label = "Tokens per second (t/s)"
            elif tool_type == "test-backend-ops":
                y_label = metric_name
            else:
                assert False
            ax.set_xlabel(plot_x_label, fontsize=12, fontweight='bold')
-            ax.set_ylabel('Tokens per second (t/s)', fontsize=12, fontweight='bold')
+            ax.set_ylabel(y_label, fontsize=12, fontweight='bold')
            ax.set_title(title, fontsize=12, fontweight='bold')
            ax.legend(loc='best', fontsize=10)
            ax.grid(True, alpha=0.3)
@ -765,7 +1084,7 @@ if known_args.plot:
        plt.savefig(output_file, dpi=300, bbox_inches='tight')
        plt.close()
-    create_performance_plot(table, headers, name_baseline, name_compare, known_args.plot, known_args.plot_x, known_args.plot_log_scale)
+    create_performance_plot(table, headers, name_baseline, name_compare, known_args.plot, known_args.plot_x, known_args.plot_log_scale, tool, primary_metric)
 print(tabulate( # noqa: NP100
    table,
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@ -85,6 +85,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_ERNIE4_5,         "ernie4_5"         },
    { LLM_ARCH_ERNIE4_5_MOE,     "ernie4_5-moe"     },
    { LLM_ARCH_HUNYUAN_MOE,      "hunyuan-moe"      },
    { LLM_ARCH_HUNYUAN_DENSE,    "hunyuan-dense"    },
    { LLM_ARCH_SMOLLM3,          "smollm3"          },
    { LLM_ARCH_LFM2,             "lfm2"             },
    { LLM_ARCH_DREAM,            "dream"            },
@ -1897,6 +1898,26 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
        },
    },
    {
        LLM_ARCH_HUNYUAN_DENSE,
        {
            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
            { LLM_TENSOR_OUTPUT,          "output" },
            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
            { LLM_TENSOR_ATTN_Q_NORM,     "blk.%d.attn_q_norm" },
            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
            { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },
            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
        },
    },
    {
        LLM_ARCH_SMOLLM3,
        {
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@ -89,6 +89,7 @@ enum llm_arch {
    LLM_ARCH_ERNIE4_5,
    LLM_ARCH_ERNIE4_5_MOE,
    LLM_ARCH_HUNYUAN_MOE,
    LLM_ARCH_HUNYUAN_DENSE,
    LLM_ARCH_SMOLLM3,
    LLM_ARCH_LFM2,
    LLM_ARCH_DREAM,
--- a/src/llama-chat.cpp
+++ b/src/llama-chat.cpp
@ -66,6 +66,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
    { "llama4",            LLM_CHAT_TEMPLATE_LLAMA4            },
    { "smolvlm",           LLM_CHAT_TEMPLATE_SMOLVLM           },
    { "hunyuan-moe",       LLM_CHAT_TEMPLATE_HUNYUAN_MOE       },
    { "hunyuan-dense",     LLM_CHAT_TEMPLATE_HUNYUAN_DENSE     },
    { "kimi-k2",           LLM_CHAT_TEMPLATE_KIMI_K2           },
 };
@ -193,6 +194,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
        return LLM_CHAT_TEMPLATE_DOTS1;
    } else if (tmpl_contains("<|startoftext|>") && tmpl_contains("<|extra_4|>")) {
        return LLM_CHAT_TEMPLATE_HUNYUAN_MOE;
    } else if (tmpl_contains("<｜hy_place▁holder▁no▁2｜>") && tmpl_contains("<｜hy_place▁holder▁no▁3｜>")) {
        return LLM_CHAT_TEMPLATE_HUNYUAN_DENSE;
    } else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) {
        return LLM_CHAT_TEMPLATE_KIMI_K2;
    }
@ -698,11 +701,27 @@ int32_t llm_chat_apply_template(
            if (role == "system") {
                ss << "<|startoftext|>" << message->content << "<|extra_4|>";
            } else if (role == "assistant") {
-                ss << "<|startoftext|>" << message->content << "<|eos|>";
+                ss << message->content << "<|eos|>";
            } else {
                ss << "<|startoftext|>" << message->content << "<|extra_0|>";
            }
        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_DENSE) {
        // tencent/Hunyuan-4B-Instruct
        for (size_t i = 0; i < chat.size(); i++) {
            std::string role(chat[i]->role);
            if (i == 0) {
                if (role == "system") {
                    ss << chat[i]->content << "<｜hy_place▁holder▁no▁3｜>";
                }
            }
            if (role == "assistant") {
                ss << "<｜hy_Assistant｜>" << chat[i]->content << "<｜hy_place▁holder▁no▁2｜>";
            } else if (role == "user") {
                ss << "<｜hy_User｜>" << chat[i]->content << "<｜hy_Assistant｜>";
            }
        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_KIMI_K2) {
        // moonshotai/Kimi-K2-Instruct
        for (auto message : chat) {
--- a/src/llama-chat.h
+++ b/src/llama-chat.h
@ -46,6 +46,7 @@ enum llm_chat_template {
    LLM_CHAT_TEMPLATE_SMOLVLM,
    LLM_CHAT_TEMPLATE_DOTS1,
    LLM_CHAT_TEMPLATE_HUNYUAN_MOE,
    LLM_CHAT_TEMPLATE_HUNYUAN_DENSE,
    LLM_CHAT_TEMPLATE_KIMI_K2,
    LLM_CHAT_TEMPLATE_UNKNOWN,
 };
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@ -105,7 +105,7 @@ llama_context::llama_context(
    {
        const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
-        supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : false;
+        supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : supports_set_rows;
        if (!supports_set_rows && !cparams.kv_unified) {
            LLAMA_LOG_WARN("%s: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache\n", __func__);
@ -113,6 +113,15 @@ llama_context::llama_context(
        }
    }
    {
        const char * LLAMA_GRAPH_REUSE_DISABLE = getenv("LLAMA_GRAPH_REUSE_DISABLE");
        graph_reuse_disable = LLAMA_GRAPH_REUSE_DISABLE ? (atoi(LLAMA_GRAPH_REUSE_DISABLE) != 0) : graph_reuse_disable;
        if (graph_reuse_disable) {
            LLAMA_LOG_WARN("%s: graph reuse disabled\n", __func__);
        }
    }
    const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
    LLAMA_LOG_INFO("%s: n_seq_max     = %u\n",   __func__, cparams.n_seq_max);
@ -716,7 +725,7 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll
    // in order to correctly reuse a graph, it's full topology has to be uniquely determined by these parameters
    const auto gparams = graph_params(res, ubatch, mctx, gtype);
-    if (res->can_reuse(gparams)) {
+    if (!graph_reuse_disable && res->can_reuse(gparams)) {
        //LLAMA_LOG_DEBUG("%s: reusing previous graph\n", __func__);
        n_reused++;
--- a/src/llama-context.h
+++ b/src/llama-context.h
@ -289,7 +289,10 @@ private:
    // env: LLAMA_SET_ROWS (temporary)
    // ref: https://github.com/ggml-org/llama.cpp/pull/14285
-    bool supports_set_rows = false;
+    bool supports_set_rows = true;
    // env: LLAMA_GRAPH_REUSE_DISABLE
    bool graph_reuse_disable = false;
    // perf
    mutable int64_t t_start_us  = 0;
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@ -423,7 +423,9 @@ struct llm_graph_params {
                (!ubatch.embd  && !other.ubatch.embd)
            );
-        if (can_reuse_ubatch && !ubatch.equal_seqs()) {
+        // when we split the batch using "equal_seqs" we have to verify that the participating sequences are the same
        //   the reason is because the set of attention streams would be different for different sequences
        if (can_reuse_ubatch && ubatch.equal_seqs()) {
            if (!ubatch.data) {
                // if the old ubatch does not own it's data, then we cannot guarantee that it is still alive, and
                //   therefore we cannot perform the sequence id check. normally should never happen
--- a/src/llama-kv-cache-unified.cpp
+++ b/src/llama-kv-cache-unified.cpp
@ -183,7 +183,7 @@ llama_kv_cache_unified::llama_kv_cache_unified(
        const size_t memory_size_k = size_k_bytes();
        const size_t memory_size_v = size_v_bytes();
-        LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u/%2u seqs), K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
+        LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u/%u seqs), K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
                (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), kv_size, (int) layers.size(), n_seq_max, n_stream,
                ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
                ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
@ -193,7 +193,7 @@ llama_kv_cache_unified::llama_kv_cache_unified(
    debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0;
    const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
-    supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) != 0 : 0;
+    supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) != 0 : supports_set_rows;
    if (!supports_set_rows) {
        // ref: https://github.com/ggml-org/llama.cpp/pull/14363
--- a/src/llama-kv-cache-unified.h
+++ b/src/llama-kv-cache-unified.h
@ -230,7 +230,7 @@ private:
    // env: LLAMA_SET_ROWS (temporary)
    // ref: https://github.com/ggml-org/llama.cpp/pull/14285
-    bool supports_set_rows = false;
+    bool supports_set_rows = true;
    const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
--- a/src/llama-memory-hybrid.cpp
+++ b/src/llama-memory-hybrid.cpp
@ -25,6 +25,7 @@ llama_memory_hybrid::llama_memory_hybrid(
                         /* common */
             uint32_t    n_seq_max,
                 bool    offload,
                 bool    unified,
                         /* layer filters */
      layer_filter_cb && filter_attn,
      layer_filter_cb && filter_recr) :
@ -38,7 +39,7 @@ llama_memory_hybrid::llama_memory_hybrid(
        type_v,
        v_trans,
        offload,
-        1,
+        unified,
        kv_size,
        n_seq_max,
        n_pad,
--- a/src/llama-memory-hybrid.h
+++ b/src/llama-memory-hybrid.h
@ -39,6 +39,7 @@ public:
                             /* common */
                 uint32_t    n_seq_max,
                     bool    offload,
                     bool    unified,
                             /* layer filters */
          layer_filter_cb && filter_attn = nullptr,
          layer_filter_cb && filter_recr = nullptr);
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@ -899,6 +899,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
            } break;
        case LLM_ARCH_QWEN3:
            {
                ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                switch (hparams.n_layer) {
                    case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
@ -1760,6 +1761,18 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    default: type = LLM_TYPE_UNKNOWN;
                }
            } break;
        case LLM_ARCH_HUNYUAN_DENSE:
            {
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                switch (hparams.n_embd) {
                    case 1024: type = LLM_TYPE_0_5B; break;
                    case 2048: type = LLM_TYPE_1_8B; break;
                    case 3072: type = LLM_TYPE_4B; break;
                    case 4096: type = LLM_TYPE_7B; break;
                    default: type = LLM_TYPE_UNKNOWN;
                }
            } break;
        case LLM_ARCH_SMOLLM3:
            {
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@ -5195,6 +5208,39 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
                    }
                } break;
            case LLM_ARCH_HUNYUAN_DENSE:
                {
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
                    // output
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
                    // if output is NULL, init from the input tok embed
                    if (output == NULL) {
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
                    }
                    for (int i = 0; i < n_layer; ++i) {
                        auto & layer = layers[i];
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
                    }
                } break;
            case LLM_ARCH_SMOLLM3:
                {
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@ -16930,6 +16976,144 @@ struct llm_build_hunyuan_moe : public llm_graph_context {
    }
 };
 struct llm_build_hunyuan_dense : public llm_graph_context {
    llm_build_hunyuan_dense(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
        const int64_t n_embd_head = hparams.n_embd_head_v;
        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
        GGML_ASSERT(n_embd_head == hparams.n_rot);
        ggml_tensor * cur;
        ggml_tensor * inpL;
        inpL = build_inp_embd(model.tok_embd);
        // inp_pos - contains the positions
        ggml_tensor * inp_pos = build_inp_pos();
        auto * inp_attn = build_attn_inp_kv_unified();
        const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
        ggml_tensor * inp_out_ids = build_inp_out_ids();
        for (int il = 0; il < n_layer; ++il) {
            ggml_tensor * inpSA = inpL;
            // norm
            cur = build_norm(inpL,
                    model.layers[il].attn_norm, NULL,
                    LLM_NORM_RMS, il);
            cb(cur, "attn_norm", il);
            // self-attention
            {
                // rope freq factors for llama3; may return nullptr for llama2 and other models
                ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
                // compute Q and K and RoPE them
                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                cb(Qcur, "Qcur", il);
                if (model.layers[il].bq) {
                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                    cb(Qcur, "Qcur", il);
                }
                ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                cb(Kcur, "Kcur", il);
                if (model.layers[il].bk) {
                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                    cb(Kcur, "Kcur", il);
                }
                ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                cb(Vcur, "Vcur", il);
                if (model.layers[il].bv) {
                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
                    cb(Vcur, "Vcur", il);
                }
                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
                Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
                Qcur = ggml_rope_ext(
                         ctx0, Qcur, inp_pos, rope_factors,
                         n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                         ext_factor, attn_factor, beta_fast, beta_slow
                         );
                cb(Qcur, "Qcur", il);
                cb(Kcur, "Kcur", il);
                cb(Vcur, "Vcur", il);
                Kcur = ggml_rope_ext(
                         ctx0, Kcur, inp_pos, rope_factors,
                         n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                         ext_factor, attn_factor, beta_fast, beta_slow
                         );
                Kcur = build_norm(Kcur,
                         model.layers[il].attn_k_norm, nullptr,
                         LLM_NORM_RMS, il);
                cb(Kcur, "Kcur_norm", il);
                Qcur = build_norm(Qcur,
                         model.layers[il].attn_q_norm, nullptr,
                         LLM_NORM_RMS, il);
                cb(Qcur, "Qcur_norm", il);
                cur = build_attn(inp_attn,
                        model.layers[il].wo, model.layers[il].bo,
                        Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
                cb(cur, "attn_out", il);
            }
            if (il == n_layer - 1 && inp_out_ids) {
                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
            }
            ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
            cb(ffn_inp, "ffn_inp", il);
            cur = build_norm(ffn_inp,
                    model.layers[il].ffn_norm, NULL,
                    LLM_NORM_RMS, il);
            cb(cur, "ffn_norm", il);
            // feed-forward network (non-MoE)
            ggml_tensor * cur_mlp = build_ffn(cur,
                        model.layers[il].ffn_up,   NULL, NULL,
                        model.layers[il].ffn_gate, NULL, NULL,
                        model.layers[il].ffn_down, NULL, NULL,
                        NULL,
                        LLM_FFN_SILU, LLM_FFN_PAR, il);
            cb(cur_mlp, "ffn_out", il);
            cur = ggml_add(ctx0, cur_mlp, ffn_inp);
            cur = build_cvec(cur, il);
            cb(cur, "l_out", il);
            // input for next layer
            inpL = cur;
        }
        cur = inpL;
        cur = build_norm(cur,
                model.output_norm, NULL,
                LLM_NORM_RMS, -1);
        cb(cur, "result_norm", -1);
        res->t_embd = cur;
        // lm_head
        cur = build_lora_mm(model.output, cur);
        cb(cur, "result_output", -1);
        res->t_logits = cur;
        ggml_build_forward_expand(gf, cur);
    }
 };
 struct llm_build_smollm3 : public llm_graph_context {
    llm_build_smollm3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
        const int64_t n_embd_head = hparams.n_embd_head_v;
@ -17414,6 +17598,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                        /* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
                        /* n_seq_max         */ cparams.n_seq_max,
                        /* offload           */ cparams.offload_kqv,
                        /* unified           */ cparams.kv_unified,
                        /* filter_attn       */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr,
                        /* filter_recr       */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr);
                } else {
@ -17797,6 +17982,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
            {
                llm = std::make_unique<llm_build_hunyuan_moe>(*this, params);
            } break;
        case LLM_ARCH_HUNYUAN_DENSE:
            {
                llm = std::make_unique<llm_build_hunyuan_dense>(*this, params);
            } break;
        case LLM_ARCH_SMOLLM3:
            {
                llm = std::make_unique<llm_build_smollm3>(*this, params);
@ -18016,6 +18205,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
        case LLM_ARCH_MINICPM3:
        case LLM_ARCH_DOTS1:
        case LLM_ARCH_HUNYUAN_MOE:
        case LLM_ARCH_HUNYUAN_DENSE:
        case LLM_ARCH_LFM2:
        case LLM_ARCH_SMALLTHINKER:
            return LLAMA_ROPE_TYPE_NEOX;
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@ -307,6 +307,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
                };
                break;
            case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM:
            case LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE:
                regex_exprs = {
                    "\\p{N}{1,3}",
                    "[一-龥぀-ゟ゠-ヿ]+",
@ -1855,7 +1856,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                    tokenizer_pre == "gigachat"   ||
                    tokenizer_pre == "jina-v2-es" ||
                    tokenizer_pre == "jina-v2-de" ||
-                    tokenizer_pre == "a.x-4.0") {
+                    tokenizer_pre == "a.x-4.0" ||
                    tokenizer_pre == "mellum") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
            } else if (
                    tokenizer_pre == "jina-v1-en" ||
@ -1964,6 +1966,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                tokenizer_pre == "hunyuan") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN;
                clean_spaces = false;
            } else if (
                tokenizer_pre == "hunyuan-dense") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE;
                clean_spaces = false;
            } else if (
                tokenizer_pre == "kimi-k2") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2;
--- a/src/llama-vocab.h
+++ b/src/llama-vocab.h
@ -46,6 +46,7 @@ enum llama_vocab_pre_type {
    LLAMA_VOCAB_PRE_TYPE_SEED_CODER     = 35,
    LLAMA_VOCAB_PRE_TYPE_HUNYUAN        = 36,
    LLAMA_VOCAB_PRE_TYPE_KIMI_K2        = 37,
    LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE  = 38,
 };
 struct LLM_KV;
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@ -5592,13 +5592,15 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 193, {1,  1}, {4, 1}, {0, 2, 1, 3}));
    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 67,  {1,  1}, {4, 1}, {0, 2, 1, 3}));
-    for (auto bs : {1,2,4,8}) {
+    for (auto bs2 : {1,3}) {
-        for (auto nr : {1,4}) {
+        for (auto bs : {1,2,4,8}) {
-            for (uint32_t m = 0; m < 2; ++m) {
+            for (auto nr : {1,4}) {
-                for (uint32_t k = 0; k < 2; ++k) {
+                for (uint32_t m = 0; m < 2; ++m) {
-                    for (ggml_type type: {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_F32}) {
+                    for (uint32_t k = 0; k < 2; ++k) {
-                        test_cases.emplace_back(new test_mul_mat(type, GGML_TYPE_F32, 1056 + m, 1, 128 + k,  {bs,  1}, {nr, 1}, {0, 2, 1, 3}));
+                        for (ggml_type type: {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_F32}) {
-                        test_cases.emplace_back(new test_mul_mat(type, GGML_TYPE_F32, 128 + m,  1, 1056 + k, {bs,  1}, {nr, 1}, {0, 1, 2, 3}, true));
+                            test_cases.emplace_back(new test_mul_mat(type, GGML_TYPE_F32, 1056 + m, 1, 128 + k,  {bs,  bs2}, {nr, 1}, {0, 2, 1, 3}));
                            test_cases.emplace_back(new test_mul_mat(type, GGML_TYPE_F32, 128 + m,  1, 1056 + k, {bs,  bs2}, {nr, 1}, {0, 1, 2, 3}, true));
                        }
                    }
                }
            }
--- a/tests/test-chat.cpp
+++ b/tests/test-chat.cpp
@ -953,6 +953,33 @@ static void test_template_output_parsers() {
                /* is_partial= */ false,
                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
        // Test multiple tool calls
        common_chat_msg message_assist_multiple_calls;
        message_assist_multiple_calls.role = "assistant";
        message_assist_multiple_calls.content = "";
        message_assist_multiple_calls.tool_calls.push_back({"special_function", "{\"arg1\": 1}", ""});
        message_assist_multiple_calls.tool_calls.push_back({"python", "{\"code\":\"print('hello')\"}", ""});
        assert_msg_equals(
            message_assist_multiple_calls,
            common_chat_parse(
                "<tool_call>\n"
                "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
                "</tool_call>\n"
                "<tool_call>\n"
                "{\"name\": \"python\", \"arguments\": {\"code\":\"print('hello')\"}}\n"
                "</tool_call>",
                /* is_partial= */ false,
                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
        assert_msg_equals(
            message_assist_multiple_calls,
            common_chat_parse(
                "<function=special_function>{\"arg1\": 1}</function>\n"
                "<function=python>{\"code\":\"print('hello')\"}</function>",
                /* is_partial= */ false,
                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
        assert_msg_equals(
            simple_assist_msg(
                "This is not a tool call:",
@ -1039,6 +1066,22 @@ static void test_template_output_parsers() {
                      "<tool_call>\n"
                      "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
                      "</tool_call>");
        // Test multiple tool calls with template
        common_chat_msg message_assist_multiple_calls_template;
        message_assist_multiple_calls_template.role = "assistant";
        message_assist_multiple_calls_template.content = "";
        message_assist_multiple_calls_template.tool_calls.push_back({"special_function", "{\"arg1\": 1}", ""});
        message_assist_multiple_calls_template.tool_calls.push_back({"python", "{\"code\":\"print('test')\"}", ""});
        test_templates(tmpls.get(), end_tokens, message_assist_multiple_calls_template, tools,
                      "<tool_call>\n"
                      "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
                      "</tool_call>\n"
                      "<tool_call>\n"
                      "{\"name\": \"python\", \"arguments\": {\"code\":\"print('test')\"}}\n"
                      "</tool_call>");
        test_templates(tmpls.get(), end_tokens, message_assist_call_python_lines, tools,
                      "<tool_call>\n"
                      "{\"name\": \"python\", \"arguments\": {\"code\":\"# This is a program:\\nprint('hey')\"}}\n"
--- a/tools/imatrix/README.md
+++ b/tools/imatrix/README.md
@ -7,7 +7,7 @@ More information is available in <https://github.com/ggml-org/llama.cpp/pull/486
 ```
 ./llama-imatrix \
-    -m model.gguf -f some-text.txt [-o imatrix.gguf] [--no-ppl] \
+    -m model.gguf -f some-text.txt [-o imatrix.gguf] [--output-format {gguf,dat}] [--no-ppl] \
    [--process-output] [--chunk 123] [--save-frequency 0] [--output-frequency 10] \
    [--in-file imatrix-prev-0.gguf --in-file imatrix-prev-1.gguf ...] [--parse-special] \
    [--show-statistics] [...]
@ -20,6 +20,7 @@ The parameters in square brackets are optional and have the following meaning:
 * `-lv | --verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`.
 * `-o | --output-file` specifies the name of the file where the computed data will be stored. If missing `imatrix.gguf` is used.
 * `-ofreq | --output-frequency` specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks)
 * `--output-format` specifies the output format of the generated imatrix file. Either "gguf", or "dat" (the legacy format). Defaults to "gguf".
 * `--save-frequency` specifies how often to save a copy of the imatrix in a separate file. Default is 0 (i.e., never)
 * `--process-output` specifies if data will be collected for the `output.weight` tensor. Typically, it is better not to utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default.
 * `--in-file` one or more existing imatrix files to load and combine. Useful for merging files from multiple runs/datasets.
@ -45,14 +46,19 @@ Recent versions of `llama-imatrix` store data in GGUF format by default. For the
 ```bash
 # generate and save the imatrix using legacy format
-./llama-imatrix -m ggml-model-f16.gguf -f calibration-data.txt -o imatrix-legcy-format.dat -ngl 99
+./llama-imatrix -m ggml-model-f16.gguf -f calibration-data.txt --output-format dat -o imatrix-legcy-format.dat -ngl 99
 ```
 ```bash
-# covert legacy (binary) imatrix format to new (GGUF) format
+# convert legacy (binary) imatrix format to new (GGUF) format
 ./llama-imatrix --in-file imatrix-legacy-format.dat -o imatrix-new-format.gguf
 ```
 ```bash
 # convert new (GGUF) imatrix format to legacy (binary) format
 ./llama-imatrix --in-file imatrix-new-format.gguf --output-format dat -o imatrix-legacy-format.dat
 ```
 ```bash
 # combine existing imatrices
 ./llama-imatrix --in-file imatrix-prev-0.gguf --in-file imatrix-prev-1.gguf -o imatrix-combined.gguf
--- a/tools/imatrix/imatrix.cpp
+++ b/tools/imatrix/imatrix.cpp
@ -26,7 +26,7 @@
 static void print_usage(int, char ** argv) {
    LOG("\nexample usage:\n");
    LOG("\n    %s \\\n"
-            "       -m model.gguf -f some-text.txt [-o imatrix.gguf] [--no-ppl] \\\n"
+            "       -m model.gguf -f some-text.txt [-o imatrix.gguf] [--output-format {gguf,dat}] [--no-ppl] \\\n"
            "       [--process-output] [--chunk 123] [--save-frequency 0] [--output-frequency 10] \\\n"
            "       [--in-file imatrix-prev-0.gguf --in-file imatrix-prev-1.gguf ...] [--parse-special] \\\n"
            "       [--show-statistics] [...]\n" , argv[0]);
@ -250,13 +250,6 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
    const char * data = is_host ? (const char *) src1->data : m_src1_data.data();
    GGML_ASSERT(src1->nb[0] == ggml_element_size(src1));
    // TODO: 4d? (is that even used in practice?)
    // the extra dimension would need to be stored somewhere to be reflected in the imatrix file
    if (ggml_nrows(src1) != src1->ne[1] * src1->ne[2]) {
        LOG_ERR("%s: tensor has more than 3 dimensions: %s", __func__, wname.c_str());
        GGML_ASSERT(false);
    }
    // this has been adapted to the new format of storing merged experts in a single 3d tensor
    // ref: https://github.com/ggml-org/llama.cpp/pull/6387
    if (t->op == GGML_OP_MUL_MAT_ID) {
@ -272,6 +265,12 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
        GGML_ASSERT(ids->ne[1] == src1->ne[2]);
        // the extra dimension would need to be stored somewhere to be reflected in the imatrix file
        if (ggml_nrows(src1) != src1->ne[1] * src1->ne[2]) {
            LOG_ERR("%s: tensor has more than 3 dimensions: %s", __func__, wname.c_str());
            GGML_ASSERT(false);
        }
        m_ids.resize(ggml_nbytes(ids));
        ggml_backend_tensor_get(ids, m_ids.data(), 0, ggml_nbytes(ids));
@ -335,29 +334,40 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
        }
    } else {
        auto & e = m_stats[wname];
-        const int64_t n_mat = src1->ne[2] * src1->ne[3];
+        const int64_t n_mat = src0->ne[2] * src0->ne[3];
        // use a single count per dense tensor
        // (necessary when merging older GGUF-imatrix files with 3d tensors)
        if (e.counts.size() > 1) {
            bool all_equal = true;
            for (size_t i = 1; i < e.counts.size(); ++i) {
                if (e.counts[0] != e.counts[i]) {
                    all_equal = false;
                    break;
                }
            }
            if (all_equal) {
                e.counts.resize(1);
            }
        }
        if (e.values.empty()) {
            e.values.resize(src1->ne[0] * n_mat, 0);
-            e.counts.resize(n_mat, 0);
+            e.counts.resize(1, 0);
        }
        else if (e.values.size() != (size_t)(src1->ne[0] * n_mat)) {
            LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)(src1->ne[0] * n_mat));
            exit(1); //GGML_ABORT("fatal error");
        }
        else if (e.counts.size() != (size_t)n_mat) {
            LOG_ERR("%s: inconsistent expert count for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.counts.size(), (int)n_mat);
            exit(1); //GGML_ABORT("fatal error");
        }
        LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d x %5d, %d\n", __func__, m_last_chunk, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->ne[2], (int)src1->type);
        for (int64_t i3 = 0; i3 < src1->ne[3]; ++i3) {
            for (int64_t i2 = 0; i2 < src1->ne[2]; ++i2) {
-                const int64_t mat_id = i3 * src1->ne[2] + i2;
+                // handle 3D+ tensors, but flatten 3D+ activations when model tensor is 2D
                const int64_t mat_id = (i3 % src0->ne[3]) * src0->ne[2] + (i2 % src0->ne[2]);
                const int64_t mat_start = mat_id * src1->ne[0];
                for (int64_t row = 0; row < src1->ne[1]; ++row) {
-                    const float * x = (const float *) (data + row * src1->nb[1] + i2 * src1->nb[2] + i3 * src1->ne[3]);
+                    const float * x = (const float *) (data + row * src1->nb[1] + i2 * src1->nb[2] + i3 * src1->nb[3]);
                    e.counts[mat_id]++;
                    for (int64_t j = 0; j < src1->ne[0]; ++j) {
                        e.values[mat_start + j] += x[j] * x[j];
                        if (!std::isfinite((float)e.values[j])) {
@ -366,16 +376,20 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
                        }
                    }
                }
-                const int32_t n_chunk = e.counts[mat_id] / chunk_size;
+            }
-                if (n_chunk > m_last_chunk) {
+        }
-                    const int32_t chunk_step = n_chunk - m_last_chunk;
+        // only 1 count in practice, except when a tensor is used for both MUL_MAT_ID and MUL_MAT
-                    m_last_chunk = n_chunk;
+        for (size_t i = 0; i < e.counts.size(); ++i) {
-                    if ((m_last_chunk % m_params.n_out_freq) / chunk_step == 0) {
+            e.counts[i] += ggml_nrows(src1) / n_mat;
-                        save_imatrix();
+            const int32_t n_chunk = e.counts[i] / chunk_size;
-                    }
+            if (n_chunk > m_last_chunk) {
-                    if (m_params.n_save_freq > 0 && (m_last_chunk % m_params.n_save_freq) / chunk_step == 0) {
+                const int32_t chunk_step = n_chunk - m_last_chunk;
-                        save_imatrix(m_last_chunk);
+                m_last_chunk = n_chunk;
-                    }
+                if ((m_last_chunk % m_params.n_out_freq) / chunk_step == 0) {
                    save_imatrix();
                }
                if (m_params.n_save_freq > 0 && (m_last_chunk % m_params.n_save_freq) / chunk_step == 0) {
                    save_imatrix(m_last_chunk);
                }
            }
        }
@ -492,13 +506,13 @@ void IMatrixCollector::save_imatrix_legacy(int32_t ncall) const {
 void IMatrixCollector::save_imatrix(int32_t n_chunk) const {
    auto fname = m_params.out_file;
    bool use_legacy_format = m_params.imat_dat;
-    // TODO: use the new format in more cases
+    if (use_legacy_format) {
    if (!string_ends_with(fname, ".gguf")) {
        LOG_WRN("\n%s: saving to legacy imatrix format because output suffix is not .gguf\n", __func__);
        this->save_imatrix_legacy(n_chunk);
        return;
    }
    // else, default to GGUF imatrix
    if (n_chunk > 0) {
        fname += ".at_";
--- a/tools/llama-bench/llama-bench.cpp
+++ b/tools/llama-bench/llama-bench.cpp
@ -1738,7 +1738,7 @@ struct sql_printer : public printer {
    void print_header(const cmd_params & params) override {
        std::vector<std::string> fields = test::get_fields();
-        fprintf(fout, "CREATE TABLE IF NOT EXISTS test (\n");
+        fprintf(fout, "CREATE TABLE IF NOT EXISTS llama_bench (\n");
        for (size_t i = 0; i < fields.size(); i++) {
            fprintf(fout, "  %s %s%s\n", fields.at(i).c_str(), get_sql_field_type(fields.at(i)).c_str(),
                    i < fields.size() - 1 ? "," : "");
@ -1749,7 +1749,7 @@ struct sql_printer : public printer {
    }
    void print_test(const test & t) override {
-        fprintf(fout, "INSERT INTO test (%s) ", join(test::get_fields(), ", ").c_str());
+        fprintf(fout, "INSERT INTO llama_bench (%s) ", join(test::get_fields(), ", ").c_str());
        fprintf(fout, "VALUES (");
        std::vector<std::string> values = t.get_values();
        for (size_t i = 0; i < values.size(); i++) {
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@ -611,7 +611,7 @@ int main(int argc, char ** argv) {
            return 1;
        }
        if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
-            fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]);
+            fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[arg_idx]);
            return 1;
        }
        if (ftype_str == "COPY") {
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@ -4249,9 +4249,6 @@ int main(int argc, char ** argv) {
            // process prompt
            std::vector<server_tokens> inputs;
            if (oaicompat && !prompt.is_string()) {
                throw std::runtime_error("prompt must be a string");
            }
            if (oaicompat && has_mtmd) {
                // multimodal
--- a/vendor/minja/chat-template.hpp
+++ b/vendor/minja/chat-template.hpp
@ -162,10 +162,15 @@ class chat_template {
        }), false);
        caps_.supports_tools = contains(out, "some_tool");
        auto out_empty = try_raw_render(json::array({dummy_user_msg, {{"role", "assistant"}, {"content", ""}}}), {}, false);
        auto out_null = try_raw_render(json::array({dummy_user_msg, {{"role", "assistant"}, {"content", nullptr}}}), {}, false);
        caps_.requires_non_null_content = contains(out_empty, user_needle) && !contains(out_null, user_needle);
        json j_null;
        auto make_tool_calls_msg = [&](const json & tool_calls) {
            return json {
                {"role", "assistant"},
-                {"content", nullptr},
+                {"content", caps_.requires_non_null_content? "" : j_null},
                {"tool_calls", tool_calls},
            };
        };
@ -195,9 +200,6 @@ class chat_template {
        caps_.supports_tool_calls = tool_call_renders_str_arguments || tool_call_renders_obj_arguments;
        caps_.requires_object_arguments = !tool_call_renders_str_arguments && tool_call_renders_obj_arguments;
        auto out_empty = try_raw_render(json::array({dummy_user_msg, {{"role", "assistant"}, {"content", ""}}}), {}, false);
        auto out_null = try_raw_render(json::array({dummy_user_msg, {{"role", "assistant"}, {"content", nullptr}}}), {}, false);
        caps_.requires_non_null_content = contains(out_empty, user_needle) && !contains(out_null, user_needle);
        if (caps_.supports_tool_calls) {
            auto dummy_args = caps_.requires_object_arguments ? dummy_args_obj : json(dummy_args_obj.dump());
@ -234,7 +236,7 @@ class chat_template {
                };
                const json tool_call_msg {
                    {"role", "assistant"},
-                    {"content", nullptr},
+                    {"content", caps_.requires_non_null_content ? "" : j_null},
                    {"tool_calls", json::array({
                        {
                            // TODO: detect if requires numerical id or fixed length == 6 like Nemo
--- a/vendor/minja/minja.hpp
+++ b/vendor/minja/minja.hpp
@ -1355,8 +1355,13 @@ public:
              case Op::Gt:        return l > r;
              case Op::Le:        return l <= r;
              case Op::Ge:        return l >= r;
-              case Op::In:        return (r.is_array() || r.is_object()) && r.contains(l);
+              case Op::In:        return (((r.is_array() || r.is_object()) && r.contains(l)) ||
-              case Op::NotIn:     return !(r.is_array() && r.contains(l));
+                                          (l.is_string() && r.is_string() &&
                                            r.to_str().find(l.to_str()) != std::string::npos));
              case Op::NotIn:
                                  return !(((r.is_array() || r.is_object()) && r.contains(l)) ||
                                            (l.is_string() && r.is_string() &&
                                              r.to_str().find(l.to_str()) != std::string::npos));
              default:            break;
          }
          throw std::runtime_error("Unknown binary operator");
@ -1552,6 +1557,19 @@ public:
              else res[i] = std::tolower(res[i]);
            }
            return res;
          } else if (method->get_name() == "replace") {
            vargs.expectArgs("replace method", {2, 3}, {0, 0});
            auto before = vargs.args[0].get<std::string>();
            auto after = vargs.args[1].get<std::string>();
            auto count = vargs.args.size() == 3 ? vargs.args[2].get<int64_t>()
                                                : str.length();
            size_t start_pos = 0;
            while ((start_pos = str.find(before, start_pos)) != std::string::npos &&
                  count-- > 0) {
              str.replace(start_pos, before.length(), after);
              start_pos += after.length();
            }
            return str;
          }
        }
        throw std::runtime_error("Unknown method: " + method->get_name());
@ -2128,7 +2146,7 @@ private:
            }
          }
-          if ((has_first_colon || has_second_colon) && (start || end || step)) {
+          if ((has_first_colon || has_second_colon)) {
            index = std::make_shared<SliceExpr>(slice_loc, std::move(start), std::move(end), std::move(step));
          } else {
            index = std::move(start);