diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 22f703e6ad..16c5acf346 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -1212,6 +1212,9 @@ class TextModel(ModelBase):
         if chkhsh == "a1e163ecab2e718a4c829d1148b6e86824ec36163bb71941c3dca9cd5ac25756":
             # ref: https://huggingface.co/JetBrains/Mellum-4b-base
             res = "mellum"
+        if chkhsh == "a0b64b4385f123663873756336c085744376d015ff328bb1d901598f63c44152":
+            # ref: https://huggingface.co/answerdotai/ModernBERT-base
+            res = "modern-bert"
         if chkhsh == "49fc0303c9e0d2c2c565c510f64b2d9b271276acdcdadff733249eda9f7d59df":
             # ref: https://huggingface.co/arcee-ai/Trinity-Tokenizer
             res = "afmoe"
@@ -9999,6 +10002,36 @@ class SmallThinkerModel(TextModel):
                 raise ValueError(f"Unprocessed experts: {experts}")
 
 
+@ModelBase.register("ModernBertModel", "ModernBertForMaskedLM", "ModernBertForSequenceClassification")
+class ModernBertModel(BertModel):
+    model_arch = gguf.MODEL_ARCH.MODERN_BERT
+
+    def set_vocab(self):
+        self.gguf_writer.add_add_bos_token(True)
+        self.gguf_writer.add_add_eos_token(True)
+        self.gguf_writer.add_add_sep_token(True)
+        self._set_vocab_gpt2()
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_sliding_window(self.hparams["local_attention"])
+        if (sliding_window_pattern := self.hparams.get("global_attn_every_n_layers")) is not None:
+            self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
+        self.gguf_writer.add_rope_freq_base_swa(self.rope_parameters.get("sliding_attention", {"rope_theta": self.hparams.get("local_rope_theta")})["rope_theta"])
+        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
+        self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # these layers act as MLM head, so we don't need them
+        if name.startswith("decoder."):
+            return []
+
+        if name.startswith("model."):
+            name = name[6:]
+
+        return super().modify_tensors(data_torch, name, bid)
+
+
 @ModelBase.register("ApertusForCausalLM")
 class ApertusModel(LlamaModel):
     model_arch = gguf.MODEL_ARCH.APERTUS
diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py
index 5e8456a7ea..4378378309 100755
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -139,6 +139,7 @@ models = [
     {"name": "lfm2",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"},
     {"name": "exaone4",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", },
     {"name": "mellum",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum-4b-base", },
+    {"name": "modern-bert",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/answerdotai/ModernBERT-base", },
     {"name": "afmoe",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/arcee-ai/Trinity-Tokenizer", },
     {"name": "bailingmoe2",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-mini-base-2.0", },
     {"name": "granite-docling",  "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-docling-258M", },
diff --git a/docs/backend/hexagon/README.md b/docs/backend/hexagon/README.md
index 85f136ef9e..00ec3a7e71 100644
--- a/docs/backend/hexagon/README.md
+++ b/docs/backend/hexagon/README.md
@@ -106,7 +106,7 @@ Here are some examples of running various llama.cpp tools via ADB.
 Simple question for Llama-3.2-1B
 
 ```
-~/src/llama.cpp$ M=Llama-3.2-1B-Instruct-Q4_0.gguf D=HTP0 ./scripts/snapdragon/adb/run-cli.sh -no-cnv -p "what is the most popular cookie in the world?"
+~/src/llama.cpp$ M=Llama-3.2-1B-Instruct-Q4_0.gguf D=HTP0 ./scripts/snapdragon/adb/run-completion.sh -p "what is the most popular cookie in the world?"
 ...
 ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev 1
 ggml-hex: Hexagon Arch version v79
@@ -136,7 +136,7 @@ llama_memory_breakdown_print: |   - HTP0-REPACK        |                  504 =
 Summary request for OLMoE-1B-7B. This is a large model that requires two HTP sessions/devices
 
 ```
-~/src/llama.cpp$ M=OLMoE-1B-7B-0125-Instruct-Q4_0.gguf NDEV=2 D=HTP0,HTP1 ./scripts/snapdragon/adb/run-cli.sh -f surfing.txt -no-cnv
+~/src/llama.cpp$ M=OLMoE-1B-7B-0125-Instruct-Q4_0.gguf NDEV=2 D=HTP0,HTP1 ./scripts/snapdragon/adb/run-completion.sh -f surfing.txt
 ...
 ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev 1
 ggml-hex: Hexagon Arch version v81
@@ -234,6 +234,6 @@ build: 6a8cf8914 (6733)
 
   Examples:
 
-      `GGML_HEXAGON_OPMASK=0x1 llama-cli ...` - Ops are enqueued but NPU-side processing is stubbed out
-      `GGML_HEXAGON_OPMASK=0x3 llama-cli ...` - NPU performs dynamic quantization and skips the rest
-      `GGML_HEXAGON_OPMASK=0x7 llama-cli ...` - Full queuing and processing of Ops (default)
+      `GGML_HEXAGON_OPMASK=0x1 llama-completion ...` - Ops are enqueued but NPU-side processing is stubbed out
+      `GGML_HEXAGON_OPMASK=0x3 llama-completion ...` - NPU performs dynamic quantization and skips the rest
+      `GGML_HEXAGON_OPMASK=0x7 llama-completion ...` - Full queuing and processing of Ops (default)
diff --git a/docs/backend/hexagon/developer.md b/docs/backend/hexagon/developer.md
index 200a7aabc0..fc4d160e93 100644
--- a/docs/backend/hexagon/developer.md
+++ b/docs/backend/hexagon/developer.md
@@ -49,7 +49,7 @@ Each Hexagon device behaves like a GPU from the offload and model splitting pers
 Here is an example of running GPT-OSS-20B model on a newer Snapdragon device with 16GB of DDR.
 
 ```
-M=gpt-oss-20b-Q4_0.gguf NDEV=4 D=HTP0,HTP1,HTP2,HTP3 P=surfing.txt scripts/snapdragon/adb/run-cli.sh -no-cnv -f surfing.txt -n 32
+M=gpt-oss-20b-Q4_0.gguf NDEV=4 D=HTP0,HTP1,HTP2,HTP3 P=surfing.txt scripts/snapdragon/adb/run-completion.sh -f surfing.txt -n 32
 ...
 LD_LIBRARY_PATH=/data/local/tmp/llama.cpp/lib
 ADSP_LIBRARY_PATH=/data/local/tmp/llama.cpp/lib
diff --git a/examples/model-conversion/Makefile b/examples/model-conversion/Makefile
index 25b0514b29..f8dc525a77 100644
--- a/examples/model-conversion/Makefile
+++ b/examples/model-conversion/Makefile
@@ -25,6 +25,8 @@ define quantize_model
 	@echo "Export the quantized model path to $(2) variable in your environment"
 endef
 
+DEVICE ?= auto
+
 ###
 ### Casual Model targets/recipes
 ###
@@ -53,7 +55,7 @@ causal-convert-mm-model:
 
 causal-run-original-model:
 	$(call validate_model_path,causal-run-original-model)
-	@MODEL_PATH="$(MODEL_PATH)" ./scripts/causal/run-org-model.py
+	@MODEL_PATH="$(MODEL_PATH)" ./scripts/causal/run-org-model.py --device "$(DEVICE)"
 
 causal-run-converted-model:
 	@CONVERTED_MODEL="$(CONVERTED_MODEL)" ./scripts/causal/run-converted-model.sh
diff --git a/examples/model-conversion/scripts/causal/run-org-model.py b/examples/model-conversion/scripts/causal/run-org-model.py
index 14bb12fe68..b12173a1fb 100755
--- a/examples/model-conversion/scripts/causal/run-org-model.py
+++ b/examples/model-conversion/scripts/causal/run-org-model.py
@@ -4,149 +4,179 @@ import argparse
 import os
 import sys
 import importlib
+import torch
+import numpy as np
+
 from pathlib import Path
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForImageTextToText, AutoConfig
 
 # Add parent directory to path for imports
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
-
-from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForImageTextToText, AutoConfig
-import torch
-import numpy as np
 from utils.common import debug_hook
 
-parser = argparse.ArgumentParser(description="Process model with specified path")
-parser.add_argument("--model-path", "-m", help="Path to the model")
-parser.add_argument("--prompt-file", "-f", help="Optional prompt file", required=False)
-parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose debug output")
-args = parser.parse_args()
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="Process model with specified path")
+    parser.add_argument("--model-path", "-m", help="Path to the model")
+    parser.add_argument("--prompt-file", "-f", help="Optional prompt file", required=False)
+    parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose debug output")
+    parser.add_argument("--device", "-d", help="Device to use (cpu, cuda, mps, auto)", default="auto")
+    return parser.parse_args()
 
-model_path = os.environ.get("MODEL_PATH", args.model_path)
-if model_path is None:
-    parser.error(
-        "Model path must be specified either via --model-path argument or MODEL_PATH environment variable"
-    )
+def load_model_and_tokenizer(model_path, device="auto"):
+    print("Loading model and tokenizer using AutoTokenizer:", model_path)
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+    multimodal = False
+    full_config = config
 
-### If you want to dump RoPE activations, uncomment the following lines:
-### === START ROPE DEBUG ===
-# from utils.common import setup_rope_debug
-# setup_rope_debug("transformers.models.apertus.modeling_apertus")
-### == END ROPE DEBUG ===
-
-
-print("Loading model and tokenizer using AutoTokenizer:", model_path)
-tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
-multimodal = False
-full_config = config
-
-print("Model type:       ", config.model_type)
-if "vocab_size" not in config and "text_config" in config:
-    config = config.text_config
-    multimodal = True
-print("Vocab size:       ", config.vocab_size)
-print("Hidden size:      ", config.hidden_size)
-print("Number of layers: ", config.num_hidden_layers)
-print("BOS token id:     ", config.bos_token_id)
-print("EOS token id:     ", config.eos_token_id)
-
-unreleased_model_name = os.getenv("UNRELEASED_MODEL_NAME")
-if unreleased_model_name:
-    model_name_lower = unreleased_model_name.lower()
-    unreleased_module_path = (
-        f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
-    )
-    class_name = f"{unreleased_model_name}ForCausalLM"
-    print(f"Importing unreleased model module: {unreleased_module_path}")
-
-    try:
-        model_class = getattr(
-            importlib.import_module(unreleased_module_path), class_name
-        )
-        model = model_class.from_pretrained(
-            model_path
-        )  # Note: from_pretrained, not fromPretrained
-    except (ImportError, AttributeError) as e:
-        print(f"Failed to import or load model: {e}")
-        exit(1)
-else:
-    if multimodal:
-        model = AutoModelForImageTextToText.from_pretrained(
-            model_path, device_map="auto", offload_folder="offload", trust_remote_code=True, config=full_config
-        )
+    # Determine device_map based on device argument
+    if device == "cpu":
+        device_map = {"": "cpu"}
+        print("Forcing CPU usage")
+    elif device == "auto":
+        device_map = "auto"
     else:
-        model = AutoModelForCausalLM.from_pretrained(
-            model_path, device_map="auto", offload_folder="offload", trust_remote_code=True, config=config
+        device_map = {"": device}
+
+    print("Model type:       ", config.model_type)
+    if "vocab_size" not in config and "text_config" in config:
+        config = config.text_config
+        multimodal = True
+
+    print("Vocab size:       ", config.vocab_size)
+    print("Hidden size:      ", config.hidden_size)
+    print("Number of layers: ", config.num_hidden_layers)
+    print("BOS token id:     ", config.bos_token_id)
+    print("EOS token id:     ", config.eos_token_id)
+
+    unreleased_model_name = os.getenv("UNRELEASED_MODEL_NAME")
+    if unreleased_model_name:
+        model_name_lower = unreleased_model_name.lower()
+        unreleased_module_path = (
+            f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
         )
+        class_name = f"{unreleased_model_name}ForCausalLM"
+        print(f"Importing unreleased model module: {unreleased_module_path}")
 
-if args.verbose:
-    for name, module in model.named_modules():
-        if len(list(module.children())) == 0:  # only leaf modules
-            module.register_forward_hook(debug_hook(name))
+        try:
+            model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
+            model = model_class.from_pretrained(
+                    model_path,
+                    device_map=device_map,
+                    offload_folder="offload",
+                    trust_remote_code=True,
+                    config=config
+            )
+        except (ImportError, AttributeError) as e:
+            print(f"Failed to import or load model: {e}")
+            exit(1)
+    else:
+        if multimodal:
+            model = AutoModelForImageTextToText.from_pretrained(
+                    model_path,
+                    device_map=device_map,
+                    offload_folder="offload",
+                    trust_remote_code=True,
+                    config=full_config
+            )
+        else:
+            model = AutoModelForCausalLM.from_pretrained(
+                    model_path,
+                    device_map=device_map,
+                    offload_folder="offload",
+                    trust_remote_code=True,
+                    config=config
+            )
 
-model_name = os.path.basename(model_path)
-# Printing the Model class to allow for easier debugging. This can be useful
-# when working with models that have not been publicly released yet and this
-# migth require that the concrete class is imported and used directly instead
-# of using AutoModelForCausalLM.
-print(f"Model class: {model.__class__.__name__}")
+    print(f"Model class: {model.__class__.__name__}")
 
-device = next(model.parameters()).device
-if args.prompt_file:
-    with open(args.prompt_file, encoding='utf-8') as f:
-        prompt = f.read()
-elif os.getenv("MODEL_TESTING_PROMPT"):
-    prompt = os.getenv("MODEL_TESTING_PROMPT")
-else:
-    prompt = "Hello, my name is"
-input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
+    return model, tokenizer, config
 
-print(f"Input tokens: {input_ids}")
-print(f"Input text: {repr(prompt)}")
-print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}")
+def enable_torch_debugging(model):
+        for name, module in model.named_modules():
+            if len(list(module.children())) == 0:  # only leaf modules
+                module.register_forward_hook(debug_hook(name))
 
-batch_size = 512
+def get_prompt(args):
+    if args.prompt_file:
+        with open(args.prompt_file, encoding='utf-8') as f:
+            return f.read()
+    elif os.getenv("MODEL_TESTING_PROMPT"):
+        return os.getenv("MODEL_TESTING_PROMPT")
+    else:
+        return "Hello, my name is"
 
-with torch.no_grad():
-    past = None
-    outputs = None
-    for i in range(0, input_ids.size(1), batch_size):
-        print(f"Processing chunk with tokens {i} to {i + batch_size}")
-        chunk = input_ids[:, i:i + batch_size]
-        outputs = model(chunk.to(model.device), past_key_values=past, use_cache=True)
-        past = outputs.past_key_values
+def main():
+    args = parse_arguments()
+    model_path = os.environ.get("MODEL_PATH", args.model_path)
+    if model_path is None:
+        print("Error: Model path must be specified either via --model-path argument or MODEL_PATH environment variable")
+        sys.exit(1)
 
-    logits = outputs.logits # type: ignore
 
-    # Extract logits for the last token (next token prediction)
-    last_logits = logits[0, -1, :].float().cpu().numpy()
+    model, tokenizer, config = load_model_and_tokenizer(model_path, args.device)
 
-    print(f"Logits shape: {logits.shape}")
-    print(f"Last token logits shape: {last_logits.shape}")
-    print(f"Vocab size: {len(last_logits)}")
+    if args.verbose:
+        enable_torch_debugging(model)
 
-    data_dir = Path("data")
-    data_dir.mkdir(exist_ok=True)
-    bin_filename = data_dir / f"pytorch-{model_name}.bin"
-    txt_filename = data_dir / f"pytorch-{model_name}.txt"
+    model_name = os.path.basename(model_path)
 
-    # Save to file for comparison
-    last_logits.astype(np.float32).tofile(bin_filename)
+    # Iterate over the model parameters (the tensors) and get the first one
+    # and use it to get the device the model is on.
+    device = next(model.parameters()).device
+    prompt = get_prompt(args)
+    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
 
-    # Also save as text file for easy inspection
-    with open(txt_filename, "w") as f:
-        for i, logit in enumerate(last_logits):
-            f.write(f"{i}: {logit:.6f}\n")
+    print(f"Input tokens: {input_ids}")
+    print(f"Input text: {repr(prompt)}")
+    print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}")
 
-    # Print some sample logits for quick verification
-    print(f"First 10 logits: {last_logits[:10]}")
-    print(f"Last 10 logits: {last_logits[-10:]}")
+    batch_size = 512
 
-    # Show top 5 predicted tokens
-    top_indices = np.argsort(last_logits)[-5:][::-1]
-    print("Top 5 predictions:")
-    for idx in top_indices:
-        token = tokenizer.decode([idx])
-        print(f"  Token {idx} ({repr(token)}): {last_logits[idx]:.6f}")
+    with torch.no_grad():
+        past = None
+        outputs = None
+        for i in range(0, input_ids.size(1), batch_size):
+            print(f"Processing chunk with tokens {i} to {i + batch_size}")
+            chunk = input_ids[:, i:i + batch_size]
+            outputs = model(chunk.to(model.device), past_key_values=past, use_cache=True)
+            past = outputs.past_key_values
 
-    print(f"Saved bin logits to: {bin_filename}")
-    print(f"Saved txt logist to: {txt_filename}")
+        logits = outputs.logits # type: ignore
+
+        # Extract logits for the last token (next token prediction)
+        last_logits = logits[0, -1, :].float().cpu().numpy()
+
+        print(f"Logits shape: {logits.shape}")
+        print(f"Last token logits shape: {last_logits.shape}")
+        print(f"Vocab size: {len(last_logits)}")
+
+        data_dir = Path("data")
+        data_dir.mkdir(exist_ok=True)
+        bin_filename = data_dir / f"pytorch-{model_name}.bin"
+        txt_filename = data_dir / f"pytorch-{model_name}.txt"
+
+        # Save to file for comparison
+        last_logits.astype(np.float32).tofile(bin_filename)
+
+        # Also save as text file for easy inspection
+        with open(txt_filename, "w") as f:
+            for i, logit in enumerate(last_logits):
+                f.write(f"{i}: {logit:.6f}\n")
+
+        # Print some sample logits for quick verification
+        print(f"First 10 logits: {last_logits[:10]}")
+        print(f"Last 10 logits: {last_logits[-10:]}")
+
+        # Show top 5 predicted tokens
+        top_indices = np.argsort(last_logits)[-5:][::-1]
+        print("Top 5 predictions:")
+        for idx in top_indices:
+            token = tokenizer.decode([idx])
+            print(f"  Token {idx} ({repr(token)}): {last_logits[idx]:.6f}")
+
+        print(f"Saved bin logits to: {bin_filename}")
+        print(f"Saved txt logist to: {txt_filename}")
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/model-conversion/scripts/embedding/run-original-model.py b/examples/model-conversion/scripts/embedding/run-original-model.py
index 640e200a97..39f054d0e0 100755
--- a/examples/model-conversion/scripts/embedding/run-original-model.py
+++ b/examples/model-conversion/scripts/embedding/run-original-model.py
@@ -45,7 +45,7 @@ if use_sentence_transformers:
 else:
     tokenizer = AutoTokenizer.from_pretrained(model_path)
 
-    config = AutoConfig.from_pretrained(model_path)
+    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
 
     # This can be used to override the sliding window size for manual testing. This
     # can be useful to verify the sliding window attention mask in the original model
@@ -64,12 +64,12 @@ else:
 
         try:
             model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
-            model = model_class.from_pretrained(model_path, config=config)
+            model = model_class.from_pretrained(model_path, config=config, trust_remote_code=True)
         except (ImportError, AttributeError) as e:
             print(f"Failed to import or load model: {e}")
             exit(1)
     else:
-        model = AutoModel.from_pretrained(model_path, config=config)
+        model = AutoModel.from_pretrained(model_path, config=config, trust_remote_code=True)
     print(f"Model class: {type(model)}")
     print(f"Model file: {type(model).__module__}")
 
@@ -123,7 +123,7 @@ with torch.no_grad():
         outputs = model(**encoded)
         hidden_states = outputs.last_hidden_state  # Shape: [batch_size, seq_len, hidden_size]
 
-        all_embeddings = hidden_states[0].cpu().numpy()  # Shape: [seq_len, hidden_size]
+        all_embeddings = hidden_states[0].float().cpu().numpy()  # Shape: [seq_len, hidden_size]
 
         print(f"Hidden states shape: {hidden_states.shape}")
         print(f"All embeddings shape: {all_embeddings.shape}")
diff --git a/examples/model-conversion/scripts/utils/semantic_check.py b/examples/model-conversion/scripts/utils/semantic_check.py
index 2ac8b6b7b4..e64c000497 100644
--- a/examples/model-conversion/scripts/utils/semantic_check.py
+++ b/examples/model-conversion/scripts/utils/semantic_check.py
@@ -166,7 +166,7 @@ def main():
     # Load the python model to get configuration information and also to load the tokenizer.
     print("Loading model and tokenizer using AutoTokenizer:", args.model_path)
     tokenizer = AutoTokenizer.from_pretrained(args.model_path)
-    config = AutoConfig.from_pretrained(args.model_path)
+    config = AutoConfig.from_pretrained(args.model_path, trust_remote_code=True)
 
     if unreleased_model_name:
         model_name_lower = unreleased_model_name.lower()
@@ -186,9 +186,9 @@ def main():
             exit(1)
     else:
         if args.causal:
-            model = AutoModelForCausalLM.from_pretrained(args.model_path)
+            model = AutoModelForCausalLM.from_pretrained(args.model_path, trust_remote_code=True)
         else:
-            model = AutoModel.from_pretrained(args.model_path)
+            model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True)
 
     encoded = tokenizer(prompt, return_tensors="pt")
     tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0])
diff --git a/examples/sycl/run-llama2.sh b/examples/sycl/run-llama2.sh
index a018e45197..cf23619ee0 100755
--- a/examples/sycl/run-llama2.sh
+++ b/examples/sycl/run-llama2.sh
@@ -22,9 +22,9 @@ if [ $# -gt 0 ]; then
     GGML_SYCL_DEVICE=$1
     echo "use $GGML_SYCL_DEVICE as main GPU"
     #use signle GPU only
-    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none
+    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none
 
 else
     #use multiple GPUs with same max compute units
-    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT}
+    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT}
 fi
diff --git a/examples/sycl/run-llama3.sh b/examples/sycl/run-llama3.sh
index 4770255703..feee5165e9 100755
--- a/examples/sycl/run-llama3.sh
+++ b/examples/sycl/run-llama3.sh
@@ -24,8 +24,8 @@ export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
 if [ $# -gt 0 ]; then
     GGML_SYCL_DEVICE=$1
     echo "Using $GGML_SYCL_DEVICE as the main GPU"
-    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none
+    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none
 else
     #use multiple GPUs with same max compute units
-    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT}
+    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT}
 fi
diff --git a/examples/sycl/win-run-llama2.bat b/examples/sycl/win-run-llama2.bat
index b654f88f62..32ff673ae2 100644
--- a/examples/sycl/win-run-llama2.bat
+++ b/examples/sycl/win-run-llama2.bat
@@ -8,4 +8,4 @@ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
 :: support malloc device memory more than 4GB.
 set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
 
-.\build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 99 -s 0
+.\build\bin\llama-completion.exe -m models\llama-2-7b.Q4_0.gguf -no-cnv -p %INPUT2% -n 400 -e -ngl 99 -s 0
diff --git a/examples/sycl/win-run-llama3.bat b/examples/sycl/win-run-llama3.bat
index 608b834f60..ea4ae69d6c 100644
--- a/examples/sycl/win-run-llama3.bat
+++ b/examples/sycl/win-run-llama3.bat
@@ -8,4 +8,4 @@ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
 :: support malloc device memory more than 4GB.
 set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
 
-.\build\bin\llama-cli.exe -m models\Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf -p %INPUT2% -n 400 -s 0 -e -ngl 99
+.\build\bin\llama-completion.exe -m models\Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf -no-cnv -p %INPUT2% -n 400 -s 0 -e -ngl 99
diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 853a5bda1e..13b96d61f8 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -7,9 +7,10 @@
 
 #include <atomic>
 #include <chrono>
+#include <cstddef>
 #include <mutex>
-#include <string>
 #include <stdexcept>
+#include <string>
 
 #ifdef _WIN32
 #    include <sal.h>
@@ -36,6 +37,7 @@
 #include "ggml-hexagon.h"
 #include "ggml-impl.h"
 #include "ggml-quants.h"
+#include "op-desc.h"
 #include "htp-msg.h"
 #include "htp_iface.h"
 
@@ -55,9 +57,6 @@ static int opt_opsync = 0;  // synchronous ops
 #define HEX_VERBOSE(...) \
     if (opt_verbose) GGML_LOG_DEBUG(__VA_ARGS__)
 
-#define HEX_PROFILE(...) \
-    if (opt_profile) GGML_LOG_INFO(__VA_ARGS__)
-
 static inline uint64_t hex_is_aligned(void * addr, uint32_t align) {
     return ((size_t) addr & (align - 1)) == 0;
 }
@@ -85,128 +84,30 @@ static const char * status_to_str(uint32_t status) {
 
 // ** debug helpers
 
-static inline int hex_format_tensor_dims(char * str, const struct ggml_tensor * t) {
-    if (t->ne[2] == 1 && t->ne[3] == 1) {
-        return sprintf(str, "%d:%d", (int) t->ne[0], (int) t->ne[1]);
-    } else {
-        return sprintf(str, "%d:%d:%d:%d", (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]);
-    }
+static void ggml_hexagon_dump_op_exec(const std::string &sess_name, const ggml_tensor * op, const uint32_t req_flags) {
+    if (!opt_verbose) return;
+
+    op_desc desc(op);
+    GGML_LOG_DEBUG("ggml-hex: %s execute-op %s: %s : %s : %s : %s : %s : flags 0x%x\n", sess_name.c_str(),
+                ggml_op_name(op->op), desc.names, desc.dims, desc.types, desc.strides, desc.buffs, req_flags);
 }
 
-static inline void hex_format_op_dims(char * str, const struct ggml_tensor * t) {
-    char * p = str;
+static void ggml_hexagon_dump_op_supp(const std::string &sess_name, const struct ggml_tensor * op, bool supp) {
+    if (!opt_verbose) return;
 
-    // append src0 and src1 (if any)
-    if (t->src[0]) {
-        p += hex_format_tensor_dims(p, t->src[0]);
-
-        for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
-            p += sprintf(p, " x ");
-            p += hex_format_tensor_dims(p, t->src[i]);
-        }
-
-        p += sprintf(p, " -> ");
-    }
-
-    // format self dims separately for better visual alignment
-    char self[64];
-    hex_format_tensor_dims(self, t);
-
-    p += sprintf(p, "%s", self);
+    op_desc desc(op);
+    GGML_LOG_DEBUG("ggml-hex: %s supports-op %s : %s : %s : %s : %s : %s : %s\n", sess_name.c_str(),
+                ggml_op_name(op->op), desc.names, desc.dims, desc.types, desc.strides, desc.buffs, supp ? "yes" : "no");
 }
 
-static inline int hex_format_tensor_strides(char * str, const struct ggml_tensor * t) {
-    const char * c = ggml_is_contiguous(t) ? "" : "!";
+static void ggml_hexagon_dump_op_prof(const std::string &sess_name, const ggml_tensor * op,
+                                      uint32_t op_usec, uint32_t op_cycles, uint32_t op_pkts, uint64_t call_usec) {
+    if (!opt_profile) return;
 
-    if (t->ne[2] == 1 && t->ne[3] == 1) {
-        return sprintf(str, "%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], c);
-    } else {
-        return sprintf(str, "%zu:%zu:%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], (size_t) t->nb[2],
-                       (size_t) t->nb[3], c);
-    }
-}
-
-static inline void hex_format_op_strides(char * str, const struct ggml_tensor * t) {
-    char * p = str;
-
-    // append src0 and src1 (if any)
-    if (t->src[0]) {
-        p += hex_format_tensor_strides(p, t->src[0]);
-
-        for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
-            p += sprintf(p, " x ");
-            p += hex_format_tensor_strides(p, t->src[i]);
-        }
-
-        p += sprintf(p, " -> ");
-    }
-
-    // format self dims separately for better visual alignment
-    char self[64];
-    hex_format_tensor_strides(self, t);
-
-    p += sprintf(p, "%s", self);
-}
-
-static inline void hex_format_op_types(char * str, const struct ggml_tensor * t) {
-    char * p = str;
-
-    // append src0 and src1 (if any)
-    if (t->src[0]) {
-        p += sprintf(p, "%s", ggml_type_name(t->src[0]->type));
-
-        for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
-            p += sprintf(p, " x ");
-            p += sprintf(p, "%s", ggml_type_name(t->src[i]->type));
-        }
-
-        p += sprintf(p, " -> ");
-    }
-
-    p += sprintf(p, "%s", ggml_type_name(t->type));
-}
-
-static inline const char * hex_tensor_buff_name(const struct ggml_tensor * t) {
-    if (t->buffer) {
-        return ggml_backend_buffer_name(t->buffer);
-    }
-    return "NONE";
-}
-
-static inline void hex_format_op_buffs(char * str, const struct ggml_tensor * t) {
-    char * p = str;
-
-    // append src0 and src1 (if any)
-    if (t->src[0]) {
-        p += sprintf(p, "%s", hex_tensor_buff_name(t->src[0]));
-
-        for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
-            p += sprintf(p, " x ");
-            p += sprintf(p, "%s", hex_tensor_buff_name(t->src[i]));
-        }
-
-        p += sprintf(p, " -> ");
-    }
-
-    p += sprintf(p, "%s", hex_tensor_buff_name(t));
-}
-
-static inline void hex_format_op_names(char * str, const struct ggml_tensor * t) {
-    char * p = str;
-
-    // append src0 and src1 (if any)
-    if (t->src[0]) {
-        p += sprintf(p, "%s", t->src[0]->name);
-
-        for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
-            p += sprintf(p, " x ");
-            p += sprintf(p, "%s", t->src[i]->name);
-        }
-
-        p += sprintf(p, " -> ");
-    }
-
-    p += sprintf(p, "%s", t->name);
+    op_desc desc(op);
+    GGML_LOG_DEBUG("ggml-hex: %s profile-op %s: %s : %s : %s : %s : %s : op-usec %u op-cycles %u op-pkts %u (%f) call-usec %llu\n", sess_name.c_str(),
+                ggml_op_name(op->op), desc.names, desc.dims, desc.types, desc.strides, desc.buffs,
+                op_usec, op_cycles, op_pkts, (float) op_cycles / op_pkts, (unsigned long long) call_usec);
 }
 
 // ** backend sessions
@@ -221,8 +122,8 @@ struct ggml_hexagon_session {
     void enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync = false);
     void flush();
 
-    ggml_backend_buffer_type buffer_type;
-    ggml_backend_buffer_type repack_buffer_type;
+    ggml_backend_buffer_type buffer_type        = {};
+    ggml_backend_buffer_type repack_buffer_type = {};
 
     std::string      name;
     remote_handle64  handle;
@@ -241,23 +142,6 @@ struct ggml_hexagon_session {
     uint32_t         prof_pkts;
 };
 
-static inline void hex_print_op_info(const ggml_tensor * op, ggml_hexagon_session * sess, const uint32_t req_flags) {
-    char dims[64 * GGML_MAX_SRC];
-    char strides[64 * GGML_MAX_SRC];
-    char types[16 * GGML_MAX_SRC];
-    char buffs[64 * GGML_MAX_SRC];
-    char names[64 * GGML_MAX_SRC];
-
-    hex_format_op_dims(dims, op);
-    hex_format_op_strides(strides, op);
-    hex_format_op_types(types, op);
-    hex_format_op_buffs(buffs, op);
-    hex_format_op_names(names, op);
-
-    HEX_VERBOSE("ggml-hex: %s %s: %s : %s : %s : %s : %s: flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
-                names, dims, types, strides, buffs, req_flags);
-}
-
 void ggml_hexagon_session::enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync) {
     // Bump pending flag (cleared in the session::flush once we get the responce)
     this->op_pending++;  // atomic inc
@@ -1598,7 +1482,7 @@ static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer(
     try {
         ggml_backend_hexagon_buffer_context * ctx = new ggml_backend_hexagon_buffer_context(sess, size, false /*repack*/);
         return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, ctx, size);
-    } catch (std::exception const &exc) {
+    } catch (const std::exception & exc) {
         GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context: %s\n", sess->name.c_str(), exc.what());
         return nullptr;
     }
@@ -1610,7 +1494,7 @@ static ggml_backend_buffer_t ggml_backend_hexagon_repack_buffer_type_alloc_buffe
     try {
         ggml_backend_hexagon_buffer_context * ctx = new ggml_backend_hexagon_buffer_context(sess, size, true /*repack*/);
         return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, ctx, size);
-    } catch (std::exception const &exc) {
+    } catch (const std::exception & exc) {
         GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context: %s\n", sess->name.c_str(), exc.what());
         return nullptr;
     }
@@ -1697,8 +1581,8 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
         }
 
         // Save the IDs
-        this->session_id = n.session_id;
-        this->domain_id  = n.effective_domain_id;
+        this->session_id    = n.session_id;
+        this->domain_id     = n.effective_domain_id;
         this->valid_session = true;
     }
 
@@ -1751,7 +1635,7 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
     this->valid_handle = true;
 
     GGML_LOG_INFO("ggml-hex: new session: %s : session-id %d domain-id %d uri %s handle 0x%lx\n", this->name.c_str(),
-            this->session_id, this->domain_id, session_uri, (unsigned long) this->handle);
+                  this->session_id, this->domain_id, session_uri, (unsigned long) this->handle);
 
     // Enable FastRPC QoS mode
     {
@@ -1838,11 +1722,8 @@ void ggml_hexagon_session::release() noexcept(true) {
 }
 
 ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) noexcept(false) {
-    buffer_type.context        = nullptr;
-    repack_buffer_type.context = nullptr;
-
-    buffer_type.device         = dev;
-    repack_buffer_type.device  = dev;
+    buffer_type.device        = dev;
+    repack_buffer_type.device = dev;
 
     try {
         allocate(dev_id);
@@ -1852,7 +1733,7 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) n
 
         repack_buffer_type.iface   = ggml_backend_hexagon_repack_buffer_type_interface;
         repack_buffer_type.context = new ggml_backend_hexagon_buffer_type_context(this->name + "-REPACK", this);
-    } catch (std::exception const &exc) {
+    } catch (const std::exception & exc) {
         release();
         throw;
     }
@@ -1861,8 +1742,8 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) n
 ggml_hexagon_session::~ggml_hexagon_session() noexcept(true) {
     release();
 
-    delete static_cast<ggml_backend_hexagon_buffer_type_context*>(buffer_type.context);
-    delete static_cast<ggml_backend_hexagon_buffer_type_context*>(repack_buffer_type.context);
+    delete static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type.context);
+    delete static_cast<ggml_backend_hexagon_buffer_type_context *>(repack_buffer_type.context);
 }
 
 // ** backend interface
@@ -1930,15 +1811,6 @@ static bool hex_supported_dims(const struct ggml_tensor * x, const struct ggml_t
     return true;
 }
 
-template <typename... _TTensor>
-static inline bool hex_supported_buffer(const struct ggml_hexagon_session * sess, _TTensor... tensors) {
-    return ([&]() -> bool {
-        return !tensors || !tensors->buffer ||
-               (ggml_backend_buffer_is_hexagon(tensors->buffer) &&
-                ggml_backend_hexagon_buffer_get_sess(tensors->buffer) == sess);
-    }() && ...);
-}
-
 static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * dst) {
     const struct ggml_tensor * src0 = dst->src[0];
     const struct ggml_tensor * src1 = dst->src[1];
@@ -1976,17 +1848,16 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s
             break;
 
         case GGML_TYPE_F16:
+            if (src0->nb[1] < src0->nb[0]) {
+                GGML_LOG_DEBUG("ggml_hexagon_supported_mul_mat: permuted F16 src0 not supported\n");
+                return false;
+            }
             break;
 
         default:
             return false;
     }
 
-    // src0 & src1 & dst must be mapped to the same session
-    if (!hex_supported_buffer(sess, src0, src1, dst)) {
-        return false;
-    }
-
     return true;
 }
 
@@ -2029,12 +1900,6 @@ static bool ggml_hexagon_supported_mul_mat_id(const struct ggml_hexagon_session
         return false;
     }
 
-    // src0 (weights) must be repacked and mapped to the same session
-    // src1 & sr2 & dst must be mapped to the same session
-    if (!hex_supported_buffer(sess, src0, src1, src2, dst)) {
-        return false;
-    }
-
     return true;
 }
 
@@ -2064,18 +1929,12 @@ static bool ggml_hexagon_supported_binary(const struct ggml_hexagon_session * se
         return false;
     }
 
-    // src0, src1 & dst must be mapped to the same session
-    if (!hex_supported_buffer(sess, src0, src1, dst)) {
-        return false;
-    }
-
     return true;
 }
 
 static bool ggml_hexagon_supported_add_id(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
     const struct ggml_tensor * src0 = op->src[0];
     const struct ggml_tensor * src1 = op->src[1];
-    const struct ggml_tensor * src2 = op->src[2];
     const struct ggml_tensor * dst  = op;
 
     if (!hex_supported_src0_type(src0->type)) {
@@ -2096,11 +1955,6 @@ static bool ggml_hexagon_supported_add_id(const struct ggml_hexagon_session * se
         return false;
     }
 
-    // src0, src1 & dst must be mapped to the same session
-    if (!hex_supported_buffer(sess, src0, src1, src2, dst)) {
-        return false;
-    }
-
     return true;
 }
 
@@ -2123,11 +1977,6 @@ static bool ggml_hexagon_supported_unary(const struct ggml_hexagon_session * ses
         return false;
     }
 
-    // src0 & dst must be mapped to the same session
-    if (!hex_supported_buffer(sess, src0, dst)) {
-        return false;
-    }
-
     return true;
 }
 
@@ -2160,17 +2009,6 @@ static bool ggml_hexagon_supported_activations(const struct ggml_hexagon_session
         }
     }
 
-    // src0, src1 & dst must be mapped to the same session
-    if(src1){
-        if (!hex_supported_buffer(sess, src0, src1, dst)) {
-            return false;
-        }
-    }else{
-        if (!hex_supported_buffer(sess, src0, dst)) {
-            return false;
-        }
-    }
-
     return true;
 }
 
@@ -2219,11 +2057,6 @@ static bool ggml_hexagon_supported_softmax(const struct ggml_hexagon_session * s
         }
     }
 
-    // src0, src1 & dst must be mapped to the same session
-    if (!hex_supported_buffer(sess, src0, src1, dst)) {
-        return false;
-    }
-
     return true;
 }
 
@@ -2274,16 +2107,28 @@ static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess
         }
     }
 
-    // src0, src1, src2 & dst must be mapped to the same session
-    if (!hex_supported_buffer(sess, src0, src1, src2, dst)) {
-        return false;
-    }
-
     return true;
 }
 
+enum dspqbuf_type {
+    DSPQBUF_TYPE_DSP_WRITE_CPU_READ = 0,
+    DSPQBUF_TYPE_CPU_WRITE_DSP_READ,
+    DSPQBUF_TYPE_CONSTANT,
+};
+
+static void dspqbuf_dump(dspqueue_buffer * d, const struct ggml_tensor * t, dspqbuf_type type) {
+    if (opt_verbose < 2) return;
+
+    auto buf  = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
+    auto sess = buf->sess;
+
+    GGML_LOG_DEBUG("ggml-hex: %s dspqbuf : %s base-addr %p base-size %zu data %p offset %u size %u\n", sess->name.c_str(),
+                t->name, (void *) buf->base, buf->size, (void *) d->ptr, (unsigned int) d->offset,
+                (unsigned int) d->size);
+}
+
 // Init hexagon tensor from GGML tensor and Hexagon buffer
-static void init_htp_tensor(htp_tensor * h, const ggml_tensor * t) {
+static void htp_req_tensor_init(htp_tensor * h, const ggml_tensor * t) {
     h->data  = 0;  // updated by the receiver
     h->type  = t->type;
     h->ne[0] = t->ne[0];
@@ -2296,53 +2141,52 @@ static void init_htp_tensor(htp_tensor * h, const ggml_tensor * t) {
     h->nb[3] = t->nb[3];
 }
 
-static size_t dspqueue_buffers_init(dspqueue_buffer * buf, const ggml_tensor * t, bool flush_host, bool flush_htp) {
+static size_t htp_req_buff_init(htp_tensor *h, dspqueue_buffer * d, const ggml_tensor * t, dspqbuf_type type) {
     if (!t) {
         return 0;
     }
 
-    memset(buf, 0, sizeof(*buf));
-    auto tensor_buf = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
-    buf->fd      = tensor_buf->fd;
-    buf->ptr     = t->data;
-    buf->offset  = (uint8_t *) t->data - tensor_buf->base;
-    buf->size    = ggml_nbytes(t);
-    buf->flags   = (flush_host ? DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER : 0);        // Flush CPU
-    buf->flags |= (flush_htp ? DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT : 0);  // Invalidate DSP
+    auto buf = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
+
+    memset(d, 0, sizeof(*d));
+    d->fd     = buf->fd;
+    d->ptr    = t->data;
+    d->offset = (uint8_t *) t->data - buf->base;
+    d->size   = ggml_nbytes(t);
+
+    switch (type) {
+        case DSPQBUF_TYPE_DSP_WRITE_CPU_READ:
+            // Flush CPU
+            d->flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER;
+            break;
+        case DSPQBUF_TYPE_CPU_WRITE_DSP_READ:
+            // Flush CPU, Invalidate DSP
+            d->flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT;
+            break;
+        default:
+            // Constant buffer, no cache maintenance
+            d->flags = 0;
+            break;
+    }
+
+    htp_req_tensor_init(h, t);
+
+    dspqbuf_dump(d, t, type);
+
     return 1;
 }
 
-static ggml_hexagon_session * get_session_from_tensor(const ggml_tensor * t) {
-    return static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context)->sess;
-}
+typedef size_t (*htp_req_init_func_t)(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * op);
 
-static void hex_dump_dspbuf(const struct ggml_tensor * t, const dspqueue_buffer * d) {
-    auto buf  = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
-    auto sess = buf->sess;
+template <htp_req_init_func_t _init_req_func>
+static inline void ggml_hexagon_dispatch_op(ggml_hexagon_session *sess, const struct ggml_tensor * op, uint32_t flags) {
+    uint64_t t = ggml_time_us();
 
-    HEX_VERBOSE("ggml-hex: %s dspqbuf : %s base-addr %p base-size %zu data %p offset %u size %u\n", sess->name.c_str(),
-                t->name, (void *) buf->base, buf->size, (void *) d->ptr, (unsigned int) d->offset,
-                (unsigned int) d->size);
-}
-
-static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags) {
-    const struct ggml_tensor * src0 = op->src[0];
-    const struct ggml_tensor * src1 = op->src[1];
-    const struct ggml_tensor * dst  = op;
-
-    uint64_t t1, t2;
-    t1 = ggml_time_us();
-
-    // Construct HTP message
+    // Construct HTP request
     htp_general_req req;
-    req.op    = HTP_OP_MUL_MAT;
+    memset(&req, 0, sizeof(req));
+
     req.flags = flags;
-
-    init_htp_tensor(&req.src0, src0);
-    init_htp_tensor(&req.src1, src1);
-    init_htp_tensor(&req.dst, dst);
-
-    // Use opmask to override flags
     if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
         req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
     }
@@ -2350,342 +2194,111 @@ static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags)
         req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
     }
 
-    dspqueue_buffer bufs[3];
-
-    // First buffer Weights.
-    // The content is static, there is no need to do any cache management
-    dspqueue_buffers_init(bufs, src0, false, false);
-
-    // Second buffer Input Activations. This is a buffer that the CPU
-    // writes and the DSP reads, so we'll need to flush CPU caches and
-    // invalidate DSP ones. On platforms with I/O coherency support the
-    // framework will automatically skip cache operations where possible.
-    dspqueue_buffers_init(&bufs[1], src1, true, true);
-
-    // Third buffer Output Activations. We'll handle DSP
-    // cache maintenance in the response message but need to flush
-    // CPU caches to ensure any previously written dirty lines are
-    // written out before writes from the DSP start.
-    dspqueue_buffers_init(&bufs[2], dst, true, false);
-
-    auto * sess = get_session_from_tensor(src0);
-
-    if (opt_verbose) {
-        hex_print_op_info(op, sess, req.flags);
-        if (opt_verbose > 1) {
-            hex_dump_dspbuf(src0, &bufs[0]);
-            hex_dump_dspbuf(src1, &bufs[1]);
-            hex_dump_dspbuf(dst, &bufs[2]);
-        }
-    }
+    ggml_hexagon_dump_op_exec(sess->name, op, req.flags);
 
     if ((opt_opmask & HTP_OPMASK_QUEUE)) {
-        sess->enqueue(req, bufs, 3, opt_opsync);
+        dspqueue_buffer bufs[HTP_MAX_PACKET_BUFFERS];
+        size_t n_bufs = _init_req_func(&req, bufs, op);
+        sess->enqueue(req, bufs, n_bufs, opt_opsync);
     }
 
-    t2 = ggml_time_us();
+    t = ggml_time_us() - t;
 
-    HEX_PROFILE(
-        "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u (%f) "
-        "call-usec %llu\n",
-        sess->name.c_str(), ggml_op_name(op->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
-        (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
-        (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
-        (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
-        (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
+    ggml_hexagon_dump_op_prof(sess->name, op, sess->prof_usecs, sess->prof_cycles, sess->prof_pkts, t);
 }
 
-static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flags) {
-    const struct ggml_tensor * src0 = op->src[0];
-    const struct ggml_tensor * src1 = op->src[1];
-    const struct ggml_tensor * src2 = op->src[2];
-    const struct ggml_tensor * dst  = op;
-
-    uint64_t t1, t2;
-    t1 = ggml_time_us();
-
-    // Construct HTP message
-    htp_general_req req;
-    req.op    = HTP_OP_MUL_MAT_ID;
-    req.flags = flags;
-
-    init_htp_tensor(&req.src0, src0);
-    init_htp_tensor(&req.src1, src1);
-    init_htp_tensor(&req.src2, src2);
-    init_htp_tensor(&req.dst, dst);
-
-    // Use opmask to override flags
-    if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
-        req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
-    }
-    if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
-        req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
-    }
-
-    dspqueue_buffer bufs[4];
-    // First buffer Weights.
-    // The content is static, there is no need to do any cache management
-    dspqueue_buffers_init(bufs, src0, false, false);
-
-    // Second buffer Input Activations. This is a buffer that the CPU
-    // writes and the DSP reads, so we'll need to flush CPU caches and
-    // invalidate DSP ones. On platforms with I/O coherency support the
-    // framework will automatically skip cache operations where possible.
-    dspqueue_buffers_init(&bufs[1], src1, true, true);
-
-    // Third buffer expert IDs. This is a buffer that the CPU
-    // writes and the DSP reads, so we'll need to flush CPU caches and
-    // invalidate DSP ones. On platforms with I/O coherency support the
-    // framework will automatically skip cache operations where possible.
-    dspqueue_buffers_init(&bufs[2], src2, true, true);
-
-    // Forth buffer Output Activations. We'll handle DSP
-    // cache maintenance in the response message but need to flush
-    // CPU caches to ensure any previously written dirty lines are
-    // written out before writes from the DSP start.
-    dspqueue_buffers_init(&bufs[3], dst, true, false);
-
-    auto * sess = get_session_from_tensor(src0);
-
-    if (opt_verbose) {
-        hex_print_op_info(op, sess, req.flags);
-        if (opt_verbose > 1) {
-            hex_dump_dspbuf(src0, &bufs[0]);
-            hex_dump_dspbuf(src1, &bufs[1]);
-            hex_dump_dspbuf(src2, &bufs[2]);
-            hex_dump_dspbuf(dst, &bufs[3]);
-        }
-    }
-
-    if ((opt_opmask & HTP_OPMASK_QUEUE)) {
-        sess->enqueue(req, bufs, 4, opt_opsync);
-    }
-
-    t2 = ggml_time_us();
-
-    HEX_PROFILE(
-        "ggml-hex: %s matmul-id %s %u:%u:%u:%u x %s %u:%u:%u:%u (%s %u:%u:%u:%u) -> %s %u:%u:%u:%u : op-usec %u "
-        "op-cycles %u op-pkts %u (%f) call-usec %llu\n",
-        sess->name.c_str(), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], (uint32_t) src0->ne[2],
-        (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2],
-        (uint32_t) src1->ne[3], src2->name, (uint32_t) src2->ne[0], (uint32_t) src2->ne[1], (uint32_t) src2->ne[2],
-        (uint32_t) src2->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],
-        (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
-        (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
-}
-
-static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
-    const struct ggml_tensor * node = op;
-    const struct ggml_tensor * src0 = node->src[0];
-    const struct ggml_tensor * src1 = node->src[1];
-    const struct ggml_tensor * dst  = node;
-
-    uint64_t t1 = 0;
-    uint64_t t2 = 0;
-
-    t1 = ggml_time_us();
-
-    // Construct HTP message
-    htp_general_req req;
-    req.flags = flags;
-
-    // Use opmask to override flags
-    if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
-        req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
-    }
-    if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
-        req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
-    }
-
-    switch (node->op) {
+template <bool _is_src0_constant>
+static inline size_t init_binary_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
+    switch (t->op) {
+        case GGML_OP_MUL_MAT:
+            req->op = HTP_OP_MUL_MAT;
+            break;
         case GGML_OP_MUL:
-            req.op = HTP_OP_MUL;
+            req->op = HTP_OP_MUL;
             break;
         case GGML_OP_ADD:
-            req.op = HTP_OP_ADD;
+            req->op = HTP_OP_ADD;
             break;
         case GGML_OP_SUB:
-            req.op = HTP_OP_SUB;
+            req->op = HTP_OP_SUB;
             break;
         default:
-            GGML_ABORT("ggml-hex: binary : unsupported op:%d\n", node->op);
+            GGML_ABORT("ggml-hex: binary : unsupported op: %d\n", t->op);
+            break;
     }
 
-    init_htp_tensor(&req.src0, src0);
-    init_htp_tensor(&req.src1, src1);
-    init_htp_tensor(&req.dst, dst);
+    // src0: Weights (mulmat) or First Operand (binary op).
+    // If constant (e.g. weights), no cache management is needed.
+    // src1: Input Activations (mulmat) or Second Operand (binary op).
 
-    dspqueue_buffer bufs[3];
-    // First buffer = First Operand of Binary op
-    // This is a buffer that the CPU writes and the DSP reads, so we'll
-    // need to flush CPU caches and invalidate DSP ones. On platforms
-    // with I/O coherency support the framework will automatically skip
-    // cache operations where possible.
-    dspqueue_buffers_init(bufs, src0, true, true);
+    size_t n_bufs = 0;
+    n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], _is_src0_constant ? DSPQBUF_TYPE_CONSTANT : DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
+    n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
+    n_bufs += htp_req_buff_init(&req->dst,  &bufs[n_bufs], t,         DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
 
-    // Second buffer = Second Operand of Binary op
-    // This is a buffer that the CPU writes and the DSP reads, so we'll
-    // need to flush CPU caches and invalidate DSP ones. On platforms
-    // with I/O coherency support the framework will automatically skip
-    // cache operations where possible.
-    dspqueue_buffers_init(&bufs[1], src1, true, true);
-
-    // Third buffer = Output Activations. We'll handle DSP
-    // cache maintenance in the response message but need to flush
-    // CPU caches to ensure any previously written dirty lines are
-    // written out before writes from the DSP start.
-    dspqueue_buffers_init(&bufs[2], dst, true, false);
-
-    auto * sess = get_session_from_tensor(src0);
-
-    if (opt_verbose) {
-        hex_print_op_info(op, sess, req.flags);
-        if (opt_verbose > 1) {
-            hex_dump_dspbuf(src0, &bufs[0]);
-            hex_dump_dspbuf(src1, &bufs[1]);
-            hex_dump_dspbuf(dst, &bufs[2]);
-        }
-    }
-
-    if ((opt_opmask & HTP_OPMASK_QUEUE)) {
-        sess->enqueue(req, bufs, 3, opt_opsync);
-    }
-
-    t2 = ggml_time_us();
-
-    HEX_PROFILE(
-        "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u (%f) "
-        "call-usec %llu\n",
-        sess->name.c_str(), ggml_op_name(node->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
-        (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
-        (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
-        (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
-        (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
+    return n_bufs;
 }
 
-static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) {
-    const struct ggml_tensor * node = op;
-    const struct ggml_tensor * src0 = node->src[0];
-    const struct ggml_tensor * src1 = node->src[1];
-    const struct ggml_tensor * src2 = node->src[2];
-    const struct ggml_tensor * dst  = node;
-
-    uint64_t t1 = 0;
-    uint64_t t2 = 0;
-
-    t1 = ggml_time_us();
-
-    // Construct HTP message
-    htp_general_req req;
-    req.flags = flags;
-
-    // Use opmask to override flags
-    if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
-        req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
-    }
-    if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
-        req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
-    }
-
-    switch (node->op) {
+template <bool _is_src0_constant>
+static inline size_t init_binary_id_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
+    switch (t->op) {
+        case GGML_OP_MUL_MAT_ID:
+            req->op = HTP_OP_MUL_MAT_ID;
+            break;
         case GGML_OP_ADD_ID:
-            req.op = HTP_OP_ADD_ID;
+            req->op = HTP_OP_ADD_ID;
             break;
         default:
-            GGML_ABORT("ggml-hex: unsupported op:%d\n", node->op);
+            GGML_ABORT("ggml-hex: unsupported op: %d\n", t->op);
     }
 
-    init_htp_tensor(&req.src0, src0);
-    init_htp_tensor(&req.src1, src1);
-    init_htp_tensor(&req.src2, src2);
-    init_htp_tensor(&req.dst, dst);
+    // src0: Weights (mulmat) or Input Activations (other op).
+    // If constant, no cache management is needed.
+    // src1: Input Activations (mulmat) or Second Operand (binary op).
+    // src2: Expert IDs (mulmat) or Activated Experts (other op).
 
-    dspqueue_buffer bufs[4];
-    // First buffer = input activations
-    dspqueue_buffers_init(bufs, src0, true, true);
-    // Second buffer = experts bias
-    dspqueue_buffers_init(&bufs[1], src1, true, true);
-    // Third buffer = activated experts
-    dspqueue_buffers_init(&bufs[2], src2, true, true);
-    // Forth buffer = output activations
-    dspqueue_buffers_init(&bufs[3], dst, true, true);
+    size_t n_bufs = 0;
+    n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], _is_src0_constant ? DSPQBUF_TYPE_CONSTANT : DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
+    n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
+    n_bufs += htp_req_buff_init(&req->src2, &bufs[n_bufs], t->src[2], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
+    n_bufs += htp_req_buff_init(&req->dst,  &bufs[n_bufs], t,         DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
 
-    auto * sess = get_session_from_tensor(src0);
-
-    if (opt_verbose) {
-        hex_print_op_info(op, sess, req.flags);
-        if (opt_verbose > 1) {
-            hex_dump_dspbuf(src0, &bufs[0]);
-            hex_dump_dspbuf(src1, &bufs[1]);
-            hex_dump_dspbuf(src2, &bufs[2]);
-            hex_dump_dspbuf(dst, &bufs[3]);
-        }
-    }
-
-    if ((opt_opmask & HTP_OPMASK_QUEUE)) {
-        sess->enqueue(req, bufs, 4, opt_opsync);
-    }
-
-    t2 = ggml_time_us();
-
-    HEX_PROFILE(
-        "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u (%f) "
-        "call-usec %llu\n",
-        sess->name.c_str(), ggml_op_name(node->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
-        (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
-        (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
-        (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
-        (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
+    return n_bufs;
 }
 
-static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
-    const struct ggml_tensor * src0 = op->src[0];
-    const struct ggml_tensor * src1 = op->src[1];
-    const struct ggml_tensor * dst  = op;
-
-    uint64_t t1 = 0;
-    uint64_t t2 = 0;
-
-    t1 = ggml_time_us();
-
-    // Construct HTP message
-    htp_general_req req;
-
-    memset(&req, 0, sizeof(htp_general_req));
-    memcpy(&req.op_params, &op->op_params, sizeof(op->op_params));
-    req.flags = flags;
+static inline size_t init_unary_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
+    memcpy(&req->op_params, &t->op_params, sizeof(t->op_params));
 
     bool supported = false;
 
-    switch (op->op) {
+    switch (t->op) {
         case GGML_OP_RMS_NORM:
-            req.op    = HTP_OP_RMS_NORM;
+            req->op   = HTP_OP_RMS_NORM;
             supported = true;
             break;
 
         case GGML_OP_UNARY:
-            if (ggml_get_unary_op(dst) == GGML_UNARY_OP_SILU) {
-                req.op    = HTP_OP_UNARY_SILU;
+            if (ggml_get_unary_op(t) == GGML_UNARY_OP_SILU) {
+                req->op   = HTP_OP_UNARY_SILU;
                 supported = true;
-            }
-            else if (ggml_get_unary_op(dst) == GGML_UNARY_OP_GELU) {
-                req.op    = HTP_OP_UNARY_GELU;
+            } else if (ggml_get_unary_op(t) == GGML_UNARY_OP_GELU) {
+                req->op   = HTP_OP_UNARY_GELU;
                 supported = true;
             }
             break;
 
         case GGML_OP_GLU:
-            if (ggml_get_glu_op(dst) == GGML_GLU_OP_SWIGLU) {
-                req.op    = HTP_OP_GLU_SWIGLU;
+            if (ggml_get_glu_op(t) == GGML_GLU_OP_SWIGLU) {
+                req->op   = HTP_OP_GLU_SWIGLU;
                 supported = true;
-            } else if (ggml_get_glu_op(dst) == GGML_GLU_OP_SWIGLU_OAI) {
-                req.op    = HTP_OP_GLU_SWIGLU_OAI;
+            } else if (ggml_get_glu_op(t) == GGML_GLU_OP_SWIGLU_OAI) {
+                req->op   = HTP_OP_GLU_SWIGLU_OAI;
                 supported = true;
             }
             break;
 
         case GGML_OP_SOFT_MAX:
-            req.op    = HTP_OP_SOFTMAX;
+            req->op   = HTP_OP_SOFTMAX;
             supported = true;
             break;
 
@@ -2694,194 +2307,28 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
     }
 
     if (!supported) {
-        GGML_ABORT("ggml-hex: unary : unsupported op:%d\n", op->op);
+        GGML_ABORT("ggml-hex: unary : unsupported op: %d\n", t->op);
     }
 
-    init_htp_tensor(&req.dst, dst);
-    init_htp_tensor(&req.src0, src0);
-    if (src1) {
-        init_htp_tensor(&req.src1, src1);
-    }
+    size_t n_bufs = 0;
+    n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
+    n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
+    n_bufs += htp_req_buff_init(&req->dst,  &bufs[n_bufs], t,         DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
 
-    // Use opmask to override flags
-    if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
-        req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
-    }
-    if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
-        req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
-    }
-
-    dspqueue_buffer bufs[3];
-
-    // First buffer = Only Operand of Unary op
-    // This is a buffer that the CPU writes and the DSP reads, so we'll
-    // need to flush CPU caches and invalidate DSP ones. On platforms
-    // with I/O coherency support the framework will automatically skip
-    // cache operations where possible.
-    size_t n_bufs = dspqueue_buffers_init(bufs, src0, true, true);
-
-    // Second buffer(nullable) = Second Operand of Binary op
-    // This is a buffer that the CPU writes and the DSP reads, so we'll
-    // need to flush CPU caches and invalidate DSP ones. On platforms
-    // with I/O coherency support the framework will automatically skip
-    // cache operations where possible.
-    n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src1, true, true);
-
-    // Second or third buffer = Output Activations. We'll handle DSP
-    // Second buffer = Output Activations. We'll handle DSP
-    // cache maintenance in the response message but need to flush
-    // CPU caches to ensure any previously written dirty lines are
-    // written out before writes from the DSP start.
-    n_bufs += dspqueue_buffers_init(&bufs[n_bufs], dst, true, false);
-
-    // Primary DSP session from the src0 tensor
-    auto * sess = get_session_from_tensor(src0);
-
-    if (opt_verbose) {
-        hex_print_op_info(op, sess, req.flags);
-        if (opt_verbose > 1) {
-            hex_dump_dspbuf(src0, &bufs[0]);
-            if (src1) {
-                hex_dump_dspbuf(src1, &bufs[1]);
-                hex_dump_dspbuf(dst, &bufs[2]);
-            } else {
-                hex_dump_dspbuf(dst, &bufs[1]);
-            }
-        }
-    }
-
-    if ((opt_opmask & HTP_OPMASK_QUEUE)) {
-        sess->enqueue(req, bufs, n_bufs, opt_opsync);
-    }
-
-    t2 = ggml_time_us();
-
-    if (src1) {
-        HEX_PROFILE(
-            "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u "
-            "(%f) call-usec %llu\n",
-            sess->name.c_str(), ggml_op_name(op->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
-            (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
-            (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
-            (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
-            (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
-    } else {
-        HEX_PROFILE(
-            "ggml-hex: %s %s %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u (%f) call-usec "
-            "%llu\n",
-            sess->name.c_str(), ggml_op_name(op->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
-            (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
-            (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
-            (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
-    }
+    return n_bufs;
 }
 
-static void ggml_hexagon_rope(const struct ggml_tensor * op, uint32_t flags) {
-    const struct ggml_tensor * src0 = op->src[0];
-    const struct ggml_tensor * src1 = op->src[1];
-    const struct ggml_tensor * src2 = op->src[2];
-    const struct ggml_tensor * dst  = op;
+static inline size_t init_rope_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
+    memcpy(&req->op_params, &t->op_params, sizeof(t->op_params));
+    req->op = HTP_OP_ROPE;
 
-    uint64_t t1 = 0;
-    uint64_t t2 = 0;
+    size_t n_bufs = 0;
+    n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
+    n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
+    n_bufs += htp_req_buff_init(&req->src2, &bufs[n_bufs], t->src[2], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
+    n_bufs += htp_req_buff_init(&req->dst,  &bufs[n_bufs], t,         DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
 
-    t1 = ggml_time_us();
-
-    // Construct HTP message
-    htp_general_req req;
-
-    memset(&req, 0, sizeof(htp_general_req));
-    memcpy(&req.op_params, &op->op_params, sizeof(op->op_params));
-    req.flags = flags;
-    req.op    = HTP_OP_ROPE;
-
-    init_htp_tensor(&req.dst, dst);
-    init_htp_tensor(&req.src0, src0);
-    init_htp_tensor(&req.src1, src1);
-    if (src2) {
-        init_htp_tensor(&req.src2, src2);
-    }
-
-    // Use opmask to override flags
-    if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
-        req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
-    }
-    if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
-        req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
-    }
-
-    dspqueue_buffer bufs[4];
-
-    // First buffer
-    // This is a buffer that the CPU writes and the DSP reads, so we'll
-    // need to flush CPU caches and invalidate DSP ones. On platforms
-    // with I/O coherency support the framework will automatically skip
-    // cache operations where possible.
-    size_t n_bufs = dspqueue_buffers_init(bufs, src0, true, true);
-
-    // Second buffer
-    // This is a buffer that the CPU writes and the DSP reads, so we'll
-    // need to flush CPU caches and invalidate DSP ones. On platforms
-    // with I/O coherency support the framework will automatically skip
-    // cache operations where possible.
-    n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src1, true, true);
-
-    // Third buffer(nullable)
-    // This is a buffer that the CPU writes and the DSP reads, so we'll
-    // need to flush CPU caches and invalidate DSP ones. On platforms
-    // with I/O coherency support the framework will automatically skip
-    // cache operations where possible.
-    n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src2, true, true);
-
-    // Final buffer = Output Activations. We'll handle DSP
-    // Second buffer = Output Activations. We'll handle DSP
-    // cache maintenance in the response message but need to flush
-    // CPU caches to ensure any previously written dirty lines are
-    // written out before writes from the DSP start.
-    n_bufs += dspqueue_buffers_init(&bufs[n_bufs], dst, true, false);
-
-    // Primary DSP session from the src0 tensor
-    auto * sess = get_session_from_tensor(src0);
-
-    if (opt_verbose) {
-        hex_print_op_info(op, sess, req.flags);
-        if (opt_verbose > 1) {
-            hex_dump_dspbuf(src0, &bufs[0]);
-            if (src1) {
-                hex_dump_dspbuf(src1, &bufs[1]);
-                hex_dump_dspbuf(dst, &bufs[2]);
-            } else {
-                hex_dump_dspbuf(dst, &bufs[1]);
-            }
-        }
-    }
-
-    if ((opt_opmask & HTP_OPMASK_QUEUE)) {
-        sess->enqueue(req, bufs, n_bufs, opt_opsync);
-    }
-
-    t2 = ggml_time_us();
-
-    if (src2) {
-        HEX_PROFILE(
-            "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles "
-            "%u op-pkts %u (%f) call-usec %llu\n",
-            sess->name.c_str(), ggml_op_name(op->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
-            (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
-            (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], src2->name, (uint32_t) src2->ne[0], (uint32_t) src2->ne[1],
-            (uint32_t) src2->ne[2], (uint32_t) src2->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
-            (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
-            (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
-    } else {
-        HEX_PROFILE(
-            "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u "
-            "(%f) call-usec %llu\n",
-            sess->name.c_str(), ggml_op_name(op->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
-            (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
-            (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
-            (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
-            (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
-    }
+    return n_bufs;
 }
 
 static const char * ggml_backend_hexagon_name(ggml_backend_t backend) {
@@ -2896,7 +2343,7 @@ static void ggml_backend_hexagon_free(ggml_backend_t backend) {
 }
 
 static inline bool op_reuse_src1(const ggml_tensor * op1, const ggml_tensor * op0) {
-    return (op0 && op0->src[1] == op1->src[1]);
+    return (op0 && op0->src[1] == op1->src[1] && ggml_is_quantized(op0->src[0]->type) && ggml_is_quantized(op1->src[1]->type));
 }
 
 static inline bool is_compute_op(ggml_tensor *node)
@@ -2946,43 +2393,50 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
 
         switch (node->op) {
             case GGML_OP_MUL_MAT:
-                ggml_hexagon_mul_mat(node, flags);
+                if (ggml_is_quantized(node->src[0]->type)) {
+                    ggml_hexagon_dispatch_op<init_binary_req<true>>(sess, node, flags);
+                } else {
+                    ggml_hexagon_dispatch_op<init_binary_req<false>>(sess, node, flags);
+                }
                 prev_quant_op = node;
                 break;
             case GGML_OP_MUL_MAT_ID:
-                ggml_hexagon_mul_mat_id(node, flags);
+                if (ggml_is_quantized(node->src[0]->type)) {
+                    ggml_hexagon_dispatch_op<init_binary_id_req<true>>(sess, node, flags);
+                } else {
+                    ggml_hexagon_dispatch_op<init_binary_id_req<false>>(sess, node, flags);
+                }
                 prev_quant_op = node;
                 break;
             case GGML_OP_MUL:
             case GGML_OP_ADD:
             case GGML_OP_SUB:
-                ggml_hexagon_binary(node, flags);
+                ggml_hexagon_dispatch_op<init_binary_req<false>>(sess, node, flags);
                 break;
             case GGML_OP_ADD_ID:
-                ggml_hexagon_add_id(node, flags);
+                ggml_hexagon_dispatch_op<init_binary_id_req<false>>(sess, node, flags);
                 break;
             case GGML_OP_RMS_NORM:
-                ggml_hexagon_unary(node, flags);
+                ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
                 break;
             case GGML_OP_UNARY:
-                if (ggml_get_unary_op(node) == GGML_UNARY_OP_SILU) {
-                    ggml_hexagon_unary(node, flags);
-                } else if (ggml_get_unary_op(node) == GGML_UNARY_OP_GELU) {
-                    ggml_hexagon_unary(node, flags);
+                if ((ggml_get_unary_op(node) == GGML_UNARY_OP_SILU) ||
+                        (ggml_get_unary_op(node) == GGML_UNARY_OP_GELU)) {
+                    ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
                 }
                 break;
             case GGML_OP_GLU:
                 if ((ggml_get_glu_op(node) == GGML_GLU_OP_SWIGLU) ||
-                    (ggml_get_glu_op(node) == GGML_GLU_OP_SWIGLU_OAI)) {
-                    ggml_hexagon_unary(node, flags);
+                        (ggml_get_glu_op(node) == GGML_GLU_OP_SWIGLU_OAI)) {
+                    ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
                 }
                 break;
             case GGML_OP_SOFT_MAX:
-                ggml_hexagon_unary(node, flags);
+                ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
                 break;
 
             case GGML_OP_ROPE:
-                ggml_hexagon_rope(node, flags);
+                ggml_hexagon_dispatch_op<init_rope_req>(sess, node, flags);
                 break;
 
             default:
@@ -3111,8 +2565,8 @@ static void ggml_backend_hexagon_graph_optimize(ggml_backend_t backend, ggml_cgr
     //   and perform the reorder over the fused nodes. after the reorder is done, we unfuse
     for (int i = 0; i < n; i++) {
         node_info node = {
-            /*.node =*/ gf->nodes[i],
-            /*.fused =*/ {},
+            /*.node =*/gf->nodes[i],
+            /*.fused =*/{},
         };
 
         // fuse only ops that start with these operations
@@ -3263,9 +2717,38 @@ static ggml_backend_buffer_type_t ggml_backend_hexagon_device_get_repack_buffer_
     return &sess->repack_buffer_type;
 }
 
+static bool ggml_hexagon_supported_buffer(ggml_hexagon_session *sess, const struct ggml_tensor * t) {
+    if (t && t->buffer) {
+        if (ggml_backend_buffer_is_hexagon(t->buffer)      == false) return false; // not our buffer
+        if (ggml_backend_hexagon_buffer_get_sess(t->buffer) != sess) return false; // wrong session
+    }
+    return true;
+}
+
+static bool ggml_hexagon_supported_buffers(ggml_hexagon_session *sess, const struct ggml_tensor * t) {
+    // all srcs & dsts must be mapped to the same session
+    if (!ggml_hexagon_supported_buffer(sess, t)) {
+        return false;
+    }
+
+    for (int i = 0; i < GGML_MAX_SRC; i++) {
+        if (!ggml_hexagon_supported_buffer(sess, t->src[i])) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
 static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
     auto sess = static_cast<ggml_hexagon_session *>(dev->context);
 
+    // all srcs & dsts must be mapped to the same session
+    if (!ggml_hexagon_supported_buffers(sess, op)) {
+        ggml_hexagon_dump_op_supp(sess->name, op, false);
+        return false;
+    }
+
     bool supp = false;
     switch (op->op) {
         case GGML_OP_NONE:
@@ -3303,20 +2786,21 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
             break;
 
         case GGML_OP_UNARY:
-            if (ggml_get_unary_op(op) == GGML_UNARY_OP_SILU) {
-                supp = ggml_hexagon_supported_activations(sess, op);
+            {
+                const auto unary_op = ggml_get_unary_op(op);
+                if (unary_op == GGML_UNARY_OP_SILU || unary_op == GGML_UNARY_OP_GELU) {
+                    supp = ggml_hexagon_supported_activations(sess, op);
+                }
+                break;
             }
-            else if (ggml_get_unary_op(op) == GGML_UNARY_OP_GELU){
-                supp = ggml_hexagon_supported_activations(sess, op);
-            }
-            break;
-
         case GGML_OP_GLU:
-            if ((ggml_get_glu_op(op) == GGML_GLU_OP_SWIGLU) || (ggml_get_glu_op(op) == GGML_GLU_OP_SWIGLU_OAI) ) {
-                supp = ggml_hexagon_supported_activations(sess, op);
+            {
+                const auto glu_op = ggml_get_glu_op(op);
+                if ((glu_op == GGML_GLU_OP_SWIGLU) || (glu_op == GGML_GLU_OP_SWIGLU_OAI)) {
+                    supp = ggml_hexagon_supported_activations(sess, op);
+                }
+                break;
             }
-            break;
-
         case GGML_OP_ROPE:
             supp = ggml_hexagon_supported_rope(sess, op);
             break;
@@ -3325,26 +2809,8 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
             break;
     }
 
-    if (opt_verbose) {
-        char dims[64 * GGML_MAX_SRC];
-        char strides[64 * GGML_MAX_SRC];
-        char types[16 * GGML_MAX_SRC];
-        char buffs[64 * GGML_MAX_SRC];
-        char names[64 * GGML_MAX_SRC];
-
-        hex_format_op_dims(dims, op);
-        hex_format_op_strides(strides, op);
-        hex_format_op_types(types, op);
-        hex_format_op_buffs(buffs, op);
-        hex_format_op_names(names, op);
-
-        HEX_VERBOSE("ggml-hex: %s device-supports-op %s : %s : %s : %s : %s : %s : (%d)\n", sess->name.c_str(),
-                    ggml_op_name(op->op), names, dims, types, strides, buffs, (int) supp);
-    }
-
+    ggml_hexagon_dump_op_supp(sess->name, op, supp);
     return supp;
-
-    GGML_UNUSED(dev);
 }
 
 static bool ggml_backend_hexagon_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
@@ -3413,7 +2879,7 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
         }
     }
 
-    if(opt_arch < 75) {
+    if (opt_arch < 75) {
         opt_ndev = 1;
         GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v75.\n");
     }
@@ -3422,11 +2888,11 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
 
     // Create devices / sessions
     for (size_t i = 0; i < opt_ndev; i++) {
-        devices[i].iface   = ggml_backend_hexagon_device_i;
-        devices[i].reg     = reg;
+        devices[i].iface = ggml_backend_hexagon_device_i;
+        devices[i].reg   = reg;
         try {
             devices[i].context = new ggml_hexagon_session(i, &devices[i]);
-        } catch (std::exception const &exc) {
+        } catch (const std::exception & exc) {
             GGML_LOG_ERROR("ggml-hex: failed to create device/session %zu\n", i);
             devices[i].context = nullptr;
         }
diff --git a/ggml/src/ggml-hexagon/htp-utils.h b/ggml/src/ggml-hexagon/htp-utils.h
index 1a48f5dcbd..7bbae3a0b7 100644
--- a/ggml/src/ggml-hexagon/htp-utils.h
+++ b/ggml/src/ggml-hexagon/htp-utils.h
@@ -8,6 +8,7 @@ extern "C" {
 #include <AEEStdErr.h>
 #include <inttypes.h>
 #include <remote.h>
+#include <rpcmem.h>
 #include <stdbool.h>
 
 /* Offset to differentiate HLOS and Hexagon error codes.
diff --git a/ggml/src/ggml-hexagon/op-desc.h b/ggml/src/ggml-hexagon/op-desc.h
new file mode 100644
index 0000000000..a1e8ddd8b9
--- /dev/null
+++ b/ggml/src/ggml-hexagon/op-desc.h
@@ -0,0 +1,153 @@
+#ifndef OP_DESC_H
+#define OP_DESC_H
+
+#define GGML_COMMON_IMPL_CPP
+#include "ggml-backend-impl.h"
+#include "ggml-common.h"
+
+#include <string>
+#include <stdio.h>
+
+struct op_desc {
+    char strides[64 * GGML_MAX_SRC];
+    char dims[64 * GGML_MAX_SRC];
+    char types[16 * GGML_MAX_SRC];
+    char buffs[64 * GGML_MAX_SRC];
+    char names[64 * GGML_MAX_SRC];
+
+    int format_tensor_dims(char * str, const struct ggml_tensor * t) {
+        if (t->ne[2] == 1 && t->ne[3] == 1) {
+            return sprintf(str, "%d:%d", (int) t->ne[0], (int) t->ne[1]);
+        } else {
+            return sprintf(str, "%d:%d:%d:%d", (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]);
+        }
+    }
+
+    void format_op_dims(char * str, const struct ggml_tensor * t) {
+        char * p = str;
+
+        // append src0 and src1 (if any)
+        if (t->src[0]) {
+            p += format_tensor_dims(p, t->src[0]);
+
+            for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
+                p += sprintf(p, " x ");
+                p += format_tensor_dims(p, t->src[i]);
+            }
+
+            p += sprintf(p, " -> ");
+        }
+
+        // format self dims separately for better visual alignment
+        char self[64];
+        format_tensor_dims(self, t);
+
+        p += sprintf(p, "%s", self);
+    }
+
+    int format_tensor_strides(char * str, const struct ggml_tensor * t) {
+        const char * c = ggml_is_contiguous(t) ? "" : "!";
+
+        if (t->ne[2] == 1 && t->ne[3] == 1) {
+            return sprintf(str, "%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], c);
+        } else {
+            return sprintf(str, "%zu:%zu:%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], (size_t) t->nb[2], (size_t) t->nb[3], c);
+        }
+    }
+
+    void format_op_strides(char * str, const struct ggml_tensor * t) {
+        char * p = str;
+
+        // append src0 and src1 (if any)
+        if (t->src[0]) {
+            p += format_tensor_strides(p, t->src[0]);
+
+            for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
+                p += sprintf(p, " x ");
+                p += format_tensor_strides(p, t->src[i]);
+            }
+
+            p += sprintf(p, " -> ");
+        }
+
+        // format self dims separately for better visual alignment
+        char self[64];
+        format_tensor_strides(self, t);
+
+        p += sprintf(p, "%s", self);
+    }
+
+    void format_op_types(char * str, const struct ggml_tensor * t) {
+        char * p = str;
+
+        // append src0 and src1 (if any)
+        if (t->src[0]) {
+            p += sprintf(p, "%s", ggml_type_name(t->src[0]->type));
+
+            for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
+                p += sprintf(p, " x ");
+                p += sprintf(p, "%s", ggml_type_name(t->src[i]->type));
+            }
+
+            p += sprintf(p, " -> ");
+        }
+
+        p += sprintf(p, "%s", ggml_type_name(t->type));
+    }
+
+    const char * tensor_buff_name(const struct ggml_tensor * t) {
+        if (t->buffer) {
+            return ggml_backend_buffer_name(t->buffer);
+        }
+        return "NONE";
+    }
+
+    void format_op_buffs(char * str, const struct ggml_tensor * t) {
+        char * p = str;
+
+        // append src0 and src1 (if any)
+        if (t->src[0]) {
+            p += sprintf(p, "%s", tensor_buff_name(t->src[0]));
+
+            for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
+                p += sprintf(p, " x ");
+                p += sprintf(p, "%s", tensor_buff_name(t->src[i]));
+            }
+
+            p += sprintf(p, " -> ");
+        }
+
+        p += sprintf(p, "%s", tensor_buff_name(t));
+    }
+
+    void format_op_names(char * str, const struct ggml_tensor * t) {
+        char * p = str;
+
+        // append src0 and src1 (if any)
+        if (t->src[0]) {
+            p += sprintf(p, "%s", t->src[0]->name);
+
+            for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
+                p += sprintf(p, " x ");
+                p += sprintf(p, "%s", t->src[i]->name);
+            }
+
+            p += sprintf(p, " -> ");
+        }
+
+        p += sprintf(p, "%s", t->name);
+    }
+
+    void format(const ggml_tensor * op) {
+        format_op_dims(dims, op);
+        format_op_strides(strides, op);
+        format_op_types(types, op);
+        format_op_buffs(buffs, op);
+        format_op_names(names, op);
+    }
+
+    op_desc() {}
+    op_desc(const ggml_tensor * op) { format(op); }
+};
+
+#endif // OP_DESC_H
diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
index 13cf1f5f9d..e7890a5ee9 100644
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
@@ -571,6 +571,10 @@ static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
     return ctx->base_ptr;
 }
 
+static bool ggml_backend_buffer_is_rpc(ggml_backend_buffer_t buffer) {
+    return buffer->iface.free_buffer == ggml_backend_rpc_buffer_free_buffer;
+}
+
 static rpc_tensor serialize_tensor(const ggml_tensor * tensor) {
     rpc_tensor result;
     if (!tensor) {
@@ -580,7 +584,7 @@ static rpc_tensor serialize_tensor(const ggml_tensor * tensor) {
 
     result.id = reinterpret_cast<uint64_t>(tensor);
     result.type = tensor->type;
-    if (tensor->buffer) {
+    if (tensor->buffer && ggml_backend_buffer_is_rpc(tensor->buffer)) {
         ggml_backend_buffer_t buffer = tensor->buffer;
         ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
         result.buffer = ctx != nullptr ? ctx->remote_ptr : 0;
@@ -664,10 +668,6 @@ static void ggml_backend_rpc_buffer_get_tensor(ggml_backend_buffer_t buffer, con
     RPC_STATUS_ASSERT(status);
 }
 
-static bool ggml_backend_buffer_is_rpc(ggml_backend_buffer_t buffer) {
-    return buffer->iface.free_buffer == ggml_backend_rpc_buffer_free_buffer;
-}
-
 static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
     if (ggml_backend_buffer_is_rpc(src->buffer)) {
         // check if src and dst are on the same server
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index cab8f2901a..41d3bd4faf 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -181,6 +181,7 @@ class Keys:
         DIMENSION_COUNT          = "{arch}.rope.dimension_count"
         DIMENSION_SECTIONS       = "{arch}.rope.dimension_sections"
         FREQ_BASE                = "{arch}.rope.freq_base"
+        FREQ_BASE_SWA            = "{arch}.rope.freq_base_swa"
         SCALING_TYPE             = "{arch}.rope.scaling.type"
         SCALING_FACTOR           = "{arch}.rope.scaling.factor"
         SCALING_ATTN_FACTOR      = "{arch}.rope.scaling.attn_factor"
@@ -354,6 +355,7 @@ class MODEL_ARCH(IntEnum):
     STARCODER        = auto()
     REFACT           = auto()
     BERT             = auto()
+    MODERN_BERT      = auto()
     NOMIC_BERT       = auto()
     NOMIC_BERT_MOE   = auto()
     NEO_BERT         = auto()
@@ -747,6 +749,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
     MODEL_ARCH.STARCODER:        "starcoder",
     MODEL_ARCH.REFACT:           "refact",
     MODEL_ARCH.BERT:             "bert",
+    MODEL_ARCH.MODERN_BERT:      "modern-bert",
     MODEL_ARCH.NOMIC_BERT:       "nomic-bert",
     MODEL_ARCH.NOMIC_BERT_MOE:   "nomic-bert-moe",
     MODEL_ARCH.NEO_BERT:         "neo-bert",
@@ -1367,6 +1370,19 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
         MODEL_TENSOR.CLS,
         MODEL_TENSOR.CLS_OUT,
     ],
+    MODEL_ARCH.MODERN_BERT: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.TOKEN_EMBD_NORM,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_QKV,
+        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.CLS,
+        MODEL_TENSOR.CLS_OUT,
+    ],
     MODEL_ARCH.NOMIC_BERT: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.TOKEN_EMBD_NORM,
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index 9e6ff3ac77..6a4a504f8d 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -774,8 +774,12 @@ class GGUFWriter:
     def add_shared_kv_layers(self, value: int) -> None:
         self.add_uint32(Keys.Attention.SHARED_KV_LAYERS.format(arch=self.arch), value)
 
-    def add_sliding_window_pattern(self, value: Sequence[bool]) -> None:
-        self.add_array(Keys.Attention.SLIDING_WINDOW_PATTERN.format(arch=self.arch), value)
+    def add_sliding_window_pattern(self, value: int | Sequence[bool]) -> None:
+        key = Keys.Attention.SLIDING_WINDOW_PATTERN.format(arch=self.arch)
+        if isinstance(value, int):
+            self.add_uint32(key, value)
+        else:
+            self.add_array(key, value)
 
     def add_dense_features_dims(self, dense:str, in_f:int, out_f:int) -> None:
         self.add_uint32(Keys.LLM.DENSE_FEAT_IN_SIZE.format(arch=self.arch, dense=dense), in_f)
@@ -886,6 +890,9 @@ class GGUFWriter:
     def add_value_residual_mix_lora_rank(self, length: int) -> None:
         self.add_uint32(Keys.Attention.VALUE_RESIDUAL_MIX_LORA_RANK.format(arch=self.arch), length)
 
+    def add_rope_freq_base_swa(self, value: float) -> None:
+        self.add_float32(Keys.Rope.FREQ_BASE_SWA.format(arch=self.arch), value)
+
     def add_gate_lora_rank(self, length: int) -> None:
         self.add_uint32(Keys.Attention.GATE_LORA_RANK.format(arch=self.arch), length)
 
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 301aafa910..276720fcde 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -17,6 +17,7 @@ class TensorNameMap:
             "embed_tokens",                              # embeddinggemma
             "tok_embeddings",                            # llama-pth
             "embeddings.word_embeddings",                # bert nomic-bert
+            "embeddings.tok_embeddings",                 # modern-bert
             "language_model.embedding.word_embeddings",  # persimmon
             "wte",                                       # gpt2
             "transformer.embd.wte",                      # phi2
@@ -46,6 +47,7 @@ class TensorNameMap:
         MODEL_TENSOR.TOKEN_EMBD_NORM: (
             "word_embeddings_layernorm",  # bloom
             "embeddings.LayerNorm",       # bert
+            "embeddings.norm",            # modern-bert
             "emb_ln",                     # nomic-bert
             "transformer.norm",           # openelm
             "rwkv.blocks.0.pre_ln",       # rwkv
@@ -75,6 +77,7 @@ class TensorNameMap:
             "head.out",                  # wavtokenizer
             "lm_head",                   # llama4
             "model.transformer.ff_out",  # llada
+            "head.decoder",              # modern-bert
         ),
         MODEL_TENSOR.DENSE_2_OUT: (
             "dense_2_out",  # embeddinggemma
@@ -104,6 +107,7 @@ class TensorNameMap:
             "backbone.final_layer_norm",               # wavtokenizer
             "model.norm",                              # llama4
             "model.transformer.ln_f",                  # llada
+            "final_norm",                              # modern-bert
             "model.norm",                              # cogvlm
         ),
 
@@ -151,6 +155,7 @@ class TensorNameMap:
             "model.layers.{bid}.input_layernorm",                   # llama4
             "layers.{bid}.input_layernorm",                         # embeddinggemma
             "transformer_encoder.{bid}.attention_norm",             # neobert
+            "layers.{bid}.attn_norm",                               # modern-bert
             "model.layers.{bid}.operator_norm",                     # lfm2
             "model.transformer.blocks.{bid}.attn_norm",             # llada
             "layers.{bid}.input_layernorm",                         # qwen3-embedding
@@ -187,6 +192,7 @@ class TensorNameMap:
             "encoder.layers.{bid}.self_attention.query_key_value",                 # chatglm
             "transformer.layers.{bid}.attn.qkv_proj",                              # openelm
             "transformer_encoder.{bid}.qkv",                                       # neobert
+            "layers.{bid}.attn.Wqkv",                                              # modern-bert
             "model.layers.{bid}.self_attn.language_expert_query_key_value",        # cogvlm
         ),
 
@@ -261,6 +267,7 @@ class TensorNameMap:
             "model.layers.{bid}.self_attn.linear_attn",                     # deci
             "layers.{bid}.attention.wo",                                    # llama-pth
             "encoder.layer.{bid}.attention.output.dense",                   # bert
+            "layers.{bid}.attn.Wo",                                         # modern-bert
             "transformer.layer.{bid}.attention.out_lin",                    # distillbert
             "transformer.h.{bid}.attn.out_proj",                            # gpt-j
             "language_model.encoder.layers.{bid}.self_attention.dense",     # persimmon
@@ -344,6 +351,7 @@ class TensorNameMap:
             "layers.{bid}.post_attention_layernorm",                         # qwen3-embedding
             "model.layers.{bid}.feedforward_layernorm",                      # apertus
             "model.layers.{bid}.pre_mlp_layernorm",                          # kormo
+            "layers.{bid}.mlp_norm"                                          # modern-bert
         ),
 
         # Pre feed-forward norm
@@ -407,6 +415,7 @@ class TensorNameMap:
             "layers.{bid}.mlp.up_proj",                               # embeddinggemma
             "layers.{bid}.feed_forward.w3",                           # llama-pth
             "encoder.layer.{bid}.intermediate.dense",                 # bert
+            "layers.{bid}.mlp.Wi",                                    # modern-bert
             "transformer.layer.{bid}.ffn.lin1",                       # distillbert
             "transformer.h.{bid}.mlp.fc_in",                          # gpt-j
             "transformer.h.{bid}.mlp.linear_3",                       # refact
@@ -521,6 +530,7 @@ class TensorNameMap:
             "layers.{bid}.mlp.down_proj",                             # embeddinggemma
             "layers.{bid}.feed_forward.w2",                           # llama-pth
             "encoder.layer.{bid}.output.dense",                       # bert
+            "layers.{bid}.mlp.Wo",                                    # modern-bert
             "transformer.layer.{bid}.ffn.lin2",                       # distillbert
             "transformer.h.{bid}.mlp.fc_out",                         # gpt-j
             "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h",  # persimmon
@@ -1122,6 +1132,7 @@ class TensorNameMap:
             "classifier.dense", # roberta
             "pre_classifier",   # distillbert
             "dense",            # neobert
+            "head.dense",       # modern-bert
         ),
 
         MODEL_TENSOR.CLS_OUT: (
diff --git a/scripts/snapdragon/adb/run-cli.sh b/scripts/snapdragon/adb/run-cli.sh
index cc5e47c2d6..8a3053c859 100755
--- a/scripts/snapdragon/adb/run-cli.sh
+++ b/scripts/snapdragon/adb/run-cli.sh
@@ -18,17 +18,17 @@ model="Llama-3.2-3B-Instruct-Q4_0.gguf"
 device="HTP0"
 [ "$D" != "" ] && device="$D"
 
-verbose=
-[ "$V" != "" ] && verbose="GGML_HEXAGON_VERBOSE=$V"
-
 experimental=
 [ "$E" != "" ] && experimental="GGML_HEXAGON_EXPERIMENTAL=$E"
 
+verbose=
+[ "$V" != "" ] && verbose="GGML_HEXAGON_VERBOSE=$V" cli_opts="$cli_opts -v"
+
 sched=
 [ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v"
 
 profile=
-[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1"
+[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1" cli_opts="$cli_opts -v"
 
 opmask=
 [ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"
@@ -45,9 +45,9 @@ adb $adbserial shell " \
   cd $basedir; ulimit -c unlimited;        \
     LD_LIBRARY_PATH=$basedir/$branch/lib   \
     ADSP_LIBRARY_PATH=$basedir/$branch/lib \
-    $verbose $experimental $sched $opmask $profile $nhvx $ndev       \
-      ./$branch/bin/llama-completion --no-mmap -m $basedir/../gguf/$model   \
-         --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1             \
-         --ctx-size 8192 --batch-size 128 -ctk q8_0 -ctv q8_0 -fa on \
-         -ngl 99 --device $device $cli_opts $@ \
+    $verbose $experimental $sched $opmask $profile $nhvx $ndev     \
+      ./$branch/bin/llama-cli --no-mmap -m $basedir/../gguf/$model \
+         --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1           \
+         --ctx-size 8192 --batch-size 128 -fa on \
+         -ngl 99 --device $device $cli_opts $@   \
 "
diff --git a/scripts/snapdragon/adb/run-completion.sh b/scripts/snapdragon/adb/run-completion.sh
new file mode 100755
index 0000000000..bb7ba5e671
--- /dev/null
+++ b/scripts/snapdragon/adb/run-completion.sh
@@ -0,0 +1,53 @@
+#!/bin/sh
+#
+
+# Basedir on device
+basedir=/data/local/tmp/llama.cpp
+
+cli_opts=
+
+branch=.
+[ "$B" != "" ] && branch=$B
+
+adbserial=
+[ "$S" != "" ] && adbserial="-s $S"
+
+model="Llama-3.2-3B-Instruct-Q4_0.gguf"
+[ "$M" != "" ] && model="$M"
+
+device="HTP0"
+[ "$D" != "" ] && device="$D"
+
+experimental=
+[ "$E" != "" ] && experimental="GGML_HEXAGON_EXPERIMENTAL=$E"
+
+verbose=
+[ "$V" != "" ] && verbose="GGML_HEXAGON_VERBOSE=$V" cli_opts="$cli_opts -v"
+
+sched=
+[ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v"
+
+profile=
+[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1" cli_opts="$cli_opts -v"
+
+opmask=
+[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"
+
+nhvx=
+[ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX"
+
+ndev=
+[ "$NDEV" != "" ] && ndev="GGML_HEXAGON_NDEV=$NDEV"
+
+set -x
+
+adb $adbserial shell " \
+  cd $basedir; ulimit -c unlimited;        \
+    LD_LIBRARY_PATH=$basedir/$branch/lib   \
+    ADSP_LIBRARY_PATH=$basedir/$branch/lib \
+    $verbose $experimental $sched $opmask $profile $nhvx $ndev            \
+      ./$branch/bin/llama-completion --no-mmap -m $basedir/../gguf/$model \
+         --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1                  \
+         --ctx-size 8192 --batch-size 128 -fa on \
+         -ngl 99 -no-cnv --device $device $cli_opts $@   \
+"
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 4192af7c0c..4ca8974916 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -90,6 +90,7 @@ add_library(llama
             models/mamba.cpp
             models/minicpm3.cpp
             models/minimax-m2.cpp
+            models/modern-bert.cpp
             models/mpt.cpp
             models/nemotron-h.cpp
             models/nemotron.cpp
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index d0eaf317f7..80f44ae1bf 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -20,6 +20,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_STARCODER,        "starcoder"        },
     { LLM_ARCH_REFACT,           "refact"           },
     { LLM_ARCH_BERT,             "bert"             },
+    { LLM_ARCH_MODERN_BERT,      "modern-bert"      },
     { LLM_ARCH_NOMIC_BERT,       "nomic-bert"       },
     { LLM_ARCH_NOMIC_BERT_MOE,   "nomic-bert-moe"   },
     { LLM_ARCH_NEO_BERT,         "neo-bert"         },
@@ -204,6 +205,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_ATTENTION_GATE_LORA_RANK,               "%s.attention.gate_lora_rank"               },
     { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,       "%s.attention.relative_buckets_count"       },
     { LLM_KV_ATTENTION_SLIDING_WINDOW,               "%s.attention.sliding_window"               },
+    { LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN,       "%s.attention.sliding_window_pattern"       },
     { LLM_KV_ATTENTION_SCALE,                        "%s.attention.scale"                        },
     { LLM_KV_ATTENTION_OUTPUT_SCALE,                 "%s.attention.output_scale"                 },
     { LLM_KV_ATTENTION_TEMPERATURE_LENGTH,           "%s.attention.temperature_length"           },
@@ -214,6 +216,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_ROPE_DIMENSION_COUNT,          "%s.rope.dimension_count"                 },
     { LLM_KV_ROPE_DIMENSION_SECTIONS,       "%s.rope.dimension_sections"              },
     { LLM_KV_ROPE_FREQ_BASE,                "%s.rope.freq_base"                       },
+    { LLM_KV_ROPE_FREQ_BASE_SWA,            "%s.rope.freq_base_swa"                   },
     { LLM_KV_ROPE_SCALE_LINEAR,             "%s.rope.scale_linear"                    },
     { LLM_KV_ROPE_SCALING_TYPE,             "%s.rope.scaling.type"                    },
     { LLM_KV_ROPE_SCALING_FACTOR,           "%s.rope.scaling.factor"                  },
@@ -778,6 +781,20 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
                 LLM_TENSOR_CLS,
                 LLM_TENSOR_CLS_OUT,
             };
+        case LLM_ARCH_MODERN_BERT:
+            return {
+                LLM_TENSOR_TOKEN_EMBD,
+                LLM_TENSOR_TOKEN_EMBD_NORM,
+                LLM_TENSOR_OUTPUT_NORM,
+                LLM_TENSOR_ATTN_NORM,
+                LLM_TENSOR_ATTN_OUT,
+                LLM_TENSOR_ATTN_QKV,
+                LLM_TENSOR_FFN_DOWN,
+                LLM_TENSOR_FFN_UP,
+                LLM_TENSOR_FFN_NORM,
+                LLM_TENSOR_CLS,
+                LLM_TENSOR_CLS_OUT,
+            };
         case LLM_ARCH_JINA_BERT_V2:
             return {
                 LLM_TENSOR_TOKEN_EMBD,
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 6cbf9b1f89..a53bc39d18 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -24,6 +24,7 @@ enum llm_arch {
     LLM_ARCH_STARCODER,
     LLM_ARCH_REFACT,
     LLM_ARCH_BERT,
+    LLM_ARCH_MODERN_BERT,
     LLM_ARCH_NOMIC_BERT,
     LLM_ARCH_NOMIC_BERT_MOE,
     LLM_ARCH_NEO_BERT,
@@ -208,6 +209,7 @@ enum llm_kv {
     LLM_KV_ATTENTION_GATE_LORA_RANK,
     LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
     LLM_KV_ATTENTION_SLIDING_WINDOW,
+    LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN,
     LLM_KV_ATTENTION_SCALE,
     LLM_KV_ATTENTION_OUTPUT_SCALE,
     LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
@@ -218,6 +220,7 @@ enum llm_kv {
     LLM_KV_ROPE_DIMENSION_COUNT,
     LLM_KV_ROPE_DIMENSION_SECTIONS,
     LLM_KV_ROPE_FREQ_BASE,
+    LLM_KV_ROPE_FREQ_BASE_SWA,
     LLM_KV_ROPE_SCALE_LINEAR,
     LLM_KV_ROPE_SCALING_TYPE,
     LLM_KV_ROPE_SCALING_FACTOR,
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index 33a76dba40..5003b4fbf5 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -462,6 +462,29 @@ namespace GGUFMeta {
         return get_key_or_arr(llm_kv(kid), result, n, required);
     }
 
+    bool llama_model_loader::get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required) {
+        const std::string key = llm_kv(kid);
+
+        const int id = gguf_find_key(meta.get(), key.c_str());
+
+        if (id < 0) {
+            if (required) {
+                throw std::runtime_error(format("key not found in model: %s", key.c_str()));
+            }
+            return false;
+        }
+
+        // throw and error if type is an array
+        if (gguf_get_kv_type(meta.get(), id) == GGUF_TYPE_ARRAY) {
+            if (required) {
+                throw std::runtime_error(format("expected scalar, found array for key: %s", key.c_str()));
+            }
+            return false;
+        }
+
+        return get_key(key, result, required);
+    }
+
     // TODO: this is not very clever - figure out something better
     template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
     template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h
index 0380c92fde..d13299ad3f 100644
--- a/src/llama-model-loader.h
+++ b/src/llama-model-loader.h
@@ -131,6 +131,8 @@ struct llama_model_loader {
     template<typename T>
     bool get_key_or_arr(enum llm_kv kid, T & result, uint32_t n, bool required = true);
 
+    bool get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required = true);
+
     std::string get_arch_name() const;
 
     enum llm_arch get_arch() const;
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index d2270e8f2d..0d5bcc64fe 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -31,12 +31,14 @@ const char * llm_type_name(llm_type type) {
         case LLM_TYPE_17M:           return "17M";
         case LLM_TYPE_22M:           return "22M";
         case LLM_TYPE_33M:           return "33M";
+        case LLM_TYPE_47M:           return "47M";
         case LLM_TYPE_60M:           return "60M";
         case LLM_TYPE_70M:           return "70M";
         case LLM_TYPE_80M:           return "80M";
         case LLM_TYPE_109M:          return "109M";
         case LLM_TYPE_137M:          return "137M";
         case LLM_TYPE_140M:          return "140M";
+        case LLM_TYPE_149M:          return "149M";
         case LLM_TYPE_160M:          return "160M";
         case LLM_TYPE_190M:          return "190M";
         case LLM_TYPE_220M:          return "220M";
@@ -46,6 +48,7 @@ const char * llm_type_name(llm_type type) {
         case LLM_TYPE_335M:          return "335M";
         case LLM_TYPE_350M:          return "350M";
         case LLM_TYPE_360M:          return "360M";
+        case LLM_TYPE_395M:          return "395M";
         case LLM_TYPE_410M:          return "410M";
         case LLM_TYPE_450M:          return "450M";
         case LLM_TYPE_475M:          return "475M";
@@ -875,6 +878,34 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     default: type = LLM_TYPE_UNKNOWN;
                 }
             } break;
+        case LLM_ARCH_MODERN_BERT:
+            {
+                const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
+                if (found_swa && hparams.n_swa > 0) {
+                    uint32_t swa_period = 3;
+                    hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
+
+                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
+                    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+                    hparams.set_swa_pattern(swa_period);
+                } else {
+                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
+                }
+
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+                ml.get_key(LLM_KV_ATTENTION_CAUSAL,        hparams.causal_attn);
+                ml.get_key(LLM_KV_POOLING_TYPE,            hparams.pooling_type, false);
+
+                switch (hparams.n_layer) {
+                    case 12:
+                        type = LLM_TYPE_47M; break; // granite-embedding-small
+                    case 22:
+                        type = LLM_TYPE_149M; break; // modern-bert-base
+                    case 28:
+                        type = LLM_TYPE_395M; break; // modern-bert-large
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
         case LLM_ARCH_JINA_BERT_V2:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
@@ -3155,6 +3186,37 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                         layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i),   {n_embd}, 0);
                     }
                 } break;
+            case LLM_ARCH_MODERN_BERT:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+                    tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
+
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+
+                    for(int i = 0; i < n_layer; ++i) {
+                        auto& layer = layers[i];
+
+                        if ( i != 0 ) {
+                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+                        } else{
+                            // layer 0 uses identity
+                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
+                        }
+
+
+                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3 * n_embd }, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT,   "weight", i), {n_embd, n_embd}, 0);
+
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, 2 * n_ff}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+                    }
+
+                    cls       = create_tensor(tn(LLM_TENSOR_CLS,     "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
+                    cls_out   = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
+                    cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"),   {hparams.n_cls_out},         TENSOR_NOT_REQUIRED);
+
+                } break;
             case LLM_ARCH_NEO_BERT:
                 {
                     tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, 0);
@@ -5181,9 +5243,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                     const int64_t n_group    = hparams.ssm_n_group;
                     const int64_t d_in_proj  = 2*d_inner + 2*n_group*d_state + n_ssm_head;
 
-                    const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
-                    const int64_t n_ff_shexp = hparams.n_ff_shexp;
-
                     // embeddings
                     tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
@@ -5235,6 +5294,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                             layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias",   i), {n_embd},         TENSOR_NOT_REQUIRED);
                         }  else {
                             if (n_expert != 0) {
+                                const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
+                                const int64_t n_ff_shexp = hparams.n_ff_shexp;
+
                                 layer.ffn_gate_inp    = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), { n_embd, n_expert}, 0);
                                 layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert         }, 0);
 
@@ -7089,6 +7151,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
         case LLM_ARCH_NOMIC_BERT_MOE:
         case LLM_ARCH_NEO_BERT:
         case LLM_ARCH_WAVTOKENIZER_DEC:
+        case LLM_ARCH_MODERN_BERT:
         case LLM_ARCH_GEMMA_EMBEDDING:
         case LLM_ARCH_DREAM:
         case LLM_ARCH_LLADA:
@@ -7248,6 +7311,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
             {
                 llm = std::make_unique<llm_build_bert>(*this, params);
             } break;
+        case LLM_ARCH_MODERN_BERT:
+            {
+                llm = std::make_unique<llm_build_modern_bert<true>>(*this, params);
+            } break;
         case LLM_ARCH_NEO_BERT:
             {
                 llm = std::make_unique<llm_build_neo_bert>(*this, params);
@@ -7816,6 +7883,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_DBRX:
         case LLM_ARCH_BERT:
         case LLM_ARCH_JINA_BERT_V3:
+        case LLM_ARCH_MODERN_BERT:
         case LLM_ARCH_NOMIC_BERT:
         case LLM_ARCH_NOMIC_BERT_MOE:
         case LLM_ARCH_STABLELM:
diff --git a/src/llama-model.h b/src/llama-model.h
index c6eb953188..7f560d462f 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -24,12 +24,14 @@ enum llm_type {
     LLM_TYPE_17M,
     LLM_TYPE_22M,
     LLM_TYPE_33M,
+    LLM_TYPE_47M,
     LLM_TYPE_60M,
     LLM_TYPE_70M,
     LLM_TYPE_80M,
     LLM_TYPE_109M,
     LLM_TYPE_137M,
     LLM_TYPE_140M,
+    LLM_TYPE_149M,
     LLM_TYPE_160M,
     LLM_TYPE_190M,
     LLM_TYPE_220M,
@@ -39,6 +41,7 @@ enum llm_type {
     LLM_TYPE_335M,
     LLM_TYPE_350M,
     LLM_TYPE_360M,
+    LLM_TYPE_395M,
     LLM_TYPE_410M,
     LLM_TYPE_450M,
     LLM_TYPE_475M,
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 7b01a2edfe..cd4092ca07 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1878,7 +1878,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                     tokenizer_pre == "jina-v2-es" ||
                     tokenizer_pre == "jina-v2-de" ||
                     tokenizer_pre == "a.x-4.0" ||
-                    tokenizer_pre == "mellum") {
+                    tokenizer_pre == "mellum"  ||
+                    tokenizer_pre == "modern-bert" ) {
                 pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
             } else if (
                     tokenizer_pre == "jina-v1-en" ||
@@ -2528,6 +2529,13 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
             for (const auto * token : {"<unk>", "<s>", "<|endoftext|>"}) {
                 _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
             }
+        } else if (_contains_any(model_name, {"modern-bert"})) {
+            if (token_to_id.count("[MASK]") == 0 ) {
+                LLAMA_LOG_WARN("%s: Mask token missing in vocab!\n", __func__);
+            }
+            else {
+                _set_token_attr("[MASK]", LLAMA_TOKEN_ATTR_LSTRIP, true);
+            }
         }
     }
 }
diff --git a/src/models/models.h b/src/models/models.h
index ffb36acc61..53a5810659 100644
--- a/src/models/models.h
+++ b/src/models/models.h
@@ -327,6 +327,11 @@ struct llm_build_mistral3 : public llm_graph_context {
     llm_build_mistral3(const llama_model & model, const llm_graph_params & params);
 };
 
+template <bool iswa>
+struct llm_build_modern_bert : public llm_graph_context {
+    llm_build_modern_bert(const llama_model & model, const llm_graph_params & params);
+};
+
 struct llm_build_mpt : public llm_graph_context {
     llm_build_mpt(const llama_model & model, const llm_graph_params & params);
 };
diff --git a/src/models/modern-bert.cpp b/src/models/modern-bert.cpp
new file mode 100644
index 0000000000..c7809bdedf
--- /dev/null
+++ b/src/models/modern-bert.cpp
@@ -0,0 +1,126 @@
+#include "models.h"
+
+template <bool iswa>
+llm_build_modern_bert<iswa>::llm_build_modern_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+    ggml_tensor * cur;
+    ggml_tensor * inpL;
+    ggml_tensor * inp_pos = build_inp_pos();
+
+    // construct input embeddings (token, type, position)
+    inpL = build_inp_embd(model.tok_embd);
+    cb(inpL, "inp_embd", -1);
+
+    // embed layer norm
+    inpL = build_norm(inpL, model.tok_norm, nullptr, LLM_NORM, -1);
+    cb(inpL, "inp_norm", -1);
+
+    ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+    auto * inp_attn = build_attn_inp_no_cache();
+
+    for (int il = 0; il < n_layer; ++il) {
+        float freq_base_l  = 0.0f;
+
+        if constexpr (iswa) {
+            freq_base_l = model.get_rope_freq_base(cparams, il);
+        } else {
+            freq_base_l = freq_base;
+        }
+
+        cur = inpL;
+
+        // attention layer norm
+        if (model.layers[il].attn_norm) {
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM, il);
+            cb(cur, "attn_norm", il);
+        }
+
+        // self attention
+        cur = build_lora_mm(model.layers[il].wqkv, cur);
+        cb(cur, "wqkv", il);
+
+        const size_t type_size = ggml_type_size(cur->type);
+
+        ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head,    n_tokens, n_embd_head*type_size, cur->nb[1], 0*type_size*(n_embd));
+        ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*type_size, cur->nb[1], 1*type_size*(n_embd));
+        ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*type_size, cur->nb[1], 1*type_size*(n_embd + n_embd_gqa));
+
+        // RoPE
+        Qcur = ggml_rope_ext(
+                ctx0, Qcur, inp_pos, nullptr,
+                n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale,
+                ext_factor, attn_factor, beta_fast, beta_slow
+                );
+
+        Kcur = ggml_rope_ext(
+                ctx0, Kcur, inp_pos, nullptr,
+                n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale,
+                ext_factor, attn_factor, beta_fast, beta_slow
+                );
+
+        cb(Qcur, "Qcur", il);
+        cb(Kcur, "Kcur", il);
+        cb(Vcur, "Vcur", il);
+
+        cur = build_attn(inp_attn,
+                    model.layers[il].wo, nullptr,
+                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+        cb(cur, "kqv_out", il);
+
+        if (il == n_layer - 1 && inp_out_ids) {
+            cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
+            inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+        }
+
+        // re-add the layer input
+        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+        cb(ffn_inp, "ffn_inp", il);
+
+        // attention layer norm
+        cur = build_norm(ffn_inp,
+                model.layers[il].ffn_norm, NULL,
+                LLM_NORM, il);
+        cb(cur, "ffn_norm", il);
+
+        cur = build_ffn(cur,
+                model.layers[il].ffn_up,   NULL, NULL,
+                NULL,                      NULL, NULL,
+                model.layers[il].ffn_down, NULL, NULL,
+                NULL,
+                LLM_FFN_GEGLU, LLM_FFN_SEQ, il);
+
+        // attentions bypass the intermediate layer
+        cur = ggml_add(ctx0, cur, ffn_inp);
+
+        // input for next layer
+        inpL = cur;
+    }
+
+    cur = inpL;
+
+    cur = build_norm(cur,
+            model.output_norm, NULL,
+            LLM_NORM, -1);
+    cb(cur, "final_norm_out", -1);
+
+    if (hparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
+        // extracting cls token
+        cur = ggml_view_1d(ctx0, cur, hparams.n_embd, 0);
+        cb(cur, "cls_pooled_embd", -1);
+    }
+
+    cb(cur, "res_embd", -1);
+    res->t_embd = cur;
+    ggml_build_forward_expand(gf, cur);
+}
+
+// Explicit template instantiations
+template struct llm_build_modern_bert<false>;
+template struct llm_build_modern_bert<true>;
diff --git a/tools/server/public/index.html.gz b/tools/server/public/index.html.gz
index b5266edee7..cf5c625b40 100644
Binary files a/tools/server/public/index.html.gz and b/tools/server/public/index.html.gz differ
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index a132b87c84..94825dc862 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -2313,6 +2313,12 @@ private:
                         slot.n_prompt_tokens_processed = 0;
 
                         slot.prompt.tokens.keep_first(n_past);
+
+                        // send initial 0% progress update if needed
+                        // this is to signal the client that the request has started processing
+                        if (slot.task->params.stream && slot.task->params.return_progress) {
+                            send_partial_response(slot, {}, true);
+                        }
                     }
 
                     if (!slot.can_split()) {
@@ -2784,6 +2790,12 @@ server_response_reader server_context::get_response_reader() {
 
 server_context_meta server_context::get_meta() const {
     auto tool_use_src = common_chat_templates_source(impl->chat_templates.get(), "tool_use");
+
+    auto bos_id = llama_vocab_bos(impl->vocab);
+    auto eos_id = llama_vocab_eos(impl->vocab);
+    auto bos_token_str = bos_id != LLAMA_TOKEN_NULL ? common_token_to_piece(impl->ctx, bos_id, true) : "";
+    auto eos_token_str = eos_id != LLAMA_TOKEN_NULL ? common_token_to_piece(impl->ctx, eos_id, true) : "";
+
     return server_context_meta {
         /* build_info             */ build_info,
         /* model_name             */ impl->model_name,
@@ -2798,8 +2810,8 @@ server_context_meta server_context::get_meta() const {
         /* chat_template          */ common_chat_templates_source(impl->chat_templates.get()),
         /* chat_template_tool_use */ tool_use_src ? tool_use_src : "",
 
-        /* bos_token_str          */ common_token_to_piece(impl->ctx, llama_vocab_bos(impl->vocab), true),
-        /* eos_token_str          */ common_token_to_piece(impl->ctx, llama_vocab_eos(impl->vocab), true),
+        /* bos_token_str          */ bos_token_str,
+        /* eos_token_str          */ eos_token_str,
         /* fim_pre_token          */ llama_vocab_fim_pre(impl->vocab),
         /* fim_sub_token          */ llama_vocab_fim_suf(impl->vocab),
         /* fim_mid_token          */ llama_vocab_fim_mid(impl->vocab),
diff --git a/tools/server/tests/unit/test_chat_completion.py b/tools/server/tests/unit/test_chat_completion.py
index 64f3158b98..5f5de415cf 100644
--- a/tools/server/tests/unit/test_chat_completion.py
+++ b/tools/server/tests/unit/test_chat_completion.py
@@ -434,8 +434,8 @@ def test_context_size_exceeded_stream():
 @pytest.mark.parametrize(
     "n_batch,batch_count,reuse_cache",
     [
-        (64, 3, False),
-        (64, 1, True),
+        (64, 4, False),
+        (64, 2, True),
     ]
 )
 def test_return_progress(n_batch, batch_count, reuse_cache):
@@ -462,10 +462,18 @@ def test_return_progress(n_batch, batch_count, reuse_cache):
     res = make_cmpl_request()
     last_progress = None
     total_batch_count = 0
+
     for data in res:
         cur_progress = data.get("prompt_progress", None)
         if cur_progress is None:
             continue
+        if total_batch_count == 0:
+            # first progress report must have n_cache == n_processed
+            assert cur_progress["total"] > 0
+            assert cur_progress["cache"] == cur_progress["processed"]
+            if reuse_cache:
+                # when reusing cache, we expect some cached tokens
+                assert cur_progress["cache"] > 0
         if last_progress is not None:
             assert cur_progress["total"] == last_progress["total"]
             assert cur_progress["cache"] == last_progress["cache"]
@@ -473,6 +481,7 @@ def test_return_progress(n_batch, batch_count, reuse_cache):
         total_batch_count += 1
         last_progress = cur_progress
 
+    # last progress should indicate completion (all tokens processed)
     assert last_progress is not None
     assert last_progress["total"] > 0
     assert last_progress["processed"] == last_progress["total"]
diff --git a/tools/server/webui/src/lib/stores/settings.svelte.ts b/tools/server/webui/src/lib/stores/settings.svelte.ts
index e163833bfb..cda940ba7e 100644
--- a/tools/server/webui/src/lib/stores/settings.svelte.ts
+++ b/tools/server/webui/src/lib/stores/settings.svelte.ts
@@ -294,15 +294,14 @@ class SettingsStore {
 	 * This sets up the default values from /props endpoint
 	 */
 	syncWithServerDefaults(): void {
-		const serverParams = serverStore.defaultParams;
-		if (!serverParams) {
-			console.warn('No server parameters available for initialization');
+		const propsDefaults = this.getServerDefaults();
+
+		if (Object.keys(propsDefaults).length === 0) {
+			console.warn('No server defaults available for initialization');
 
 			return;
 		}
 
-		const propsDefaults = this.getServerDefaults();
-
 		for (const [key, propsValue] of Object.entries(propsDefaults)) {
 			const currentValue = getConfigValue(this.config, key);
 
diff --git a/tools/server/webui/src/routes/+layout.svelte b/tools/server/webui/src/routes/+layout.svelte
index a14dfb633c..095827b9ca 100644
--- a/tools/server/webui/src/routes/+layout.svelte
+++ b/tools/server/webui/src/routes/+layout.svelte
@@ -119,7 +119,7 @@
 	$effect(() => {
 		const serverProps = serverStore.props;
 
-		if (serverProps?.default_generation_settings?.params) {
+		if (serverProps) {
 			settingsStore.syncWithServerDefaults();
 		}
 	});