diff --git a/.devops/rocm.Dockerfile b/.devops/rocm.Dockerfile
index 14936f8e9c..830fe19e3e 100644
--- a/.devops/rocm.Dockerfile
+++ b/.devops/rocm.Dockerfile
@@ -1,8 +1,8 @@
 ARG UBUNTU_VERSION=24.04
 
 # This needs to generally match the container host's environment.
-ARG ROCM_VERSION=7.0
-ARG AMDGPU_VERSION=7.0
+ARG ROCM_VERSION=7.2
+ARG AMDGPU_VERSION=7.2
 
 # Target the ROCm build image
 ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
@@ -11,13 +11,12 @@ ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-co
 FROM ${BASE_ROCM_DEV_CONTAINER} AS build
 
 # Unless otherwise specified, we make a fat build.
-# List from https://github.com/ggml-org/llama.cpp/pull/1087#issuecomment-1682807878
 # This is mostly tied to rocBLAS supported archs.
-# gfx803, gfx900, gfx906, gfx1032, gfx1101, gfx1102,not officialy supported
-# check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.1/reference/system-requirements.html
+# check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.2.0/reference/system-requirements.html
+# check https://rocm.docs.amd.com/projects/radeon-ryzen/en/latest/docs/compatibility/compatibilityrad/native_linux/native_linux_compatibility.html
+# check https://rocm.docs.amd.com/projects/radeon-ryzen/en/latest/docs/compatibility/compatibilityryz/native_linux/native_linux_compatibility.html
 
-ARG ROCM_DOCKER_ARCH='gfx803;gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1010;gfx1030;gfx1032;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx1151'
-#ARG ROCM_DOCKER_ARCH='gfx1151'
+ARG ROCM_DOCKER_ARCH='gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1151;gfx1150;gfx1200;gfx1201'
 
 # Set ROCm architectures
 ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 1914c08489..860acc6b1b 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -516,6 +516,102 @@ jobs:
           path: llama-bin-win-sycl-x64.zip
           name: llama-bin-win-sycl-x64.zip
 
+  ubuntu-22-rocm:
+    runs-on: ubuntu-22.04
+
+    strategy:
+      matrix:
+        include:
+          - ROCM_VERSION: "7.2"
+            gpu_targets: "gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1151;gfx1150;gfx1200;gfx1201"
+            build: 'x64'
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.16
+        with:
+          key: ubuntu-rocm-cmake-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}
+          evict-old-files: 1d
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt install -y build-essential git cmake wget
+
+      - name: Setup Legacy ROCm
+        if: matrix.ROCM_VERSION == '7.2'
+        id: legacy_env
+        run: |
+          sudo mkdir --parents --mode=0755 /etc/apt/keyrings
+          wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | \
+            gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null
+
+          sudo tee /etc/apt/sources.list.d/rocm.list << EOF
+          deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/${{ matrix.ROCM_VERSION }} jammy main
+          EOF
+
+          sudo tee /etc/apt/preferences.d/rocm-pin-600 << EOF
+          Package: *
+          Pin: release o=repo.radeon.com
+          Pin-Priority: 600
+          EOF
+
+          sudo apt update
+          sudo apt-get install -y libssl-dev rocm-hip-sdk
+
+      - name: Setup TheRock
+        if: matrix.ROCM_VERSION != '7.2'
+        id: therock_env
+        run: |
+          wget https://repo.amd.com/rocm/tarball/therock-dist-linux-gfx1151-${{ matrix.ROCM_VERSION }}.tar.gz
+          mkdir install
+          tar -xf *.tar.gz -C install
+          export ROCM_PATH=$(pwd)/install
+          echo ROCM_PATH=$ROCM_PATH >> $GITHUB_ENV
+          echo PATH=$PATH:$ROCM_PATH/bin >> $GITHUB_ENV
+          echo LD_LIBRARY_PATH=$ROCM_PATH/lib:$ROCM_PATH/llvm/lib:$ROCM_PATH/lib/rocprofiler-systems >> $GITHUB_ENV
+
+      - name: Build with native CMake HIP support
+        id: cmake_build
+        run: |
+          cmake -B build -S . \
+            -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
+            -DCMAKE_HIP_FLAGS="-mllvm --amdgpu-unroll-threshold-local=600" \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DGGML_BACKEND_DL=ON \
+            -DGGML_NATIVE=OFF \
+            -DCMAKE_INSTALL_RPATH='$ORIGIN' \
+            -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
+            -DGGML_CPU_ALL_VARIANTS=ON \
+            -DGPU_TARGETS="${{ matrix.gpu_targets }}" \
+            -DGGML_HIP=ON \
+            -DHIP_PLATFORM=amd \
+            -DGGML_HIP_ROCWMMA_FATTN=ON \
+            ${{ env.CMAKE_ARGS }}
+          cmake --build build --config Release -j $(nproc)
+
+      - name: Determine tag name
+        id: tag
+        uses: ./.github/actions/get-tag-name
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        run: |
+          cp LICENSE ./build/bin/
+          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v6
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}.tar.gz
+          name: llama-bin-ubuntu-rocm-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}.tar.gz
+
   windows-hip:
     runs-on: windows-2022
 
@@ -784,6 +880,7 @@ jobs:
       - windows-cuda
       - windows-sycl
       - windows-hip
+      - ubuntu-22-rocm
       - ubuntu-22-cpu
       - ubuntu-22-vulkan
       - macOS-arm64
@@ -868,6 +965,7 @@ jobs:
             **Linux:**
             - [Ubuntu x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.tar.gz)
             - [Ubuntu x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz)
+            - [Ubuntu x64 (ROCm 7.2)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-7.2-x64.tar.gz)
             - [Ubuntu s390x (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-s390x.tar.gz)
 
             **Windows:**
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 32542ecd27..69da97dc1e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
+cmake_minimum_required(VERSION 3.14...3.28) # for add_link_options and implicit target directories.
 project("llama.cpp" C CXX)
 include(CheckIncludeFileCXX)
 
diff --git a/common/chat-parser-xml-toolcall.cpp b/common/chat-parser-xml-toolcall.cpp
index a80900ff8d..ba359fdbf4 100644
--- a/common/chat-parser-xml-toolcall.cpp
+++ b/common/chat-parser-xml-toolcall.cpp
@@ -803,7 +803,7 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
         }
 
         // remove potential partial suffix
-        if (builder.pos() == builder.input().size()) {
+        if (builder.pos() == builder.input().size() && builder.is_partial()) {
             if (unclosed_reasoning_content.empty()) {
                 rstrip(content);
                 trim_potential_partial_word(content);
diff --git a/common/chat-parser.cpp b/common/chat-parser.cpp
index 29819e48d3..060578f0b7 100644
--- a/common/chat-parser.cpp
+++ b/common/chat-parser.cpp
@@ -893,23 +893,6 @@ static void common_chat_parse_minimax_m2(common_chat_msg_parser & builder) {
     builder.consume_reasoning_with_xml_tool_calls(form, "<think>", "</think>");
 }
 
-static void common_chat_parse_qwen3_coder_xml(common_chat_msg_parser & builder) {
-    static const xml_tool_call_format form = ([]() {
-        xml_tool_call_format form {};
-        form.scope_start = "<tool_call>";
-        form.tool_start  = "<function=";
-        form.tool_sep    = ">";
-        form.key_start   = "<parameter=";
-        form.key_val_sep = ">";
-        form.val_end     = "</parameter>";
-        form.tool_end    = "</function>";
-        form.scope_end   = "</tool_call>";
-        form.trim_raw_argval = true;
-        return form;
-    })();
-    builder.consume_reasoning_with_xml_tool_calls(form);
-}
-
 static void common_chat_parse_kimi_k2(common_chat_msg_parser & builder) {
     static const xml_tool_call_format form = ([]() {
         xml_tool_call_format form {};
@@ -1590,9 +1573,6 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
         case COMMON_CHAT_FORMAT_KIMI_K2:
             common_chat_parse_kimi_k2(builder);
             break;
-        case COMMON_CHAT_FORMAT_QWEN3_CODER_XML:
-            common_chat_parse_qwen3_coder_xml(builder);
-            break;
         case COMMON_CHAT_FORMAT_APRIEL_1_5:
             common_chat_parse_apriel_1_5(builder);
             break;
diff --git a/common/chat.cpp b/common/chat.cpp
index 8a57e66cce..52780c59ad 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -736,7 +736,6 @@ const char * common_chat_format_name(common_chat_format format) {
         case COMMON_CHAT_FORMAT_MINIMAX_M2: return "MiniMax-M2";
         case COMMON_CHAT_FORMAT_GLM_4_5: return "GLM 4.5";
         case COMMON_CHAT_FORMAT_KIMI_K2: return "Kimi K2";
-        case COMMON_CHAT_FORMAT_QWEN3_CODER_XML: return "Qwen3 Coder";
         case COMMON_CHAT_FORMAT_APRIEL_1_5: return "Apriel 1.5";
         case COMMON_CHAT_FORMAT_XIAOMI_MIMO: return "Xiaomi MiMo";
         case COMMON_CHAT_FORMAT_SOLAR_OPEN: return "Solar Open";
@@ -1522,14 +1521,17 @@ static common_chat_params common_chat_params_init_nemotron_v2(const common_chat_
     return data;
 }
 
-static common_chat_params common_chat_params_init_nemotron_v3(const common_chat_template & tmpl, const struct templates_params & inputs) {
+static common_chat_params common_chat_params_init_qwen3_coder(const common_chat_template & tmpl, const struct templates_params & inputs) {
     common_chat_params data;
 
     data.prompt = apply(tmpl, inputs);
     data.format = COMMON_CHAT_FORMAT_PEG_CONSTRUCTED;
 
+    // Nemotron Nano 3 and Step-3.5-Flash use the Qwen3 Coder tool calling with thinking
+    bool supports_reasoning = (tmpl.source().find("<think>") != std::string::npos);
+
     // Handle thinking tags appropriately based on inputs.enable_thinking
-    if (string_ends_with(data.prompt, "<think>\n")) {
+    if (supports_reasoning && string_ends_with(data.prompt, "<think>\n")) {
         if (!inputs.enable_thinking) {
             data.prompt += "</think>";
         } else {
@@ -1538,19 +1540,21 @@ static common_chat_params common_chat_params_init_nemotron_v3(const common_chat_
     }
 
     data.preserved_tokens = {
-        "<think>",
-        "</think>",
         "<tool_call>",
         "</tool_call>",
     };
 
+    if (supports_reasoning) {
+        data.preserved_tokens.insert(data.preserved_tokens.end(), {"<think>", "</think>"});
+    }
+
     auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
     auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
     auto include_grammar = true;
 
     auto parser = build_chat_peg_constructed_parser([&](auto & p) {
         auto reasoning = p.eps();
-        if (inputs.enable_thinking && extract_reasoning) {
+        if (supports_reasoning && inputs.enable_thinking && extract_reasoning) {
             auto reasoning_content = p.reasoning(p.until("</think>")) + ("</think>" | p.end());
             if (data.thinking_forced_open) {
                 reasoning = reasoning_content;
@@ -1888,38 +1892,6 @@ static common_chat_params common_chat_params_init_minimax_m2(const common_chat_t
     return data;
 }
 
-static common_chat_params common_chat_params_init_qwen3_coder_xml(const common_chat_template & tmpl, const struct templates_params & params) {
-    common_chat_params data;
-    data.grammar_lazy = params.tools.is_array() && !params.tools.empty() && params.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
-
-    data.prompt = apply(tmpl, params);
-    data.format = COMMON_CHAT_FORMAT_QWEN3_CODER_XML;
-
-    data.preserved_tokens = {
-        "<tool_call>",
-        "</tool_call>",
-        "<function=",
-        "</function>",
-        "<parameter=",
-        "</parameter>",
-    };
-
-    // build grammar for tool call
-    static const xml_tool_call_format form {
-        /* form.scope_start = */ "<tool_call>\n",
-        /* form.tool_start  = */ "<function=",
-        /* form.tool_sep    = */ ">\n",
-        /* form.key_start   = */ "<parameter=",
-        /* form.key_val_sep = */ ">\n",
-        /* form.val_end     = */ "\n</parameter>\n",
-        /* form.tool_end    = */ "</function>\n",
-        /* form.scope_end   = */ "</tool_call>",
-    };
-    build_grammar_xml_tool_call(data, params.tools, form);
-
-    return data;
-}
-
 static common_chat_params common_chat_params_init_kimi_k2(const common_chat_template & tmpl, const struct templates_params & params) {
     common_chat_params data;
     data.grammar_lazy = params.tools.is_array() && !params.tools.empty() && params.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
@@ -3147,13 +3119,7 @@ static common_chat_params common_chat_templates_apply_jinja(
         src.find("<function=") != std::string::npos &&
         src.find("<parameter=") != std::string::npos) {
         workaround::func_args_not_string(params.messages);
-        // Models with <think> support (Step-3.5-Flash, Nemotron 3 Nano) use the
-        // Nemotron v3 PEG parser for streaming and schema-aware parameter parsing.
-        // Qwen3-Coder has no <think> in its template.
-        if (src.find("<think>") != std::string::npos) {
-            return common_chat_params_init_nemotron_v3(tmpl, params);
-        }
-        return common_chat_params_init_qwen3_coder_xml(tmpl, params);
+        return common_chat_params_init_qwen3_coder(tmpl, params);
     }
 
     // Xiaomi MiMo format detection (must come before Hermes 2 Pro)
diff --git a/common/chat.h b/common/chat.h
index 1bf43f7261..6f0b9409ec 100644
--- a/common/chat.h
+++ b/common/chat.h
@@ -128,7 +128,6 @@ enum common_chat_format {
     COMMON_CHAT_FORMAT_GLM_4_5,
     COMMON_CHAT_FORMAT_MINIMAX_M2,
     COMMON_CHAT_FORMAT_KIMI_K2,
-    COMMON_CHAT_FORMAT_QWEN3_CODER_XML,
     COMMON_CHAT_FORMAT_APRIEL_1_5,
     COMMON_CHAT_FORMAT_XIAOMI_MIMO,
     COMMON_CHAT_FORMAT_SOLAR_OPEN,
diff --git a/common/common.cpp b/common/common.cpp
index 75116ed6f3..53bddc4ef2 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1760,3 +1760,65 @@ float lr_opt::get_lr(float epoch) const {
     LOG_INF("epoch %.2g lr=%.2g\n", epoch, r);
     return r;
 }
+
+bool common_replay_last_token(struct llama_context * ctx, llama_token last_token, int32_t pos) {
+    llama_batch batch = llama_batch_get_one(&last_token, 1);
+    batch.pos = &pos;
+    if (llama_decode(ctx, batch)) {
+        LOG_ERR("%s: failed to replay last token\n", __func__);
+        return false;
+    }
+    return true;
+}
+
+bool common_prompt_batch_decode(
+              struct llama_context * ctx,
+    const std::vector<llama_token> & tokens,
+                               int & n_past,
+                               int   n_batch,
+                  std::string_view   state_path,
+                              bool   save_state) {
+    const int n_eval = tokens.size();
+    if (n_eval == 0) {
+        return true;
+    }
+
+    if (save_state && n_eval > 1) {
+        const int n_tokens_before_last = n_eval - 1;
+
+        GGML_ASSERT(n_eval <= n_batch);
+
+        // Decode all but the last token so we can save the memory state before decoding the last token.
+        // This is done so we can restore the session state later and replay the last token.
+        // Memory implementations in recurrent/hybrid models don't support removing tokens from their
+        // memory, so we can't just remove the last token from the memory and replay the last token which
+        // is the reason for this logic.
+        if (llama_decode(ctx, llama_batch_get_one(const_cast<llama_token*>(tokens.data()), n_tokens_before_last))) {
+            LOG_ERR("%s : failed to eval\n", __func__);
+            return false;
+        }
+        n_past += n_tokens_before_last;
+
+        llama_state_save_file(ctx, state_path.data(), tokens.data(), n_tokens_before_last);
+        LOG_INF("saved session before last token to %s, n_tokens = %d\n", state_path.data(), n_tokens_before_last);
+
+        llama_token last_token = tokens.back();
+        llama_batch batch = llama_batch_get_one(&last_token, 1);
+        int32_t pos = n_past;
+        batch.pos = &pos;
+
+        if (llama_decode(ctx, batch)) {
+            LOG_ERR("%s : failed to eval last token\n", __func__);
+            return false;
+        }
+        n_past++;
+    } else {
+        if (llama_decode(ctx, llama_batch_get_one(const_cast<llama_token*>(tokens.data()), n_eval))) {
+            LOG_ERR("%s : failed to eval\n", __func__);
+            return false;
+        }
+        n_past += n_eval;
+    }
+
+    return true;
+}
diff --git a/common/common.h b/common/common.h
index a4c431172d..1fa1728656 100644
--- a/common/common.h
+++ b/common/common.h
@@ -804,6 +804,23 @@ void common_batch_add(
     const std::vector<llama_seq_id> & seq_ids,
                                bool   logits);
 
+// decodes a single batch of tokens for a prompt and manages session tokens
+//
+// Note: We save state before the last token so that we can replay it to ensure
+// compatibility with all memory types. Recurrent/hybrid models cannot remove
+// tokens from memory, so this approach works across all model architectures.
+bool common_prompt_batch_decode(
+              struct llama_context * ctx,
+    const std::vector<llama_token> & embd,
+                               int & n_past,
+                               int   n_batch,
+                  std::string_view   state_path,
+                              bool   save_state);
+
+// replays the last token after loading state to regenerate logits
+// used after loading session state to ensure the sampling context has valid logits
+bool common_replay_last_token(struct llama_context * ctx, llama_token last_token, int32_t pos);
+
 //
 // Vocab utils
 //
diff --git a/common/jinja/runtime.cpp b/common/jinja/runtime.cpp
index cc012c892f..c93e182a7e 100644
--- a/common/jinja/runtime.cpp
+++ b/common/jinja/runtime.cpp
@@ -85,7 +85,7 @@ value identifier::execute_impl(context & ctx) {
     auto builtins = global_builtins();
     if (!it->is_undefined()) {
         if (ctx.is_get_stats) {
-            it->stats.used = true;
+            value_t::stats_t::mark_used(it);
         }
         JJ_DEBUG("Identifier '%s' found, type = %s", val.c_str(), it->type().c_str());
         return it;
@@ -277,7 +277,7 @@ value binary_expression::execute_impl(context & ctx) {
 static value try_builtin_func(context & ctx, const std::string & name, value & input, bool undef_on_missing = false) {
     JJ_DEBUG("Trying built-in function '%s' for type %s", name.c_str(), input->type().c_str());
     if (ctx.is_get_stats) {
-        input->stats.used = true;
+        value_t::stats_t::mark_used(input);
         input->stats.ops.insert(name);
     }
     auto builtins = input->get_builtins();
@@ -448,7 +448,7 @@ value for_statement::execute_impl(context & ctx) {
 
     // mark the variable being iterated as used for stats
     if (ctx.is_get_stats) {
-        iterable_val->stats.used = true;
+        value_t::stats_t::mark_used(iterable_val);
         iterable_val->stats.ops.insert("array_access");
     }
 
@@ -470,7 +470,7 @@ value for_statement::execute_impl(context & ctx) {
             items.push_back(std::move(tuple));
         }
         if (ctx.is_get_stats) {
-            iterable_val->stats.used = true;
+            value_t::stats_t::mark_used(iterable_val);
             iterable_val->stats.ops.insert("object_access");
         }
     } else {
@@ -480,7 +480,7 @@ value for_statement::execute_impl(context & ctx) {
             items.push_back(item);
         }
         if (ctx.is_get_stats) {
-            iterable_val->stats.used = true;
+            value_t::stats_t::mark_used(iterable_val);
             iterable_val->stats.ops.insert("array_access");
         }
     }
@@ -817,8 +817,9 @@ value member_expression::execute_impl(context & ctx) {
     }
 
     if (ctx.is_get_stats && val && object && property) {
-        val->stats.used = true;
-        object->stats.used = true;
+        value_t::stats_t::mark_used(val);
+        value_t::stats_t::mark_used(object);
+        value_t::stats_t::mark_used(property);
         if (is_val<value_int>(property)) {
             object->stats.ops.insert("array_access");
         } else if (is_val<value_string>(property)) {
diff --git a/common/jinja/value.cpp b/common/jinja/value.cpp
index 9987836d18..749113124b 100644
--- a/common/jinja/value.cpp
+++ b/common/jinja/value.cpp
@@ -161,6 +161,11 @@ static value tojson(const func_args & args) {
     value val_separators = args.get_kwarg_or_pos("separators",   3);
     value val_sort       = args.get_kwarg_or_pos("sort_keys",    4);
     int indent = -1;
+    if (args.ctx.is_get_stats) {
+        // mark as used (recursively) for stats
+        auto val_input = args.get_pos(0);
+        value_t::stats_t::mark_used(const_cast<value&>(val_input), true);
+    }
     if (is_val<value_int>(val_indent)) {
         indent = static_cast<int>(val_indent->as_int());
     }
@@ -891,6 +896,11 @@ const func_builtins & value_array_t::get_builtins() const {
         }},
         {"string", [](const func_args & args) -> value {
             args.ensure_vals<value_array>();
+            if (args.ctx.is_get_stats) {
+                // mark as used (recursively) for stats
+                auto val_input = args.get_pos(0);
+                value_t::stats_t::mark_used(const_cast<value&>(val_input), true);
+            }
             return mk_val<value_string>(args.get_pos(0)->as_string());
         }},
         {"tojson", tojson},
@@ -1046,6 +1056,11 @@ const func_builtins & value_object_t::get_builtins() const {
         {"tojson", tojson},
         {"string", [](const func_args & args) -> value {
             args.ensure_vals<value_object>();
+            if (args.ctx.is_get_stats) {
+                // mark as used (recursively) for stats
+                auto val_input = args.get_pos(0);
+                value_t::stats_t::mark_used(const_cast<value&>(val_input), true);
+            }
             return mk_val<value_string>(args.get_pos(0)->as_string());
         }},
         {"length", [](const func_args & args) -> value {
@@ -1358,4 +1373,21 @@ std::string value_to_string_repr(const value & val) {
     }
 }
 
+// stats utility
+void value_t::stats_t::mark_used(value & val, bool deep) {
+    val->stats.used = true;
+    if (deep) {
+        if (is_val<value_array>(val)) {
+            for (auto & item : val->val_arr) {
+                mark_used(item, deep);
+            }
+        } else if (is_val<value_object>(val)) {
+            for (auto & pair : val->val_obj) {
+                mark_used(pair.first, deep);
+                mark_used(pair.second, deep);
+            }
+        }
+    }
+}
+
 } // namespace jinja
diff --git a/common/jinja/value.h b/common/jinja/value.h
index 1c04760a08..07e447ff69 100644
--- a/common/jinja/value.h
+++ b/common/jinja/value.h
@@ -118,6 +118,8 @@ struct value_t {
         bool used = false;
         // ops can be builtin calls or operators: "array_access", "object_access"
         std::set<std::string> ops;
+        // utility to recursively mark value and its children as used
+        static void mark_used(value & val, bool deep = false);
     } stats;
 
     value_t() = default;
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 31acd5bb48..e038109599 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -1274,6 +1274,9 @@ class TextModel(ModelBase):
         if chkhsh == "b4b8ca1f9769494fbd956ebc4c249de6131fb277a4a3345a7a92c7dd7a55808d":
             # ref: https://huggingface.co/jdopensource/JoyAI-LLM-Flash
             res = "joyai-llm"
+        if chkhsh == "e4d54df1ebc1f2b91acd986c5b51aa50837d5faf7c7398e73c1f9e9ee5d19869":
+            # ref: https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct-2601
+            res = "kanana2"
 
         if res is None:
             logger.warning("\n")
diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py
index 8f7443d1b5..53a73759ec 100755
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -152,6 +152,7 @@ models = [
     {"name": "exaone-moe",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/K-EXAONE-236B-A23B", },
     {"name": "qwen35",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen3.5-9B-Instruct", },
     {"name": "joyai-llm",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jdopensource/JoyAI-LLM-Flash", },
+    {"name": "kanana2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct-2601", },
 ]
 
 # some models are known to be broken upstream, so we will skip them as exceptions
diff --git a/examples/model-conversion/Makefile b/examples/model-conversion/Makefile
index 342de63bd0..9356aaf854 100644
--- a/examples/model-conversion/Makefile
+++ b/examples/model-conversion/Makefile
@@ -77,7 +77,10 @@ causal-verify-embeddings: causal-run-original-embeddings causal-run-converted-em
 	@./scripts/causal/compare-embeddings-logits.sh
 
 causal-inspect-original-model:
-	@./scripts/utils/inspect-org-model.py
+	@./scripts/utils/inspect-org-model.py --list-all -s
+
+causal-list-original-model-tensors:
+	@./scripts/utils/inspect-org-model.py --list-all-short -s
 
 causal-inspect-converted-model:
 	@./scripts/utils/inspect-converted-model.sh
@@ -153,7 +156,7 @@ embedding-verify-logits-st: embedding-run-original-model-st embedding-run-conver
 
 embedding-inspect-original-model:
 	$(call validate_embedding_model_path,embedding-inspect-original-model)
-	@EMBEDDING_MODEL_PATH="$(EMBEDDING_MODEL_PATH)" ./scripts/utils/inspect-org-model.py -m ${EMBEDDING_MODEL_PATH}
+	@EMBEDDING_MODEL_PATH="$(EMBEDDING_MODEL_PATH)" ./scripts/utils/inspect-org-model.py -m ${EMBEDDING_MODEL_PATH} --list-all -s
 
 embedding-inspect-converted-model:
 	@CONVERTED_EMBEDDING_MODEL="$(CONVERTED_EMBEDDING_MODEL)" ./scripts/utils/inspect-converted-model.sh ${CONVERTED_EMBEDDING_MODEL}
diff --git a/examples/model-conversion/scripts/utils/inspect-org-model.py b/examples/model-conversion/scripts/utils/inspect-org-model.py
index bc6f45a5fb..5c3674af71 100755
--- a/examples/model-conversion/scripts/utils/inspect-org-model.py
+++ b/examples/model-conversion/scripts/utils/inspect-org-model.py
@@ -1,67 +1,290 @@
 #!/usr/bin/env python3
 
 import argparse
-import os
 import json
+import os
+import re
+import struct
+import sys
+from pathlib import Path
+from typing import Optional
 from safetensors import safe_open
-from collections import defaultdict
 
-parser = argparse.ArgumentParser(description='Process model with specified path')
-parser.add_argument('--model-path', '-m', help='Path to the model')
-args = parser.parse_args()
 
-model_path = os.environ.get('MODEL_PATH', args.model_path)
-if model_path is None:
-    parser.error("Model path must be specified either via --model-path argument or MODEL_PATH environment variable")
+MODEL_SAFETENSORS_FILE = "model.safetensors"
+MODEL_SAFETENSORS_INDEX = "model.safetensors.index.json"
 
-# Check if there's an index file (multi-file model)
-index_path = os.path.join(model_path, "model.safetensors.index.json")
-single_file_path = os.path.join(model_path, "model.safetensors")
+DTYPE_SIZES = {
+    "F64": 8, "I64": 8, "U64": 8,
+    "F32": 4, "I32": 4, "U32": 4,
+    "F16": 2, "BF16": 2, "I16": 2, "U16": 2,
+    "I8": 1, "U8": 1, "BOOL": 1,
+    "F8_E4M3": 1, "F8_E5M2": 1,
+}
 
-if os.path.exists(index_path):
-    # Multi-file model
-    print("Multi-file model detected")
+SIZE_UNITS = ['B', 'KB', 'MB', 'GB', 'TB']
 
-    with open(index_path, 'r') as f:
-        index_data = json.load(f)
 
-    # Get the weight map (tensor_name -> file_name)
-    weight_map = index_data.get("weight_map", {})
+def get_weight_map(model_path: Path) -> Optional[dict[str, str]]:
+    index_file = model_path / MODEL_SAFETENSORS_INDEX
 
-    # Group tensors by file for efficient processing
-    file_tensors = defaultdict(list)
-    for tensor_name, file_name in weight_map.items():
-        file_tensors[file_name].append(tensor_name)
+    if index_file.exists():
+        with open(index_file, 'r') as f:
+            index = json.load(f)
+            return index.get("weight_map", {})
 
-    print("Tensors in model:")
+    return None
 
-    # Process each shard file
-    for file_name, tensor_names in file_tensors.items():
-        file_path = os.path.join(model_path, file_name)
-        print(f"\n--- From {file_name} ---")
 
-        with safe_open(file_path, framework="pt") as f:
-            for tensor_name in sorted(tensor_names):
-                tensor = f.get_tensor(tensor_name)
-                print(f"- {tensor_name} : shape = {tensor.shape}, dtype = {tensor.dtype}")
+def get_all_tensor_names(model_path: Path) -> list[str]:
+    weight_map = get_weight_map(model_path)
 
-elif os.path.exists(single_file_path):
-    # Single file model (original behavior)
-    print("Single-file model detected")
+    if weight_map is not None:
+        return list(weight_map.keys())
 
-    with safe_open(single_file_path, framework="pt") as f:
-        keys = f.keys()
-        print("Tensors in model:")
-        for key in sorted(keys):
-            tensor = f.get_tensor(key)
-            print(f"- {key} : shape = {tensor.shape}, dtype = {tensor.dtype}")
+    single_file = model_path / MODEL_SAFETENSORS_FILE
+    if single_file.exists():
+        try:
+            with safe_open(single_file, framework="pt", device="cpu") as f:
+                return list(f.keys())
+        except Exception as e:
+            print(f"Error reading {single_file}: {e}")
+            sys.exit(1)
 
-else:
-    print(f"Error: Neither 'model.safetensors.index.json' nor 'model.safetensors' found in {model_path}")
-    print("Available files:")
-    if os.path.exists(model_path):
-        for item in sorted(os.listdir(model_path)):
-            print(f"  {item}")
+    print(f"Error: No safetensors files found in {model_path}")
+    sys.exit(1)
+
+
+def find_tensor_file(model_path: Path, tensor_name: str) -> Optional[str]:
+    weight_map = get_weight_map(model_path)
+
+    if weight_map is not None:
+        return weight_map.get(tensor_name)
+
+    single_file = model_path / MODEL_SAFETENSORS_FILE
+    if single_file.exists():
+        return single_file.name
+
+    return None
+
+
+def read_safetensors_header(file_path: Path) -> dict:
+    with open(file_path, 'rb') as f:
+        header_size = struct.unpack('<Q', f.read(8))[0]
+        return json.loads(f.read(header_size))
+
+
+def get_tensor_size_bytes(tensor_meta: dict) -> int:
+    offsets = tensor_meta.get("data_offsets")
+    if offsets and len(offsets) == 2:
+        return offsets[1] - offsets[0]
+    n_elements = 1
+    for d in tensor_meta.get("shape", []):
+        n_elements *= d
+    return n_elements * DTYPE_SIZES.get(tensor_meta.get("dtype", "F32"), 4)
+
+
+def format_size(size_bytes: int) -> str:
+    val = float(size_bytes)
+    for unit in SIZE_UNITS[:-1]:
+        if val < 1024.0:
+            return f"{val:.2f} {unit}"
+        val /= 1024.0
+    return f"{val:.2f} {SIZE_UNITS[-1]}"
+
+
+def get_all_tensor_metadata(model_path: Path) -> dict[str, dict]:
+    weight_map = get_weight_map(model_path)
+
+    if weight_map is not None:
+        file_to_tensors: dict[str, list[str]] = {}
+        for tensor_name, file_name in weight_map.items():
+            file_to_tensors.setdefault(file_name, []).append(tensor_name)
+
+        all_metadata: dict[str, dict] = {}
+        for file_name, tensor_names in file_to_tensors.items():
+            try:
+                header = read_safetensors_header(model_path / file_name)
+                for tensor_name in tensor_names:
+                    if tensor_name in header:
+                        all_metadata[tensor_name] = header[tensor_name]
+            except Exception as e:
+                print(f"Warning: Could not read header from {file_name}: {e}", file=sys.stderr)
+        return all_metadata
+
+    single_file = model_path / MODEL_SAFETENSORS_FILE
+    if single_file.exists():
+        try:
+            header = read_safetensors_header(single_file)
+            return {k: v for k, v in header.items() if k != "__metadata__"}
+        except Exception as e:
+            print(f"Error reading {single_file}: {e}")
+            sys.exit(1)
+
+    print(f"Error: No safetensors files found in {model_path}")
+    sys.exit(1)
+
+
+def normalize_tensor_name(tensor_name: str) -> str:
+    normalized = re.sub(r'\.\d+\.', '.#.', tensor_name)
+    normalized = re.sub(r'\.\d+$', '.#', normalized)
+    return normalized
+
+
+def list_all_tensors(
+    model_path: Path,
+    short: bool = False,
+    show_sizes: bool = False,
+):
+    tensor_names = get_all_tensor_names(model_path)
+
+    metadata: Optional[dict[str, dict]] = None
+    if show_sizes:
+        metadata = get_all_tensor_metadata(model_path)
+
+    total_bytes = 0
+
+    if short:
+        seen: dict[str, str] = {}
+        for tensor_name in sorted(tensor_names):
+            normalized = normalize_tensor_name(tensor_name)
+            if normalized not in seen:
+                seen[normalized] = tensor_name
+        display_pairs = list(sorted(seen.items()))
+        name_width = max((len(n) for n, _ in display_pairs), default=0)
+        for normalized, first_name in display_pairs:
+            if metadata and first_name in metadata:
+                m = metadata[first_name]
+                size = get_tensor_size_bytes(m)
+                total_bytes += size
+                print(f"{normalized:{name_width}}  {m.get('dtype', '?'):6s}  {str(m.get('shape', '')):30s}  {format_size(size)}")
+            else:
+                print(normalized)
     else:
-        print(f"  Directory {model_path} does not exist")
-    exit(1)
+        name_width = max((len(n) for n in tensor_names), default=0)
+        for tensor_name in sorted(tensor_names):
+            if metadata and tensor_name in metadata:
+                m = metadata[tensor_name]
+                size = get_tensor_size_bytes(m)
+                total_bytes += size
+                print(f"{tensor_name:{name_width}}  {m.get('dtype', '?'):6s}  {str(m.get('shape', '')):30s}  {format_size(size)}")
+            else:
+                print(tensor_name)
+
+    if show_sizes:
+        print(f"\nTotal: {format_size(total_bytes)}")
+
+
+def print_tensor_info(model_path: Path, tensor_name: str, num_values: Optional[int] = None):
+    tensor_file = find_tensor_file(model_path, tensor_name)
+
+    if tensor_file is None:
+        print(f"Error: Could not find tensor '{tensor_name}' in model index")
+        print(f"Model path: {model_path}")
+        sys.exit(1)
+
+    file_path = model_path / tensor_file
+
+    try:
+        header = read_safetensors_header(file_path)
+        tensor_meta = header.get(tensor_name, {})
+        dtype_str = tensor_meta.get("dtype")
+
+        with safe_open(file_path, framework="pt", device="cpu") as f:
+            if tensor_name in f.keys():
+                tensor_slice = f.get_slice(tensor_name)
+                shape = tensor_slice.get_shape()
+                print(f"Tensor: {tensor_name}")
+                print(f"File:   {tensor_file}")
+                print(f"Shape:  {shape}")
+                if dtype_str:
+                    print(f"Dtype:  {dtype_str}")
+                if tensor_meta:
+                    print(f"Size:   {format_size(get_tensor_size_bytes(tensor_meta))}")
+                if num_values is not None:
+                    tensor = f.get_tensor(tensor_name)
+                    if not dtype_str:
+                        print(f"Dtype:  {tensor.dtype}")
+                    flat = tensor.flatten()
+                    n = min(num_values, flat.numel())
+                    print(f"Values: {flat[:n].tolist()}")
+            else:
+                print(f"Error: Tensor '{tensor_name}' not found in {tensor_file}")
+                sys.exit(1)
+
+    except FileNotFoundError:
+        print(f"Error: The file '{file_path}' was not found.")
+        sys.exit(1)
+    except Exception as e:
+        print(f"An error occurred: {e}")
+        sys.exit(1)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Print tensor information from a safetensors model"
+    )
+    parser.add_argument(
+        "tensor_name",
+        nargs="?",
+        help="Name of the tensor to inspect"
+    )
+    parser.add_argument(
+        "-m", "--model-path",
+        type=Path,
+        help="Path to the model directory (default: MODEL_PATH environment variable)"
+    )
+    parser.add_argument(
+        "-l", "--list-all-short",
+        action="store_true",
+        help="List unique tensor patterns (layer numbers replaced with #)"
+    )
+    parser.add_argument(
+        "-la", "--list-all",
+        action="store_true",
+        help="List all tensor names with actual layer numbers"
+    )
+    parser.add_argument(
+        "-n", "--num-values",
+        nargs="?",
+        const=10,
+        default=None,
+        type=int,
+        metavar="N",
+        help="Print the first N values of the tensor flattened (default: 10 if flag is given without a number)"
+    )
+    parser.add_argument(
+        "-s", "--sizes",
+        action="store_true",
+        help="Show dtype, shape, and size for each tensor when listing"
+    )
+
+    args = parser.parse_args()
+
+    model_path = args.model_path
+    if model_path is None:
+        model_path_str = os.environ.get("MODEL_PATH")
+        if model_path_str is None:
+            print("Error: --model-path not provided and MODEL_PATH environment variable not set")
+            sys.exit(1)
+        model_path = Path(model_path_str)
+
+    if not model_path.exists():
+        print(f"Error: Model path does not exist: {model_path}")
+        sys.exit(1)
+
+    if not model_path.is_dir():
+        print(f"Error: Model path is not a directory: {model_path}")
+        sys.exit(1)
+
+    if args.list_all_short or args.list_all:
+        list_all_tensors(model_path, short=args.list_all_short, show_sizes=args.sizes)
+    else:
+        if args.tensor_name is None:
+            print("Error: tensor_name is required when not using --list-all-short or --list-all")
+            sys.exit(1)
+        print_tensor_info(model_path, args.tensor_name, args.num_values)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/model-conversion/scripts/utils/tensor-info.py b/examples/model-conversion/scripts/utils/tensor-info.py
deleted file mode 100755
index 1bb9e0564c..0000000000
--- a/examples/model-conversion/scripts/utils/tensor-info.py
+++ /dev/null
@@ -1,174 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-import json
-import os
-import re
-import sys
-from pathlib import Path
-from typing import Optional
-from safetensors import safe_open
-
-
-MODEL_SAFETENSORS_FILE = "model.safetensors"
-MODEL_SAFETENSORS_INDEX = "model.safetensors.index.json"
-
-
-def get_weight_map(model_path: Path) -> Optional[dict[str, str]]:
-    index_file = model_path / MODEL_SAFETENSORS_INDEX
-
-    if index_file.exists():
-        with open(index_file, 'r') as f:
-            index = json.load(f)
-            return index.get("weight_map", {})
-
-    return None
-
-
-def get_all_tensor_names(model_path: Path) -> list[str]:
-    weight_map = get_weight_map(model_path)
-
-    if weight_map is not None:
-        return list(weight_map.keys())
-
-    single_file = model_path / MODEL_SAFETENSORS_FILE
-    if single_file.exists():
-        try:
-            with safe_open(single_file, framework="pt", device="cpu") as f:
-                return list(f.keys())
-        except Exception as e:
-            print(f"Error reading {single_file}: {e}")
-            sys.exit(1)
-
-    print(f"Error: No safetensors files found in {model_path}")
-    sys.exit(1)
-
-
-def find_tensor_file(model_path: Path, tensor_name: str) -> Optional[str]:
-    weight_map = get_weight_map(model_path)
-
-    if weight_map is not None:
-        return weight_map.get(tensor_name)
-
-    single_file = model_path / MODEL_SAFETENSORS_FILE
-    if single_file.exists():
-        return single_file.name
-
-    return None
-
-
-def normalize_tensor_name(tensor_name: str) -> str:
-    normalized = re.sub(r'\.\d+\.', '.#.', tensor_name)
-    normalized = re.sub(r'\.\d+$', '.#', normalized)
-    return normalized
-
-
-def list_all_tensors(model_path: Path, unique: bool = False):
-    tensor_names = get_all_tensor_names(model_path)
-
-    if unique:
-        seen = set()
-        for tensor_name in sorted(tensor_names):
-            normalized = normalize_tensor_name(tensor_name)
-            if normalized not in seen:
-                seen.add(normalized)
-                print(normalized)
-    else:
-        for tensor_name in sorted(tensor_names):
-            print(tensor_name)
-
-
-def print_tensor_info(model_path: Path, tensor_name: str, num_values: Optional[int] = None):
-    tensor_file = find_tensor_file(model_path, tensor_name)
-
-    if tensor_file is None:
-        print(f"Error: Could not find tensor '{tensor_name}' in model index")
-        print(f"Model path: {model_path}")
-        sys.exit(1)
-
-    file_path = model_path / tensor_file
-
-    try:
-        with safe_open(file_path, framework="pt", device="cpu") as f:
-            if tensor_name in f.keys():
-                tensor_slice = f.get_slice(tensor_name)
-                shape = tensor_slice.get_shape()
-                print(f"Tensor: {tensor_name}")
-                print(f"File:   {tensor_file}")
-                print(f"Shape:  {shape}")
-                if num_values is not None:
-                    tensor = f.get_tensor(tensor_name)
-                    print(f"Dtype:  {tensor.dtype}")
-                    flat = tensor.flatten()
-                    n = min(num_values, flat.numel())
-                    print(f"Values: {flat[:n].tolist()}")
-            else:
-                print(f"Error: Tensor '{tensor_name}' not found in {tensor_file}")
-                sys.exit(1)
-
-    except FileNotFoundError:
-        print(f"Error: The file '{file_path}' was not found.")
-        sys.exit(1)
-    except Exception as e:
-        print(f"An error occurred: {e}")
-        sys.exit(1)
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Print tensor information from a safetensors model"
-    )
-    parser.add_argument(
-        "tensor_name",
-        nargs="?",  # optional (if --list is used for example)
-        help="Name of the tensor to inspect"
-    )
-    parser.add_argument(
-        "-m", "--model-path",
-        type=Path,
-        help="Path to the model directory (default: MODEL_PATH environment variable)"
-    )
-    parser.add_argument(
-        "-l", "--list",
-        action="store_true",
-        help="List unique tensor patterns in the model (layer numbers replaced with #)"
-    )
-    parser.add_argument(
-        "-n", "--num-values",
-        nargs="?",
-        const=10,
-        default=None,
-        type=int,
-        metavar="N",
-        help="Print the first N values of the tensor flattened (default: 10 if flag is given without a number)"
-    )
-
-    args = parser.parse_args()
-
-    model_path = args.model_path
-    if model_path is None:
-        model_path_str = os.environ.get("MODEL_PATH")
-        if model_path_str is None:
-            print("Error: --model-path not provided and MODEL_PATH environment variable not set")
-            sys.exit(1)
-        model_path = Path(model_path_str)
-
-    if not model_path.exists():
-        print(f"Error: Model path does not exist: {model_path}")
-        sys.exit(1)
-
-    if not model_path.is_dir():
-        print(f"Error: Model path is not a directory: {model_path}")
-        sys.exit(1)
-
-    if args.list:
-        list_all_tensors(model_path, unique=True)
-    else:
-        if args.tensor_name is None:
-            print("Error: tensor_name is required when not using --list")
-            sys.exit(1)
-        print_tensor_info(model_path, args.tensor_name, args.num_values)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
index 39d4464663..5e35dcd603 100644
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -5,12 +5,15 @@
 #include <vector>
 #include <cstdio>
 
+
 int main(int argc, char ** argv) {
     common_params params;
 
     params.prompt = "The quick brown fox";
     params.sampling.seed = 1234;
 
+    const std::string_view state_file = "dump_state.bin";
+
     if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
         return 1;
     }
@@ -53,35 +56,16 @@ int main(int argc, char ** argv) {
     // tokenize prompt
     auto tokens = common_tokenize(ctx, params.prompt, true);
 
-    // prepare the batch
-    llama_batch batch = llama_batch_init(tokens.size(), 0, 1);
-    for (size_t i = 0; i < tokens.size(); i++) {
-        common_batch_add(batch, tokens[i], i, {0}, false);
+    const bool save_state = true;
+    if (!common_prompt_batch_decode(ctx, tokens, n_past, params.n_batch, state_file, save_state)) {
+        return 1;
     }
-    batch.logits[batch.n_tokens - 1] = true; // generate next token
-
-    // evaluate prompt
-    llama_decode(ctx, batch);
-    n_past += batch.n_tokens;
-
-    // save state (rng, logits, embedding and kv_cache) to file
-    {
-        std::vector<uint8_t> state_mem(llama_state_get_size(ctx));
-        const size_t written = llama_state_get_data(ctx, state_mem.data(), state_mem.size());
-
-        FILE *fp_write = fopen("dump_state.bin", "wb");
-        fwrite(state_mem.data(), 1, written, fp_write);
-        fclose(fp_write);
-
-        fprintf(stderr, "%s : serialized state into %zd out of a maximum of %zd bytes\n", __func__, written, state_mem.size());
-    }
-
-    // save state (last tokens)
-    const auto n_past_saved = n_past;
 
     // first run
     printf("\nfirst run: %s", params.prompt.c_str());
 
+    llama_batch batch = llama_batch_init(1, 0, 1);
+
     for (auto i = 0; i < params.n_predict; i++) {
         auto next_token     = llama_sampler_sample(smpl, ctx, -1);
         auto next_token_str = common_token_to_piece(ctx, next_token);
@@ -111,27 +95,23 @@ int main(int argc, char ** argv) {
 
     printf("\nsecond run: %s", params.prompt.c_str());
 
-    // load state (rng, logits, embedding and kv_cache) from file
-    {
-        std::vector<uint8_t> state_mem;
+    // load state from file
+    std::vector<llama_token> unused_sts(tokens.size()); // unused session tokens.
+    size_t n_token_count_out = 0;
 
-        FILE * fp_read = fopen("dump_state.bin", "rb");
-        fseek(fp_read, 0, SEEK_END);
-        state_mem.resize(ftell(fp_read));
-        fseek(fp_read, 0, SEEK_SET);
-        const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read);
-        fclose(fp_read);
-
-        if (read != llama_state_set_data(ctx2, state_mem.data(), state_mem.size())) {
-            fprintf(stderr, "\n%s : failed to read state\n", __func__);
-            return 1;
-        }
-
-        fprintf(stderr, "%s : deserialized state from %zd out of a maximum of %zd bytes\n", __func__, read, state_mem.size());
+    if (!llama_state_load_file(ctx2, state_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) {
+        fprintf(stderr, "\n%s : failed to load state\n", __func__);
+        return 1;
     }
 
+    fprintf(stderr, "%s : loaded state with %zu tokens\n", __func__, n_token_count_out);
+
     // restore state (last tokens)
-    n_past = n_past_saved;
+    n_past = n_token_count_out;
+    if (!common_replay_last_token(ctx2, tokens.back(), n_past)) {
+        return 1;
+    }
+    ++n_past;
 
     // second run
     for (auto i = 0; i < params.n_predict; i++) {
@@ -160,7 +140,9 @@ int main(int argc, char ** argv) {
     }
 
     // make new context
-    llama_context * ctx3 = llama_init_from_model(model, common_context_params_to_llama(params));
+    auto params_ctx3 = common_context_params_to_llama(params);
+    params_ctx3.n_seq_max = 2;
+    llama_context * ctx3 = llama_init_from_model(model, params_ctx3);
 
     llama_sampler * smpl3 = llama_sampler_chain_init(sparams);
 
@@ -169,26 +151,21 @@ int main(int argc, char ** argv) {
     printf("\nsingle seq run: %s", params.prompt.c_str());
 
     // load state (rng, logits, embedding and kv_cache) from file
-    {
-        std::vector<uint8_t> state_mem;
+    n_token_count_out = 0;
 
-        FILE * fp_read = fopen("dump_state.bin", "rb");
-        fseek(fp_read, 0, SEEK_END);
-        state_mem.resize(ftell(fp_read));
-        fseek(fp_read, 0, SEEK_SET);
-        const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read);
-        fclose(fp_read);
-
-        if (read != llama_state_set_data(ctx3, state_mem.data(), state_mem.size())) {
-            fprintf(stderr, "\n%s : failed to read state\n", __func__);
-            return 1;
-        }
-
-        fprintf(stderr, "%s : deserialized state from %zd out of a maximum of %zd bytes\n", __func__, read, state_mem.size());
+    if (!llama_state_load_file(ctx3, state_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) {
+        fprintf(stderr, "\n%s : failed to load state\n", __func__);
+        return 1;
     }
 
+    fprintf(stderr, "%s : loaded state with %zu tokens\n", __func__, n_token_count_out);
+
     // restore state (last tokens)
-    n_past = n_past_saved;
+    n_past = n_token_count_out;
+    if (!common_replay_last_token(ctx3, tokens.back(), n_past)) {
+        return 1;
+    }
+    ++n_past;
 
     // save seq 0 and load into seq 1
     {
diff --git a/ggml/src/ggml-cpu/arch-fallback.h b/ggml/src/ggml-cpu/arch-fallback.h
index 55526e6fb3..4dfe28e1d6 100644
--- a/ggml/src/ggml-cpu/arch-fallback.h
+++ b/ggml/src/ggml-cpu/arch-fallback.h
@@ -42,6 +42,7 @@
 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
 #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
+#define ggml_gemv_q5_K_8x4_q8_K_generic ggml_gemv_q5_K_8x4_q8_K
 #define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
 #define ggml_gemv_q6_K_8x4_q8_K_generic ggml_gemv_q6_K_8x4_q8_K
 #define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
@@ -55,9 +56,10 @@
 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
 #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
+#define ggml_gemm_q5_K_8x4_q8_K_generic ggml_gemm_q5_K_8x4_q8_K
 #define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
 #define ggml_gemm_q6_K_8x4_q8_K_generic ggml_gemm_q6_K_8x4_q8_K
-#define ggml_gemm_q6_K_8x8_q8_K_generic   ggml_gemm_q6_K_8x8_q8_K
+#define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
 #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
 #define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
@@ -77,6 +79,7 @@
 #define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
 #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
 #define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
+#define ggml_gemv_q5_K_8x4_q8_K_generic ggml_gemv_q5_K_8x4_q8_K
 #define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
 #define ggml_gemv_q6_K_8x4_q8_K_generic ggml_gemv_q6_K_8x4_q8_K
 #define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
@@ -86,6 +89,7 @@
 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
 #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
 #define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
+#define ggml_gemm_q5_K_8x4_q8_K_generic ggml_gemm_q5_K_8x4_q8_K
 #define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
 #define ggml_gemm_q6_K_8x4_q8_K_generic ggml_gemm_q6_K_8x4_q8_K
 #define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
@@ -110,6 +114,7 @@
 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
 #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
+#define ggml_gemv_q5_K_8x4_q8_K_generic ggml_gemv_q5_K_8x4_q8_K
 #define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
 #define ggml_gemv_q6_K_8x4_q8_K_generic ggml_gemv_q6_K_8x4_q8_K
 #define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
@@ -123,6 +128,7 @@
 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
 #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
+#define ggml_gemm_q5_K_8x4_q8_K_generic ggml_gemm_q5_K_8x4_q8_K
 #define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
 #define ggml_gemm_q6_K_8x4_q8_K_generic ggml_gemm_q6_K_8x4_q8_K
 #define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
@@ -148,6 +154,7 @@
 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
 #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
+#define ggml_gemv_q5_K_8x4_q8_K_generic ggml_gemv_q5_K_8x4_q8_K
 #define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
 #define ggml_gemv_q6_K_8x4_q8_K_generic ggml_gemv_q6_K_8x4_q8_K
 #define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
@@ -161,6 +168,7 @@
 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
 #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
+#define ggml_gemm_q5_K_8x4_q8_K_generic ggml_gemm_q5_K_8x4_q8_K
 #define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
 #define ggml_gemm_q6_K_8x4_q8_K_generic ggml_gemm_q6_K_8x4_q8_K
 #define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
@@ -187,6 +195,7 @@
 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
 #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
+#define ggml_gemv_q5_K_8x4_q8_K_generic ggml_gemv_q5_K_8x4_q8_K
 #define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
 #define ggml_gemv_q6_K_8x4_q8_K_generic ggml_gemv_q6_K_8x4_q8_K
 #define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
@@ -199,6 +208,7 @@
 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
 #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
+#define ggml_gemm_q5_K_8x4_q8_K_generic ggml_gemm_q5_K_8x4_q8_K
 #define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
 #define ggml_gemm_q6_K_8x4_q8_K_generic ggml_gemm_q6_K_8x4_q8_K
 #define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
@@ -230,6 +240,7 @@
 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
 #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
+#define ggml_gemv_q5_K_8x4_q8_K_generic ggml_gemv_q5_K_8x4_q8_K
 #define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
 #define ggml_gemv_q6_K_8x4_q8_K_generic ggml_gemv_q6_K_8x4_q8_K
 #define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
@@ -243,6 +254,7 @@
 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
 #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
+#define ggml_gemm_q5_K_8x4_q8_K_generic ggml_gemm_q5_K_8x4_q8_K
 #define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
 #define ggml_gemm_q6_K_8x4_q8_K_generic ggml_gemm_q6_K_8x4_q8_K
 #define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
@@ -276,6 +288,7 @@
 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
 #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
+#define ggml_gemv_q5_K_8x4_q8_K_generic ggml_gemv_q5_K_8x4_q8_K
 #define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
 #define ggml_gemv_q6_K_8x4_q8_K_generic ggml_gemv_q6_K_8x4_q8_K
 #define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
@@ -289,6 +302,7 @@
 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
 #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
+#define ggml_gemm_q5_K_8x4_q8_K_generic ggml_gemm_q5_K_8x4_q8_K
 #define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
 #define ggml_gemm_q6_K_8x4_q8_K_generic ggml_gemm_q6_K_8x4_q8_K
 #define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
diff --git a/ggml/src/ggml-cpu/arch/arm/repack.cpp b/ggml/src/ggml-cpu/arch/arm/repack.cpp
index 3a3b32efb2..c2e4623f37 100644
--- a/ggml/src/ggml-cpu/arch/arm/repack.cpp
+++ b/ggml/src/ggml-cpu/arch/arm/repack.cpp
@@ -785,6 +785,165 @@ void ggml_gemv_q4_K_8x8_q8_K(int                        n,
     ggml_gemv_q4_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc);
 }
 
+void ggml_gemv_q5_K_8x4_q8_K(int                        n,
+                             float * GGML_RESTRICT      s,
+                             size_t                     bs,
+                             const void * GGML_RESTRICT vx,
+                             const void * GGML_RESTRICT vy,
+                             int                        nr,
+                             int                        nc) {
+    constexpr int qk = QK_K;
+    const int     nb = n / qk;
+
+    constexpr int ncols_interleaved = 8;
+    constexpr int blocklen          = 4;
+
+    assert(n % qk == 0);
+    assert(nc % ncols_interleaved == 0);
+
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+#if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
+    constexpr int    col_groups = ncols_interleaved / 4;  // 0123 and 4567
+    const uint8x16_t m4b        = vdupq_n_u8(0x0f);
+    const uint8x16_t mone       = vdupq_n_u8(1);
+    const uint8x16_t mtwo       = vdupq_n_u8(2);
+
+    // 1x8 tile = 2 x 4
+    float32x4_t acc_f32[col_groups];
+
+    const block_q8_K * GGML_RESTRICT q8_ptr = (const block_q8_K *) vy;
+
+    for (int x = 0; x < nc / ncols_interleaved; x++) {
+        const block_q5_Kx8 * GGML_RESTRICT q5_ptr = (const block_q5_Kx8 *) vx + (x * nb);
+
+        for (int i = 0; i < col_groups; i++) {
+            acc_f32[i] = vdupq_n_f32(0);
+        }
+
+        for (int b = 0; b < nb; b++) {
+            float32x4_t q5_d_0        = vcvt_f32_f16(vld1_f16((const __fp16 *) q5_ptr[b].d));      // d0 d1 d2 d3
+            float32x4_t q5_d_1        = vcvt_f32_f16(vld1_f16((const __fp16 *) q5_ptr[b].d + 4));  // d4 d5 d6 d7
+            float32x4_t q8_d          = vdupq_n_f32(q8_ptr[b].d);
+            float32x4_t sb_scale_0123 = vmulq_f32(q5_d_0, q8_d);
+            float32x4_t sb_scale_4567 = vmulq_f32(q5_d_1, q8_d);
+            float32x4_t q5_dmin_0     = vcvt_f32_f16(vld1_f16((const __fp16 *) q5_ptr[b].dmin));      // dmin 0..3
+            float32x4_t q5_dmin_1     = vcvt_f32_f16(vld1_f16((const __fp16 *) q5_ptr[b].dmin + 4));  // dmin 4..7
+            float32x4_t sb_min_0123   = vmulq_f32(q5_dmin_0, q8_d);
+            float32x4_t sb_min_4567   = vmulq_f32(q5_dmin_1, q8_d);
+
+            // interleaved bias_acc: [0]->r0 0123, [1]->r0 4567
+            int32x4_t bias_acc[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+            int32x4_t acc_lo[col_groups];
+            int32x4_t acc_hi[col_groups];
+
+            // Each bsum is 16 elements, pairwise add leaves us with the 8 bsums of the entire block
+            const int16x8_t bsums = vpaddq_s16(vld1q_s16(q8_ptr[b].bsums), vld1q_s16(q8_ptr[b].bsums + 8));
+            int16_t         bsums_arr[8];
+            vst1q_s16(bsums_arr, bsums);
+
+            uint8x16_t qh[col_groups][8];
+            for (int c = 0; c < col_groups; c++) {
+                for (int i = 0; i < 8; i++) {
+                    qh[c][i] = vld1q_u8(q5_ptr[b].qh + i * 32 + 16 * c);
+                }
+            }
+
+            for (int sb = 0; sb < QK_K / 64; sb++) {
+                for (int i = 0; i < col_groups; i++) {
+                    acc_lo[i] = vdupq_n_s32(0);
+                    acc_hi[i] = vdupq_n_s32(0);
+                }
+                // Need scales for the low and high nibbles
+                // 2 * 12 = 24 bytes per subblock, 4 sbs -> 4 * 24 = 96 bytes total
+                int16x8_t q5sb_mins[2];
+                int16x8_t q5sb_scales[2];
+                for (int i = 0; i < 2; i++) {
+                    int8_t    aux_q5sb[8];
+                    const int offset = sb * 24 + i * 12;
+                    decode_q_Kx8_6bit_scales(&q5_ptr[b].scales[offset], &q5sb_mins[i], aux_q5sb);
+                    q5sb_scales[i] = vmovl_s8(vld1_s8(aux_q5sb));
+                }
+
+                int8x16_t q8_qs[4];
+                for (int i = 0; i < 4; i++) {
+                    q8_qs[i] = vld1q_s8(q8_ptr[b].qs + sb * 64 + i * 16);
+                }
+
+                for (int c = 0; c < col_groups; c++) {
+                    uint8x16_t q5_cols[8];
+                    uint8x16_t hbit_lo[8];
+                    uint8x16_t hbit_hi[8];
+                    int8x16_t  q5_lo[8];
+                    int8x16_t  q5_hi[8];
+
+                    for (int i = 0; i < 8; i++) {
+                        q5_cols[i] = vld1q_u8(q5_ptr[b].qs + sb * QK_K + i * 32 + 16 * c);
+                        hbit_lo[i] = vandq_u8(qh[c][i], mone);
+                        hbit_hi[i] = vshlq_n_u8(vandq_u8(qh[c][i], mtwo), 3);
+                        qh[c][i]   = vshrq_n_u8(qh[c][i], 2);
+                        q5_lo[i]   = vreinterpretq_s8_u8(vsliq_n_u8(vandq_u8(q5_cols[i], m4b), hbit_lo[i], 4));
+                        q5_hi[i]   = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5_cols[i], 4), hbit_hi[i]));
+                    }
+
+                    acc_lo[c] = vdotq_laneq_s32(acc_lo[c], q5_lo[0], q8_qs[0], 0);
+                    acc_lo[c] = vdotq_laneq_s32(acc_lo[c], q5_lo[1], q8_qs[0], 1);
+                    acc_lo[c] = vdotq_laneq_s32(acc_lo[c], q5_lo[2], q8_qs[0], 2);
+                    acc_lo[c] = vdotq_laneq_s32(acc_lo[c], q5_lo[3], q8_qs[0], 3);
+                    acc_lo[c] = vdotq_laneq_s32(acc_lo[c], q5_lo[4], q8_qs[1], 0);
+                    acc_lo[c] = vdotq_laneq_s32(acc_lo[c], q5_lo[5], q8_qs[1], 1);
+                    acc_lo[c] = vdotq_laneq_s32(acc_lo[c], q5_lo[6], q8_qs[1], 2);
+                    acc_lo[c] = vdotq_laneq_s32(acc_lo[c], q5_lo[7], q8_qs[1], 3);
+
+                    acc_hi[c] = vdotq_laneq_s32(acc_hi[c], q5_hi[0], q8_qs[2], 0);
+                    acc_hi[c] = vdotq_laneq_s32(acc_hi[c], q5_hi[1], q8_qs[2], 1);
+                    acc_hi[c] = vdotq_laneq_s32(acc_hi[c], q5_hi[2], q8_qs[2], 2);
+                    acc_hi[c] = vdotq_laneq_s32(acc_hi[c], q5_hi[3], q8_qs[2], 3);
+                    acc_hi[c] = vdotq_laneq_s32(acc_hi[c], q5_hi[4], q8_qs[3], 0);
+                    acc_hi[c] = vdotq_laneq_s32(acc_hi[c], q5_hi[5], q8_qs[3], 1);
+                    acc_hi[c] = vdotq_laneq_s32(acc_hi[c], q5_hi[6], q8_qs[3], 2);
+                    acc_hi[c] = vdotq_laneq_s32(acc_hi[c], q5_hi[7], q8_qs[3], 3);
+                }
+
+                // Scales
+                // row c0123 blk0 and blk1
+                const int16x4_t   sc_0123_lo = vget_low_s16(q5sb_scales[0]);
+                const int16x4_t   sc_0123_hi = vget_low_s16(q5sb_scales[1]);
+                const float32x4_t sumf_0123  = vcvtq_f32_s32(vaddq_s32(vmulq_s32(vmovl_s16(sc_0123_lo), acc_lo[0]),
+                                                                       vmulq_s32(vmovl_s16(sc_0123_hi), acc_hi[0])));
+                acc_f32[0]                   = vfmaq_f32(acc_f32[0], sb_scale_0123, sumf_0123);
+                // row c4567 blk0 and blk1
+                const int16x4_t   sc_4567_lo = vget_high_s16(q5sb_scales[0]);
+                const int16x4_t   sc_4567_hi = vget_high_s16(q5sb_scales[1]);
+                const float32x4_t sumf_4567  = vcvtq_f32_s32(vaddq_s32(vmulq_s32(vmovl_s16(sc_4567_lo), acc_lo[1]),
+                                                                       vmulq_s32(vmovl_s16(sc_4567_hi), acc_hi[1])));
+                acc_f32[1]                   = vfmaq_f32(acc_f32[1], sb_scale_4567, sumf_4567);
+
+                // Bias Correction
+                const int16x4_t bsums_vec_lo = vdup_n_s16(bsums_arr[2 * sb + 0]);
+                const int16x4_t bsums_vec_hi = vdup_n_s16(bsums_arr[2 * sb + 1]);
+
+                bias_acc[0] = vmlal_s16(bias_acc[0], bsums_vec_lo, vget_low_s16(q5sb_mins[0]));
+                bias_acc[0] = vmlal_s16(bias_acc[0], bsums_vec_hi, vget_low_s16(q5sb_mins[1]));
+                bias_acc[1] = vmlal_s16(bias_acc[1], bsums_vec_lo, vget_high_s16(q5sb_mins[0]));
+                bias_acc[1] = vmlal_s16(bias_acc[1], bsums_vec_hi, vget_high_s16(q5sb_mins[1]));
+            }  // for sb
+
+            acc_f32[0] = vmlsq_f32(acc_f32[0], vcvtq_f32_s32(bias_acc[0]), sb_min_0123);
+            acc_f32[1] = vmlsq_f32(acc_f32[1], vcvtq_f32_s32(bias_acc[1]), sb_min_4567);
+        }  // for b
+
+        int base = x * ncols_interleaved;
+        vst1q_f32(s + base, acc_f32[0]);
+        vst1q_f32(s + base + 4, acc_f32[1]);
+    }  // for x
+    return;
+#endif  // defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
+    ggml_gemv_q5_K_8x4_q8_K_generic(n, s, bs, vx, vy, nr, nc);
+}
+
 void ggml_gemv_q5_K_8x8_q8_K(int                        n,
                              float * GGML_RESTRICT      s,
                              size_t                     bs,
@@ -3205,6 +3364,235 @@ void ggml_gemm_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
     ggml_gemm_q4_K_8x4_q8_K_generic(n, s, bs, vx, vy, nr, nc);
 }
 
+void ggml_gemm_q5_K_8x4_q8_K(int                        n,
+                             float * GGML_RESTRICT      s,
+                             size_t                     bs,
+                             const void * GGML_RESTRICT vx,
+                             const void * GGML_RESTRICT vy,
+                             int                        nr,
+                             int                        nc) {
+    constexpr int qk = QK_K;
+    const int     nb = n / qk;
+
+    constexpr int ncols_interleaved = 8;
+    constexpr int blocklen          = 4;
+
+    assert(n % qk == 0);
+    assert(nr % 4 == 0);
+    assert(nc % ncols_interleaved == 0);
+
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+#if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
+    constexpr int    q8_k_blocklen = 4;
+    constexpr int    acc_size      = 2 * 4;  // 2 row pairs, 4 col pairs
+    constexpr int    col_groups    = ncols_interleaved / 4;
+    const uint8x16_t m4b           = vdupq_n_u8(0x0f);
+    const uint8x16_t mone          = vdupq_n_u8(1);
+    const uint8x16_t mtwo          = vdupq_n_u8(2);
+
+    // 8 accumulators: 2 row pairs, 4 col pairs
+    float32x4_t acc_f32[acc_size];
+
+    for (int y = 0; y < nr / q8_k_blocklen; y++) {
+        const block_q8_Kx4 * GGML_RESTRICT q8_ptr = (const block_q8_Kx4 *) vy + (y * nb);
+
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_q5_Kx8 * GGML_RESTRICT q5_ptr = (const block_q5_Kx8 *) vx + (x * nb);
+
+            for (int i = 0; i < acc_size; i++) {
+                acc_f32[i] = vdupq_n_f32(0);
+            }
+
+            for (int b = 0; b < nb; b++) {
+                // d5 0 1 2 3, 4 5 6 7
+                float32x4_t q5_d_0123    = vcvt_f32_f16(vld1_f16((const __fp16 *) q5_ptr[b].d));
+                float32x4_t q5_d_4567    = vcvt_f32_f16(vld1_f16((const __fp16 *) q5_ptr[b].d + 4));
+                // d8 0 1 2 3
+                float32x4_t q8_d_0123    = vld1q_f32(q8_ptr[b].d);
+                // mins
+                float32x4_t q5_dmin_0123 = vcvt_f32_f16(vld1_f16((const __fp16 *) q5_ptr[b].dmin));
+                float32x4_t q5_dmin_4567 = vcvt_f32_f16(vld1_f16((const __fp16 *) q5_ptr[b].dmin + 4));
+
+                // Precomputation of scales and mins
+                float32x4_t sbd_scale_0123[q8_k_blocklen];
+                float32x4_t sbd_scale_4567[q8_k_blocklen];
+                float32x4_t sbd_min_0123[q8_k_blocklen];
+                float32x4_t sbd_min_4567[q8_k_blocklen];
+
+                sbd_scale_0123[0] = vmulq_laneq_f32(q5_d_0123, q8_d_0123, 0);
+                sbd_scale_4567[0] = vmulq_laneq_f32(q5_d_4567, q8_d_0123, 0);
+                sbd_min_0123[0]   = vmulq_laneq_f32(q5_dmin_0123, q8_d_0123, 0);
+                sbd_min_4567[0]   = vmulq_laneq_f32(q5_dmin_4567, q8_d_0123, 0);
+
+                sbd_scale_0123[1] = vmulq_laneq_f32(q5_d_0123, q8_d_0123, 1);
+                sbd_scale_4567[1] = vmulq_laneq_f32(q5_d_4567, q8_d_0123, 1);
+                sbd_min_0123[1]   = vmulq_laneq_f32(q5_dmin_0123, q8_d_0123, 1);
+                sbd_min_4567[1]   = vmulq_laneq_f32(q5_dmin_4567, q8_d_0123, 1);
+
+                sbd_scale_0123[2] = vmulq_laneq_f32(q5_d_0123, q8_d_0123, 2);
+                sbd_scale_4567[2] = vmulq_laneq_f32(q5_d_4567, q8_d_0123, 2);
+                sbd_min_0123[2]   = vmulq_laneq_f32(q5_dmin_0123, q8_d_0123, 2);
+                sbd_min_4567[2]   = vmulq_laneq_f32(q5_dmin_4567, q8_d_0123, 2);
+
+                sbd_scale_0123[3] = vmulq_laneq_f32(q5_d_0123, q8_d_0123, 3);
+                sbd_scale_4567[3] = vmulq_laneq_f32(q5_d_4567, q8_d_0123, 3);
+                sbd_min_0123[3]   = vmulq_laneq_f32(q5_dmin_0123, q8_d_0123, 3);
+                sbd_min_4567[3]   = vmulq_laneq_f32(q5_dmin_4567, q8_d_0123, 3);
+
+                // Precomputation of bsums, each vpaddq calcs all the bsums for each row
+                const int16x8_t bsums[q8_k_blocklen] = {
+                    vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 0), vld1q_s16(q8_ptr[b].bsums + 16 * 0 + 8)),
+                    vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 1), vld1q_s16(q8_ptr[b].bsums + 16 * 1 + 8)),
+                    vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 2), vld1q_s16(q8_ptr[b].bsums + 16 * 2 + 8)),
+                    vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 3), vld1q_s16(q8_ptr[b].bsums + 16 * 3 + 8)),
+                };
+                int16_t bsums_arr[QK_K / 64][8];
+                for (int q8_row = 0; q8_row < 4; q8_row++) {
+                    vst1q_s16(bsums_arr[q8_row], bsums[q8_row]);
+                }
+
+                // interleaved bias_acc: [0]->r0 0123, [1]->r1 0123, .., [4]->r0 4567, [5]->r1 4567 ..
+                int32x4_t bias_acc[acc_size];
+                for (int i = 0; i < acc_size; i++) {
+                    bias_acc[i] = vdupq_n_s32(0);
+                }
+
+                uint8x16_t qh[col_groups][8];
+                for (int c = 0; c < col_groups; c++) {
+                    for (int i = 0; i < 8; i++) {
+                        qh[c][i] = vld1q_u8(q5_ptr[b].qh + i * 32 + 16 * c);
+                    }
+                }
+
+                for (int sb = 0; sb < QK_K / 64; sb++) {
+                    // Int accumulators for qs vecdot (4 row * 2 col quartets)
+                    int32x4_t acc_lo[acc_size];
+                    int32x4_t acc_hi[acc_size];
+                    for (int i = 0; i < acc_size; i++) {
+                        acc_lo[i] = vdupq_n_s32(0);
+                        acc_hi[i] = vdupq_n_s32(0);
+                    }
+                    // Need scales for the low and high nibbles
+                    // 2 * 12 = 24 bytes per subblock, 4 sbs -> 4 * 24 = 96 bytes total
+                    int16x8_t q5sb_scales[2];
+                    int16x8_t q5sb_mins[2];
+                    for (int i = 0; i < 2; i++) {
+                        int8_t    aux_q5sb[8];
+                        const int offset = sb * 24 + i * 12;
+                        decode_q_Kx8_6bit_scales(&q5_ptr[b].scales[offset], &q5sb_mins[i], aux_q5sb);
+                        q5sb_scales[i] = vmovl_s8(vld1_s8(aux_q5sb));
+                    }
+
+                    constexpr int reads_per_sb = 8;  // 8 * 16 bytes each => 32 qs * 4 rows
+                    for (int k = 0; k < reads_per_sb; k++) {
+                        const int8x16_t q8_blk0 = vld1q_s8(q8_ptr[b].qs + sb * 256 + 16 * k);
+                        const int8x16_t q8_blk1 = vld1q_s8(q8_ptr[b].qs + sb * 256 + 16 * k + 128);
+
+                        // 0..3 & 32..35
+                        const uint8x16_t q5_0123 = vld1q_u8(q5_ptr[b].qs + sb * QK_K + 32 * k);
+                        const uint8x16_t q5_4567 = vld1q_u8(q5_ptr[b].qs + sb * QK_K + 32 * k + 16);
+
+                        // NOTE: This is the only difference with q4_K
+                        const uint8x16_t hbit_lo_0123 = vandq_u8(qh[0][k], mone);
+                        const uint8x16_t hbit_hi_0123 = vshlq_n_u8(vandq_u8(qh[0][k], mtwo), 3);
+                        qh[0][k]                      = vshrq_n_u8(qh[0][k], 2);
+                        const uint8x16_t hbit_lo_4567 = vandq_u8(qh[1][k], mone);
+                        const uint8x16_t hbit_hi_4567 = vshlq_n_u8(vandq_u8(qh[1][k], mtwo), 3);
+                        qh[1][k]                      = vshrq_n_u8(qh[1][k], 2);
+                        // From here, same as q4_K
+
+                        const int8x16_t q5_0123_lo =
+                            vreinterpretq_s8_u8(vsliq_n_u8(vandq_u8(q5_0123, m4b), hbit_lo_0123, 4));
+                        const int8x16_t q5_0123_hi =
+                            vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5_0123, 4), hbit_hi_0123));
+
+                        acc_lo[0] = vdotq_laneq_s32(acc_lo[0], q5_0123_lo, q8_blk0, 0);  //  0..3  r0 c0123
+                        acc_lo[1] = vdotq_laneq_s32(acc_lo[1], q5_0123_lo, q8_blk0, 1);  //  0..3  r1 c0123
+                        acc_lo[2] = vdotq_laneq_s32(acc_lo[2], q5_0123_lo, q8_blk0, 2);  //  0..3  r2 c0123
+                        acc_lo[3] = vdotq_laneq_s32(acc_lo[3], q5_0123_lo, q8_blk0, 3);  //  0..3  r3 c0123
+
+                        acc_hi[0] = vdotq_laneq_s32(acc_hi[0], q5_0123_hi, q8_blk1, 0);  // 32..35 r0 c0123
+                        acc_hi[1] = vdotq_laneq_s32(acc_hi[1], q5_0123_hi, q8_blk1, 1);  // 32..35 r1 c0123
+                        acc_hi[2] = vdotq_laneq_s32(acc_hi[2], q5_0123_hi, q8_blk1, 2);  // 32..35 r2 c0123
+                        acc_hi[3] = vdotq_laneq_s32(acc_hi[3], q5_0123_hi, q8_blk1, 3);  // 32..35 r3 c0123
+
+                        const int8x16_t q5_4567_lo =
+                            vreinterpretq_s8_u8(vsliq_n_u8(vandq_u8(q5_4567, m4b), hbit_lo_4567, 4));
+                        const int8x16_t q5_4567_hi =
+                            vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5_4567, 4), hbit_hi_4567));
+
+                        acc_lo[4] = vdotq_laneq_s32(acc_lo[4], q5_4567_lo, q8_blk0, 0);  //  0..3  r0 c4567
+                        acc_lo[5] = vdotq_laneq_s32(acc_lo[5], q5_4567_lo, q8_blk0, 1);  //  0..3  r1 c4567
+                        acc_lo[6] = vdotq_laneq_s32(acc_lo[6], q5_4567_lo, q8_blk0, 2);  //  0..3  r2 c4567
+                        acc_lo[7] = vdotq_laneq_s32(acc_lo[7], q5_4567_lo, q8_blk0, 3);  //  0..3  r3 c4567
+
+                        acc_hi[4] = vdotq_laneq_s32(acc_hi[4], q5_4567_hi, q8_blk1, 0);  // 32..35 r0 c4567
+                        acc_hi[5] = vdotq_laneq_s32(acc_hi[5], q5_4567_hi, q8_blk1, 1);  // 32..35 r1 c4567
+                        acc_hi[6] = vdotq_laneq_s32(acc_hi[6], q5_4567_hi, q8_blk1, 2);  // 32..35 r2 c4567
+                        acc_hi[7] = vdotq_laneq_s32(acc_hi[7], q5_4567_hi, q8_blk1, 3);  // 32..35 r3 c4567
+                    }
+
+                    // Scale and bias application
+                    // acc is stored interleaved to match output layout
+                    const int16x4_t sc_0123_lo = vget_low_s16(q5sb_scales[0]);
+                    const int16x4_t sc_4567_lo = vget_high_s16(q5sb_scales[0]);
+                    const int16x4_t sc_0123_hi = vget_low_s16(q5sb_scales[1]);
+                    const int16x4_t sc_4567_hi = vget_high_s16(q5sb_scales[1]);
+                    for (int row = 0; row < q8_k_blocklen; row++) {
+                        // Bias correction
+                        // row c0123 blk0 and blk1
+                        const float32x4_t sumf_0123 =
+                            vcvtq_f32_s32(vaddq_s32(vmulq_s32(vmovl_s16(sc_0123_lo), acc_lo[row]),
+                                                    vmulq_s32(vmovl_s16(sc_0123_hi), acc_hi[row])));
+                        acc_f32[2 * row] = vfmaq_f32(acc_f32[2 * row], sbd_scale_0123[row], sumf_0123);
+
+                        // row c4567 blk0 and blk1
+                        const float32x4_t sumf_4567 =
+                            vcvtq_f32_s32(vaddq_s32(vmulq_s32(vmovl_s16(sc_4567_lo), acc_lo[row + 4]),
+                                                    vmulq_s32(vmovl_s16(sc_4567_hi), acc_hi[row + 4])));
+                        acc_f32[2 * row + 1] = vfmaq_f32(acc_f32[2 * row + 1], sbd_scale_4567[row], sumf_4567);
+
+                        // Bias
+                        const int16x4_t bsums_vec_lo = vdup_n_s16(bsums_arr[sb][row * 2]);
+                        const int16x4_t bsums_vec_hi = vdup_n_s16(bsums_arr[sb][row * 2 + 1]);
+
+                        // row c0123 blk0 and blk1
+                        bias_acc[2 * row] = vmlal_s16(bias_acc[2 * row], bsums_vec_lo, vget_low_s16(q5sb_mins[0]));
+                        bias_acc[2 * row] = vmlal_s16(bias_acc[2 * row], bsums_vec_hi, vget_low_s16(q5sb_mins[1]));
+
+                        // row c4567 blk0 and blk1
+                        bias_acc[2 * row + 1] =
+                            vmlal_s16(bias_acc[2 * row + 1], bsums_vec_lo, vget_high_s16(q5sb_mins[0]));
+                        bias_acc[2 * row + 1] =
+                            vmlal_s16(bias_acc[2 * row + 1], bsums_vec_hi, vget_high_s16(q5sb_mins[1]));
+                    }
+                }  // for sb
+
+                for (int row = 0; row < q8_k_blocklen; row++) {
+                    acc_f32[2 * row] = vmlsq_f32(acc_f32[2 * row], vcvtq_f32_s32(bias_acc[2 * row]), sbd_min_0123[row]);
+                    acc_f32[2 * row + 1] =
+                        vmlsq_f32(acc_f32[2 * row + 1], vcvtq_f32_s32(bias_acc[2 * row + 1]), sbd_min_4567[row]);
+                }
+            }  // for b
+
+            for (int i = 0; i < q8_k_blocklen; i++) {
+                int row = y * q8_k_blocklen + i;
+                for (int j = 0; j < 2; j++) {
+                    int col    = x * ncols_interleaved + j * 4;
+                    int offset = row * bs + col;
+                    vst1q_f32(s + offset, acc_f32[2 * i + j]);
+                }
+            }
+        }  // for x
+    }  // for y
+    return;
+#endif  // defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
+    ggml_gemm_q5_K_8x4_q8_K_generic(n, s, bs, vx, vy, nr, nc);
+}
+
 void ggml_gemm_q4_K_8x8_q8_K(int                        n,
                              float * GGML_RESTRICT      s,
                              size_t                     bs,
diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp
index f94426ddd7..1b3d23cbed 100644
--- a/ggml/src/ggml-cpu/repack.cpp
+++ b/ggml/src/ggml-cpu/repack.cpp
@@ -450,6 +450,208 @@ static void ggml_gemm_q6_K_NxM_q8_K_generic_impl(int                        n,
     }
 }
 
+template <int M, int N>
+static void ggml_gemv_q5_K_NxM_q8_K_generic_impl(int                        n,
+                                                 float * GGML_RESTRICT      s,
+                                                 size_t                     bs,
+                                                 const void * GGML_RESTRICT vx,
+                                                 const void * GGML_RESTRICT vy,
+                                                 int                        nr,
+                                                 int                        nc) {
+    constexpr int         blocklen          = M;
+    constexpr int         ncols_interleaved = N;
+    const int             qk                = QK_K;
+    const int             nb                = n / qk;
+    static const uint32_t kmask1            = 0x3f3f3f3f;
+    static const uint32_t kmask2            = 0x0f0f0f0f;
+    static const uint32_t kmask3            = 0x03030303;
+
+    assert(n % qk == 0);
+    assert(nc % ncols_interleaved == 0);
+
+    UNUSED(bs);
+    UNUSED(nr);
+
+    float    sumf[ncols_interleaved];
+    float    sum_minf[ncols_interleaved];
+    uint32_t utmp[32];
+    int      sumi1;
+    int      sumi2;
+    int      sumi;
+
+    const block_q8_K * a_ptr = (const block_q8_K *) vy;
+    for (int x = 0; x < nc / ncols_interleaved; x++) {
+        const block_q5_Kx8 * b_ptr = (const block_q5_Kx8 *) vx + (x * nb);
+
+        for (int j = 0; j < ncols_interleaved; j++) {
+            sumf[j]     = 0.0;
+            sum_minf[j] = 0.0;
+        }
+        for (int l = 0; l < nb; l++) {
+            for (int sb = 0; sb < 8; sb++) {
+                memcpy(utmp + sb * 4, b_ptr[l].scales + sb * K_SCALE_SIZE, K_SCALE_SIZE);
+                utmp[sb * 4 + 3]      = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
+                const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
+                utmp[sb * 4 + 1]      = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
+                utmp[sb * 4 + 2]      = uaux_0;
+                utmp[sb * 4 + 0] &= kmask1;
+            }
+            for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                constexpr int scale_stride = 32;
+                uint8_t *     scales_0     = (uint8_t *) utmp + (k / (32 / blocklen)) * scale_stride;
+                uint8_t *     scales_1     = (uint8_t *) utmp + (k / (32 / blocklen)) * scale_stride + 16;
+
+                const int qh_shift = (k / (32 / blocklen)) * 2;
+                for (int j = 0; j < ncols_interleaved; j++) {
+                    sumi1 = 0;
+                    sumi2 = 0;
+                    sumi  = 0;
+                    for (int i = 0; i < blocklen; ++i) {
+                        const int b_qs_offset = k * ncols_interleaved * blocklen + j * blocklen + i;
+
+                        const int qh_idx      = (k * blocklen + i) % 32;
+                        const int qh_chunk    = qh_idx / blocklen;
+                        const int qh_pos      = qh_idx % blocklen;
+                        const int b_qh_offset = qh_chunk * (blocklen * ncols_interleaved) + j * blocklen + qh_pos;
+
+                        const uint8_t qh_val = b_ptr[l].qh[b_qh_offset];
+                        const uint8_t h0     = (qh_val >> qh_shift) & 1;
+                        const uint8_t h1     = (qh_val >> (qh_shift + 1)) & 1;
+
+                        const int v0 = (int8_t) ((b_ptr[l].qs[b_qs_offset] & 0xF) | (h0 << 4));
+                        const int v1 = (int8_t) ((b_ptr[l].qs[b_qs_offset] >> 4) | (h1 << 4));
+
+                        const int q8_offset = (k / (32 / blocklen)) * 64 + (k % (32 / blocklen)) * blocklen + i;
+
+                        sumi1 = (v0 * a_ptr[l].qs[q8_offset]);
+                        sumi2 = (v1 * a_ptr[l].qs[q8_offset + 32]);
+                        sumi1 = sumi1 * scales_0[j];
+                        sumi2 = sumi2 * scales_1[j];
+                        sumi += sumi1 + sumi2;
+                    }
+                    sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
+                }
+            }
+            for (int sb = 0; sb < 8; sb++) {
+                uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
+                for (int j = 0; j < ncols_interleaved; j++) {
+                    sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) *
+                                   GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
+                }
+            }
+        }
+        for (int j = 0; j < ncols_interleaved; j++) {
+            s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
+        }
+    }
+}
+
+template <int M, int N>
+static void ggml_gemm_q5_K_NxM_q8_K_generic_impl(int                        n,
+                                                 float * GGML_RESTRICT      s,
+                                                 size_t                     bs,
+                                                 const void * GGML_RESTRICT vx,
+                                                 const void * GGML_RESTRICT vy,
+                                                 int                        nr,
+                                                 int                        nc) {
+    constexpr int         blocklen          = M;
+    constexpr int         ncols_interleaved = N;
+    const int             qk                = QK_K;
+    const int             nb                = n / qk;
+    static const uint32_t kmask1            = 0x3f3f3f3f;
+    static const uint32_t kmask2            = 0x0f0f0f0f;
+    static const uint32_t kmask3            = 0x03030303;
+
+    assert(n % qk == 0);
+    assert(nr % 4 == 0);
+    assert(nc % ncols_interleaved == 0);
+
+    float    sumf[4][ncols_interleaved];
+    float    sum_minf[4][ncols_interleaved];
+    uint32_t utmp[32];
+    int      sumi1;
+    int      sumi2;
+    int      sumi;
+
+    for (int y = 0; y < nr / 4; y++) {
+        const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_q5_Kx8 * b_ptr = (const block_q5_Kx8 *) vx + (x * nb);
+            for (int m = 0; m < 4; m++) {
+                for (int j = 0; j < ncols_interleaved; j++) {
+                    sumf[m][j]     = 0.0;
+                    sum_minf[m][j] = 0.0;
+                }
+            }
+            for (int l = 0; l < nb; l++) {
+                for (int sb = 0; sb < 8; sb++) {
+                    memcpy(utmp + sb * 4, b_ptr[l].scales + sb * K_SCALE_SIZE, K_SCALE_SIZE);
+                    utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
+                    const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
+                    utmp[sb * 4 + 1]      = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
+                    utmp[sb * 4 + 2]      = uaux_0;
+                    utmp[sb * 4 + 0] &= kmask1;
+                }
+                for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                    constexpr int scale_stride = 32;
+                    uint8_t *     scales_0     = (uint8_t *) utmp + (k / (32 / blocklen)) * scale_stride;
+                    uint8_t *     scales_1     = (uint8_t *) utmp + (k / (32 / blocklen)) * scale_stride + 16;
+
+                    const int qh_shift = (k / (32 / blocklen)) * 2;
+                    for (int m = 0; m < 4; m++) {
+                        for (int j = 0; j < ncols_interleaved; j++) {
+                            sumi1 = 0;
+                            sumi2 = 0;
+                            sumi  = 0;
+                            for (int i = 0; i < blocklen; ++i) {
+                                const int b_qs_offset = k * ncols_interleaved * blocklen + j * blocklen + i;
+
+                                const int qh_idx   = (k * blocklen + i) % 32;
+                                const int qh_chunk = qh_idx / blocklen;
+                                const int qh_pos   = qh_idx % blocklen;
+                                const int b_qh_offset =
+                                    qh_chunk * (blocklen * ncols_interleaved) + j * blocklen + qh_pos;
+
+                                const uint8_t qh_val = b_ptr[l].qh[b_qh_offset];
+                                const uint8_t h0     = (qh_val >> qh_shift) & 1;
+                                const uint8_t h1     = (qh_val >> (qh_shift + 1)) & 1;
+
+                                const int v0 = (int8_t) ((b_ptr[l].qs[b_qs_offset] & 0xF) | (h0 << 4));
+                                const int v1 = (int8_t) ((b_ptr[l].qs[b_qs_offset] >> 4) | (h1 << 4));
+
+                                const int q8_offset = (k / (32 / blocklen)) * 256 +
+                                                      (k % (32 / blocklen)) * 4 * blocklen + m * blocklen + i;
+
+                                sumi1 = (v0 * a_ptr[l].qs[q8_offset]);
+                                sumi2 = (v1 * a_ptr[l].qs[q8_offset + 128]);
+                                sumi1 = sumi1 * scales_0[j];
+                                sumi2 = sumi2 * scales_1[j];
+                                sumi += sumi1 + sumi2;
+                            }
+                            sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
+                        }
+                    }
+                }
+                for (int sb = 0; sb < 8; sb++) {
+                    uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
+                    for (int m = 0; m < 4; m++) {
+                        const int16_t * bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
+                        for (int j = 0; j < ncols_interleaved; j++) {
+                            sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) *
+                                              GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
+                        }
+                    }
+                }
+            }
+            for (int m = 0; m < 4; m++) {
+                for (int j = 0; j < ncols_interleaved; j++) {
+                    s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
+                }
+            }
+        }
+    }
+}
+
 extern "C" {
 
 void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
@@ -803,98 +1005,12 @@ void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
     }
 }
 
-void ggml_gemv_q5_K_8x8_q8_K_generic(int                        n,
-                                     float * GGML_RESTRICT      s,
-                                     size_t                     bs,
-                                     const void * GGML_RESTRICT vx,
-                                     const void * GGML_RESTRICT vy,
-                                     int                        nr,
-                                     int                        nc) {
-    const int             qk                = QK_K;
-    const int             nb                = n / qk;
-    const int             ncols_interleaved = 8;
-    const int             blocklen          = 8;
-    static const uint32_t kmask1            = 0x3f3f3f3f;
-    static const uint32_t kmask2            = 0x0f0f0f0f;
-    static const uint32_t kmask3            = 0x03030303;
+void ggml_gemv_q5_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+    ggml_gemv_q5_K_NxM_q8_K_generic_impl<4, 8>(n, s, bs, vx, vy, nr, nc);
+}
 
-    assert(n % qk == 0);
-    assert(nc % ncols_interleaved == 0);
-
-    UNUSED(bs);
-    UNUSED(nr);
-
-    float    sumf[8];
-    float    sum_minf[8];
-    uint32_t utmp[32];
-    int      sumi1;
-    int      sumi2;
-    int      sumi;
-
-    const block_q8_K * a_ptr = (const block_q8_K *) vy;
-    for (int x = 0; x < nc / ncols_interleaved; x++) {
-        const block_q5_Kx8 * b_ptr = (const block_q5_Kx8 *) vx + (x * nb);
-
-        for (int j = 0; j < ncols_interleaved; j++) {
-            sumf[j]     = 0.0;
-            sum_minf[j] = 0.0;
-        }
-        for (int l = 0; l < nb; l++) {
-            for (int sb = 0; sb < 8; sb++) {
-                memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
-                utmp[sb * 4 + 3]      = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
-                const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
-                utmp[sb * 4 + 1]      = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
-                utmp[sb * 4 + 2]      = uaux_0;
-                utmp[sb * 4 + 0] &= kmask1;
-            }
-            for (int k = 0; k < (qk / (2 * blocklen)); k++) {
-                uint8_t * scales_0 = (uint8_t *) utmp + (k / 4) * 32;
-                uint8_t * scales_1 = (uint8_t *) utmp + (k / 4) * 32 + 16;
-
-                const int qh_shift = (k / 4) * 2;
-                for (int j = 0; j < ncols_interleaved; j++) {
-                    sumi1 = 0;
-                    sumi2 = 0;
-                    sumi  = 0;
-                    for (int i = 0; i < blocklen; ++i) {
-                        const int b_qs_offset = k * ncols_interleaved * blocklen + j * blocklen + i;
-
-                        const int qh_idx      = (k * 8 + i) % 32;
-                        const int qh_chunk    = qh_idx / 8;
-                        const int qh_pos      = qh_idx % 8;
-                        const int b_qh_offset = qh_chunk * 64 + j * 8 + qh_pos;
-
-                        const uint8_t qh_val = b_ptr[l].qh[b_qh_offset];
-                        const uint8_t h0     = (qh_val >> qh_shift) & 1;
-                        const uint8_t h1     = (qh_val >> (qh_shift + 1)) & 1;
-
-                        const int v0 = (int8_t) ((b_ptr[l].qs[b_qs_offset] & 0xF) | (h0 << 4));
-                        const int v1 = (int8_t) ((b_ptr[l].qs[b_qs_offset] >> 4) | (h1 << 4));
-
-                        const int q8_offset = (k >> 2) * 64 + (k % 4) * blocklen + i;
-
-                        sumi1 = (v0 * a_ptr[l].qs[q8_offset]);
-                        sumi2 = (v1 * a_ptr[l].qs[q8_offset + 32]);
-                        sumi1 = sumi1 * scales_0[j];
-                        sumi2 = sumi2 * scales_1[j];
-                        sumi += sumi1 + sumi2;
-                    }
-                    sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
-                }
-            }
-            for (int sb = 0; sb < 8; sb++) {
-                uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
-                for (int j = 0; j < ncols_interleaved; j++) {
-                    sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) *
-                                   GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
-                }
-            }
-        }
-        for (int j = 0; j < ncols_interleaved; j++) {
-            s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
-        }
-    }
+void ggml_gemv_q5_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+    ggml_gemv_q5_K_NxM_q8_K_generic_impl<8, 8>(n, s, bs, vx, vy, nr, nc);
 }
 
 
@@ -1494,107 +1610,12 @@ void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
     }
 }
 
-void ggml_gemm_q5_K_8x8_q8_K_generic(int                        n,
-                                     float * GGML_RESTRICT      s,
-                                     size_t                     bs,
-                                     const void * GGML_RESTRICT vx,
-                                     const void * GGML_RESTRICT vy,
-                                     int                        nr,
-                                     int                        nc) {
-    const int qk                = QK_K;
-    const int nb                = n / qk;
-    const int ncols_interleaved = 8;
-    const int blocklen          = 8;
+void ggml_gemm_q5_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+    ggml_gemm_q5_K_NxM_q8_K_generic_impl<4, 8>(n, s, bs, vx, vy, nr, nc);
+}
 
-    constexpr uint32_t kmask1 = 0x3f3f3f3f;
-    constexpr uint32_t kmask2 = 0x0f0f0f0f;
-    constexpr uint32_t kmask3 = 0x03030303;
-
-    assert(n % qk == 0);
-    assert(nr % 4 == 0);
-    assert(nc % ncols_interleaved == 0);
-
-    float    sumf[4][8];
-    float    sum_minf[4][8];
-    uint32_t utmp[32];
-    int      sumi1;
-    int      sumi2;
-    int      sumi;
-
-    for (int y = 0; y < nr / 4; y++) {
-        const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
-        for (int x = 0; x < nc / ncols_interleaved; x++) {
-            const block_q5_Kx8 * b_ptr = (const block_q5_Kx8 *) vx + (x * nb);
-            for (int m = 0; m < 4; m++) {
-                for (int j = 0; j < ncols_interleaved; j++) {
-                    sumf[m][j]     = 0.0;
-                    sum_minf[m][j] = 0.0;
-                }
-            }
-            for (int l = 0; l < nb; l++) {
-                for (int sb = 0; sb < 8; sb++) {
-                    memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
-                    utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
-                    const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
-                    utmp[sb * 4 + 1]      = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
-                    utmp[sb * 4 + 2]      = uaux_0;
-                    utmp[sb * 4 + 0] &= kmask1;
-                }
-                for (int k = 0; k < (qk / (2 * blocklen)); k++) {
-                    uint8_t * scales_0 = (uint8_t *) utmp + (k / 4) * 32;
-                    uint8_t * scales_1 = (uint8_t *) utmp + (k / 4) * 32 + 16;
-
-                    const int qh_shift = (k / 4) * 2;
-                    for (int m = 0; m < 4; m++) {
-                        for (int j = 0; j < ncols_interleaved; j++) {
-                            sumi1 = 0;
-                            sumi2 = 0;
-                            sumi  = 0;
-                            for (int i = 0; i < blocklen; ++i) {
-                                const int b_qs_offset = k * ncols_interleaved * blocklen + j * blocklen + i;
-
-                                const int qh_idx      = (k * 8 + i) % 32;
-                                const int qh_chunk    = qh_idx / 8;
-                                const int qh_pos      = qh_idx % 8;
-                                const int b_qh_offset = qh_chunk * 64 + j * 8 + qh_pos;
-
-                                const uint8_t qh_val = b_ptr[l].qh[b_qh_offset];
-                                const uint8_t h0     = (qh_val >> qh_shift) & 1;
-                                const uint8_t h1     = (qh_val >> (qh_shift + 1)) & 1;
-
-                                const int v0 = (int8_t) ((b_ptr[l].qs[b_qs_offset] & 0xF) | (h0 << 4));
-                                const int v1 = (int8_t) ((b_ptr[l].qs[b_qs_offset] >> 4) | (h1 << 4));
-
-                                const int q8_offset = (k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i;
-
-                                sumi1 = (v0 * a_ptr[l].qs[q8_offset]);
-                                sumi2 = (v1 * a_ptr[l].qs[q8_offset + 128]);
-                                sumi1 = sumi1 * scales_0[j];
-                                sumi2 = sumi2 * scales_1[j];
-                                sumi += sumi1 + sumi2;
-                            }
-                            sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
-                        }
-                    }
-                }
-                for (int sb = 0; sb < 8; sb++) {
-                    uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
-                    for (int m = 0; m < 4; m++) {
-                        const int16_t * bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
-                        for (int j = 0; j < ncols_interleaved; j++) {
-                            sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) *
-                                              GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
-                        }
-                    }
-                }
-            }
-            for (int m = 0; m < 4; m++) {
-                for (int j = 0; j < ncols_interleaved; j++) {
-                    s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
-                }
-            }
-        }
-    }
+void ggml_gemm_q5_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+    ggml_gemm_q5_K_NxM_q8_K_generic_impl<8, 8>(n, s, bs, vx, vy, nr, nc);
 }
 
 void ggml_gemm_q6_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
@@ -2029,18 +2050,16 @@ static block_q5_Kx8 make_block_q5_Kx8(block_q5_K * in, unsigned int blck_size_in
 
     const int end = QK_K * 4 / blck_size_interleave;
 
-    // Interleave Q5_K quants by taking 8 bytes at a time
+    // Interleave Q5_K quants by taking blck_size_interleave bytes at a time
     for (int i = 0; i < end; ++i) {
         int src_id     = i % 8;
         int src_offset = (i / 8) * blck_size_interleave;
         int dst_offset = i * blck_size_interleave;
 
-        uint64_t elems;
-        memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
-        memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
+        memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], blck_size_interleave);
     }
 
-    // Repeat for low bits 8 bytes at a time as well, since
+    // Repeat for high bits with the same chunk size, since
     // the high bits are interleaved in Q5_K and the index is
     // qh_idx = (qs_idx % 32);
     // qh_val = qh[qh_idx] >> (qs_idx / 32);
@@ -2049,9 +2068,7 @@ static block_q5_Kx8 make_block_q5_Kx8(block_q5_K * in, unsigned int blck_size_in
         int src_offset = (i / 8) * blck_size_interleave;
         int dst_offset = i * blck_size_interleave;
 
-        uint64_t elems;
-        memcpy(&elems, &in[src_id].qh[src_offset], sizeof(uint64_t));
-        memcpy(&out.qh[dst_offset], &elems, sizeof(uint64_t));
+        memcpy(&out.qh[dst_offset], &in[src_id].qh[src_offset], blck_size_interleave);
     }
 
     // The below logic is copied over from Q4_K
@@ -2249,7 +2266,7 @@ static int repack_q5_K_to_q5_K_8_bl(struct ggml_tensor *       t,
                                     const void * GGML_RESTRICT data,
                                     size_t                     data_size) {
     GGML_ASSERT(t->type == GGML_TYPE_Q5_K);
-    GGML_ASSERT(interleave_block == 8);
+    GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
     constexpr int nrows_interleaved = 8;
 
     block_q5_Kx8 *     dst = (block_q5_Kx8 *) t->data;
@@ -2523,6 +2540,10 @@ template <> int repack<block_q2_K, 8, 8>(struct ggml_tensor * t, const void * da
     return repack_q2_K_to_q2_K_8_bl(t, 8, data, data_size);
 }
 
+template <> int repack<block_q5_K, 4, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
+    return repack_q5_K_to_q5_K_8_bl(t, 4, data, data_size);
+}
+
 template <> int repack<block_q5_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
     return repack_q5_K_to_q5_K_8_bl(t, 8, data, data_size);
 }
@@ -2591,6 +2612,10 @@ template <> void gemv<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t
     ggml_gemv_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
 }
 
+template <> void gemv<block_q5_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+    ggml_gemv_q5_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
+}
+
 template <> void gemv<block_q5_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
     ggml_gemv_q5_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
 }
@@ -2654,6 +2679,10 @@ template <> void gemm<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t
     ggml_gemm_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
 }
 
+template <> void gemm<block_q5_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+    ggml_gemm_q5_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
+}
+
 template <> void gemm<block_q5_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
     ggml_gemm_q5_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
 }
@@ -3068,6 +3097,7 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
     static const ggml::cpu::repack::tensor_traits<block_q4_K, 8, 8, GGML_TYPE_Q8_K> q4_K_8x8_q8_K;
 
     // instance for Q5_K
+    static const ggml::cpu::repack::tensor_traits<block_q5_K, 4, 8, GGML_TYPE_Q8_K> q5_K_8x4_q8_K;
     static const ggml::cpu::repack::tensor_traits<block_q5_K, 8, 8, GGML_TYPE_Q8_K> q5_K_8x8_q8_K;
 
     // instance for Q6_K
@@ -3130,6 +3160,11 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
                 return &q5_K_8x8_q8_K;
             }
         }
+        if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
+            if (cur->ne[1] % 8 == 0) {
+                return &q5_K_8x4_q8_K;
+            }
+        }
     } else if (cur->type == GGML_TYPE_Q6_K) {
         if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
             if (cur->ne[1] % 8 == 0) {
diff --git a/ggml/src/ggml-cpu/repack.h b/ggml/src/ggml-cpu/repack.h
index 39b6b48238..ddf03d7642 100644
--- a/ggml/src/ggml-cpu/repack.h
+++ b/ggml/src/ggml-cpu/repack.h
@@ -111,6 +111,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
 void ggml_gemv_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemv_q5_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_q5_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_q6_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
@@ -122,6 +123,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
 void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_q5_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q5_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q6_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
@@ -143,6 +145,7 @@ void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
 void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemv_q5_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_q5_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_q6_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_q6_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
@@ -154,6 +157,7 @@ void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
 void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_q5_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q5_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q6_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q6_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index a3256d59dd..36d8a3aaab 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -1149,8 +1149,7 @@ struct ggml_cuda_graph {
     size_t num_nodes = 0;
     std::vector<cudaGraphNode_t> nodes;
     bool disable_due_to_gpu_arch = false;
-    bool disable_due_to_too_many_updates = false;
-    int number_consecutive_updates = 0;
+    bool warmup_complete = false;
     std::vector<ggml_cuda_graph_node_properties> props;
 
     // these are extra tensors (inputs) that participate in the ggml graph but are not nodes
@@ -1159,21 +1158,9 @@ struct ggml_cuda_graph {
     // ref: https://github.com/ggml-org/llama.cpp/pull/19165
     std::vector<ggml_cuda_graph_node_properties> extra;
 
-    void record_update(bool use_graph, bool update_required) {
-        if (use_graph && update_required) {
-            number_consecutive_updates++;
-        } else {
-            number_consecutive_updates = 0;
-        }
-        if (number_consecutive_updates >= 4) {
-            GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
-            disable_due_to_too_many_updates = true;
-        }
-    }
-
     bool is_enabled() const {
         static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
-        return !(disable_due_to_gpu_arch || disable_cuda_graphs_due_to_env || disable_due_to_too_many_updates);
+        return !(disable_due_to_gpu_arch || disable_cuda_graphs_due_to_env);
     }
 #endif
 };
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index ffa35eeb65..7e6d330354 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2979,10 +2979,6 @@ static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx
     const void * graph_key = ggml_cuda_graph_get_key(cgraph);
     ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key);
 
-    if (graph->instance == nullptr) {
-        res = true;
-    }
-
     // Check if the graph size has changed
     if (graph->props.size() != (size_t)cgraph->n_nodes) {
         res = true;
@@ -3931,14 +3927,35 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
 #ifdef USE_CUDA_GRAPH
     graph_key = ggml_cuda_graph_get_key(cgraph);
 
-    use_cuda_graph = ggml_cuda_graph_set_enabled(cuda_ctx, graph_key);
+    ggml_cuda_graph_set_enabled(cuda_ctx, graph_key);
 
     ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key);
     if (graph->is_enabled()) {
-        cuda_graph_update_required = ggml_cuda_graph_update_required(cuda_ctx, cgraph);
-        use_cuda_graph             = ggml_cuda_graph_check_compability(cgraph);
+        const bool graph_compatible = ggml_cuda_graph_check_compability(cgraph);
+        if (graph_compatible) {
+            const bool properties_changed = ggml_cuda_graph_update_required(cuda_ctx, cgraph);
 
-        graph->record_update(use_cuda_graph, cuda_graph_update_required);
+            if (!graph->warmup_complete) {
+                // Warmup: need at least 2 calls with no property change on the 2nd call
+                if (!properties_changed) {
+                    graph->warmup_complete = true;
+                    GGML_LOG_DEBUG("%s: CUDA graph warmup complete\n", __func__);
+                    use_cuda_graph = true;
+                    cuda_graph_update_required = true;
+                }
+                // else: properties changed or first call - execute directly (use_cuda_graph stays false)
+            } else {
+                // Post-warmup: normal CUDA graph operation
+                if (properties_changed) {
+                    // Properties changed - reset warmup, execute directly until stable again
+                    graph->warmup_complete = false;
+                    GGML_LOG_DEBUG("%s: CUDA graph warmup reset\n", __func__);
+                } else {
+                    use_cuda_graph = true;
+                    cuda_graph_update_required = graph->instance == nullptr;
+                }
+            }
+        }
     }
 #endif // USE_CUDA_GRAPH
 
diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 54f9986498..7a44443a8a 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -1749,23 +1749,6 @@ static inline bool ggml_backend_buffer_is_hexagon_repack(const struct ggml_backe
     return b->buft->iface.alloc_buffer == ggml_backend_hexagon_repack_buffer_type_alloc_buffer;
 }
 
-static bool hex_supported_dims2(const struct ggml_tensor * x, const struct ggml_tensor * y) {
-    if (x->ne[0] != y->ne[0]) {
-        return false;
-    }
-    if (x->ne[1] != y->ne[1]) {
-        return false;
-    }
-    if (x->ne[2] != y->ne[2]) {
-        return false;
-    }
-    if (x->ne[3] != y->ne[3]) {
-        return false;
-    }
-
-    return true;
-}
-
 static bool ggml_hexagon_supported_flash_attn_ext(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
     const struct ggml_tensor * src0 = op->src[0];
     const struct ggml_tensor * src1 = op->src[1];
@@ -1797,43 +1780,6 @@ static bool ggml_hexagon_supported_flash_attn_ext(const struct ggml_hexagon_sess
     return opt_experimental;
 }
 
-static bool hex_supported_src0_type(ggml_type t) {
-    return t == GGML_TYPE_F32;
-}
-
-static bool hex_supported_src1_type(ggml_type t) {
-    return t == GGML_TYPE_F32;
-}
-
-static bool hex_supported_src2_type(ggml_type t) {
-    return t == GGML_TYPE_F32;
-}
-
-static bool hex_supported_src1_type2(ggml_type t) {
-    return t == GGML_TYPE_F16;
-}
-
-static bool hex_supported_src1_type3(ggml_type t) {
-    return t == GGML_TYPE_I32;
-}
-
-static bool hex_supported_dst_type(ggml_type t) {
-    return t == GGML_TYPE_F32;
-}
-
-static bool hex_supported_dims(const struct ggml_tensor * x, const struct ggml_tensor * y) {
-    // TODO: support broadcast for ne[2 and 3]
-    if (x->ne[0] != y->ne[0]) {
-        return false;
-    }
-    if (x->ne[2] != y->ne[2]) {
-        return false;
-    }
-    if (x->ne[3] != y->ne[3]) {
-        return false;
-    }
-    return true;
-}
 
 static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * dst) {
     const struct ggml_tensor * src0 = dst->src[0];
@@ -1919,19 +1865,19 @@ static bool ggml_hexagon_supported_binary(const struct ggml_hexagon_session * se
     const struct ggml_tensor * src1 = op->src[1];
     const struct ggml_tensor * dst  = op;
 
-    if (!hex_supported_src0_type(src0->type)) {
+    if (src0->type != GGML_TYPE_F32) {
         return false;
     }
-    if (!hex_supported_src1_type(src1->type)) {
+    if (src1->type != GGML_TYPE_F32) {
         return false;
     }
-    if (!hex_supported_dst_type(dst->type)) {
+    if (dst->type != GGML_TYPE_F32) {
         return false;
     }
-    if (!hex_supported_dims2(src0, dst)) {
+    if (!ggml_are_same_shape(src0, dst)) {
         return false;
     }
-    if (!ggml_can_repeat(src1, src0)) {
+    if (!ggml_can_repeat(src1, src0) || ggml_is_permuted(src1)) {
         return false;
     }
 
@@ -1943,16 +1889,16 @@ static bool ggml_hexagon_supported_add_id(const struct ggml_hexagon_session * se
     const struct ggml_tensor * src1 = op->src[1];
     const struct ggml_tensor * dst  = op;
 
-    if (!hex_supported_src0_type(src0->type)) {
+    if (src0->type != GGML_TYPE_F32) {
         return false;
     }
-    if (!hex_supported_src1_type(src1->type)) {
+    if (src1->type != GGML_TYPE_F32) {
         return false;
     }
-    if (!hex_supported_dst_type(dst->type)) {
+    if (dst->type != GGML_TYPE_F32) {
         return false;
     }
-    if (!hex_supported_dims2(src0, dst)) {
+    if (!ggml_are_same_shape(src0, dst)) {
         return false;
     }
 
@@ -1968,13 +1914,13 @@ static bool ggml_hexagon_supported_unary(const struct ggml_hexagon_session * ses
     const struct ggml_tensor * src0 = op->src[0];
     const struct ggml_tensor * dst  = op;
 
-    if (!hex_supported_src0_type(src0->type)) {
+    if (src0->type != GGML_TYPE_F32) {
         return false;
     }
-    if (!hex_supported_dst_type(dst->type)) {
+    if (dst->type != GGML_TYPE_F32) {
         return false;
     }
-    if (!hex_supported_dims2(src0, dst)) {
+    if (!ggml_are_same_shape(src0, dst)) {
         return false;
     }
 
@@ -1990,10 +1936,10 @@ static bool ggml_hexagon_supported_sum_rows(const struct ggml_hexagon_session *
     const struct ggml_tensor * src0 = op->src[0];
     const struct ggml_tensor * dst  = op;
 
-    if (!hex_supported_src0_type(src0->type)) {
+    if (src0->type != GGML_TYPE_F32) {
         return false;
     }
-    if (!hex_supported_dst_type(dst->type)) {
+    if (dst->type != GGML_TYPE_F32) {
         return false;
     }
 
@@ -2011,10 +1957,10 @@ static bool ggml_hexagon_supported_activations(const struct ggml_hexagon_session
     const struct ggml_tensor * src1 = op->src[1];
     const struct ggml_tensor * dst  = op;
 
-    if (!hex_supported_src0_type(src0->type)) {
+    if (src0->type != GGML_TYPE_F32) {
         return false;
     }
-    if (!hex_supported_dst_type(dst->type)) {
+    if (dst->type != GGML_TYPE_F32) {
         return false;
     }
 
@@ -2023,10 +1969,10 @@ static bool ggml_hexagon_supported_activations(const struct ggml_hexagon_session
     }
 
     if (src1) {
-        if (!hex_supported_src1_type(src1->type)) {
+        if (src1->type != GGML_TYPE_F32) {
             return false;
         }
-        if (!hex_supported_dims2(src0, src1)) {
+        if (!ggml_are_same_shape(src0, src1)) {
             return false;
         }
         if (!ggml_is_contiguous(src1)) {
@@ -2047,15 +1993,15 @@ static bool ggml_hexagon_supported_softmax(const struct ggml_hexagon_session * s
         return false;  // FIXME: add support for sinks
     }
 
-    if (!hex_supported_src0_type(src0->type)) {
+    if (src0->type != GGML_TYPE_F32) {
         return false;
     }
-    if (!hex_supported_dst_type(dst->type)) {
+    if (dst->type != GGML_TYPE_F32) {
         return false;
     }
 
     if (src1) {
-        if (!hex_supported_src1_type(src1->type) && !hex_supported_src1_type2(src1->type)) {
+        if (src1->type != GGML_TYPE_F32 && src1->type != GGML_TYPE_F16) {
             return false;
         }
         if (src0->ne[0] != src1->ne[0]) {
@@ -2162,17 +2108,17 @@ static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess
     const struct ggml_tensor * src2 = op->src[2];
     const struct ggml_tensor * dst  = op;
 
-    if (!hex_supported_src0_type(src0->type)) {
+    if (src0->type != GGML_TYPE_F32) {
         return false;  // FIXME: add support for GGML_TYPE_F16 for src0
     }
-    if (!hex_supported_dst_type(dst->type)) {
+    if (dst->type != GGML_TYPE_F32) {
         return false;
     }
-    if (!hex_supported_src1_type3(src1->type)) {
+    if (src1->type != GGML_TYPE_I32) {
         return false;
     }
     if (src2) {
-        if (!hex_supported_src2_type(src2->type)) {
+        if (src2->type != GGML_TYPE_F32) {
             return false;
         }
         int n_dims = op_params[1];
diff --git a/ggml/src/ggml-hexagon/htp/act-ops.c b/ggml/src/ggml-hexagon/htp/act-ops.c
index 950d836ad3..21bd4050a1 100644
--- a/ggml/src/ggml-hexagon/htp/act-ops.c
+++ b/ggml/src/ggml-hexagon/htp/act-ops.c
@@ -69,27 +69,45 @@
     const uint32_t nb2 = dst->nb[2];   \
     const uint32_t nb3 = dst->nb[3];
 
-static void glu_swiglu_f32_per_thread(const struct htp_tensor * src0,
-                                       const struct htp_tensor * src1,
-                                       struct htp_tensor *       dst,
-                                       const int32_t *           op_params,
-                                       struct htp_spad *         src0_spad,
-                                       struct htp_spad *         src1_spad,
-                                       struct htp_spad *         dst_spad,
-                                       uint32_t                  nth,
-                                       uint32_t                  ith,
-                                       uint32_t                  src0_nrows_per_thread,
-                                       dma_queue *               dma_queue) {
+struct htp_act_context {
+    struct htp_ops_context *  octx;
+
+    // Precomputed values
+    const uint8_t *           data_src0;
+    const uint8_t *           data_src1;
+    uint8_t *                 data_dst;
+
+    size_t                    src0_row_size;
+    size_t                    src1_row_size;
+    size_t                    dst_row_size;
+
+    size_t                    src0_row_size_aligned;
+    size_t                    src1_row_size_aligned;
+    size_t                    dst_row_size_aligned;
+
+    size_t                    src0_spad_half_size;
+    size_t                    src1_spad_half_size;
+    size_t                    dst_spad_half_size;
+
+    uint32_t                  block;
+    uint32_t                  src0_nrows;
+    uint32_t                  src0_nrows_per_thread;
+    int                       nc;
+};
+
+static void glu_swiglu_f32_per_thread(unsigned int nth, unsigned int ith, void * data) {
+    struct htp_act_context * actx = (struct htp_act_context *) data;
+    const struct htp_tensor * src0 = &actx->octx->src0;
+    const struct htp_tensor * src1 = &actx->octx->src1;
+    const struct htp_tensor * dst  = &actx->octx->dst;
     htp_act_preamble3;
 
-    size_t src0_row_size = nb01;
-    size_t src1_row_size = nb11;
-    size_t dst_row_size  = nb1;
-
-
-
-    const uint32_t src0_nrows = ne01 * ne02 * ne03;  // src0 rows
+    size_t src0_row_size = actx->src0_row_size;
+    size_t src1_row_size = actx->src1_row_size;
+    size_t dst_row_size  = actx->dst_row_size;
 
+    const uint32_t src0_nrows = actx->src0_nrows;
+    const uint32_t src0_nrows_per_thread = actx->src0_nrows_per_thread;
     const uint32_t src0_start_row = src0_nrows_per_thread * ith;
     const uint32_t src0_end_row   = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
 
@@ -101,43 +119,34 @@ static void glu_swiglu_f32_per_thread(const struct htp_tensor * src0,
     uint64_t t1, t2;
     t1 = HAP_perf_get_qtimer_count();
 
-    const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
-    const uint8_t * restrict data_src1 = (const uint8_t *) src1->data;
-    uint8_t * restrict data_dst        = (uint8_t *) dst->data;
+    const uint8_t * restrict data_src0 = actx->data_src0;
+    const uint8_t * restrict data_src1 = actx->data_src1;
+    uint8_t * restrict data_dst        = actx->data_dst;
 
-    const bool src1_valid = src1->ne[0];
-    const int  nc         = (src1_valid) ? ne00 : ne00 / 2;
-    if (!src1_valid) {
-        const int32_t swapped = op_params[1];
-        data_src1             = data_src0;
-        src1_row_size         = src0_row_size;
+    const int  nc = actx->nc;
 
-        const size_t nc_in_bytes = nc * SIZEOF_FP32;
-        data_src0 += swapped ? nc_in_bytes : 0;
-        data_src1 += swapped ? 0 : nc_in_bytes;
-    }
+    const size_t src0_row_size_aligned = actx->src0_row_size_aligned;
+    const size_t src1_row_size_aligned = actx->src1_row_size_aligned;
+    const size_t dst_row_size_aligned  = actx->dst_row_size_aligned;
 
-    const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
-    const size_t src1_row_size_aligned = hex_round_up(src1_row_size, VLEN);
-    const size_t dst_row_size_aligned  = hex_round_up(dst_row_size, VLEN);
+    uint8_t * restrict src0_spad_data = actx->octx->src0_spad.data + (ith * actx->octx->src0_spad.size_per_thread);
+    uint8_t * restrict src1_spad_data = actx->octx->src1_spad.data + (ith * actx->octx->src1_spad.size_per_thread);
+    uint8_t * restrict dst_spad_data  = actx->octx->dst_spad.data + (ith * actx->octx->dst_spad.size_per_thread);
 
-    uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_spad->size_per_thread);
-    uint8_t * restrict src1_spad_data = src1_spad->data + (ith * src1_spad->size_per_thread);
-    uint8_t * restrict dst_spad_data  = dst_spad->data + (ith * dst_spad->size_per_thread);
+    size_t src0_spad_half_size = actx->src0_spad_half_size;
+    size_t src1_spad_half_size = actx->src1_spad_half_size;
+    size_t dst_spad_half_size  = actx->dst_spad_half_size;
 
-    // While given src0_spad->size_per_thread, divide it to two ping-pong buffer for src0
-    size_t src0_spad_half_size = src0_spad->size_per_thread / 2;
-    size_t src1_spad_half_size = src1_spad->size_per_thread / 2;
-    size_t dst_spad_half_size  = dst_spad->size_per_thread / 2;
-
-    const int BLOCK = src0_spad_half_size / src0_row_size_aligned;  // How many rows can we process in one block
+    const int BLOCK = actx->block;
     if (BLOCK == 0) {
         FARF(ERROR,
              "swiglu-f32 : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n",
-             src0_spad->size_per_thread, src0_row_size_aligned);
+             actx->octx->src0_spad.size_per_thread, src0_row_size_aligned);
         return;
     }
 
+    dma_queue * dma_queue = actx->octx->ctx->dma[ith];
+
     // See discussion: https://github.com/ggml-org/llama.cpp/pull/18151#issuecomment-3678235379
     for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; ir += BLOCK, spad_idx++) {
         const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
@@ -196,27 +205,22 @@ static void glu_swiglu_f32_per_thread(const struct htp_tensor * src0,
          (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
 
-static void glu_swiglu_oai_f32_per_thread(const struct htp_tensor * src0,
-                                           const struct htp_tensor * src1,
-                                           struct htp_tensor *       dst,
-                                           const int32_t *           op_params,
-                                           struct htp_spad *         src0_spad,
-                                           struct htp_spad *         src1_spad,
-                                           struct htp_spad *         dst_spad,
-                                           uint32_t                  nth,
-                                           uint32_t                  ith,
-                                           uint32_t                  src0_nrows_per_thread,
-                                           dma_queue *               dma_queue) {
+static void glu_swiglu_oai_f32_per_thread(unsigned int nth, unsigned int ith, void * data) {
+    struct htp_act_context * actx = (struct htp_act_context *) data;
+    const struct htp_tensor * src0 = &actx->octx->src0;
+    const struct htp_tensor * src1 = &actx->octx->src1;
+    const struct htp_tensor * dst  = &actx->octx->dst;
     htp_act_preamble3;
 
     uint64_t t1, t2;
     t1 = HAP_perf_get_qtimer_count();
 
-    size_t src0_row_size = nb01;
-    size_t src1_row_size = nb11;
-    size_t dst_row_size  = nb1;
+    size_t src0_row_size = actx->src0_row_size;
+    size_t src1_row_size = actx->src1_row_size;
+    size_t dst_row_size  = actx->dst_row_size;
 
-    const uint32_t src0_nrows = ne01 * ne02 * ne03;  // src0 rows
+    const uint32_t src0_nrows = actx->src0_nrows;
+    const uint32_t src0_nrows_per_thread = actx->src0_nrows_per_thread;
 
     const uint32_t src0_start_row = src0_nrows_per_thread * ith;
     const uint32_t src0_end_row   = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
@@ -226,45 +230,36 @@ static void glu_swiglu_oai_f32_per_thread(const struct htp_tensor * src0,
         return;
     }
 
-    const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
-    const uint8_t * restrict data_src1 = (const uint8_t *) src1->data;
-    uint8_t * restrict data_dst        = (uint8_t *) dst->data;
+    const uint8_t * restrict data_src0 = actx->data_src0;
+    const uint8_t * restrict data_src1 = actx->data_src1;
+    uint8_t * restrict data_dst        = actx->data_dst;
 
-    const bool src1_valid = src1->ne[0];
-    const int  nc         = (src1_valid) ? ne00 : ne00 / 2;
-    if (!src1_valid) {
-        const int32_t swapped = op_params[1];
-        data_src1             = data_src0;
-        src1_row_size         = src0_row_size;
+    const int nc = actx->nc;
 
-        const size_t nc_in_bytes = nc * SIZEOF_FP32;
-        data_src0 += swapped ? nc_in_bytes : 0;
-        data_src1 += swapped ? 0 : nc_in_bytes;
-    }
+    const size_t src0_row_size_aligned = actx->src0_row_size_aligned;
+    const size_t src1_row_size_aligned = actx->src1_row_size_aligned;
+    const size_t dst_row_size_aligned  = actx->dst_row_size_aligned;
 
-    const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
-    const size_t src1_row_size_aligned = hex_round_up(src1_row_size, VLEN);
-    const size_t dst_row_size_aligned  = hex_round_up(dst_row_size, VLEN);
+    uint8_t * restrict src0_spad_data = actx->octx->src0_spad.data + (ith * actx->octx->src0_spad.size_per_thread);
+    uint8_t * restrict src1_spad_data = actx->octx->src1_spad.data + (ith * actx->octx->src1_spad.size_per_thread);
+    uint8_t * restrict dst_spad_data  = actx->octx->dst_spad.data + (ith * actx->octx->dst_spad.size_per_thread);
 
-    uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_spad->size_per_thread);
-    uint8_t * restrict src1_spad_data = src1_spad->data + (ith * src1_spad->size_per_thread);
-    uint8_t * restrict dst_spad_data  = dst_spad->data + (ith * dst_spad->size_per_thread);
+    size_t src0_spad_half_size = actx->src0_spad_half_size;
+    size_t src1_spad_half_size = actx->src1_spad_half_size;
+    size_t dst_spad_half_size  = actx->dst_spad_half_size;
 
-    // While given src0_spad->size_per_thread, divide it to two ping-pong buffer for src0
-    size_t src0_spad_half_size = src0_spad->size_per_thread / 2;
-    size_t src1_spad_half_size = src1_spad->size_per_thread / 2;
-    size_t dst_spad_half_size  = dst_spad->size_per_thread / 2;
-
-    const int BLOCK = src0_spad_half_size / src0_row_size_aligned;  // How many rows can we process in one block
+    const int BLOCK = actx->block;
     if (BLOCK == 0) {
         FARF(ERROR,
              "swiglu-oai-f32 : current VTCM reservation %zu is too small for even 1 row per thread, needed at least "
              "%zu\n",
-             src0_spad->size_per_thread, src0_row_size_aligned);
+             actx->octx->src0_spad.size_per_thread, src0_row_size_aligned);
         return;
     }
-    const float alpha = ((const float *) (op_params))[2];
-    const float limit = ((const float *) (op_params))[3];
+    const float alpha = ((const float *) (actx->octx->op_params))[2];
+    const float limit = ((const float *) (actx->octx->op_params))[3];
+
+    dma_queue * dma_queue = actx->octx->ctx->dma[ith];
 
     // See discussion: https://github.com/ggml-org/llama.cpp/pull/18151#issuecomment-3678235379
     for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; ir += BLOCK, spad_idx++) {
@@ -335,26 +330,22 @@ static void glu_swiglu_oai_f32_per_thread(const struct htp_tensor * src0,
 }
 
 
-static void unary_gelu_f32_per_thread(const struct htp_tensor * src0,
-                                       struct htp_tensor *       dst,
-                                       const int32_t *           op_params,
-                                       struct htp_spad *         src0_spad,
-                                       struct htp_spad *         dst_spad,
-                                       uint32_t                  nth,
-                                       uint32_t                  ith,
-                                       uint32_t                  src0_nrows_per_thread,
-                                       dma_queue *               dma_queue) {
+static void unary_gelu_f32_per_thread(unsigned int nth, unsigned int ith, void * data) {
+    struct htp_act_context * actx = (struct htp_act_context *) data;
+    const struct htp_tensor * src0 = &actx->octx->src0;
+    const struct htp_tensor * dst  = &actx->octx->dst;
     htp_act_preamble2;
 
     uint64_t t1, t2;
     t1 = HAP_perf_get_qtimer_count();
 
-    const size_t src0_row_size = nb01;
-    const size_t dst_row_size  = nb1;
-    const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
-    const size_t dst_row_size_aligned  = hex_round_up(dst_row_size, VLEN);
+    const size_t src0_row_size = actx->src0_row_size;
+    const size_t dst_row_size  = actx->dst_row_size;
+    const size_t src0_row_size_aligned = actx->src0_row_size_aligned;
+    const size_t dst_row_size_aligned  = actx->dst_row_size_aligned;
 
-    const uint32_t src0_nrows = ne01 * ne02 * ne03;
+    const uint32_t src0_nrows = actx->src0_nrows;
+    const uint32_t src0_nrows_per_thread = actx->src0_nrows_per_thread;
 
     const uint32_t src0_start_row = src0_nrows_per_thread * ith;
     const uint32_t src0_end_row   = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
@@ -364,25 +355,29 @@ static void unary_gelu_f32_per_thread(const struct htp_tensor * src0,
         return;
     }
 
-    const uint8_t * data_src0 = (const uint8_t *) src0->data;
-    uint8_t * data_dst        = (uint8_t *) dst->data;
+    const uint8_t * data_src0 = actx->data_src0;
+    uint8_t * data_dst        = actx->data_dst;
 
-    uint8_t * src0_spad_data = src0_spad->data + (ith * src0_spad->size_per_thread);
-    uint8_t * dst_spad_data  = dst_spad->data  + (ith * dst_spad->size_per_thread);
+    // nc/ne0 matches.
+    const int ne0_val = actx->nc; // == dst->ne[0]
 
-    // While given src0_spad->size_per_thread, divide it to two ping-pong buffer for src0
-    size_t src0_spad_half_size = src0_spad->size_per_thread / 2;
-    size_t dst_spad_half_size  = dst_spad->size_per_thread  / 2;
+    uint8_t * src0_spad_data = actx->octx->src0_spad.data + (ith * actx->octx->src0_spad.size_per_thread);
+    uint8_t * dst_spad_data  = actx->octx->dst_spad.data  + (ith * actx->octx->dst_spad.size_per_thread);
+
+    size_t src0_spad_half_size = actx->src0_spad_half_size;
+    size_t dst_spad_half_size  = actx->dst_spad_half_size;
 
     // In gelu = x*sigmoid(x*1.702)
-    const int BLOCK = src0_spad_half_size / src0_row_size_aligned; // How many rows can we process in one block
+    const int BLOCK = actx->block;
 
     if (BLOCK == 0) {
         FARF(ERROR, "gelu-f32 : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n",
-                src0_spad->size_per_thread, src0_row_size_aligned);
+                actx->octx->src0_spad.size_per_thread, src0_row_size_aligned);
         return;
     }
 
+    dma_queue * dma_queue = actx->octx->ctx->dma[ith];
+
     // See discussion: https://github.com/ggml-org/llama.cpp/pull/18151#issuecomment-3678235379
     for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; ir += BLOCK, spad_idx++) {
         const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
@@ -408,9 +403,9 @@ static void unary_gelu_f32_per_thread(const struct htp_tensor * src0,
             float* dst_spad_ptr        = dst_spad  + ib * (dst_row_size_aligned  / sizeof(float));
 
             // gelu = x * sigmoid(1.702 * x) // current implementation
-            hvx_mul_scalar_f32((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, (float) 1.702, ne0);
-            hvx_sigmoid_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) dst_spad_ptr, ne0);
-            hvx_mul_f32_aaa((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, (const uint8_t *) dst_spad_ptr, ne0);
+            hvx_mul_scalar_f32((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, (float) 1.702, ne0_val);
+            hvx_sigmoid_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) dst_spad_ptr, ne0_val);
+            hvx_mul_f32_aaa((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, (const uint8_t *) dst_spad_ptr, ne0_val);
         }
 
         dma_queue_push_vtcm_to_ddr(dma_queue,
@@ -435,34 +430,23 @@ static void unary_gelu_f32_per_thread(const struct htp_tensor * src0,
          ne03, src0_start_row, src0_end_row, ne0, ne1, ne2, ne3, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
 
-static void unary_gelu_f32(unsigned int n, unsigned int i, void * data) {
-    struct htp_ops_context * octx = (struct htp_ops_context *) data;
-    unary_gelu_f32_per_thread(&octx->src0, &octx->dst, octx->op_params, &octx->src0_spad, &octx->dst_spad, n, i,
-                               octx->src0_nrows_per_thread, octx->ctx->dma[i]);
-}
 
-
-
-static void unary_silu_f32_per_thread(const struct htp_tensor * src0,
-                                       struct htp_tensor *       dst,
-                                       const int32_t *           op_params,
-                                       struct htp_spad *         src0_spad,
-                                       struct htp_spad *         dst_spad,
-                                       uint32_t                  nth,
-                                       uint32_t                  ith,
-                                       uint32_t                  src0_nrows_per_thread,
-                                       dma_queue *               dma_queue) {
+static void unary_silu_f32_per_thread(unsigned int nth, unsigned int ith, void * data) {
+    struct htp_act_context * actx = (struct htp_act_context *) data;
+    const struct htp_tensor * src0 = &actx->octx->src0;
+    const struct htp_tensor * dst  = &actx->octx->dst;
     htp_act_preamble2;
 
     uint64_t t1, t2;
     t1 = HAP_perf_get_qtimer_count();
 
-    const size_t src0_row_size = nb01;
-    const size_t dst_row_size  = nb1;
-    const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
-    const size_t dst_row_size_aligned  = hex_round_up(dst_row_size, VLEN);
+    const size_t src0_row_size = actx->src0_row_size;
+    const size_t dst_row_size  = actx->dst_row_size;
+    const size_t src0_row_size_aligned = actx->src0_row_size_aligned;
+    const size_t dst_row_size_aligned  = actx->dst_row_size_aligned;
 
-    const uint32_t src0_nrows = ne01 * ne02 * ne03;
+    const uint32_t src0_nrows = actx->src0_nrows;
+    const uint32_t src0_nrows_per_thread = actx->src0_nrows_per_thread;
 
     const uint32_t src0_start_row = src0_nrows_per_thread * ith;
     const uint32_t src0_end_row   = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
@@ -472,24 +456,27 @@ static void unary_silu_f32_per_thread(const struct htp_tensor * src0,
         return;
     }
 
-    const uint8_t * data_src0 = (const uint8_t *) src0->data;
-    uint8_t * data_dst        = (uint8_t *) dst->data;
+    const uint8_t * data_src0 = actx->data_src0;
+    uint8_t * data_dst        = actx->data_dst;
 
-    uint8_t * src0_spad_data = src0_spad->data + (ith * src0_spad->size_per_thread);
-    uint8_t * dst_spad_data  = dst_spad->data  + (ith * dst_spad->size_per_thread);
+    const int ne0_val = actx->nc; // == dst->ne[0]
 
-    // While given src0_spad->size_per_thread, divide it to two ping-pong buffer for src0
-    size_t src0_spad_half_size = src0_spad->size_per_thread / 2;
-    size_t dst_spad_half_size  = dst_spad->size_per_thread  / 2;
+    uint8_t * src0_spad_data = actx->octx->src0_spad.data + (ith * actx->octx->src0_spad.size_per_thread);
+    uint8_t * dst_spad_data  = actx->octx->dst_spad.data  + (ith * actx->octx->dst_spad.size_per_thread);
 
-    const int BLOCK = src0_spad_half_size / src0_row_size_aligned; // How many rows can we process in one block
+    size_t src0_spad_half_size = actx->src0_spad_half_size;
+    size_t dst_spad_half_size  = actx->dst_spad_half_size;
+
+    const int BLOCK = actx->block;
 
     if (BLOCK == 0) {
         FARF(ERROR, "silu-f32 : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n",
-                src0_spad->size_per_thread, src0_row_size_aligned);
+                actx->octx->src0_spad.size_per_thread, src0_row_size_aligned);
         return;
     }
 
+    dma_queue * dma_queue = actx->octx->ctx->dma[ith];
+
     // See discussion: https://github.com/ggml-org/llama.cpp/pull/18151#issuecomment-3678235379
     for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; ir += BLOCK, spad_idx++) {
         const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
@@ -515,8 +502,8 @@ static void unary_silu_f32_per_thread(const struct htp_tensor * src0,
             float* dst_spad_ptr        = dst_spad  + ib * (dst_row_size_aligned  / sizeof(float));
 
             // silu = x * sigmoid(x)
-            hvx_sigmoid_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, ne0);
-            hvx_mul_f32_aaa((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, (const uint8_t *) dst_spad_ptr, ne0);
+            hvx_sigmoid_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, ne0_val);
+            hvx_mul_f32_aaa((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, (const uint8_t *) dst_spad_ptr, ne0_val);
         }
 
         dma_queue_push_vtcm_to_ddr(dma_queue,
@@ -544,27 +531,22 @@ static void unary_silu_f32_per_thread(const struct htp_tensor * src0,
 static const float GELU_COEF_A     = 0.044715f;
 static const float SQRT_2_OVER_PI  = 0.79788456080286535587989211986876f;
 
-static void glu_geglu_f32_per_thread(const struct htp_tensor * src0,
-                                       const struct htp_tensor * src1,
-                                       struct htp_tensor *       dst,
-                                       const int32_t *           op_params,
-                                       struct htp_spad *         src0_spad,
-                                       struct htp_spad *         src1_spad,
-                                       struct htp_spad *         dst_spad,
-                                       uint32_t                  nth,
-                                       uint32_t                  ith,
-                                       uint32_t                  src0_nrows_per_thread,
-                                       dma_queue *               dma_queue) {
+static void glu_geglu_f32_per_thread(unsigned int nth, unsigned int ith, void * data) {
+    struct htp_act_context * actx = (struct htp_act_context *) data;
+    const struct htp_tensor * src0 = &actx->octx->src0;
+    const struct htp_tensor * src1 = &actx->octx->src1;
+    const struct htp_tensor * dst  = &actx->octx->dst;
     htp_act_preamble3;
 
-    size_t src0_row_size = nb01;
-    size_t src1_row_size = nb11;
-    size_t dst_row_size  = nb1;
+    size_t src0_row_size = actx->src0_row_size;
+    size_t src1_row_size = actx->src1_row_size;
+    size_t dst_row_size  = actx->dst_row_size;
 
     uint64_t t1, t2;
     t1 = HAP_perf_get_qtimer_count();
 
-    const uint32_t src0_nrows = ne01 * ne02 * ne03;  // src0 rows
+    const uint32_t src0_nrows = actx->src0_nrows;
+    const uint32_t src0_nrows_per_thread = actx->src0_nrows_per_thread;
 
     const uint32_t src0_start_row = src0_nrows_per_thread * ith;
     const uint32_t src0_end_row   = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
@@ -574,43 +556,34 @@ static void glu_geglu_f32_per_thread(const struct htp_tensor * src0,
         return;
     }
 
-    const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
-    const uint8_t * restrict data_src1 = (const uint8_t *) src1->data;
-    uint8_t * restrict data_dst        = (uint8_t *) dst->data;
+    const uint8_t * restrict data_src0 = actx->data_src0;
+    const uint8_t * restrict data_src1 = actx->data_src1;
+    uint8_t * restrict data_dst        = actx->data_dst;
 
-    const bool src1_valid = src1->ne[0];
-    const int  nc         = (src1_valid) ? ne00 : ne00 / 2;
-    if (!src1_valid) {
-        const int32_t swapped = op_params[1];
-        data_src1             = data_src0;
-        src1_row_size         = src0_row_size;
+    const int nc = actx->nc;
 
-        const size_t nc_in_bytes = nc * SIZEOF_FP32;
-        data_src0 += swapped ? nc_in_bytes : 0;
-        data_src1 += swapped ? 0 : nc_in_bytes;
-    }
+    const size_t src0_row_size_aligned = actx->src0_row_size_aligned;
+    const size_t src1_row_size_aligned = actx->src1_row_size_aligned;
+    const size_t dst_row_size_aligned  = actx->dst_row_size_aligned;
 
-    const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
-    const size_t src1_row_size_aligned = hex_round_up(src1_row_size, VLEN);
-    const size_t dst_row_size_aligned  = hex_round_up(dst_row_size, VLEN);
+    uint8_t * restrict src0_spad_data = actx->octx->src0_spad.data + (ith * actx->octx->src0_spad.size_per_thread);
+    uint8_t * restrict src1_spad_data = actx->octx->src1_spad.data + (ith * actx->octx->src1_spad.size_per_thread);
+    uint8_t * restrict dst_spad_data  = actx->octx->dst_spad.data + (ith * actx->octx->dst_spad.size_per_thread);
 
-    uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_spad->size_per_thread);
-    uint8_t * restrict src1_spad_data = src1_spad->data + (ith * src1_spad->size_per_thread);
-    uint8_t * restrict dst_spad_data  = dst_spad->data + (ith * dst_spad->size_per_thread);
+    size_t src0_spad_half_size = actx->src0_spad_half_size;
+    size_t src1_spad_half_size = actx->src1_spad_half_size;
+    size_t dst_spad_half_size  = actx->dst_spad_half_size;
 
-    // While given src0_spad->size_per_thread, divide it to two ping-pong buffer for src0
-    size_t src0_spad_half_size = src0_spad->size_per_thread / 2;
-    size_t src1_spad_half_size = src1_spad->size_per_thread / 2;
-    size_t dst_spad_half_size  = dst_spad->size_per_thread / 2;
-
-    const int BLOCK = src0_spad_half_size / src0_row_size_aligned;  // How many rows can we process in one block
+    const int BLOCK = actx->block;
     if (BLOCK == 0) {
         FARF(ERROR,
              "geglu-f32 : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n",
-             src0_spad->size_per_thread, src0_row_size_aligned);
+             actx->octx->src0_spad.size_per_thread, src0_row_size_aligned);
         return;
     }
 
+    dma_queue * dma_queue = actx->octx->ctx->dma[ith];
+
     // See discussion: https://github.com/ggml-org/llama.cpp/pull/18151#issuecomment-3678235379
     for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; ir += BLOCK, spad_idx++) {
         const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
@@ -678,33 +651,7 @@ static void glu_geglu_f32_per_thread(const struct htp_tensor * src0,
          (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
 
-static void unary_silu_f32(unsigned int n, unsigned int i, void * data) {
-    struct htp_ops_context * octx = (struct htp_ops_context *) data;
-    unary_silu_f32_per_thread(&octx->src0, &octx->dst, octx->op_params, &octx->src0_spad, &octx->dst_spad, n, i,
-                               octx->src0_nrows_per_thread, octx->ctx->dma[i]);
-}
-
-static void glu_swiglu_f32(unsigned int n, unsigned int i, void * data) {
-    struct htp_ops_context * octx = (struct htp_ops_context *) data;
-    glu_swiglu_f32_per_thread(&octx->src0, &octx->src1, &octx->dst, octx->op_params, &octx->src0_spad,
-                               &octx->src1_spad, &octx->dst_spad, n, i, octx->src0_nrows_per_thread, octx->ctx->dma[i]);
-}
-
-static void glu_swiglu_oai_f32(unsigned int n, unsigned int i, void * data) {
-    struct htp_ops_context * octx = (struct htp_ops_context *) data;
-    glu_swiglu_oai_f32_per_thread(&octx->src0, &octx->src1, &octx->dst, octx->op_params, &octx->src0_spad,
-                                   &octx->src1_spad, &octx->dst_spad, n, i, octx->src0_nrows_per_thread, octx->ctx->dma[i]);
-}
-
-static void glu_geglu_f32(unsigned int n, unsigned int i, void * data) {
-    struct htp_ops_context * octx = (struct htp_ops_context *) data;
-    glu_geglu_f32_per_thread(&octx->src0, &octx->src1, &octx->dst, octx->op_params, &octx->src0_spad,
-                               &octx->src1_spad, &octx->dst_spad, n, i, octx->src0_nrows_per_thread, octx->ctx->dma[i]);
-}
-
 static int execute_op_activations_f32(struct htp_ops_context * octx) {
-    int err = HTP_STATUS_OK;
-
     const struct htp_tensor * src0 = &octx->src0;
     const struct htp_tensor * src1 = &octx->src1;
     struct htp_tensor *       dst  = &octx->dst;
@@ -719,26 +666,26 @@ static int execute_op_activations_f32(struct htp_ops_context * octx) {
 
     switch (octx->op) {
         case HTP_OP_UNARY_SILU:
-            act_op_func = unary_silu_f32;
+            act_op_func = (worker_callback_t)unary_silu_f32_per_thread;
             op_type     = "silu-f32";
             break;
 
         case HTP_OP_GLU_SWIGLU:
-            act_op_func = glu_swiglu_f32;
+            act_op_func = (worker_callback_t)glu_swiglu_f32_per_thread;
             op_type     = "swiglu-f32";
             break;
 
         case HTP_OP_GLU_SWIGLU_OAI:
-            act_op_func = glu_swiglu_oai_f32;
+            act_op_func = (worker_callback_t)glu_swiglu_oai_f32_per_thread;
             op_type     = "swiglu-oai-f32";
             break;
         case HTP_OP_UNARY_GELU:
-            act_op_func = unary_gelu_f32;
+            act_op_func = (worker_callback_t)unary_gelu_f32_per_thread;
             op_type     = "gelu-f32";
             break;
 
         case HTP_OP_GLU_GEGLU:
-            act_op_func = glu_geglu_f32;
+            act_op_func = (worker_callback_t)glu_geglu_f32_per_thread;
             op_type     = "geglu-f32";
             break;
         default:
@@ -797,13 +744,58 @@ static int execute_op_activations_f32(struct htp_ops_context * octx) {
              octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size);
     }
 
-    if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
-        uint32_t n_jobs = MIN(n_threads, src0_nrows);
-        octx->src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
-        worker_pool_run_func(octx->ctx->worker_pool, act_op_func, octx, n_jobs);
+    if ((octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
+        return HTP_STATUS_OK;
     }
 
-    return err;
+    uint32_t n_jobs = MIN(n_threads, src0_nrows);
+
+    // Prepare context
+    struct htp_act_context actx;
+    actx.octx = octx;
+
+    actx.src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
+
+    actx.src0_row_size = src0_row_size;
+    actx.src1_row_size = src1_row_size;
+    actx.dst_row_size  = dst_row_size;
+
+    actx.src0_row_size_aligned = src0_row_size_aligned;
+    actx.src1_row_size_aligned = src1_row_size_aligned;
+    actx.dst_row_size_aligned  = dst_row_size_aligned;
+
+    actx.src0_spad_half_size = octx->src0_spad.size_per_thread / 2;
+    actx.src1_spad_half_size = octx->src1_spad.size_per_thread / 2;
+    actx.dst_spad_half_size  = octx->dst_spad.size_per_thread / 2;
+
+    actx.block = actx.src0_spad_half_size / actx.src0_row_size_aligned;
+    actx.src0_nrows = src0_nrows;
+
+    actx.nc = dst->ne[0];
+
+    // Pointers and GLU logic
+    const uint8_t * data_src0 = (const uint8_t *) src0->data;
+    const uint8_t * data_src1 = (const uint8_t *) src1->data;
+
+    if (!src1_valid && (octx->op == HTP_OP_GLU_SWIGLU || octx->op == HTP_OP_GLU_SWIGLU_OAI || octx->op == HTP_OP_GLU_GEGLU)) {
+         const int32_t swapped = octx->op_params[1];
+         data_src1 = data_src0;
+         actx.src1_row_size = actx.src0_row_size;
+
+         size_t nc_in_bytes = actx.nc * SIZEOF_FP32;
+         if (swapped) {
+             data_src0 += nc_in_bytes;
+         } else {
+             data_src1 += nc_in_bytes;
+         }
+    }
+
+    actx.data_src0 = data_src0;
+    actx.data_src1 = data_src1;
+    actx.data_dst  = (uint8_t *) dst->data;
+
+    worker_pool_run_func(octx->ctx->worker_pool, act_op_func, &actx, n_jobs);
+    return HTP_STATUS_OK;
 }
 
 int op_activations(struct htp_ops_context * octx) {
diff --git a/ggml/src/ggml-hexagon/htp/get-rows-ops.c b/ggml/src/ggml-hexagon/htp/get-rows-ops.c
index a657cd2dcf..bf24bbda70 100644
--- a/ggml/src/ggml-hexagon/htp/get-rows-ops.c
+++ b/ggml/src/ggml-hexagon/htp/get-rows-ops.c
@@ -15,6 +15,13 @@
 #include "htp-ops.h"
 #include "hvx-utils.h"
 
+struct get_rows_context {
+    struct htp_ops_context * octx;
+    uint32_t src1_nrows_per_thread;
+    struct fastdiv_values get_rows_div_ne10;
+    struct fastdiv_values get_rows_div_ne10_ne11;
+};
+
 #define get_rows_preamble \
     const uint32_t ne00 = octx->src0.ne[0]; \
     const uint32_t ne01 = octx->src0.ne[1]; \
@@ -39,20 +46,22 @@
                                             \
     const uint32_t nr = ne10 * ne11 * ne12;
 
-static int get_rows_thread_f32_f32(struct htp_ops_context * octx, const int nth, const int ith) {
+static void get_rows_thread_f32_f32(unsigned int nth, unsigned int ith, void *data) {
+    struct get_rows_context * grctx = (struct get_rows_context *)data;
+    struct htp_ops_context * octx = grctx->octx;
     get_rows_preamble;
 
     // parallelize by src1 elements (which correspond to dst rows)
-    const uint32_t dr  = octx->src1_nrows_per_thread;
+    const uint32_t dr  = grctx->src1_nrows_per_thread;
     const uint32_t ir0 = dr * ith;
     const uint32_t ir1 = (ir0 + dr < nr) ? (ir0 + dr) : nr;
 
     const bool is_i32 = (octx->src1.type == HTP_TYPE_I32);
 
     for (uint32_t i = ir0; i < ir1; ++i) {
-        const uint32_t i12 = fastdiv(i, &octx->get_rows_div_ne10_ne11);
+        const uint32_t i12 = fastdiv(i, &grctx->get_rows_div_ne10_ne11);
         const uint32_t rem = i - i12 * ne11 * ne10;
-        const uint32_t i11 = fastdiv(rem, &octx->get_rows_div_ne10);
+        const uint32_t i11 = fastdiv(rem, &grctx->get_rows_div_ne10);
         const uint32_t i10 = rem - i11 * ne10;
 
         const uintptr_t src1_addr = octx->src1.data + i10*nb10 + i11*nb11 + i12*nb12;
@@ -68,12 +77,6 @@ static int get_rows_thread_f32_f32(struct htp_ops_context * octx, const int nth,
         const uintptr_t dst_ptr  = octx->dst.data  + i10*nb1  + i11*nb2  + i12*nb3;
         hvx_copy_f32_uu((uint8_t *)dst_ptr, (const uint8_t *)src0_ptr, ne00);
     }
-
-    return HTP_STATUS_OK;
-}
-
-static void get_rows_work_f32_f32(unsigned int n, unsigned int i, void *data) {
-    get_rows_thread_f32_f32((struct htp_ops_context *) data, n, i);
 }
 
 int op_get_rows(struct htp_ops_context * octx) {
@@ -95,12 +98,14 @@ int op_get_rows(struct htp_ops_context * octx) {
         return HTP_STATUS_OK;
     }
 
-    octx->get_rows_div_ne10      = init_fastdiv_values(octx->src1.ne[0]);
-    octx->get_rows_div_ne10_ne11 = init_fastdiv_values(octx->src1.ne[0] * octx->src1.ne[1]);
+    struct get_rows_context grctx;
+    grctx.octx = octx;
+    grctx.get_rows_div_ne10      = init_fastdiv_values(octx->src1.ne[0]);
+    grctx.get_rows_div_ne10_ne11 = init_fastdiv_values(octx->src1.ne[0] * octx->src1.ne[1]);
 
     const uint32_t n_jobs = MIN(nr, octx->n_threads);
-    octx->src1_nrows_per_thread = (nr + n_jobs - 1) / n_jobs;
+    grctx.src1_nrows_per_thread = (nr + n_jobs - 1) / n_jobs;
 
-    worker_pool_run_func(octx->ctx->worker_pool, get_rows_work_f32_f32, octx, n_jobs);
+    worker_pool_run_func(octx->ctx->worker_pool, get_rows_thread_f32_f32, &grctx, n_jobs);
     return HTP_STATUS_OK;
 }
diff --git a/ggml/src/ggml-hexagon/htp/hex-dma.h b/ggml/src/ggml-hexagon/htp/hex-dma.h
index d1ddb0ecbf..350ab9d966 100644
--- a/ggml/src/ggml-hexagon/htp/hex-dma.h
+++ b/ggml/src/ggml-hexagon/htp/hex-dma.h
@@ -102,7 +102,7 @@ static inline bool dma_queue_push(dma_queue * q,
     dmlink(q->tail, desc);
     q->tail = desc;
 
-    // FARF(ERROR, "dma-push: i %u len %u dst %p src %p\n", q->push_idx, len, dst, src);
+    // FARF(ERROR, "dma-push: i %u width %u nrows %d dst %p src %p\n", q->push_idx, width, nrows, dptr.dst, dptr.src);
     q->push_idx = (q->push_idx + 1) & q->idx_mask;
     return true;
 }
@@ -144,11 +144,37 @@ static inline dma_ptr dma_queue_pop(dma_queue * q) {
 
     dptr = q->dptr[q->pop_idx];
 
-    // FARF(ERROR, "dma-pop: i %u dst %p\n", q->pop_idx, dst);
+    // FARF(ERROR, "dma-pop: i %u dst %p src %p\n", q->pop_idx, dptr.dst, dptr.src);
     q->pop_idx = (q->pop_idx + 1) & q->idx_mask;
     return dptr;
 }
 
+static inline dma_ptr dma_queue_pop_nowait(dma_queue * q) {
+    dma_ptr dptr  = { NULL };
+
+    if (q->push_idx == q->pop_idx) {
+        return dptr;
+    }
+
+    dptr = q->dptr[q->pop_idx];
+
+    // FARF(ERROR, "dma-pop-nowait: i %u dst %p src %p\n", q->pop_idx, dptr.dst, dptr.src);
+    q->pop_idx = (q->pop_idx + 1) & q->idx_mask;
+    return dptr;
+}
+
+static inline bool dma_queue_empty(dma_queue * q) {
+    return q->push_idx == q->pop_idx;
+}
+
+static inline uint32_t dma_queue_depth(dma_queue * q) {
+    return (q->push_idx - q->pop_idx) & q->idx_mask;
+}
+
+static inline uint32_t dma_queue_capacity(dma_queue * q) {
+    return q->capacity;
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/ggml/src/ggml-hexagon/htp/htp-ops.h b/ggml/src/ggml-hexagon/htp/htp-ops.h
index f1ad24dbfa..127ab1d665 100644
--- a/ggml/src/ggml-hexagon/htp/htp-ops.h
+++ b/ggml/src/ggml-hexagon/htp/htp-ops.h
@@ -44,32 +44,6 @@ struct htp_ops_context {
     uint32_t src0_nrows_per_thread;
     uint32_t src1_nrows_per_thread;
 
-    struct fastdiv_values src0_div1;  // fastdiv values for ne1
-    struct fastdiv_values src0_div2;  // fastdiv values for ne2
-    struct fastdiv_values src0_div3;  // fastdiv values for ne3
-    struct fastdiv_values src0_div21; // fastdiv values for ne2 * ne1
-
-    struct fastdiv_values src1_div1;  // fastdiv values for ne1
-    struct fastdiv_values src1_div2;  // fastdiv values for ne2
-    struct fastdiv_values src1_div3;  // fastdiv values for ne3
-    struct fastdiv_values src1_div21; // fastdiv values for ne2 * ne1
-
-    struct fastdiv_values src3_div1;  // fastdiv values for ne1
-    struct fastdiv_values src3_div2;  // fastdiv values for ne2
-    struct fastdiv_values src3_div3;  // fastdiv values for ne3
-    struct fastdiv_values src3_div21; // fastdiv values for ne2 * ne1
-
-    struct fastdiv_values broadcast_rk2;
-    struct fastdiv_values broadcast_rk3;
-    struct fastdiv_values broadcast_rv2;
-    struct fastdiv_values broadcast_rv3;
-
-    struct fastdiv_values set_rows_div_ne12; // fastdiv values for ne12
-    struct fastdiv_values set_rows_div_ne11; // fastdiv values for ne11
-
-    struct fastdiv_values get_rows_div_ne10;      // fastdiv values for ne10
-    struct fastdiv_values get_rows_div_ne10_ne11; // fastdiv values for ne10 * ne11
-
     uint32_t flags;
 };
 
diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c
index c360abe8da..6f6f51f01f 100644
--- a/ggml/src/ggml-hexagon/htp/matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c
@@ -49,62 +49,6 @@ struct htp_matmul_context {
     struct fastdiv_values mm_div_r3;
 };
 
-// vdelta control to replicate first 4x fp32 values across lanes
-static const uint8_t __attribute__((aligned(128))) repl_4x_f32[128] = {
-    0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x10, 0x10, 0x10,
-    0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x20, 0x20,
-    0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x10, 0x10, 0x10, 0x10, 0x04,
-    0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x40, 0x40, 0x40, 0x40,
-    0x44, 0x44, 0x44, 0x44, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04,
-    0x04, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x20, 0x20, 0x20, 0x20, 0x04, 0x04,
-    0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x10, 0x10, 0x10, 0x10,
-};
-
-// vdelta control to replicate and interleave first 8x fp32 values across lanes
-static const uint8_t __attribute__((aligned(128))) repl_interleave_8x_f32[128] = {
-    0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x00, 0x00, 0x00,
-    0x00, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x20, 0x20,
-    0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x20, 0x20, 0x20, 0x20, 0x04,
-    0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x40, 0x40, 0x40, 0x40,
-    0x44, 0x44, 0x44, 0x44, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x40, 0x40, 0x40, 0x40, 0x44, 0x44, 0x44,
-    0x44, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x20, 0x20, 0x20, 0x20, 0x04, 0x04,
-    0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x20, 0x20, 0x20, 0x20,
-};
-
-// vdelta control to replicate first fp32 value across all elements
-static const uint8_t __attribute__((aligned(128))) repl_1x_f32[128] = {
-    0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x10, 0x10, 0x10,
-    0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x20, 0x20, 0x20, 0x20, 0x04, 0x04,
-    0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08,
-    0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x40, 0x40, 0x40, 0x40, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08,
-    0x04, 0x04, 0x04, 0x04, 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04,
-    0x04, 0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x10, 0x10,
-    0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
-};
-
-// vdelta control to replicate first fp16 value across all elements
-static const uint8_t __attribute__((aligned(128))) repl_1x_f16[128] = {
-    0x00, 0x00, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x10, 0x10, 0x02,
-    0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x20, 0x20, 0x02, 0x02, 0x04, 0x04,
-    0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x10, 0x10, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08,
-    0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x40, 0x40, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02,
-    0x04, 0x04, 0x02, 0x02, 0x10, 0x10, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02,
-    0x02, 0x20, 0x20, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x10, 0x10,
-    0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
-};
-
-// vdelta control to replicate first fp16 value across all elements
-static const uint8_t __attribute__((aligned(128))) repl_2x_f16[128] = {
-    0x00, 0x00, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
-    0x10, 0x10, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
-    0x20, 0x20, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
-    0x10, 0x10, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
-    0x00, 0x00, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
-    0x10, 0x10, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
-    0x20, 0x20, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
-    0x10, 0x10, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
-};
-
 // vdelta control to expand first 32 e8m0 values into 32 uint32 elements
 static const uint8_t __attribute__((aligned(128))) expand_x32_e8m0[128] = {
     0x00, 0x00, 0x00, 0x00, 0x01, 0x04, 0x00, 0x00, 0x02, 0x00, 0x08, 0x08, 0x01, 0x02, 0x00, 0x04, 0x04, 0x00, 0x00,
@@ -2067,10 +2011,10 @@ static inline void quantize_block_f32_q8x1(float * restrict x, uint8_t * restric
     HVX_Vector vx3_qf = Q6_Vqf32_vsub_VsfVsf(vx[3], zero);  // 32 elements
 
     // Convert to QF32
-    HVX_Vector vmax0_qf = Q6_Vqf32_vsub_VsfVsf(vmax0_sf, zero);
-    HVX_Vector vmax1_qf = Q6_Vqf32_vsub_VsfVsf(vmax1_sf, zero);
-    HVX_Vector vmax2_qf = Q6_Vqf32_vsub_VsfVsf(vmax2_sf, zero);
-    HVX_Vector vmax3_qf = Q6_Vqf32_vsub_VsfVsf(vmax3_sf, zero);
+    HVX_Vector vmax0_qf = Q6_Vqf32_vsub_VsfVsf(vmax0_sf, zero); // replicated over all lanes
+    HVX_Vector vmax1_qf = Q6_Vqf32_vsub_VsfVsf(vmax1_sf, zero); // replicated over all lanes
+    HVX_Vector vmax2_qf = Q6_Vqf32_vsub_VsfVsf(vmax2_sf, zero); // replicated over all lanes
+    HVX_Vector vmax3_qf = Q6_Vqf32_vsub_VsfVsf(vmax3_sf, zero); // replicated over all lanes
 
     // Combine and convert to fp16
     HVX_Vector vmax01_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vmax1_qf, vmax0_qf)));
@@ -2080,11 +2024,6 @@ static inline void quantize_block_f32_q8x1(float * restrict x, uint8_t * restric
     HVX_Vector vx01_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vx1_qf, vx0_qf)));
     HVX_Vector vx23_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vx3_qf, vx2_qf)));
 
-    // Replicate first fp16 scale across all lanes
-    HVX_Vector ctrl = *(const HVX_Vector *) repl_2x_f16;
-    vmax01_hf         = Q6_V_vdelta_VV(vmax01_hf, ctrl);
-    vmax23_hf         = Q6_V_vdelta_VV(vmax23_hf, ctrl);
-
     HVX_Vector vd01_qf16 = Q6_Vqf16_vmpy_VhfVhf(vmax01_hf, Q6_Vh_vsplat_R(0x2008));  // 1.0 / 127.0
     HVX_Vector vd23_qf16 = Q6_Vqf16_vmpy_VhfVhf(vmax23_hf, Q6_Vh_vsplat_R(0x2008));  // 1.0 / 127.0
     HVX_Vector vd01_hf   = Q6_Vhf_equals_Vqf16(vd01_qf16);
@@ -2130,13 +2069,8 @@ static inline void quantize_block_f32_q8x2(float * restrict x, uint8_t * restric
     HVX_Vector vx23_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vx3_qf, vx2_qf)));
 
     // Compute max and scale
-    HVX_Vector vmax01_hf = hvx_vec_reduce_max_f16(hvx_vec_abs_f16(vx01_hf));
-    HVX_Vector vmax23_hf = hvx_vec_reduce_max_f16(hvx_vec_abs_f16(vx23_hf));
-
-    // Replicate first fp16 scale across all lanes
-    HVX_Vector ctrl = *(const HVX_Vector *) repl_1x_f16;
-    vmax01_hf         = Q6_V_vdelta_VV(vmax01_hf, ctrl);
-    vmax23_hf         = Q6_V_vdelta_VV(vmax23_hf, ctrl);
+    HVX_Vector vmax01_hf = hvx_vec_reduce_max_f16(hvx_vec_abs_f16(vx01_hf)); // replicated over all lanes
+    HVX_Vector vmax23_hf = hvx_vec_reduce_max_f16(hvx_vec_abs_f16(vx23_hf)); // replicated over all lanes
 
     HVX_Vector vd01_qf16 = Q6_Vqf16_vmpy_VhfVhf(vmax01_hf, Q6_Vh_vsplat_R(0x2008));  // 1.0 / 127.0
     HVX_Vector vd23_qf16 = Q6_Vqf16_vmpy_VhfVhf(vmax23_hf, Q6_Vh_vsplat_R(0x2008));  // 1.0 / 127.0
@@ -2179,11 +2113,7 @@ static inline void quantize_block_f32_q8x4(float * restrict x, uint8_t * restric
 
     // Compute max and scale
     HVX_Vector vmax_hf = hvx_vec_reduce_max_f16(hvx_vec_abs_f16(vx01_hf));
-    vmax_hf            = hvx_vec_reduce_max2_f16(hvx_vec_abs_f16(vx23_hf), vmax_hf);
-
-    // Replicate first fp16 scale across all lanes
-    HVX_Vector ctrl = *(const HVX_Vector *) repl_1x_f16;
-    vmax_hf         = Q6_V_vdelta_VV(vmax_hf, ctrl);
+    vmax_hf            = hvx_vec_reduce_max2_f16(hvx_vec_abs_f16(vx23_hf), vmax_hf); // replicated over all lanes
 
     HVX_Vector vd_qf16 = Q6_Vqf16_vmpy_VhfVhf(vmax_hf, Q6_Vh_vsplat_R(0x2008));  // 1.0 / 127.0
     HVX_Vector vd_hf   = Q6_Vhf_equals_Vqf16(vd_qf16);
diff --git a/ggml/src/ggml-hexagon/htp/rope-ops.c b/ggml/src/ggml-hexagon/htp/rope-ops.c
index 943ca5c952..aa6a6c9008 100644
--- a/ggml/src/ggml-hexagon/htp/rope-ops.c
+++ b/ggml/src/ggml-hexagon/htp/rope-ops.c
@@ -10,6 +10,7 @@
 
 #include "hex-dma.h"
 #include "hvx-utils.h"
+#include "hex-fastdiv.h"
 
 #define GGML_COMMON_DECL_C
 #include "ggml-common.h"
@@ -21,6 +22,9 @@
 #define HTP_ROPE_TYPE_NORMAL 0
 #define HTP_ROPE_TYPE_NEOX   2
 
+#define HTP_ROPE_SPAD_NROWS  16
+#define HTP_ROPE_SPAD_BLOCK  (HTP_ROPE_SPAD_NROWS/2)
+
 #define htp_rope_preamble              \
     const uint32_t ne00 = src0->ne[0]; \
     const uint32_t ne01 = src0->ne[1]; \
@@ -42,7 +46,7 @@
     const uint32_t nb2 = dst->nb[2];   \
     const uint32_t nb3 = dst->nb[3];
 
-struct rope_th_ctx {
+struct htp_rope_context {
     int32_t n_dims;
     int32_t mode;
     int32_t n_ctx_orig;
@@ -57,7 +61,19 @@ struct rope_th_ctx {
     float theta_scale;
     float corr_dims[2];
 
+    uint32_t src0_nrows_per_thread;
+    size_t spad_stride;
+
     struct htp_ops_context * octx;
+
+    size_t src0_row_size;
+    size_t dst_row_size;
+    size_t src0_row_size_aligned;
+    size_t dst_row_size_aligned;
+    size_t theta_cache_offset;
+    uint32_t src0_nrows;
+
+    uint64_t t_start;
 };
 
 static float rope_yarn_ramp(const float low, const float high, const int i0) {
@@ -117,64 +133,23 @@ static void rope_corr_dims(int     n_dims,
     dims[1]     = MIN(n_dims - 1, end);
 }
 
-static void init_rope_ctx(struct rope_th_ctx * rope_ctx, struct htp_ops_context * octx) {
-    memset(rope_ctx, 0, sizeof(struct rope_th_ctx));
+static inline void hvx_rope_neox_f32_aa(float * restrict dst, const float * restrict src0, uint32_t ne, const float * restrict theta_cache) {
+    const HVX_Vector * restrict vsrc   = (const HVX_Vector *) src0;
+    const HVX_Vector * restrict vtheta = (const HVX_Vector *) theta_cache;
+    HVX_Vector       * restrict vdst   = (HVX_Vector *) dst;
 
-    const int32_t * op_params = &octx->op_params[0];
+    uint32_t nvec = (ne / (VLEN_FP32 * 2) * 2); // 2 vecs per loop, step of 2
 
-    rope_ctx->n_dims     = ((const int32_t *) op_params)[1];
-    rope_ctx->mode       = ((const int32_t *) op_params)[2];
-    rope_ctx->n_ctx_orig = ((const int32_t *) op_params)[4];
+    uint32_t he = ne / 2;         // half_dims offset in elements
+    uint32_t hv = he / VLEN_FP32; // half_dims offset in vectors
 
-    memcpy(&rope_ctx->freq_base, (int32_t *) op_params + 5, sizeof(float));
-    memcpy(&rope_ctx->freq_scale, (int32_t *) op_params + 6, sizeof(float));
-    memcpy(&rope_ctx->ext_factor, (int32_t *) op_params + 7, sizeof(float));
-    memcpy(&rope_ctx->attn_factor, (int32_t *) op_params + 8, sizeof(float));
-    memcpy(&rope_ctx->beta_fast, (int32_t *) op_params + 9, sizeof(float));
-    memcpy(&rope_ctx->beta_slow, (int32_t *) op_params + 10, sizeof(float));
-    memcpy(&rope_ctx->sections, (int32_t *) op_params + 11, sizeof(int) * 4);
+    #pragma unroll(2)
+    for (uint32_t i = 0; i < nvec; i += 2) {
+        HVX_Vector v0 = vsrc[i/2+0];
+        HVX_Vector v1 = vsrc[i/2+hv];
 
-    rope_ctx->theta_scale = powf(rope_ctx->freq_base, -2.0f / rope_ctx->n_dims);
-
-    rope_corr_dims(rope_ctx->n_dims, rope_ctx->n_ctx_orig, rope_ctx->freq_base, rope_ctx->beta_fast,
-                   rope_ctx->beta_slow, rope_ctx->corr_dims);
-
-    rope_ctx->octx = octx;
-    FARF(HIGH, "rope-f32 n_dims:%d, ext_factor:%.6f, theta_scale:%.6f, attn_factor:%.6f\n", rope_ctx->n_dims,
-         rope_ctx->ext_factor, rope_ctx->theta_scale, rope_ctx->attn_factor);
-}
-
-static void hvx_calc_rope_neox_f32(const float * restrict src0,
-                                   float * restrict dst,
-                                   const int num_elems,
-                                   const float * restrict theta_cache) {
-    // for (int i = 0; i < num_elems; i += 2) {
-    //const float cos_theta = theta_cache[i + 0];
-    //const float sin_theta = theta_cache[i + 1];
-
-    //const float x0 = src[0];
-    //const float x1 = src[num_elems/2];
-
-    //dst[0] = x0*cos_theta - x1*sin_theta;
-    //dst[num_elems/2] = x0*sin_theta + x1*cos_theta;
-
-    //src += 1;
-    //dst += 1;
-    // }
-
-    const uint8_t * restrict src0_curr  = (const uint8_t *) src0;
-    const uint8_t * restrict theta_curr = (const uint8_t *) theta_cache;
-    uint8_t * restrict dst_curr         = (uint8_t *) dst;
-
-    int step_of_1 = num_elems >> 6;  // 6 because we process two vectors at once
-    int half_size = (sizeof(float) * (num_elems / 2));
-
-    for (int i = 0; i < step_of_1; i++) {
-        HVX_Vector v0 = *(HVX_Vector *) src0_curr;
-        HVX_Vector v1 = *(HVX_Vector *) (src0_curr + half_size);
-
-        HVX_Vector v2 = *(HVX_Vector *) theta_curr;
-        HVX_Vector v3 = *(HVX_Vector *) (theta_curr + VLEN);
+        HVX_Vector v2 = vtheta[i+0];
+        HVX_Vector v3 = vtheta[i+1];
 
         HVX_VectorPair vcos_sin = Q6_W_vdeal_VVR(v3, v2, -4);  // vcos_sin[0] = cos_theta, vcos_sin[1] = sin_theta
 
@@ -186,45 +161,34 @@ static void hvx_calc_rope_neox_f32(const float * restrict src0,
         HVX_Vector v4 = Q6_Vqf32_vsub_Vqf32Vqf32(vx0_c, vx1_s);
         HVX_Vector v5 = Q6_Vqf32_vadd_Vqf32Vqf32(vx0_s, vx1_c);
 
-        *(HVX_Vector *) dst_curr               = Q6_Vsf_equals_Vqf32(v4);
-        *(HVX_Vector *) (dst_curr + half_size) = Q6_Vsf_equals_Vqf32(v5);
+        vdst[i/2+0]  = Q6_Vsf_equals_Vqf32(v4);
+        vdst[i/2+hv] = Q6_Vsf_equals_Vqf32(v5);
+    }
 
-        src0_curr += VLEN;
-        theta_curr += 2 * VLEN;
-        dst_curr += VLEN;
+    for (uint32_t i = nvec * VLEN_FP32; i < ne; i += 2) {
+        const float cos_theta = theta_cache[i+0];
+        const float sin_theta = theta_cache[i+1];
+        float x0 = src0[i/2];
+        float x1 = src0[i/2 + he];
+        dst[i/2]      = x0 * cos_theta - x1 * sin_theta;
+        dst[i/2 + he] = x0 * sin_theta + x1 * cos_theta;
     }
 }
 
-static void hvx_calc_rope_f32(const float * restrict src0,
-                              float * restrict dst,
-                              const int num_elems,
-                              const float * restrict theta_cache) {
-    // for (int i = 0; i < num_elems; i += 2) {
-    //const float cos_theta = theta_cache[i + 0];
-    //const float sin_theta = theta_cache[i + 1];
+static inline void hvx_rope_f32_aa(float * restrict dst, const float * restrict src0, uint32_t ne, const float * restrict theta_cache) {
+    const HVX_Vector * restrict vsrc   = (const HVX_Vector *) src0;
+    const HVX_Vector * restrict vtheta = (const HVX_Vector *) theta_cache;
+    HVX_Vector       * restrict vdst   = (HVX_Vector *) dst;
 
-    //const float x0 = src[0];
-    //const float x1 = src[1];
+    uint32_t nvec = (ne / (VLEN_FP32 * 2)) * 2; // 2 vecs per loop, step of two
 
-    //dst[0] = x0*cos_theta - x1*sin_theta;
-    //dst[1] = x0*sin_theta + x1*cos_theta;
+    #pragma unroll(2)
+    for (uint32_t i = 0; i < nvec; i+=2) {
+        HVX_Vector v0 = vsrc[i+0];
+        HVX_Vector v1 = vsrc[i+1];
 
-    //src += 2;
-    //dst += 2;
-    // }
-
-    const uint8_t * restrict src0_curr  = (const uint8_t *) src0;
-    const uint8_t * restrict theta_curr = (const uint8_t *) theta_cache;
-    uint8_t * restrict dst_curr         = (uint8_t *) dst;
-
-    int step_of_1 = num_elems >> 6;  // 6 because we process two vectors at once
-
-    for (int i = 0; i < step_of_1; i++) {
-        HVX_Vector v0 = *(HVX_Vector *) src0_curr;
-        HVX_Vector v1 = *(HVX_Vector *) (src0_curr + VLEN);
-
-        HVX_Vector v2 = *(HVX_Vector *) theta_curr;
-        HVX_Vector v3 = *(HVX_Vector *) (theta_curr + VLEN);
+        HVX_Vector v2 = vtheta[i+0];
+        HVX_Vector v3 = vtheta[i+1];
 
         HVX_VectorPair vx0_x1   = Q6_W_vdeal_VVR(v1, v0, -4);  // vx0_x1[0] = x0, vx0_x1[1] = x1
         HVX_VectorPair vcos_sin = Q6_W_vdeal_VVR(v3, v2, -4);  // vcos_sin[0] = cos_theta, vcos_sin[1] = sin_theta
@@ -239,116 +203,65 @@ static void hvx_calc_rope_f32(const float * restrict src0,
 
         HVX_VectorPair vstore = Q6_W_vshuff_VVR(Q6_Vsf_equals_Vqf32(v5), Q6_Vsf_equals_Vqf32(v4), -4);
 
-        *(HVX_Vector *) dst_curr          = Q6_V_lo_W(vstore);
-        *(HVX_Vector *) (dst_curr + VLEN) = Q6_V_hi_W(vstore);
+        vdst[i+0] = Q6_V_lo_W(vstore);
+        vdst[i+1] = Q6_V_hi_W(vstore);
+    }
 
-        src0_curr += 2 * VLEN;
-        theta_curr += 2 * VLEN;
-        dst_curr += 2 * VLEN;
+    for (uint32_t i = nvec * VLEN_FP32; i < ne; i += 2) {
+        const float cos_theta = theta_cache[i+0];
+        const float sin_theta = theta_cache[i+1];
+        float x0 = src0[i+0];
+        float x1 = src0[i+1];
+        dst[i+0] = x0 * cos_theta - x1 * sin_theta;
+        dst[i+1] = x0 * sin_theta + x1 * cos_theta;
     }
 }
 
-static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
-                         const uint32_t       ir0,
-                         const uint32_t       ir1,
-                         int                  nth,
-                         int                  ith,
-                         const int            opt_path) {
-    struct htp_ops_context * octx = rope_ctx->octx;
+static void inline rope_basic_f32(struct htp_rope_context * rctx, uint8_t * restrict dst, uint8_t * restrict src,
+                   uint32_t nr, uint32_t ne0, const float * restrict theta_cache) {
+    #pragma unroll(4)
+    for (uint32_t i = 0; i < nr; i++) {
+        float * d = (float *) (dst + i * rctx->dst_row_size_aligned);
+        float * s = (float *) (src + i * rctx->src0_row_size_aligned);
+
+        hvx_rope_f32_aa(d, s, rctx->n_dims, theta_cache);
+
+        // fill the remain channels with data from src tensor
+        if (rctx->n_dims < ne0) {
+            hvx_copy_f32_uu((uint8_t *)(d + rctx->n_dims), (uint8_t *)(s + rctx->n_dims), ne0 - rctx->n_dims);
+        }
+    }
+}
+
+static void inline rope_neox_f32(struct htp_rope_context * rctx, uint8_t * restrict dst, uint8_t * restrict src,
+                   uint32_t nr, uint32_t ne0, const float * restrict theta_cache) {
+    #pragma unroll(4)
+    for (uint32_t i = 0; i < nr; i++) {
+        float * d = (float *) (dst + i * rctx->dst_row_size_aligned);
+        float * s = (float *) (src + i * rctx->src0_row_size_aligned);
+
+        hvx_rope_neox_f32_aa(d, s, rctx->n_dims, theta_cache);
+
+        // fill the remain channels with data from src tensor
+        if (rctx->n_dims < ne0) {
+            hvx_copy_f32_uu((uint8_t *)(d + rctx->n_dims), (uint8_t *)(s + rctx->n_dims), ne0 - rctx->n_dims);
+        }
+    }
+}
+
+static void rope_job_f32(unsigned int nth, unsigned int ith, void * data) {
+    struct htp_rope_context * rctx = (struct htp_rope_context *) data;
+    struct htp_ops_context * octx = rctx->octx;
 
     const struct htp_tensor * src0 = &octx->src0;
     const struct htp_tensor * src1 = &octx->src1;
     const struct htp_tensor * src2 = &octx->src2;
     struct htp_tensor *       dst  = &octx->dst;
 
-    const int32_t mode    = rope_ctx->mode;
-    const bool    is_neox = mode & HTP_ROPE_TYPE_NEOX;
-
     htp_rope_preamble;
 
-    const int32_t * pos = (const int32_t *) src1->data;
-
-    float * wp0 = (float *) (octx->src0_spad.data + (ith * nb01));
-
-    const float * freq_factors = NULL;
-    if (src2 != NULL) {
-        freq_factors = (const float *) src2->data;
-    }
-
-    const uint32_t i1_end       = MIN(ir1, ne1);
-    const int32_t  half_dims    = rope_ctx->n_dims / 2;
-    const size_t   remain_bytes = (ne0 - rope_ctx->n_dims) * sizeof(float);
-    for (uint32_t i3 = 0; i3 < ne3; i3++) {      // batch
-        for (uint32_t i2 = 0; i2 < ne2; i2++) {  // seq-len
-            const int32_t p = pos[i2];
-
-            rope_cache_init(p, rope_ctx->freq_scale, freq_factors, rope_ctx->corr_dims, ne0, rope_ctx->ext_factor,
-                            rope_ctx->attn_factor, wp0, rope_ctx->theta_scale);
-
-            for (uint32_t i1 = ir0; i1 < i1_end; i1++) {  // attn-heads
-                const float * src      = (float *) ((char *) src0->data + i3 * nb03 + i2 * nb02 + i1 * nb01);
-                float *       dst_data = (float *) ((char *) dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1);
-
-                const float * src_loc      = src;
-                float *       dst_data_loc = dst_data;
-
-                if (1 == opt_path) {
-                    if (is_neox) {
-                        hvx_calc_rope_neox_f32(src_loc, dst_data_loc, rope_ctx->n_dims, wp0);
-                    } else {
-                        hvx_calc_rope_f32(src_loc, dst_data_loc, rope_ctx->n_dims, wp0);
-                    }
-
-                    src_loc += rope_ctx->n_dims;
-                    dst_data_loc += rope_ctx->n_dims;
-                } else {
-                    for (uint32_t i0 = 0; i0 < rope_ctx->n_dims; i0 += 2) {
-                        const float cos_theta = wp0[i0 + 0];
-                        const float sin_theta = wp0[i0 + 1];
-
-                        if (is_neox) {
-                            const float x0 = src_loc[0];
-                            const float x1 = src_loc[half_dims];
-
-                            dst_data_loc[0]         = x0 * cos_theta - x1 * sin_theta;
-                            dst_data_loc[half_dims] = x0 * sin_theta + x1 * cos_theta;
-
-                            src_loc += 1;
-                            dst_data_loc += 1;
-                        } else {
-                            const float x0 = src_loc[0];
-                            const float x1 = src_loc[1];
-
-                            dst_data_loc[0] = x0 * cos_theta - x1 * sin_theta;
-                            dst_data_loc[1] = x0 * sin_theta + x1 * cos_theta;
-
-                            src_loc += 2;
-                            dst_data_loc += 2;
-                        }
-                    }
-
-                    src_loc += (is_neox ? half_dims : 0);
-                    dst_data_loc += (is_neox ? half_dims : 0);
-                }
-
-                // TODO: use simd to speed up the remaining elements copy
-                memcpy(dst_data_loc, src_loc, remain_bytes);
-            }
-        }
-    }
-}
-
-static void rope_job_f32_per_thread(struct rope_th_ctx * rope_ctx, int nth, int ith) {
-    struct htp_ops_context * octx = rope_ctx->octx;
-
-    const struct htp_tensor * src0 = &octx->src0;
-    const struct htp_tensor * src1 = &octx->src1;
-    struct htp_tensor *       dst  = &octx->dst;
-
-    htp_rope_preamble;
-
-    const uint32_t src0_nrows            = ne01 * ne02 * ne03;  // src0 rows
-    const uint32_t src0_nrows_per_thread = octx->src0_nrows_per_thread;
+    const uint32_t src0_nrows = rctx->src0_nrows;
+    const uint32_t src0_nrows_per_thread = rctx->src0_nrows_per_thread;
 
     const uint32_t src0_start_row = src0_nrows_per_thread * ith;
     const uint32_t src0_end_row   = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
@@ -358,32 +271,114 @@ static void rope_job_f32_per_thread(struct rope_th_ctx * rope_ctx, int nth, int
         return;
     }
 
-    uint64_t t1, t2;
-    t1 = HAP_perf_get_qtimer_count();
+    uint64_t tt = HAP_perf_get_qtimer_count();
 
-    int is_aligned = 1;
-    int opt_path   = 0;
-    if ((0 == hex_is_aligned((void *) src0->data, VLEN)) || (0 == hex_is_aligned((void *) src1->data, VLEN)) ||
-        (0 == hex_is_aligned((void *) dst->data, VLEN))) {
-        FARF(HIGH, "rope-f32: unaligned addresses in rope op, possibly slower execution\n");
-        is_aligned = 0;
-    }
-    if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
-        opt_path = 1;
+    const int32_t mode    = rctx->mode;
+    const bool    is_neox = mode & HTP_ROPE_TYPE_NEOX;
+
+    // VTCM setup
+    uint8_t * src0_spad_base = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread);
+    float *   theta_cache    = (float *) (src0_spad_base);
+              src0_spad_base = src0_spad_base + rctx->theta_cache_offset;
+    uint8_t * dst_spad_base  = octx->dst_spad.data + (ith * octx->dst_spad.size_per_thread);
+
+    dma_queue * dma_queue = octx->ctx->dma[ith];
+    const int32_t * pos = (const int32_t *) src1->data;
+    const float * freq_factors = src2->data ? (const float *) src2->data : NULL;
+
+    uint32_t ir = 0;
+    uint32_t prev_i2 = (uint32_t) -1;
+
+    for (uint32_t i3 = 0; i3 < ne3; i3++) { // batch
+        for (uint32_t i2 = 0; i2 < ne2; i2++) { // seq-len
+            for (uint32_t i1 = 0; i1 < ne1; ) { // attn-heads
+                if (ir < src0_start_row) { ir++; i1++; continue; }
+                if (ir >= src0_end_row) goto done;
+
+                // Rows in this block
+                const uint32_t nrows = MIN(src0_end_row - ir, ne1 - i1);
+
+                // Depth before prefetch
+                uint32_t dma_depth = dma_queue_depth(dma_queue);
+
+                // FARF(HIGH, "rope-block %u: ir %u n-rows %u dma-depth %u : usec %u", ith, ir, nrows, dma_depth,
+                //             (unsigned) HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - rctx->t_start));
+
+                // Prefetch loop
+                for (uint32_t pnr = 0, pr = 0; pr < nrows && pr < HTP_ROPE_SPAD_NROWS; pr += pnr) {
+                    pnr = MIN(nrows - pr, HTP_ROPE_SPAD_BLOCK);
+
+                    uint32_t pi1 = i1 + pr;
+                    uint32_t pir = ir + pr;
+
+                    // Dummy DMA transaction for sequencing (interleaving dst,src,dst,...)
+                    dma_queue_push_vtcm_to_ddr(dma_queue, dma_make_ptr((void *) dst->data, dst_spad_base + pr * rctx->dst_row_size_aligned), 0, 0, 0);
+
+                    const uint8_t * src_addr = (const uint8_t *) src0->data + i3 * nb03 + i2 * nb02 + pi1 * nb01;
+                          uint8_t * src_spad = src0_spad_base + pr * rctx->src0_row_size_aligned;
+                    dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(src_spad, src_addr),
+                        rctx->src0_row_size_aligned, rctx->src0_row_size, pnr);
+
+                    // FARF(HIGH, "rope-prefetch %u: pr %u i1 %u i2 %u i3 %u src-spad %p src-addr %p pnr %u", ith, pir, pi1, i2, i3, src_spad, src_addr, pnr);
+                }
+
+                // Update theta cache
+                if (i2 != prev_i2) {
+                    prev_i2 = i2;
+
+                    const int32_t p = pos[i2];
+                    rope_cache_init(p, rctx->freq_scale, freq_factors, rctx->corr_dims, ne0, rctx->ext_factor, rctx->attn_factor, theta_cache, rctx->theta_scale);
+
+                    // FARF(HIGH, "rope-theta %u: ir %u i1 %u i2 %u i3 %u cache %p : usec %u", ith, ir, i1, i2, i3, theta_cache,
+                    //         (unsigned) HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - rctx->t_start));
+                }
+
+                // Skip DMA transactions from prev block (if any)
+                // No need to wait for these since the DMA is setup for in-order processing
+                for (uint32_t d=0; d < dma_depth; d++) { dma_queue_pop_nowait(dma_queue); }
+
+                // Compute loop
+                for (uint32_t cnr = 0, cr = 0; cr < nrows; cr += cnr, ir += cnr, i1 += cnr) {
+                    // Number of rows to compute
+                    cnr = MIN(nrows - cr, HTP_ROPE_SPAD_BLOCK);
+
+                    uint8_t * dst_spad = (uint8_t *) dma_queue_pop(dma_queue).src;
+                    uint8_t * src_spad = (uint8_t *) dma_queue_pop(dma_queue).dst;
+
+                    // FARF(HIGH, "rope-compute %u: ir %u i1 %u i2 %u i3 %u src-spad %p cnr %u : usec %u", ith, ir, i1, i2, i3, src_spad, cnr,
+                    //         (unsigned) HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - rctx->t_start));
+
+                    if (is_neox) {
+                        rope_neox_f32(rctx, dst_spad, src_spad, cnr, ne0, theta_cache);
+                    } else {
+                        rope_basic_f32(rctx, dst_spad, src_spad, cnr, ne0, theta_cache);
+                    }
+
+                    uint8_t * dst_addr = (uint8_t *) dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1;
+                    dma_queue_push_vtcm_to_ddr(dma_queue, dma_make_ptr(dst_addr, dst_spad), rctx->dst_row_size, rctx->dst_row_size_aligned, cnr);
+
+                    // Prefetch more rows (if any)
+                    if ((cr + HTP_ROPE_SPAD_NROWS) < nrows) {
+                        uint32_t pnr = MIN(nrows - (cr + HTP_ROPE_SPAD_NROWS), HTP_ROPE_SPAD_BLOCK);
+                        uint32_t pi1 = i1 + HTP_ROPE_SPAD_NROWS;
+                        uint32_t pir = ir + HTP_ROPE_SPAD_NROWS;
+
+                        const uint8_t * src_addr = (const uint8_t *) src0->data + i3 * nb03 + i2 * nb02 + pi1 * nb01;
+                        dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(src_spad, src_addr),
+                            rctx->src0_row_size_aligned, rctx->src0_row_size, pnr);
+
+                        // FARF(HIGH, "rope-prefetch %u: pr %u i1 %u i2 %u i3 %u src-spad %p src-addr %p pnr %u", ith, pir, pi1, i2, i3, src_spad, src_addr, pnr);
+                    }
+                }
+            }
+        }
     }
 
-    rope_hex_f32(rope_ctx, src0_start_row, src0_end_row, nth, ith, opt_path);
+done:
+    dma_queue_flush(dma_queue);
+    tt = HAP_perf_get_qtimer_count() - tt;
 
-    t2 = HAP_perf_get_qtimer_count();
-
-    FARF(HIGH, "rope-f32: %d/%d/%d: (%u:%u) usec %u\n", ith, nth, opt_path, src0_start_row, src0_end_row,
-         (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
-}
-
-static void rope_job_dispatcher_f32(unsigned int n, unsigned int i, void * data) {
-    struct rope_th_ctx * rope_ctx = (struct rope_th_ctx *) data;
-
-    rope_job_f32_per_thread(rope_ctx, n, i);
+    FARF(HIGH, "rope-f32: %d/%d: (%u:%u) usec %u\n", ith, nth, src0_start_row, src0_end_row, (unsigned) HAP_perf_qtimer_count_to_us(tt));
 }
 
 static int execute_op_rope_f32(struct htp_ops_context * octx) {
@@ -394,17 +389,10 @@ static int execute_op_rope_f32(struct htp_ops_context * octx) {
     const struct htp_tensor * src2 = &octx->src2;
     struct htp_tensor *       dst  = &octx->dst;
 
-    worker_callback_t op_func;
-    const char *      op_type = NULL;
-
-    struct rope_th_ctx rope_ctx;
+    const char * op_type = "rope-f32";
 
     switch (octx->op) {
         case HTP_OP_ROPE:
-            op_func = rope_job_dispatcher_f32;
-            op_type = "rope-f32";
-
-            init_rope_ctx(&rope_ctx, octx);
             break;
 
         default:
@@ -415,49 +403,79 @@ static int execute_op_rope_f32(struct htp_ops_context * octx) {
     const uint32_t n_threads = octx->n_threads;
 
     const size_t src0_row_size = src0->nb[1];
-    const size_t src1_row_size = src0_row_size;
     const size_t dst_row_size  = dst->nb[1];
 
-    // VTCM scratchpads for all tensors
-    // N rows per thread, padded to HVX vector size
-    octx->dst_spad.size  = hex_round_up(dst_row_size, 128) * n_threads;
-    octx->src0_spad.size = hex_round_up(src0_row_size, 128) * n_threads;
-    octx->src1_spad.size = hex_round_up(src1_row_size, 128) * n_threads;
+    // Aligned row sizes for VTCM
+    const size_t src0_row_size_aligned    = hex_round_up(src0_row_size, VLEN);
+    const size_t dst_row_size_aligned     = hex_round_up(dst_row_size, VLEN);
+    const size_t theta_cache_size_aligned = hex_round_up(src0->ne[0] * sizeof(float), 128);
 
-    size_t spad_size = octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size;
+    // Calculate spad sizes per thread
+    size_t src0_spad_per_thread = theta_cache_size_aligned + HTP_ROPE_SPAD_NROWS * src0_row_size_aligned;
+    size_t dst_spad_per_thread  = HTP_ROPE_SPAD_NROWS * dst_row_size_aligned;
+    size_t spad_per_thread = src0_spad_per_thread + dst_spad_per_thread;
 
-    if (src2->ne[0]) {
-        FARF(HIGH,
-             "%s: %ux%ux%ux%u (x %ux%ux%ux%u x %ux%ux%ux%u) -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u "
-             "dst-spad-size %u\n",
-             op_type, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2],
-             src1->ne[3], src2->ne[0], src2->ne[1], src2->ne[2], src2->ne[3], dst->ne[0], dst->ne[1], dst->ne[2],
-             dst->ne[3], octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size);
-    } else {
-        FARF(HIGH,
-             "%s: %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n",
-             op_type, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2],
-             src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], octx->src0_spad.size, octx->src1_spad.size,
-             octx->dst_spad.size);
-    }
-
-    // Make sure the reserved vtcm size is sufficient
-    if (octx->ctx->vtcm_size < spad_size) {
-        FARF(ERROR, "%s : current VTCM reservation %zu is too small, needed %zu\n", op_type, octx->ctx->vtcm_size,
-             spad_size);
+    // Check if we fit in VTCM
+    size_t total_vtcm_needed = spad_per_thread * n_threads;
+    if (octx->ctx->vtcm_size < total_vtcm_needed) {
+        FARF(ERROR, "%s : current VTCM reservation %zu is too small, needed %zu\n", op_type, octx->ctx->vtcm_size, total_vtcm_needed);
         return HTP_STATUS_VTCM_TOO_SMALL;
     }
 
-    octx->src0_spad.data = octx->ctx->vtcm_base;
-    octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
-    octx->dst_spad.data  = octx->src1_spad.data + octx->src1_spad.size;
+    // Assign sizes
+    octx->src0_spad.size_per_thread = src0_spad_per_thread;
+    octx->dst_spad.size_per_thread  = dst_spad_per_thread;
+    octx->src0_spad.size = n_threads * src0_spad_per_thread;
+    octx->dst_spad.size  = n_threads * dst_spad_per_thread;
+    octx->src1_spad.size = 0;
 
+    // Assign pointers
+    octx->src0_spad.data = octx->ctx->vtcm_base;
+    octx->src1_spad.data = NULL;
+    octx->dst_spad.data  = octx->src0_spad.data + octx->src0_spad.size;
+
+    // Fill context
+    struct htp_rope_context rctx;
+    memset(&rctx, 0, sizeof(struct htp_rope_context));
+
+    rctx.t_start = HAP_perf_get_qtimer_count();
+
+    rctx.octx = octx;
+
+    const int32_t * op_params = &octx->op_params[0];
+    rctx.n_dims     = ((const int32_t *) op_params)[1];
+    rctx.mode       = ((const int32_t *) op_params)[2];
+    rctx.n_ctx_orig = ((const int32_t *) op_params)[4];
+
+    memcpy(&rctx.freq_base,   (int32_t *) op_params + 5,  sizeof(float));
+    memcpy(&rctx.freq_scale,  (int32_t *) op_params + 6,  sizeof(float));
+    memcpy(&rctx.ext_factor,  (int32_t *) op_params + 7,  sizeof(float));
+    memcpy(&rctx.attn_factor, (int32_t *) op_params + 8,  sizeof(float));
+    memcpy(&rctx.beta_fast,   (int32_t *) op_params + 9,  sizeof(float));
+    memcpy(&rctx.beta_slow,   (int32_t *) op_params + 10, sizeof(float));
+    memcpy(&rctx.sections,    (int32_t *) op_params + 11, sizeof(int) * 4);
+
+    rctx.theta_scale = powf(rctx.freq_base, -2.0f / rctx.n_dims);
+
+    rope_corr_dims(rctx.n_dims, rctx.n_ctx_orig, rctx.freq_base, rctx.beta_fast, rctx.beta_slow, rctx.corr_dims);
+
+    rctx.src0_row_size = src0_row_size;
+    rctx.dst_row_size  = dst_row_size;
+    rctx.src0_row_size_aligned = src0_row_size_aligned;
+    rctx.dst_row_size_aligned  = dst_row_size_aligned;
+    rctx.theta_cache_offset    = theta_cache_size_aligned;
+
+    uint32_t ne0 = dst->ne[0];
     uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3];
+    rctx.src0_nrows = src0_nrows;
+
+    FARF(HIGH, "rope-f32 n-rows %u n-dims %d ne0 %u ext-factor %.6f theta-scale %.6f attn-factor %.6f\n", rctx.src0_nrows, rctx.n_dims, ne0,
+         rctx.ext_factor, rctx.theta_scale, rctx.attn_factor);
 
     if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
-        uint32_t n_jobs             = MIN(n_threads, src0_nrows);
-        octx->src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
-        worker_pool_run_func(octx->ctx->worker_pool, op_func, &rope_ctx, n_jobs);
+        uint32_t n_jobs = MIN(n_threads, src0_nrows);
+        rctx.src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
+        worker_pool_run_func(octx->ctx->worker_pool, rope_job_f32, &rctx, n_jobs);
     }
 
     return err;
diff --git a/ggml/src/ggml-hexagon/htp/set-rows-ops.c b/ggml/src/ggml-hexagon/htp/set-rows-ops.c
index 904484da9d..2fd6c90772 100644
--- a/ggml/src/ggml-hexagon/htp/set-rows-ops.c
+++ b/ggml/src/ggml-hexagon/htp/set-rows-ops.c
@@ -43,11 +43,21 @@
                                             \
     const uint32_t nr  = ne01;
 
-static int set_rows_thread_f32_f32(struct htp_ops_context * octx, const int nth, const int ith) {
+struct htp_set_rows_context {
+    struct htp_ops_context * octx;
+    struct fastdiv_values div_ne12;
+    struct fastdiv_values div_ne11;
+    uint32_t src0_nrows_per_thread;
+};
+
+static void set_rows_thread_f32_f32(unsigned int nth, unsigned int ith, void *data) {
+    struct htp_set_rows_context * srctx = (struct htp_set_rows_context *)data;
+    struct htp_ops_context * octx = srctx->octx;
+
     set_rows_preamble;
 
     // parallelize by rows of src0
-    const uint32_t dr  = octx->src0_nrows_per_thread;
+    const uint32_t dr  = srctx->src0_nrows_per_thread;
     const uint32_t ir0 = dr * ith;
     const uint32_t ir1 = (ir0 + dr < nr) ? (ir0 + dr) : nr;
 
@@ -56,8 +66,8 @@ static int set_rows_thread_f32_f32(struct htp_ops_context * octx, const int nth,
     for (uint32_t i03 = 0; i03 < ne03; ++i03) {
         for (uint32_t i02 = 0; i02 < ne02; ++i02) {
             for (uint32_t i = ir0; i < ir1; ++i) {
-                const uint32_t i12 = fastmodulo(i03, ne12, &octx->set_rows_div_ne12);
-                const uint32_t i11 = fastmodulo(i02, ne11, &octx->set_rows_div_ne11);
+                const uint32_t i12 = fastmodulo(i03, ne12, &srctx->div_ne12);
+                const uint32_t i11 = fastmodulo(i02, ne11, &srctx->div_ne11);
                 const uint32_t i10 = i;
 
                 const uintptr_t src1_addr = octx->src1.data + i10*nb10 + i11*nb11 + i12*nb12;
@@ -76,15 +86,16 @@ static int set_rows_thread_f32_f32(struct htp_ops_context * octx, const int nth,
             }
         }
     }
-
-    return HTP_STATUS_OK;
 }
 
-static int set_rows_thread_f16_f32(struct htp_ops_context * octx, const int nth, const int ith) {
+static void set_rows_thread_f16_f32(unsigned int nth, unsigned int ith, void *data) {
+    struct htp_set_rows_context * srctx = (struct htp_set_rows_context *)data;
+    struct htp_ops_context * octx = srctx->octx;
+
     set_rows_preamble;
 
     // parallelize by rows of src0
-    const uint32_t dr  = octx->src0_nrows_per_thread;
+    const uint32_t dr  = srctx->src0_nrows_per_thread;
     const uint32_t ir0 = dr * ith;
     const uint32_t ir1 = (ir0 + dr < nr) ? (ir0 + dr) : nr;
 
@@ -93,8 +104,8 @@ static int set_rows_thread_f16_f32(struct htp_ops_context * octx, const int nth,
     for (uint32_t i03 = 0; i03 < ne03; ++i03) {
         for (uint32_t i02 = 0; i02 < ne02; ++i02) {
             for (uint32_t i = ir0; i < ir1; ++i) {
-                const uint32_t i12 = fastmodulo(i03, ne12, &octx->set_rows_div_ne12);
-                const uint32_t i11 = fastmodulo(i02, ne11, &octx->set_rows_div_ne11);
+                const uint32_t i12 = fastmodulo(i03, ne12, &srctx->div_ne12);
+                const uint32_t i11 = fastmodulo(i02, ne11, &srctx->div_ne11);
                 const uint32_t i10 = i;
 
                 const uintptr_t src1_addr = octx->src1.data + i10*nb10 + i11*nb11 + i12*nb12;
@@ -112,16 +123,6 @@ static int set_rows_thread_f16_f32(struct htp_ops_context * octx, const int nth,
             }
         }
     }
-
-    return HTP_STATUS_OK;
-}
-
-static void set_rows_work_f16_f32(unsigned int n, unsigned int i, void *data) {
-    set_rows_thread_f16_f32((struct htp_ops_context *) data, n, i);
-}
-
-static void set_rows_work_f32_f32(unsigned int n, unsigned int i, void *data) {
-    set_rows_thread_f32_f32((struct htp_ops_context *) data, n, i);
 }
 
 int op_set_rows(struct htp_ops_context * octx) {
@@ -143,18 +144,20 @@ int op_set_rows(struct htp_ops_context * octx) {
         return HTP_STATUS_OK;
     }
 
-    octx->set_rows_div_ne12 = init_fastdiv_values(ne12);
-    octx->set_rows_div_ne11 = init_fastdiv_values(ne11);
+    struct htp_set_rows_context srctx;
+    srctx.octx = octx;
+    srctx.div_ne12 = init_fastdiv_values(ne12);
+    srctx.div_ne11 = init_fastdiv_values(ne11);
 
     const uint32_t n_jobs = MIN(nr, octx->n_threads);
-    octx->src0_nrows_per_thread = (nr + n_jobs - 1) / n_jobs;
+    srctx.src0_nrows_per_thread = (nr + n_jobs - 1) / n_jobs;
 
     switch(octx->dst.type) {
     case HTP_TYPE_F32:
-        worker_pool_run_func(octx->ctx->worker_pool, set_rows_work_f32_f32, octx, n_jobs);
+        worker_pool_run_func(octx->ctx->worker_pool, set_rows_thread_f32_f32, &srctx, n_jobs);
         break;
     case HTP_TYPE_F16:
-        worker_pool_run_func(octx->ctx->worker_pool, set_rows_work_f16_f32, octx, n_jobs);
+        worker_pool_run_func(octx->ctx->worker_pool, set_rows_thread_f16_f32, &srctx, n_jobs);
         break;
     default:
         return HTP_STATUS_NO_SUPPORT;
diff --git a/ggml/src/ggml-hexagon/htp/softmax-ops.c b/ggml/src/ggml-hexagon/htp/softmax-ops.c
index e91a16d947..6e22eb6a63 100644
--- a/ggml/src/ggml-hexagon/htp/softmax-ops.c
+++ b/ggml/src/ggml-hexagon/htp/softmax-ops.c
@@ -10,6 +10,7 @@
 
 #include "hex-dma.h"
 #include "hvx-utils.h"
+#include "hex-fastdiv.h"
 
 #define GGML_COMMON_DECL_C
 #include "ggml-common.h"
@@ -48,7 +49,7 @@
     const uint32_t nb2 = dst->nb[2];                       \
     const uint32_t nb3 = dst->nb[3];
 
-struct softmax_th_ctx {
+struct htp_softmax_context {
     bool     use_f16;
     bool     use_src1;
     uint32_t n_head;
@@ -59,28 +60,48 @@ struct softmax_th_ctx {
     float m0;
     float m1;
 
+    uint32_t src0_nrows_per_thread;
+    struct fastdiv_values fastdiv_ne01;
+    struct fastdiv_values fastdiv_ne02;
+    struct fastdiv_values fastdiv_ne12; // For mask broadcasting
+    struct fastdiv_values fastdiv_ne13; // For mask broadcasting
+    size_t spad_stride;
+
     struct htp_ops_context * octx;
 };
 
-static void init_softmax_ctx(struct softmax_th_ctx * softmax_ctx, struct htp_ops_context * octx) {
+static void init_softmax_ctx(struct htp_softmax_context * smctx, struct htp_ops_context * octx) {
     const struct htp_tensor * src0 = &octx->src0;
     const struct htp_tensor * src1 = &octx->src1;
 
-    memset(softmax_ctx, 0, sizeof(struct softmax_th_ctx));
+    memset(smctx, 0, sizeof(struct htp_softmax_context));
 
-    memcpy(&softmax_ctx->scale, (float *) octx->op_params, sizeof(float));
-    memcpy(&softmax_ctx->max_bias, (float *) octx->op_params + 1, sizeof(float));
+    memcpy(&smctx->scale, (float *) octx->op_params, sizeof(float));
+    memcpy(&smctx->max_bias, (float *) octx->op_params + 1, sizeof(float));
 
-    softmax_ctx->n_head      = src0->ne[2];
-    softmax_ctx->n_head_log2 = 1u << (uint32_t) floor(log2(softmax_ctx->n_head));
+    smctx->n_head      = src0->ne[2];
+    smctx->n_head_log2 = 1u << (uint32_t) floor(log2(smctx->n_head));
 
-    softmax_ctx->m0 = powf(2.0f, -(softmax_ctx->max_bias) / softmax_ctx->n_head_log2);
-    softmax_ctx->m1 = powf(2.0f, -(softmax_ctx->max_bias / 2.0f) / softmax_ctx->n_head_log2);
+    smctx->m0 = powf(2.0f, -(smctx->max_bias) / smctx->n_head_log2);
+    smctx->m1 = powf(2.0f, -(smctx->max_bias / 2.0f) / smctx->n_head_log2);
 
-    softmax_ctx->use_src1 = (src1->ne[0] != 0);
-    softmax_ctx->use_f16  = (src1->ne[0] != 0) && (src1->type == HTP_TYPE_F16);
+    smctx->use_src1 = (src1->ne[0] != 0);
+    smctx->use_f16  = (src1->ne[0] != 0) && (src1->type == HTP_TYPE_F16);
 
-    softmax_ctx->octx = octx;
+    smctx->octx = octx;
+
+    // Initialize fastdiv values
+    const uint32_t ne01 = src0->ne[1];
+    const uint32_t ne02 = src0->ne[2];
+
+    if (ne01 > 0) smctx->fastdiv_ne01 = init_fastdiv_values(ne01);
+    if (ne02 > 0) smctx->fastdiv_ne02 = init_fastdiv_values(ne02);
+
+    const uint32_t ne12 = (src1->ne[0]) ? src1->ne[2] : 1;
+    const uint32_t ne13 = (src1->ne[0]) ? src1->ne[3] : 1;
+
+    if (ne12 > 0) smctx->fastdiv_ne12 = init_fastdiv_values(ne12);
+    if (ne13 > 0) smctx->fastdiv_ne13 = init_fastdiv_values(ne13);
 }
 
 static void hvx_fast_softmax_prep_f32(const uint8_t * restrict src,
@@ -139,8 +160,7 @@ static void hvx_fast_softmax_f32(const uint8_t * restrict src,
         max_vec       = Q6_Vsf_vmax_VsfVsf(max_vec, v1);
     }
 
-    HVX_Vector v = hvx_vec_reduce_max_f32(max_vec);
-    max_vec      = hvx_vec_repl4(v);
+    max_vec = hvx_vec_reduce_max_f32(max_vec); // replicated over all lanes
 
     #pragma unroll(4)
     for (int i = 0; i < step_of_1; i++) {
@@ -154,8 +174,7 @@ static void hvx_fast_softmax_f32(const uint8_t * restrict src,
         v_pad[i] = v3;
     }
 
-    v       = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(sum_vec));
-    sum_vec = hvx_vec_repl4(v);
+    sum_vec = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(sum_vec)); // replicated over all lanes
 
     HVX_VectorPred pos_sum   = Q6_Q_vcmp_gt_VwVw(sum_vec, zero_v);
     HVX_Vector     v4        = hvx_vec_inverse_f32(sum_vec);
@@ -183,83 +202,9 @@ static float hvx_softmax_f32(const uint8_t * restrict src,
     return sum;
 }
 
-static void softmax_htp_f32(int nth, int ith, struct softmax_th_ctx * softmax_ctx, int opt_path) {
-    struct htp_ops_context * octx = softmax_ctx->octx;
-
-    const struct htp_tensor * src0 = &octx->src0;
-    const struct htp_tensor * src1 = &octx->src1;
-    const struct htp_tensor * dst  = &octx->dst;
-
-    htp_softmax_preamble3;
-
-    uint8_t * src0_spad_data = octx->src0_spad.data + (ith * nb01);
-    uint8_t * src1_spad_data = octx->src1_spad.data + (ith * nb01);
-    uint8_t * dst_spad_data  = octx->dst_spad.data + (ith * nb1);
-
-    float * wp0 = (float *) src0_spad_data;
-    float * wp1 = (float *) src1_spad_data;
-    float * wp2 = (float *) dst_spad_data;
-
-    for (uint32_t i03 = 0; i03 < ne03; i03++) {
-        for (uint32_t i02 = 0; i02 < ne02; i02++) {
-            for (uint32_t i01 = ith; i01 < ne01; i01 += nth) {
-                const uint32_t i11 = i01;
-                const uint32_t i12 = i02 % ne12;
-                const uint32_t i13 = i03 % ne13;
-
-                // ALiBi
-                const uint32_t h = i02;  // head
-
-                const float slope = (softmax_ctx->max_bias > 0.0f) ?
-                                        h < softmax_ctx->n_head_log2 ?
-                                        powf(softmax_ctx->m0, h + 1) :
-                                        powf(softmax_ctx->m1, 2 * (h - softmax_ctx->n_head_log2) + 1) :
-                                        1.0f;
-
-                float * sp = (float *) ((char *) octx->src0.data + i01 * nb01 + i02 * nb02 + i03 * nb03);
-                float * dp = (float *) ((char *) octx->dst.data + i01 * nb1 + i02 * nb2 + i03 * nb3);
-
-                // broadcast the mask across rows
-                __fp16 * mp_f16 = (softmax_ctx->use_src1) ?
-                                      (__fp16 *) ((char *) octx->src1.data + i11 * nb11 + i12 * nb12 + i13 * nb13) :
-                                      NULL;
-                float *  mp_f32 = (softmax_ctx->use_src1) ?
-                                      (float *) ((char *) octx->src1.data + i11 * nb11 + i12 * nb12 + i13 * nb13) :
-                                      NULL;
-
-                if ((1 == opt_path) && (mp_f32) && !(softmax_ctx->use_f16)) {
-                    hvx_fast_softmax_prep_f32((const uint8_t *) sp, (uint8_t *) wp0, ne00, softmax_ctx->scale,
-                                              (const uint8_t *) mp_f32, slope);
-                } else {
-                    hvx_scale_f32((uint8_t *) wp0, (const uint8_t *) sp, ne00, softmax_ctx->scale);
-                    if (mp_f32) {
-                        if (softmax_ctx->use_f16) {
-                            for (int i = 0; i < ne00; ++i) {
-                                wp0[i] += slope * (float) mp_f16[i];
-                            }
-                        } else {
-                            for (int i = 0; i < ne00; ++i) {
-                                wp0[i] += slope * mp_f32[i];
-                            }
-                        }
-                    }
-                }
-
-                if (1 == opt_path) {
-                    hvx_fast_softmax_f32((const uint8_t *) wp0, (uint8_t *) dp, (uint8_t *) wp1, ne00);
-                } else {
-                    float max = hvx_reduce_max_f32((const uint8_t *) wp0, ne00);
-                    float sum = hvx_softmax_f32((const uint8_t *) wp0, (uint8_t *) wp2, (uint8_t *) wp1, ne00, max);
-                    sum       = sum > 0.0 ? (1.0 / sum) : 1;
-                    hvx_scale_f32((uint8_t *) dp, (const uint8_t *) wp2, ne00, sum);
-                }
-            }
-        }
-    }
-}
-
-static void softmax_job_f32_per_thread(struct softmax_th_ctx * softmax_ctx, int nth, int ith) {
-    struct htp_ops_context * octx = softmax_ctx->octx;
+static void softmax_job_f32(unsigned int nth, unsigned int ith, void * data) {
+    struct htp_softmax_context * smctx = (struct htp_softmax_context *) data;
+    struct htp_ops_context * octx = smctx->octx;
 
     const struct htp_tensor * src0 = &octx->src0;
     const struct htp_tensor * src1 = &octx->src1;
@@ -268,7 +213,7 @@ static void softmax_job_f32_per_thread(struct softmax_th_ctx * softmax_ctx, int
     htp_softmax_preamble3;
 
     const uint32_t src0_nrows            = ne01 * ne02 * ne03;  // src0 rows
-    const uint32_t src0_nrows_per_thread = octx->src0_nrows_per_thread;
+    const uint32_t src0_nrows_per_thread = smctx->src0_nrows_per_thread;
 
     const uint32_t src0_start_row = src0_nrows_per_thread * ith;
     const uint32_t src0_end_row   = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
@@ -291,20 +236,103 @@ static void softmax_job_f32_per_thread(struct softmax_th_ctx * softmax_ctx, int
         opt_path = 1;
     }
 
-    softmax_htp_f32(nth, ith, softmax_ctx, opt_path);
+    uint8_t * src0_spad_data = octx->src0_spad.data + (ith * smctx->spad_stride);
+    uint8_t * src1_spad_data = octx->src1_spad.data + (ith * smctx->spad_stride);
+    uint8_t * dst_spad_data  = octx->dst_spad.data + (ith * smctx->spad_stride);
+
+    float * wp0 = (float *) src0_spad_data;
+    float * wp1 = (float *) src1_spad_data;
+    float * wp2 = (float *) dst_spad_data;
+
+    uint32_t prev_i2 = (uint32_t)-1;
+    float slope = 1.0f;
+
+    for (uint32_t r = src0_start_row; r < src0_end_row; ++r) {
+        uint32_t i1 = fastmodulo(r, ne01, &smctx->fastdiv_ne01);
+        uint32_t r_div_ne01 = fastdiv(r, &smctx->fastdiv_ne01);
+        uint32_t i2 = fastmodulo(r_div_ne01, ne02, &smctx->fastdiv_ne02);
+        uint32_t i3 = fastdiv(r_div_ne01, &smctx->fastdiv_ne02);
+
+        // Map to original logic indices
+        // i01 = i1
+        // i02 = i2
+        // i03 = i3
+
+        const uint32_t i11 = i1;
+        // const uint32_t i12 = i2 % ne12;
+        // const uint32_t i13 = i3 % ne13;
+
+        uint32_t i12, i13;
+        if (ne12 == ne02) {
+             i12 = i2;
+        } else {
+             i12 = fastmodulo(i2, ne12, &smctx->fastdiv_ne12);
+        }
+
+        if (ne13 == ne03) {
+             i13 = i3;
+        } else {
+             i13 = fastmodulo(i3, ne13, &smctx->fastdiv_ne13);
+        }
+
+        // ALiBi
+        if (i2 != prev_i2) {
+            const uint32_t h = i2;  // head
+
+            slope = (smctx->max_bias > 0.0f) ?
+                        h < smctx->n_head_log2 ?
+                        powf(smctx->m0, h + 1) :
+                        powf(smctx->m1, 2 * (h - smctx->n_head_log2) + 1) :
+                        1.0f;
+            prev_i2 = i2;
+        }
+
+        float * sp = (float *) ((char *) octx->src0.data + i1 * nb01 + i2 * nb02 + i3 * nb03);
+        float * dp = (float *) ((char *) octx->dst.data + i1 * nb1 + i2 * nb2 + i3 * nb3);
+
+        // broadcast the mask across rows
+        __fp16 * mp_f16 = (smctx->use_src1) ?
+                              (__fp16 *) ((char *) octx->src1.data + i11 * nb11 + i12 * nb12 + i13 * nb13) :
+                              NULL;
+        float *  mp_f32 = (smctx->use_src1) ?
+                              (float *) ((char *) octx->src1.data + i11 * nb11 + i12 * nb12 + i13 * nb13) :
+                              NULL;
+
+        if ((1 == opt_path) && (mp_f32) && !(smctx->use_f16)) {
+            hvx_fast_softmax_prep_f32((const uint8_t *) sp, (uint8_t *) wp0, ne00, smctx->scale,
+                                      (const uint8_t *) mp_f32, slope);
+        } else {
+            hvx_scale_f32((uint8_t *) wp0, (const uint8_t *) sp, ne00, smctx->scale);
+            if (mp_f32) {
+                if (smctx->use_f16) {
+                    for (int i = 0; i < ne00; ++i) {
+                        wp0[i] += slope * (float) mp_f16[i];
+                    }
+                } else {
+                    for (int i = 0; i < ne00; ++i) {
+                        wp0[i] += slope * mp_f32[i];
+                    }
+                }
+            }
+        }
+
+        if (1 == opt_path) {
+            hvx_fast_softmax_f32((const uint8_t *) wp0, (uint8_t *) dp, (uint8_t *) wp1, ne00);
+        } else {
+            float max = hvx_reduce_max_f32((const uint8_t *) wp0, ne00);
+            float sum = hvx_softmax_f32((const uint8_t *) wp0, (uint8_t *) wp2, (uint8_t *) wp1, ne00, max);
+            sum       = sum > 0.0 ? (1.0 / sum) : 1;
+            hvx_scale_f32((uint8_t *) dp, (const uint8_t *) wp2, ne00, sum);
+        }
+    }
 
     t2 = HAP_perf_get_qtimer_count();
 
     FARF(HIGH, "softmax-f32 %d/%d/%d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth,
-         softmax_ctx->use_f16, opt_path, ne00, ne01, ne02, ne03, src0_start_row, src0_end_row, ne10, ne11, ne12, ne13,
+         smctx->use_f16, opt_path, ne00, ne01, ne02, ne03, src0_start_row, src0_end_row, ne10, ne11, ne12, ne13,
          ne0, ne1, ne2, ne3, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
 
-static void softmax_job_dispatcher_f32(unsigned int n, unsigned int i, void * p_data) {
-    struct softmax_th_ctx * p_softmax_ctx = (struct softmax_th_ctx *) p_data;
-    softmax_job_f32_per_thread(p_softmax_ctx, n, i);
-}
-
 static int execute_op_softmax_f32(struct htp_ops_context * octx) {
     int err = HTP_STATUS_OK;
 
@@ -312,17 +340,12 @@ static int execute_op_softmax_f32(struct htp_ops_context * octx) {
     const struct htp_tensor * src1 = &octx->src1;
     struct htp_tensor *       dst  = &octx->dst;
 
-    worker_callback_t op_func;
-    const char *      op_type = NULL;
-
-    struct softmax_th_ctx softmax_ctx;
+    struct htp_softmax_context smctx;
+    const char * op_type = "softmax-f32";
 
     switch (octx->op) {
         case HTP_OP_SOFTMAX:
-            op_func = softmax_job_dispatcher_f32;
-            op_type = "softmax-f32";
-
-            init_softmax_ctx(&softmax_ctx, octx);
+            init_softmax_ctx(&smctx, octx);
             break;
 
         default:
@@ -342,6 +365,9 @@ static int execute_op_softmax_f32(struct htp_ops_context * octx) {
     octx->src0_spad.size = hex_round_up(src0_row_size, 128) * n_threads;
     octx->src1_spad.size = hex_round_up(src1_row_size, 128) * n_threads;
 
+    // Use stride for calculating offset
+    smctx.spad_stride = hex_round_up(src0_row_size, 128);
+
     size_t spad_size = octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size;
 
     if (src1->ne[0]) {
@@ -371,8 +397,8 @@ static int execute_op_softmax_f32(struct htp_ops_context * octx) {
 
     if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
         uint32_t n_jobs             = MIN(n_threads, src0_nrows);
-        octx->src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
-        worker_pool_run_func(octx->ctx->worker_pool, op_func, &softmax_ctx, n_jobs);
+        smctx.src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
+        worker_pool_run_func(octx->ctx->worker_pool, softmax_job_f32, &smctx, n_jobs);
     }
 
     return err;
diff --git a/ggml/src/ggml-hexagon/htp/sum-rows-ops.c b/ggml/src/ggml-hexagon/htp/sum-rows-ops.c
index 62e45da2b3..04fa72182a 100644
--- a/ggml/src/ggml-hexagon/htp/sum-rows-ops.c
+++ b/ggml/src/ggml-hexagon/htp/sum-rows-ops.c
@@ -17,7 +17,6 @@
 #include "htp-msg.h"
 #include "htp-ops.h"
 
-
 #define sum_rows_preamble                       \
     struct htp_tensor *src0 =  &octx->src0;\
     struct htp_tensor *dst  = &octx->dst;  \
@@ -42,53 +41,54 @@
     const uint32_t  nb2 = dst->nb[2];      \
     const uint32_t  nb3 = dst->nb[3];      \
 
-static int sum_rows_thread_f32(struct htp_ops_context * octx, const int nth, const int ith) {
-    sum_rows_preamble;
+struct sum_rows_context {
+    const uint8_t * src_data;
+    uint8_t       * dst_data;
+    uint32_t        ne00;
+    size_t          src_stride;
+    size_t          dst_stride;
+    uint32_t        rows_per_thread;
+    uint32_t        total_rows;
+    bool            opt_path;
+};
 
-    const uint32_t src0_nrows_per_thread  = octx->src0_nrows_per_thread;
-    const size_t src0_row_size = nb01;
-    const size_t dst_row_size  = nb1;
+static void sum_rows_thread_f32(unsigned int nth, unsigned int ith, void *data) {
+    const struct sum_rows_context * smctx = (const struct sum_rows_context *) data;
 
-    const uint32_t src0_nrows = ne01 * ne02 * ne03;  // src0 rows
+    const uint32_t rows_per_thread = smctx->rows_per_thread;
+    const uint32_t total_rows      = smctx->total_rows;
 
-    const uint32_t src0_start_row = src0_nrows_per_thread * ith;
-    const uint32_t src0_end_row   = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
+    const uint32_t start_row = rows_per_thread * ith;
+    const uint32_t end_row   = MIN(start_row + rows_per_thread, total_rows);
 
-    // no work for this thread
-    if (src0_start_row >= src0_end_row) {
-        return HTP_STATUS_OK;
+    if (start_row >= end_row) {
+        return;
     }
 
-    int opt_path   = 0;
-    if ((0 == hex_is_aligned((void *) src0->data, VLEN)) && !(nb01 & (VLEN - 1))) {
-        opt_path = 1;
-    }
+    const size_t   src_stride = smctx->src_stride;
+    const size_t   dst_stride = smctx->dst_stride;
+    const uint32_t ne00       = smctx->ne00;
+    const bool     opt_path   = smctx->opt_path;
 
-    const uint8_t * restrict data_src = (const uint8_t *) src0->data;
-    uint8_t * restrict data_dst       = (uint8_t *) dst->data;
+    const float * restrict src_th = (const float *) (smctx->src_data + (start_row * src_stride));
+    float       * restrict dst_th = (float *)       (smctx->dst_data + (start_row * dst_stride));
 
-    const float * restrict src_th = (float *) (data_src + (src0_start_row * src0_row_size));
-    float * restrict dst_th       = (float *) (data_dst + (src0_start_row * dst_row_size));
+    // Calculate actual number of rows for this thread
+    const uint32_t n_rows = end_row - start_row;
 
-    for (uint32_t ir = 0; ir < src0_nrows_per_thread; ir++) {
-        const float * restrict src_local = src_th + (ir * ne00);
+    for (uint32_t ir = 0; ir < n_rows; ir++) {
+        const float * restrict src_local = src_th + (ir * (src_stride / sizeof(float)));
 
-        if (ir + 1 < src0_nrows_per_thread) {
-            hex_l2fetch(src_local + ne00, src0_row_size, src0_row_size, 1);
+        if (ir + 1 < n_rows) {
+            hex_l2fetch(src_local + (src_stride / sizeof(float)), src_stride, src_stride, 1);
         }
 
-        if (1 == opt_path) {
+        if (opt_path) {
             dst_th[ir] = hvx_reduce_sum_f32_a((const uint8_t *) src_local, ne00);
         } else {
             dst_th[ir] = hvx_reduce_sum_f32((const uint8_t *) src_local, ne00);
         }
     }
-
-    return HTP_STATUS_OK;
-}
-
-static void sum_rows_work_f32(unsigned int n, unsigned int i, void *data) {
-    sum_rows_thread_f32((struct htp_ops_context *) data, n, i);
 }
 
 int op_sum_rows(struct htp_ops_context * octx) {
@@ -106,10 +106,25 @@ int op_sum_rows(struct htp_ops_context * octx) {
     const uint32_t src0_nrows = ne01 * ne02 * ne03;
 
     uint32_t n_jobs = MIN(n_threads, src0_nrows);
-    octx->src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
+    uint32_t rows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
 
-    worker_pool_run_func(octx->ctx->worker_pool, sum_rows_work_f32, octx, n_jobs);
+    bool opt_path = false;
+    if ((0 == hex_is_aligned((void *) src0->data, VLEN)) && !(nb01 & (VLEN - 1))) {
+        opt_path = true;
+    }
+
+    struct sum_rows_context smctx = {
+        .src_data        = (const uint8_t *) src0->data,
+        .dst_data        = (uint8_t *) dst->data,
+        .ne00            = ne00,
+        .src_stride      = nb01,
+        .dst_stride      = nb1,
+        .rows_per_thread = rows_per_thread,
+        .total_rows      = src0_nrows,
+        .opt_path        = opt_path,
+    };
+
+    worker_pool_run_func(octx->ctx->worker_pool, sum_rows_thread_f32, &smctx, n_jobs);
 
     return HTP_STATUS_OK;
 }
-
diff --git a/ggml/src/ggml-hexagon/htp/unary-ops.c b/ggml/src/ggml-hexagon/htp/unary-ops.c
index ce879bf037..98135c50ab 100644
--- a/ggml/src/ggml-hexagon/htp/unary-ops.c
+++ b/ggml/src/ggml-hexagon/htp/unary-ops.c
@@ -17,6 +17,28 @@
 #include "htp-msg.h"
 #include "htp-ops.h"
 
+struct htp_unary_context {
+    struct htp_ops_context * octx;
+
+    // Precomputed values
+    const uint8_t *           data_src0;
+    uint8_t *                 data_dst;
+
+    size_t                    src0_row_size;
+    size_t                    dst_row_size;
+
+    size_t                    src0_row_size_aligned;
+    size_t                    dst_row_size_aligned;
+
+    size_t                    src0_spad_half_size;
+    size_t                    dst_spad_half_size;
+
+    uint32_t                  block;
+    uint32_t                  src0_nrows;
+    uint32_t                  src0_nrows_per_thread;
+    uint32_t                  nc;
+};
+
 #define htp_unary_preamble            \
     const uint32_t ne00 = src->ne[0]; \
     const uint32_t ne01 = src->ne[1]; \
@@ -57,8 +79,7 @@ static void hvx_fast_rms_norm_f32(const uint8_t * restrict src,
         sum_v         = Q6_Vqf32_vadd_Vqf32Vqf32(sum_v, v2);
     }
 
-    HVX_Vector reduced_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(sum_v));
-    sum_v                  = hvx_vec_repl4(reduced_sum);
+    sum_v = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(sum_v)); // replicated over all lanes
 
     HVX_Vector t_v            = hvx_vec_splat_f32((float) num_elems);
     HVX_Vector denom_v        = hvx_vec_inverse_f32(t_v);
@@ -75,128 +96,95 @@ static void hvx_fast_rms_norm_f32(const uint8_t * restrict src,
     }
 }
 
-static void scale_htp_f32(const float * restrict src,
-                          float * restrict dst,
-                          uint8_t * restrict spad,
-                          const uint32_t num_rows,
-                          const uint32_t row_elems,
-                          const size_t   row_size,
-                          int32_t *      op_params,
-                          int            opt_path) {
+static void scale_f32(const float * restrict src,
+                      float * restrict dst,
+                      uint8_t * restrict spad,
+                      const uint32_t num_rows,
+                      const uint32_t row_elems,
+                      const size_t   row_size,
+                      int32_t *      op_params) {
     float scale = 0.f;
     float bias  = 0.f;
     memcpy(&scale, &op_params[0], sizeof(float));
     memcpy(&bias,  &op_params[1], sizeof(float));
 
     for (uint32_t ir = 0; ir < num_rows; ir++) {
-        const float * restrict src_local = src + (ir * row_elems);
-        float * restrict dst_local       = dst + (ir * row_elems);
+        const uint8_t * restrict src_local = (const uint8_t *)src + (ir * row_size);
+        uint8_t * restrict dst_local       = (uint8_t *)dst + (ir * row_size);
 
-        if (ir + 1 < num_rows) {
-            hex_l2fetch(src_local + row_elems, row_size, row_size, 1);
-        }
-
-        hvx_scale_offset_f32((uint8_t *) dst_local, (const uint8_t *) src_local, row_elems, scale, bias);
+        hvx_scale_offset_f32_aa((uint8_t *) dst_local, (const uint8_t *) src_local, row_elems, scale, bias);
     }
 }
 
-static void rms_norm_htp_f32(const float * restrict src,
-                             float * restrict dst,
-                             uint8_t * restrict spad,
-                             const uint32_t num_rows,
-                             const uint32_t row_elems,
-                             const size_t   row_size,
-                             int32_t *      op_params,
-                             int            opt_path) {
+static void rms_norm_f32(const float * restrict src,
+                         float * restrict dst,
+                         uint8_t * restrict spad,
+                         const uint32_t num_rows,
+                         const uint32_t row_elems,
+                         const size_t   row_size,
+                         int32_t *      op_params) {
     float epsilon = 0.f;
     memcpy(&epsilon, op_params, sizeof(float));
 
     for (uint32_t ir = 0; ir < num_rows; ir++) {
-        const float * restrict src_local = src + (ir * row_elems);
-        float * restrict dst_local       = dst + (ir * row_elems);
+        const uint8_t * restrict src_local = (const uint8_t *)src + (ir * row_size);
+        uint8_t * restrict dst_local       = (uint8_t *)dst + (ir * row_size);
 
-        if (ir + 1 < num_rows) {
-            hex_l2fetch(src_local + row_elems, row_size, row_size, 1);
-        }
-
-        if (1 == opt_path) {
-            hvx_fast_rms_norm_f32((const uint8_t *) src_local, (uint8_t *) dst_local, spad, row_elems, epsilon);
-        } else {
-            float sum = hvx_sum_of_squares_f32((const uint8_t *) src_local, row_elems);
-
-            const float mean  = sum / row_elems;
-            const float scale = 1.0f / sqrtf(mean + epsilon);
-
-            hvx_scale_f32((uint8_t *) dst_local, (const uint8_t *) src_local, row_elems, scale);
-        }
+        hvx_fast_rms_norm_f32((const uint8_t *) src_local, (uint8_t *) dst_local, spad, row_elems, epsilon);
     }
 }
 
-static void sqr_htp_f32(const float * restrict src,
-                          float * restrict dst,
-                          uint8_t * restrict spad,
-                          const uint32_t num_rows,
-                          const uint32_t row_elems,
-                          const size_t   row_size,
-                          int32_t *      op_params,
-                          int            opt_path) {
+static void sqr_f32(const float * restrict src,
+                    float * restrict dst,
+                    uint8_t * restrict spad,
+                    const uint32_t num_rows,
+                    const uint32_t row_elems,
+                    const size_t   row_size,
+                    int32_t *      op_params) {
 
     for (uint32_t ir = 0; ir < num_rows; ir++) {
-        const float * restrict src_local = src + (ir * row_elems);
-        float * restrict dst_local       = dst + (ir * row_elems);
+        const uint8_t * restrict src_local = (const uint8_t *)src + (ir * row_size);
+        uint8_t * restrict dst_local       = (uint8_t *)dst + (ir * row_size);
 
-        if (ir + 1 < num_rows) {
-            hex_l2fetch(src_local + row_elems, row_size, row_size, 1);
-        }
-
-        if (1 == opt_path) {
-            hvx_sqr_f32_aa((uint8_t *) dst_local, (const uint8_t *) src_local, row_elems);
-        } else {
-            hvx_sqr_f32((uint8_t *) dst_local, (const uint8_t *) src_local, row_elems);
-        }
+        hvx_sqr_f32_aa((uint8_t *) dst_local, (const uint8_t *) src_local, row_elems);
     }
 }
 
-static void sqrt_htp_f32(const float * restrict src,
-                          float * restrict dst,
-                          uint8_t * restrict spad,
-                          const uint32_t num_rows,
-                          const uint32_t row_elems,
-                          const size_t   row_size,
-                          int32_t *      op_params,
-                          int            opt_path) {
+static void sqrt_f32(const float * restrict src,
+                     float * restrict dst,
+                     uint8_t * restrict spad,
+                     const uint32_t num_rows,
+                     const uint32_t row_elems,
+                     const size_t   row_size,
+                     int32_t *      op_params) {
 
     for (uint32_t ir = 0; ir < num_rows; ir++) {
-        const float * restrict src_local = src + (ir * row_elems);
-        float * restrict dst_local       = dst + (ir * row_elems);
+        const uint8_t * restrict src_local = (const uint8_t *)src + (ir * row_size);
+        uint8_t * restrict dst_local       = (uint8_t *)dst + (ir * row_size);
 
-        if (ir + 1 < num_rows) {
-            hex_l2fetch(src_local + row_elems, row_size, row_size, 1);
-        }
-
-        if (1 == opt_path) {
-            hvx_sqrt_f32_aa((uint8_t *) dst_local, (const uint8_t *) src_local, row_elems);
-        } else {
-            hvx_sqrt_f32((uint8_t *) dst_local, (const uint8_t *) src_local, row_elems);
-        }
+        hvx_sqrt_f32_aa((uint8_t *) dst_local, (const uint8_t *) src_local, row_elems);
     }
 }
 
-static void unary_job_f32_per_thread(const struct htp_tensor * src,
-                                     struct htp_tensor *       dst,
-                                     uint8_t *                 spad,
-                                     int                       htp_op,
-                                     int32_t *                 op_params,
-                                     uint32_t                  nth,
-                                     uint32_t                  ith,
-                                     uint32_t                  src0_nrows_per_thread) {
+static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void * data) {
+    const struct htp_unary_context * uctx = (const struct htp_unary_context *) data;
+    struct htp_ops_context * octx = uctx->octx;
+    const struct htp_tensor * src = &octx->src0;
+    const struct htp_tensor * dst = &octx->dst;
+
     htp_unary_preamble;
 
-    const size_t src0_row_size = nb01;
-    const size_t dst_row_size  = nb1;
+    int                       htp_op = octx->op;
+    int32_t *                 op_params = octx->op_params;
+    uint32_t                  src0_nrows_per_thread = uctx->src0_nrows_per_thread;
 
-    const uint32_t src0_nrows = ne01 * ne02 * ne03;  // src0 rows
+    const size_t src0_row_size = uctx->src0_row_size;
+    const size_t dst_row_size  = uctx->dst_row_size;
 
+    const size_t src0_row_size_aligned = uctx->src0_row_size_aligned;
+    const size_t dst_row_size_aligned  = uctx->dst_row_size_aligned;
+
+    const uint32_t src0_nrows = uctx->src0_nrows;
     const uint32_t src0_start_row = src0_nrows_per_thread * ith;
     const uint32_t src0_end_row   = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
 
@@ -208,79 +196,104 @@ static void unary_job_f32_per_thread(const struct htp_tensor * src,
     uint64_t t1, t2;
     t1 = HAP_perf_get_qtimer_count();
 
-    int is_aligned = 1;
-    int opt_path   = 0;
-    if ((0 == hex_is_aligned((void *) src->data, VLEN)) || (0 == hex_is_aligned((void *) dst->data, VLEN))) {
-        is_aligned = 0;
-    }
-    if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
-        opt_path = 1;
+    const uint8_t * restrict data_src = uctx->data_src0;
+    uint8_t * restrict       data_dst = uctx->data_dst;
+
+    uint8_t * src0_spad_data = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread);
+    uint8_t * dst_spad_data  = octx->dst_spad.data  + (ith * octx->dst_spad.size_per_thread);
+
+    size_t src0_spad_half_size = uctx->src0_spad_half_size;
+    size_t dst_spad_half_size  = uctx->dst_spad_half_size;
+
+    const int BLOCK = uctx->block;
+    if (BLOCK == 0) {
+        FARF(ERROR, "unary-f32 : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n",
+             octx->src0_spad.size_per_thread, src0_row_size_aligned);
+        return;
     }
 
-    const uint8_t * restrict data_src = (const uint8_t *) src->data;
-    uint8_t * restrict data_dst       = (uint8_t *) dst->data;
+    dma_queue * dma_queue = octx->ctx->dma[ith];
 
-    const float * restrict src_th = (float *) (data_src + (src0_start_row * src0_row_size));
-    float * restrict dst_th       = (float *) (data_dst + (src0_start_row * dst_row_size));
-    uint8_t * restrict spad_th    = (uint8_t *) spad + (ith * nb01);
+    for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; ir += BLOCK, spad_idx++) {
+        const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
 
-    switch (htp_op) {
-        case HTP_OP_RMS_NORM:
-            rms_norm_htp_f32(src_th, dst_th, spad_th, src0_end_row - src0_start_row, ne0, nb1, op_params, opt_path);
-            break;
-        case HTP_OP_SCALE:
-            scale_htp_f32(src_th, dst_th, spad_th, src0_end_row - src0_start_row, ne0, nb1, op_params, opt_path);
-            break;
-        case HTP_OP_SQR:
-            sqr_htp_f32(src_th, dst_th, spad_th, src0_end_row - src0_start_row, ne0, nb1, op_params, opt_path);
-            break;
-        case HTP_OP_SQRT:
-            sqrt_htp_f32(src_th, dst_th, spad_th, src0_end_row - src0_start_row, ne0, nb1, op_params, opt_path);
-            break;
+        // Dummy DMA transation for sequencing (interleaving dst,src,dst,...)
+        dma_queue_push_vtcm_to_ddr(dma_queue,
+            dma_make_ptr(data_dst, dst_spad_data + (spad_idx * dst_spad_half_size)),
+            dst_row_size, dst_row_size_aligned, 0);
 
-        default:
-            break;
+        dma_queue_push_ddr_to_vtcm(dma_queue,
+            dma_make_ptr(src0_spad_data + (spad_idx * src0_spad_half_size), data_src + (ir * src0_row_size)),
+            src0_row_size_aligned, src0_row_size, block_size);
     }
 
+    for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) {
+        const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
+
+        float * dst_spad  = (float *) dma_queue_pop(dma_queue).src;
+        float * src0_spad = (float *) dma_queue_pop(dma_queue).dst;
+
+        // Process block in VTCM
+        switch (htp_op) {
+            case HTP_OP_RMS_NORM:
+                rms_norm_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params);
+                break;
+            case HTP_OP_SCALE:
+                scale_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params);
+                break;
+            case HTP_OP_SQR:
+                sqr_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params);
+                break;
+            case HTP_OP_SQRT:
+                sqrt_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params);
+                break;
+            default:
+                break;
+        }
+
+        dma_queue_push_vtcm_to_ddr(dma_queue,
+            dma_make_ptr(data_dst + (ir * dst_row_size), dst_spad),
+            dst_row_size, dst_row_size_aligned, block_size);
+
+        // prefetch N+2 loop iteration if any
+        const uint32_t pref_block = (ir + BLOCK * 2);
+        if (pref_block < src0_end_row) {
+            const uint32_t pref_block_size = MIN(BLOCK, src0_end_row - pref_block);
+            dma_queue_push_ddr_to_vtcm(dma_queue,
+                dma_make_ptr(src0_spad, data_src + (pref_block * src0_row_size)),
+                src0_row_size_aligned, src0_row_size, pref_block_size);
+        }
+    }
+
+    dma_queue_flush(dma_queue);
+
     t2 = HAP_perf_get_qtimer_count();
 
-    FARF(HIGH, "unary-f32 %d/%d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n", ith, nth, opt_path, src->ne[0],
+    FARF(HIGH, "unary-f32 %d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n", ith, nth, src->ne[0],
          src->ne[1], src->ne[2], src->ne[3], src0_start_row, src0_end_row, dst->ne[0], dst->ne[1], dst->ne[2],
          dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
 
-static void unary_job_dispatcher_f32(unsigned int n, unsigned int i, void * data) {
-    struct htp_ops_context * octx = (struct htp_ops_context *) data;
-
-    unary_job_f32_per_thread(&octx->src0, &octx->dst, octx->src0_spad.data, octx->op, octx->op_params, n, i,
-                             octx->src0_nrows_per_thread);
-}
-
 static int execute_op_unary_f32(struct htp_ops_context * octx) {
     int err = HTP_STATUS_OK;
 
     const struct htp_tensor * src0 = &octx->src0;
     struct htp_tensor *       dst  = &octx->dst;
 
-    worker_callback_t unary_op_func;
-    const char *      op_type = NULL;
+    const char * op_type = NULL;
 
     switch (octx->op) {
         case HTP_OP_RMS_NORM:
-            unary_op_func = unary_job_dispatcher_f32;
-            op_type       = "rmsnorm-f32";
+            op_type = "rmsnorm-f32";
             break;
         case HTP_OP_SCALE:
-            unary_op_func = unary_job_dispatcher_f32;
-            op_type       = "scale-f32";
+            op_type = "scale-f32";
             break;
         case HTP_OP_SQR:
-            unary_op_func = unary_job_dispatcher_f32;
-            op_type       = "sqr-f32";
+            op_type = "sqr-f32";
             break;
         case HTP_OP_SQRT:
-            unary_op_func = unary_job_dispatcher_f32;
-            op_type       = "sqrt-f32";
+            op_type = "sqrt-f32";
             break;
 
         default:
@@ -294,32 +307,61 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) {
     const size_t src0_row_size = src0->nb[1];
     const size_t dst_row_size  = dst->nb[1];
 
-    // VTCM scratchpads for all tensors
-    octx->dst_spad.size  = hex_round_up(dst_row_size, 128) * n_threads;
-    octx->src0_spad.size = hex_round_up(src0_row_size, 128) * n_threads;
+    const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
+    const size_t dst_row_size_aligned  = hex_round_up(dst_row_size, VLEN);
 
-    size_t spad_size = octx->src0_spad.size + octx->dst_spad.size;
+    // VTCM scratchpads for all tensors
+    // N rows per thread, padded to HVX vector size
+    // Double buffering requires 2x size per buffer
+
+    size_t spad_size_per_row   = 2 * (src0_row_size_aligned + dst_row_size_aligned);
+    size_t vtcm_row_per_thread = (octx->ctx->vtcm_size)/ (n_threads * spad_size_per_row);
+
+    // Make sure the reserved vtcm size is sufficient
+    if (vtcm_row_per_thread == 0) {
+        FARF(ERROR, "unary-%s : current VTCM reservation %zu is too small, needed %zu\n", op_type, octx->ctx->vtcm_size,
+             spad_size_per_row * n_threads);
+        return HTP_STATUS_VTCM_TOO_SMALL;
+    }
+
+    octx->src0_spad.size_per_thread = src0_row_size_aligned * vtcm_row_per_thread * 2;
+    octx->dst_spad.size_per_thread  = dst_row_size_aligned * vtcm_row_per_thread * 2;
+
+    octx->src0_spad.size = n_threads * octx->src0_spad.size_per_thread;
+    octx->dst_spad.size  = n_threads * octx->dst_spad.size_per_thread;
+
+    octx->src0_spad.data = octx->ctx->vtcm_base;
+    octx->dst_spad.data  = octx->src0_spad.data + octx->src0_spad.size;
 
     FARF(HIGH, "%s: (%ux%ux%ux%u) -> (%ux%ux%ux%u) : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n", op_type,
          src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
          octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size);
 
-    // Make sure the reserved vtcm size is sufficient
-    if (octx->ctx->vtcm_size < spad_size) {
-        FARF(ERROR, "unary-%s : current VTCM reservation %zu is too small, needed %zu\n", op_type, octx->ctx->vtcm_size,
-             spad_size);
-        return HTP_STATUS_VTCM_TOO_SMALL;
-    }
-
-    octx->src0_spad.data = octx->ctx->vtcm_base;
-    octx->dst_spad.data  = octx->src0_spad.data + octx->src0_spad.size;
-
     if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
         uint32_t n_jobs = MIN(n_threads, src0_nrows);
 
-        octx->src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
+        struct htp_unary_context uctx = {
+            .octx                  = octx,
+            .src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs,
+            .src0_nrows            = src0_nrows,
 
-        worker_pool_run_func(octx->ctx->worker_pool, unary_op_func, octx, n_jobs);
+            .data_src0             = (const uint8_t *)src0->data,
+            .data_dst              = (uint8_t *)dst->data,
+
+            .src0_row_size         = src0_row_size,
+            .dst_row_size          = dst_row_size,
+
+            .src0_row_size_aligned = src0_row_size_aligned,
+            .dst_row_size_aligned  = dst_row_size_aligned,
+
+            .src0_spad_half_size   = octx->src0_spad.size_per_thread / 2,
+            .dst_spad_half_size    = octx->dst_spad.size_per_thread / 2,
+
+            .block                 = (octx->src0_spad.size_per_thread / 2) / src0_row_size_aligned,
+            .nc                    = src0->ne[0],
+        };
+
+        worker_pool_run_func(octx->ctx->worker_pool, unary_job_f32_per_thread, &uctx, n_jobs);
     }
 
     return err;
diff --git a/scripts/snapdragon/adb/run-cli.sh b/scripts/snapdragon/adb/run-cli.sh
index d19d4e920e..dfc051b28b 100755
--- a/scripts/snapdragon/adb/run-cli.sh
+++ b/scripts/snapdragon/adb/run-cli.sh
@@ -54,6 +54,6 @@ adb $adbserial $adbhost shell " \
     $verbose $experimental $sched $opmask $profile $nhvx $ndev $hb \
       ./$branch/bin/llama-cli --no-mmap -m $basedir/../gguf/$model \
          --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1           \
-         --ctx-size 8192 --batch-size 128 -fa on \
-         -ngl 99 --device $device $cli_opts $@   \
+         --ctx-size 8192 --ubatch-size 256 -fa on                  \
+         -ngl 99 --device $device $cli_opts $@                     \
 "
diff --git a/scripts/snapdragon/adb/run-completion.sh b/scripts/snapdragon/adb/run-completion.sh
index da9df110a0..d53b588739 100755
--- a/scripts/snapdragon/adb/run-completion.sh
+++ b/scripts/snapdragon/adb/run-completion.sh
@@ -54,6 +54,6 @@ adb $adbserial $adbhost shell " \
     $verbose $experimental $sched $opmask $profile $nhvx $ndev $hb        \
       ./$branch/bin/llama-completion --no-mmap -m $basedir/../gguf/$model \
          --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1                  \
-         --ctx-size 8192 --batch-size 128 -fa on \
-         -ngl 99 -no-cnv --device $device $cli_opts $@   \
+         --ctx-size 8192 --ubatch-size 256 -fa on                         \
+         -ngl 99 -no-cnv --device $device $cli_opts $@                    \
 "
diff --git a/scripts/snapdragon/adb/run-mtmd.sh b/scripts/snapdragon/adb/run-mtmd.sh
index fc018e7269..41d7cd44f8 100755
--- a/scripts/snapdragon/adb/run-mtmd.sh
+++ b/scripts/snapdragon/adb/run-mtmd.sh
@@ -58,11 +58,11 @@ adb $adbserial $adbhost shell " \
   cd $basedir; ulimit -c unlimited;        \
     LD_LIBRARY_PATH=$basedir/$branch/lib   \
     ADSP_LIBRARY_PATH=$basedir/$branch/lib \
-    $verbose $experimental $sched $opmask $profile $nhvx $ndev $mtmd_backend       \
-      ./$branch/bin/llama-mtmd-cli --no-mmap -m $basedir/../gguf/$model   \
-         --mmproj $basedir/../gguf/$mmproj \
-         --image $basedir/../gguf/$image \
-         --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1             \
-         --ctx-size 8192 --batch-size 128 -ctk q8_0 -ctv q8_0 -fa on \
-         -ngl 99 --device $device -v $cli_opts $@ \
+    $verbose $experimental $sched $opmask $profile $nhvx $ndev $mtmd_backend \
+      ./$branch/bin/llama-mtmd-cli --no-mmap -m $basedir/../gguf/$model      \
+         --mmproj $basedir/../gguf/$mmproj                                   \
+         --image $basedir/../gguf/$image                                     \
+         --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1                     \
+         --ctx-size 8192 --ubatch-size 256 -fa on                            \
+         -ngl 99 --device $device -v $cli_opts $@                            \
 "
diff --git a/scripts/snapdragon/windows/run-cli.ps1 b/scripts/snapdragon/windows/run-cli.ps1
index b13161aa63..40c7acc430 100644
--- a/scripts/snapdragon/windows/run-cli.ps1
+++ b/scripts/snapdragon/windows/run-cli.ps1
@@ -49,5 +49,5 @@ $env:ADSP_LIBRARY_PATH="$basedir\lib"
 & "$basedir\bin\llama-completion.exe" `
     --no-mmap -no-cnv -m $basedir\..\..\gguf\$model `
     --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 `
-    --ctx-size 8192 --batch-size 128 -ctk q8_0 -ctv q8_0 -fa on `
+    --ctx-size 8192 --ubatch-size 128 -fa on `
     -ngl 99 --device $device $cli_opts
diff --git a/scripts/sync_vendor.py b/scripts/sync_vendor.py
index fe1286d009..2fb809a9f0 100755
--- a/scripts/sync_vendor.py
+++ b/scripts/sync_vendor.py
@@ -5,7 +5,7 @@ import os
 import sys
 import subprocess
 
-HTTPLIB_VERSION = "d4180e923f846b44a3d30acd938438d6e64fc9f6"
+HTTPLIB_VERSION = "refs/tags/v0.34.0"
 
 vendor = {
     "https://github.com/nlohmann/json/releases/latest/download/json.hpp":     "vendor/nlohmann/json.hpp",
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 7cd0bfc0d2..98d055d34e 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -2440,64 +2440,6 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
         // TODO: add more model-specific info which should prevent loading the session file if not identical
     }
 
-    // write output ids
-    {
-        LLAMA_LOG_DEBUG("%s: - writing output ids\n", __func__);
-
-        const auto n_outputs    = this->n_outputs;
-        const auto & output_ids = this->output_ids;
-
-        std::vector<int32_t> w_output_pos;
-
-        w_output_pos.resize(n_outputs);
-
-        // build a more compact representation of the output ids
-        for (size_t i = 0; i < n_batch(); ++i) {
-            // map an output id to a position in the batch
-            int64_t pos = output_ids[i];
-            if (pos >= 0) {
-                GGML_ASSERT(pos < n_outputs);
-                w_output_pos[pos] = i;
-            }
-        }
-
-        io.write(&n_outputs, sizeof(n_outputs));
-
-        if (n_outputs) {
-            io.write(w_output_pos.data(), n_outputs * sizeof(int32_t));
-        }
-    }
-
-    // [TAG_CONTEXT_STATE_LOGITS]
-    // write logits
-    {
-        LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__);
-
-        const uint64_t logits_size = std::min((uint64_t) this->logits.size, (uint64_t) n_outputs * model.vocab.n_tokens());
-
-        io.write(&logits_size, sizeof(logits_size));
-
-        if (logits_size) {
-            io.write(logits.data, logits_size * sizeof(float));
-        }
-    }
-
-    // write embeddings
-    {
-        LLAMA_LOG_DEBUG("%s: - writing embeddings\n", __func__);
-
-        const uint64_t embd_size = std::min((uint64_t) this->embd.size, (uint64_t) n_outputs * model.hparams.n_embd);
-
-        io.write(&embd_size, sizeof(embd_size));
-
-        if (embd_size) {
-            io.write(embd.data, embd_size * sizeof(float));
-        }
-    }
-
-    // TODO: handle sampling buffers and samplers state ?
-    //       https://github.com/ggml-org/llama.cpp/pull/17004
-
     if (memory != nullptr) {
         LLAMA_LOG_DEBUG("%s: - writing memory module\n", __func__);
         memory->state_write(io);
@@ -2523,70 +2465,6 @@ size_t llama_context::state_read_data(llama_io_read_i & io) {
         // TODO: add more info which needs to be identical but which is not verified otherwise
     }
 
-    // read output ids
-    {
-        LLAMA_LOG_DEBUG("%s: - reading output ids\n", __func__);
-
-        auto n_outputs = this->n_outputs;
-        io.read_to(&n_outputs, sizeof(n_outputs));
-
-        if (n_outputs > output_reserve(n_outputs)) {
-            throw std::runtime_error("could not reserve outputs");
-        }
-
-        std::vector<int32_t> output_pos;
-
-        if (n_outputs) {
-            output_pos.resize(n_outputs);
-            io.read_to(output_pos.data(), n_outputs * sizeof(int32_t));
-
-            for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) {
-                int32_t id = output_pos[i];
-                if ((uint32_t) id >= n_batch()) {
-                    throw std::runtime_error(format("invalid output id, %d does not fit in batch size of %u", id, n_batch()));
-                }
-                this->output_ids[id] = i;
-            }
-
-            this->n_outputs = n_outputs;
-        }
-    }
-
-    // read logits
-    {
-        LLAMA_LOG_DEBUG("%s: - reading logits\n", __func__);
-
-        uint64_t logits_size;
-        io.read_to(&logits_size, sizeof(logits_size));
-
-        if (this->logits.size < logits_size) {
-            throw std::runtime_error("logits buffer too small");
-        }
-
-        if (logits_size) {
-            io.read_to(this->logits.data, logits_size * sizeof(float));
-        }
-    }
-
-    // read embeddings
-    {
-        LLAMA_LOG_DEBUG("%s: - reading embeddings\n", __func__);
-
-        uint64_t embd_size;
-        io.read_to(&embd_size, sizeof(embd_size));
-
-        if (this->embd.size < embd_size) {
-            throw std::runtime_error("embeddings buffer too small");
-        }
-
-        if (embd_size) {
-            io.read_to(this->embd.data, embd_size * sizeof(float));
-        }
-    }
-
-    // TODO: handle sampling buffers and samplers state ?
-    //       https://github.com/ggml-org/llama.cpp/pull/17004
-
     if (memory) {
         LLAMA_LOG_DEBUG("%s: - reading memory module\n", __func__);
 
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 764839b9bc..c93e29555b 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1703,8 +1703,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
             } break;
         case LLM_ARCH_DEEPSEEK2:
             {
-                // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
-                const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
+                // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B, Kanana-2-30B-A3B
+                const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26 || (hparams.n_layer == 48 && n_vocab == 128256));
 
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                 ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead);
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 69b25a1bf9..9c118eab7e 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -2027,7 +2027,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                 pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
             } else if (
                 tokenizer_pre == "gpt-4o" ||
-                tokenizer_pre == "llama4") {
+                tokenizer_pre == "llama4" ||
+                tokenizer_pre == "kanana2") {
                 pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O;
                 clean_spaces = false;
             } else if (
diff --git a/tests/test-backend-sampler.cpp b/tests/test-backend-sampler.cpp
index c10bde91b6..d4cd62c71e 100644
--- a/tests/test-backend-sampler.cpp
+++ b/tests/test-backend-sampler.cpp
@@ -361,7 +361,7 @@ static void test_backend_temp_sampling(const test_params & params) {
             GGML_ASSERT(false && "Failed to decode token");
         }
 
-        // Verfify sequence 0
+        // Verify sequence 0
         {
             int32_t batch_idx = test_ctx.idx_for_seq(0);
             int n_logits = llama_get_sampled_logits_count_ith(test_ctx.ctx.get(), batch_idx);
@@ -379,7 +379,7 @@ static void test_backend_temp_sampling(const test_params & params) {
         }
 
 
-        // Verfify sequence 1
+        // Verify sequence 1
         {
             int32_t batch_idx = test_ctx.idx_for_seq(1);
 
@@ -395,7 +395,7 @@ static void test_backend_temp_sampling(const test_params & params) {
         }
     }
 
-    // lambda to testing non-positive temperature values.
+    // lambda for testing non-positive temperature values.
     auto test_argmax_temp = [&](float temp) {
         printf("\nTesting temperature = %.1f\n", temp);
 
@@ -454,7 +454,7 @@ static void test_backend_temp_ext_sampling(const test_params & params) {
         }
     }
 
-    // lambda to testing non-positive temp/delta/exponent values.
+    // lambda for testing non-positive temp/delta/exponent values.
     auto test_argmax_temp = [&](float temp, float delta, float exponent) {
         printf("\nTesting temperature = %.1f, delta = %1.f, exponent = %1.f\n", temp, delta, exponent);
 
@@ -530,7 +530,7 @@ static void test_backend_min_p_sampling(const test_params & params) {
     printf("min-p cpu sampled token id:%d, string: '%s'\n", token, token_str.c_str());
     GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);
 
-    // Decode and sampler 10 more tokens
+    // Decode and sample 10 more tokens
     for (int i = 0; i < 10; i++) {
         int32_t loop_idx = test_ctx.idx_for_seq(seq_id);
         llama_token token = llama_sampler_sample(chain.get(), test_ctx.ctx.get(), loop_idx);
@@ -582,7 +582,7 @@ static void test_backend_top_p_sampling(const test_params & params) {
     printf("top-p cpu sampled token id:%d, string: '%s'\n", token, token_str.c_str());
     GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);
 
-    // Decode and sampler 10 more tokens
+    // Decode and sample 10 more tokens
     for (int i = 0; i < 10; i++) {
         int32_t loop_idx = test_ctx.idx_for_seq(seq_id);
         llama_token token = llama_sampler_sample(chain.get(), test_ctx.ctx.get(), loop_idx);
@@ -619,7 +619,7 @@ static void test_backend_multi_sequence_sampling(const test_params & params) {
         GGML_ASSERT(false && "Failed to decode token");
     }
 
-    // Verfiy sequence 0
+    // Verify sequence 0
     {
         int32_t batch_idx = test_ctx.idx_for_seq(0);
         llama_token token = llama_get_sampled_token_ith(test_ctx.ctx.get(), batch_idx);
@@ -763,7 +763,7 @@ static void test_backend_logit_bias_sampling(const test_params & params) {
     printf("backend logit bias sampling test PASSED\n");
 }
 
-// This test verifies that it is possible to have two different backend sampler,
+// This test verifies that it is possible to have two different backend samplers,
 // one that uses the backend dist sampler, and another that uses CPU dist sampler.
 static void test_backend_mixed_sampling(const test_params & params) {
     struct llama_sampler_chain_params chain_params_0 = llama_sampler_chain_default_params();
@@ -791,7 +791,7 @@ static void test_backend_mixed_sampling(const test_params & params) {
         GGML_ASSERT(false && "Failed to decode token");
     }
 
-    // Verfiy sequence 0 that used the dist backend sampler.
+    // Verify sequence 0 that used the dist backend sampler.
     {
         int32_t batch_idx = test_ctx.idx_for_seq(0);
         llama_token token = llama_get_sampled_token_ith(test_ctx.ctx.get(), batch_idx);
@@ -802,7 +802,7 @@ static void test_backend_mixed_sampling(const test_params & params) {
         //GGML_ASSERT(llama_get_sampled_logits_count_ith(test_ctx.ctx.get(), batch_idx) == 0);
     }
 
-    // Verfiy sequence 1 that used the top-k backend sampler.
+    // Verify sequence 1 that used the top-k backend sampler.
     {
         int32_t batch_idx = test_ctx.idx_for_seq(1);
         float * logits = llama_get_sampled_logits_ith(test_ctx.ctx.get(), batch_idx);
@@ -934,7 +934,7 @@ static void test_backend_cpu_mixed_batch(const test_params & params) {
         // samplers.
         llama_set_sampler(test_ctx.ctx.get(), 0, nullptr);
 
-        // Create a CPU sampler and verify we can sampler from it.
+        // Create a CPU sampler and verify we can sample from it.
         struct llama_sampler_chain_params chain_params = llama_sampler_chain_default_params();
         llama_sampler_ptr chain(llama_sampler_chain_init(chain_params));
         llama_sampler_chain_add(chain.get(), llama_sampler_init_greedy());
diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp
index 1bef5b9f44..f3d19118b5 100644
--- a/tests/test-chat.cpp
+++ b/tests/test-chat.cpp
@@ -229,6 +229,20 @@ common_chat_tool python_tool {
         "required": ["code"]
     })",
 };
+common_chat_tool todo_list_tool {
+    /* .name = */ "todo_list",
+    /* .description = */ "Create or update the todo list",
+    /* .parameters = */ R"({
+        "type": "object",
+        "properties": {
+            "todos": {
+                "type": "array",
+                "description": "List of TODO list items"
+            }
+        },
+        "required": ["todos"]
+    })",
+};
 common_chat_tool code_interpreter_tool {
     /* .name = */ "code_interpreter",
     /* .description = */ "an ipython interpreter",
@@ -3018,542 +3032,6 @@ Hey there!<|im_end|>
         );
     }
 
-    // Test Qwen3-Coder XML format
-    {
-        // Basic XML tool call parsing
-        assert_msg_equals(
-            message_assist_call,
-            test_chat_parse(
-                "<tool_call>\n"
-                "  <function=special_function>\n"
-                "    <parameter=arg1>\n"
-                "      1\n"
-                "    </parameter>\n"
-                "  </function>\n"
-                "</tool_call>",
-                /* is_partial= */ false,
-                {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}));
-
-        // Multiple parameters with different types
-        common_chat_msg expected_multi_param;
-        expected_multi_param.role = "assistant";
-        expected_multi_param.tool_calls = {
-            { "complex_function", "{\"name\":\"John Doe\",\"age\":30,\"active\":true,\"score\":95.5}", "" }
-        };
-
-        test_parser_with_streaming(expected_multi_param,
-                "<tool_call>\n"
-                "  <function=complex_function>\n"
-                "    <parameter=name>\n"
-                "      John Doe\n"
-                "    </parameter>\n"
-                "    <parameter=age>\n"
-                "      30\n"
-                "    </parameter>\n"
-                "    <parameter=active>\n"
-                "      true\n"
-                "    </parameter>\n"
-                "    <parameter=score>\n"
-                "      95.5\n"
-                "    </parameter>\n"
-                "  </function>\n"
-                "</tool_call>",
-            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
-
-        // Special characters and Unicode
-        common_chat_msg expected_special_chars;
-        expected_special_chars.role = "assistant";
-        expected_special_chars.tool_calls = {
-            { "unicode_function", "{\"message\":\"Hello 世界! 🌍 Special chars: @#$%^&*()\"}", "" }
-        };
-
-        test_parser_with_streaming(expected_special_chars,
-                "<tool_call>\n"
-                "  <function=unicode_function>\n"
-                "    <parameter=message>\n"
-                "      Hello 世界! 🌍 Special chars: @#$%^&*()\n"
-                "    </parameter>\n"
-                "  </function>\n"
-                "</tool_call>",
-            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
-
-        // Multiline content with newlines and indentation
-        common_chat_msg expected_multiline;
-        expected_multiline.role = "assistant";
-        expected_multiline.tool_calls = {
-            { "code_function", "{\"code\":\"def hello():\\n    print(\\\"Hello, World!\\\")\\n    return True\"}", "" }
-        };
-
-        test_parser_with_streaming(expected_multiline,
-                "<tool_call>\n"
-                "  <function=code_function>\n"
-                "    <parameter=code>\n"
-                "def hello():\n"
-                "    print(\"Hello, World!\")\n"
-                "    return True\n"
-                "    </parameter>\n"
-                "  </function>\n"
-                "</tool_call>",
-            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
-
-        // JSON object as parameter value
-        common_chat_msg expected_json_param;
-        expected_json_param.role = "assistant";
-        expected_json_param.tool_calls = {
-            { "json_function", "{\"config\":{\"host\":\"localhost\",\"port\":8080,\"ssl\":false}}", "" }
-        };
-
-        test_parser_with_streaming(
-            expected_json_param,
-                "<tool_call>\n"
-                "  <function=json_function>\n"
-                "    <parameter=config>\n"
-                "      {\"host\": \"localhost\", \"port\": 8080, \"ssl\": false}\n"
-                "    </parameter>\n"
-                "  </function>\n"
-                "</tool_call>",
-            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
-
-        // Array as parameter value
-        common_chat_msg expected_array_param;
-        expected_array_param.role = "assistant";
-        expected_array_param.tool_calls = {
-            { "array_function", "{\"items\":[\"apple\",\"banana\",\"cherry\"]}", "" }
-        };
-
-        test_parser_with_streaming(
-            expected_array_param,
-                "<tool_call>\n"
-                "  <function=array_function>\n"
-                "    <parameter=items>\n"
-                "      [\"apple\", \"banana\", \"cherry\"]\n"
-                "    </parameter>\n"
-                "  </function>\n"
-                "</tool_call>",
-            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
-
-        // Empty parameter
-        common_chat_msg expected_empty_param;
-        expected_empty_param.role = "assistant";
-        expected_empty_param.tool_calls = {
-            { "empty_function", "{\"empty_param\":\"\"}", "" }
-        };
-
-        test_parser_with_streaming(
-            expected_empty_param,
-                "<tool_call>\n"
-                "  <function=empty_function>\n"
-                "    <parameter=empty_param>\n"
-                "    </parameter>\n"
-                "  </function>\n"
-                "</tool_call>",
-            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
-
-        // Boolean values (true/false)
-        common_chat_msg expected_boolean;
-        expected_boolean.role = "assistant";
-        expected_boolean.tool_calls = {
-            { "boolean_function", "{\"enabled\":true,\"debug\":false}", "" }
-        };
-
-        test_parser_with_streaming(
-            expected_boolean,
-                "<tool_call>\n"
-                "  <function=boolean_function>\n"
-                "    <parameter=enabled>\n"
-                "      true\n"
-                "    </parameter>\n"
-                "    <parameter=debug>\n"
-                "      false\n"
-                "    </parameter>\n"
-                "  </function>\n"
-                "</tool_call>",
-            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
-
-        // Null value
-        common_chat_msg expected_null;
-        expected_null.role = "assistant";
-        expected_null.tool_calls = {
-            { "null_function", "{\"optional_param\":null}", "" }
-        };
-
-        test_parser_with_streaming(
-            expected_null,
-                "<tool_call>\n"
-                "  <function=null_function>\n"
-                "    <parameter=optional_param>\n"
-                "      null\n"
-                "    </parameter>\n"
-                "  </function>\n"
-                "</tool_call>",
-            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
-
-        // Negative numbers and scientific notation
-        common_chat_msg expected_numbers;
-        expected_numbers.role = "assistant";
-        expected_numbers.tool_calls = {
-            { "math_function", "{\"negative\":-42,\"decimal\":-3.14,\"scientific\":1.23e-4}", "" }
-        };
-
-        test_parser_with_streaming(
-            expected_numbers,
-                "<tool_call>\n"
-                "  <function=math_function>\n"
-                "    <parameter=negative>\n"
-                "      -42\n"
-                "    </parameter>\n"
-                "    <parameter=decimal>\n"
-                "      -3.14\n"
-                "    </parameter>\n"
-                "    <parameter=scientific>\n"
-                "      1.23e-4\n"
-                "    </parameter>\n"
-                "  </function>\n"
-                "</tool_call>",
-            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
-
-        // XML-like content in parameters (should be escaped)
-        common_chat_msg expected_xml_content;
-        expected_xml_content.role = "assistant";
-        expected_xml_content.tool_calls = {
-            { "xml_function", "{\"xml_content\":\"<root><item>value</item></root>\"}", "" }
-        };
-
-        test_parser_with_streaming(
-            expected_xml_content,
-                "<tool_call>\n"
-                "  <function=xml_function>\n"
-                "    <parameter=xml_content>\n"
-                "      <root><item>value</item></root>\n"
-                "    </parameter>\n"
-                "  </function>\n"
-                "</tool_call>",
-            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
-
-        // Quotes and escape characters
-        common_chat_msg expected_quotes;
-        expected_quotes.role = "assistant";
-        expected_quotes.tool_calls = {
-            { "quote_function", "{\"message\":\"She said \\\"Hello!\\\" and left.\"}", "" }
-        };
-
-        test_parser_with_streaming(
-            expected_quotes,
-                "<tool_call>\n"
-                "  <function=quote_function>\n"
-                "    <parameter=message>\n"
-                "      She said \"Hello!\" and left.\n"
-                "    </parameter>\n"
-                "  </function>\n"
-                "</tool_call>",
-            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
-
-        // Long parameter value (simplified)
-        std::string long_text = "This is a long text parameter that should test the parser's ability to handle larger amounts of text data.";
-
-        common_chat_msg expected_long_text;
-        expected_long_text.role = "assistant";
-        expected_long_text.tool_calls = {
-            { "long_function", "{\"long_text\":\"" + long_text + "\"}", "" }
-        };
-
-        test_parser_with_streaming(
-            expected_long_text,
-                "<tool_call>\n"
-                "  <function=long_function>\n"
-                "    <parameter=long_text>\n"
-                "      " + long_text + "\n"
-                "    </parameter>\n"
-                "  </function>\n"
-                "</tool_call>",
-            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
-
-        // Mixed content with text before and after tool call
-        common_chat_msg expected_mixed_content;
-        expected_mixed_content.role = "assistant";
-        expected_mixed_content.content = "I'll help you search for products. ";
-        expected_mixed_content.tool_calls = {
-            { "search_function", "{\"query\":\"laptops\"}", "" }
-        };
-
-        test_parser_with_streaming(
-            expected_mixed_content,
-                "I'll help you search for products. <tool_call>\n"
-                "  <function=search_function>\n"
-                "    <parameter=query>\n"
-                "      laptops\n"
-                "    </parameter>\n"
-                "  </function>\n"
-                "</tool_call>",
-            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
-
-        // Compact format (no extra whitespace)
-        common_chat_msg expected_compact;
-        expected_compact.role = "assistant";
-        expected_compact.tool_calls = {
-            { "compact_function", "{\"param\":\"value\"}", "" }
-        };
-
-        test_parser_with_streaming(
-            expected_compact,
-                "<tool_call><function=compact_function><parameter=param>value</parameter></function></tool_call>",
-            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
-
-        // Function name with underscores and numbers
-        common_chat_msg expected_complex_name;
-        expected_complex_name.role = "assistant";
-        expected_complex_name.tool_calls = {
-            { "get_user_data_v2", "{\"user_id\":12345}", "" }
-        };
-
-        test_parser_with_streaming(
-            expected_complex_name,
-                "<tool_call>\n"
-                "  <function=get_user_data_v2>\n"
-                "    <parameter=user_id>\n"
-                "      12345\n"
-                "    </parameter>\n"
-                "  </function>\n"
-                "</tool_call>",
-            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
-
-        // Parameter names with underscores and numbers
-        common_chat_msg expected_complex_params;
-        expected_complex_params.role = "assistant";
-        expected_complex_params.tool_calls = {
-            { "test_function", "{\"param_1\":\"value1\",\"param_2_name\":\"value2\",\"param3\":123}", "" }
-        };
-
-        test_parser_with_streaming(
-            expected_complex_params,
-                "<tool_call>\n"
-                "  <function=test_function>\n"
-                "    <parameter=param_1>\n"
-                "      value1\n"
-                "    </parameter>\n"
-                "    <parameter=param_2_name>\n"
-                "      value2\n"
-                "    </parameter>\n"
-                "    <parameter=param3>\n"
-                "      123\n"
-                "    </parameter>\n"
-                "  </function>\n"
-                "</tool_call>",
-            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
-
-        // Very deeply nested XML content in parameter
-        common_chat_msg expected_deep_xml;
-        expected_deep_xml.role = "assistant";
-        expected_deep_xml.tool_calls = {
-            { "xml_parser", "{\"xml\":\"<root><level1><level2><level3>deep content</level3></level2></level1></root>\"}", "" }
-        };
-
-        test_parser_with_streaming(
-            expected_deep_xml,
-                "<tool_call>\n"
-                "  <function=xml_parser>\n"
-                "    <parameter=xml>\n"
-                "      <root><level1><level2><level3>deep content</level3></level2></level1></root>\n"
-                "    </parameter>\n"
-                "  </function>\n"
-                "</tool_call>",
-            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
-
-        // Parameter with only whitespace
-        common_chat_msg expected_whitespace_param;
-        expected_whitespace_param.role = "assistant";
-        expected_whitespace_param.tool_calls = {
-            { "whitespace_function", "{\"spaces\":\"\"}", "" }
-        };
-
-        test_parser_with_streaming(
-            expected_whitespace_param,
-                "<tool_call>\n"
-                "  <function=whitespace_function>\n"
-                "    <parameter=spaces>\n"
-                "      \n"
-                "    </parameter>\n"
-                "  </function>\n"
-                "</tool_call>",
-            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
-
-        // Parameter with tabs and mixed whitespace
-        common_chat_msg expected_mixed_whitespace;
-        expected_mixed_whitespace.role = "assistant";
-        expected_mixed_whitespace.tool_calls = {
-            { "tab_function", "{\"content\":\"line1\\n\\tindented line\\n    spaces\"}", "" }
-        };
-
-        test_parser_with_streaming(
-            expected_mixed_whitespace,
-                "<tool_call>\n"
-                "  <function=tab_function>\n"
-                "    <parameter=content>\n"
-                "line1\n"
-                "\tindented line\n"
-                "    spaces\n"
-                "    </parameter>\n"
-                "  </function>\n"
-                "</tool_call>",
-            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
-
-        // Control characters and special Unicode
-        common_chat_msg expected_control_chars;
-        expected_control_chars.role = "assistant";
-        expected_control_chars.tool_calls = {
-            { "control_function", "{\"text\":\"Line1\\nLine2\\tTabbed\\rCarriage return\"}", "" }
-        };
-
-        test_parser_with_streaming(
-            expected_control_chars,
-                "<tool_call>\n"
-                "  <function=control_function>\n"
-                "    <parameter=text>\n"
-                "Line1\nLine2\tTabbed\rCarriage return\n"
-                "    </parameter>\n"
-                "  </function>\n"
-                "</tool_call>",
-            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
-
-        // Emoji and extended Unicode characters
-        common_chat_msg expected_emoji;
-        expected_emoji.role = "assistant";
-        expected_emoji.tool_calls = {
-            { "emoji_function", "{\"message\":\"Hello! 👋 🌟 🚀 Testing emojis: 😀😃😄😁 and symbols: ∑∏∆∇\"}", "" }
-        };
-
-        test_parser_with_streaming(
-            expected_emoji,
-                "<tool_call>\n"
-                "  <function=emoji_function>\n"
-                "    <parameter=message>\n"
-                "      Hello! 👋 🌟 🚀 Testing emojis: 😀😃😄😁 and symbols: ∑∏∆∇\n"
-                "    </parameter>\n"
-                "  </function>\n"
-                "</tool_call>",
-            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
-
-        // Mathematical expressions and formulas
-        common_chat_msg expected_math;
-        expected_math.role = "assistant";
-        expected_math.tool_calls = {
-            { "math_function", "{\"formula\":\"E = mc² and ∫f(x)dx = F(x) + C\"}", "" }
-        };
-
-        test_parser_with_streaming(
-            expected_math,
-                "<tool_call>\n"
-                "  <function=math_function>\n"
-                "    <parameter=formula>\n"
-                "      E = mc² and ∫f(x)dx = F(x) + C\n"
-                "    </parameter>\n"
-                "  </function>\n"
-                "</tool_call>",
-            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
-
-        // SQL injection-like content (should be safely escaped)
-        common_chat_msg expected_sql;
-        expected_sql.role = "assistant";
-        expected_sql.tool_calls = {
-            { "sql_function", "{\"query\":\"SELECT * FROM users WHERE id = 1; DROP TABLE users; --\"}", "" }
-        };
-
-        test_parser_with_streaming(
-            expected_sql,
-                "<tool_call>\n"
-                "  <function=sql_function>\n"
-                "    <parameter=query>\n"
-                "      SELECT * FROM users WHERE id = 1; DROP TABLE users; --\n"
-                "    </parameter>\n"
-                "  </function>\n"
-                "</tool_call>",
-            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
-
-        // HTML/XML injection content
-        common_chat_msg expected_html;
-        expected_html.role = "assistant";
-        expected_html.tool_calls = {
-            { "html_function", "{\"content\":\"<script>alert('xss')</script><img src=x onerror=alert(1)>\"}", "" }
-        };
-
-        test_parser_with_streaming(
-            expected_html,
-                "<tool_call>\n"
-                "  <function=html_function>\n"
-                "    <parameter=content>\n"
-                "      <script>alert('xss')</script><img src=x onerror=alert(1)>\n"
-                "    </parameter>\n"
-                "  </function>\n"
-                "</tool_call>",
-            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
-
-        // Binary-like content (base64)
-        common_chat_msg expected_binary;
-        expected_binary.role = "assistant";
-        expected_binary.tool_calls = {
-            { "binary_function", "{\"data\":\"SGVsbG8gV29ybGQhIFRoaXMgaXMgYmFzZTY0IGVuY29kZWQgdGV4dC4=\"}", "" }
-        };
-
-        test_parser_with_streaming(
-            expected_binary,
-                "<tool_call>\n"
-                "  <function=binary_function>\n"
-                "    <parameter=data>\n"
-                "      SGVsbG8gV29ybGQhIFRoaXMgaXMgYmFzZTY0IGVuY29kZWQgdGV4dC4=\n"
-                "    </parameter>\n"
-                "  </function>\n"
-                "</tool_call>",
-            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
-
-        // Very large numbers (should be parsed as scientific notation)
-        common_chat_msg expected_large_numbers;
-        expected_large_numbers.role = "assistant";
-        expected_large_numbers.tool_calls = {
-            { "number_function", "{\"big_int\":1e+60}", "" }  // Large number becomes scientific notation
-        };
-
-        test_parser_with_streaming(
-            expected_large_numbers,
-                "<tool_call>\n"
-                "  <function=number_function>\n"
-                "    <parameter=big_int>\n"
-                "      999999999999999999999999999999999999999999999999999999999999\n"
-                "    </parameter>\n"
-                "  </function>\n"
-                "</tool_call>",
-            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
-    }
-
-    {
-        // Qwen3-Coder template
-        auto tmpls = read_templates("models/templates/Qwen3-Coder.jinja");
-        common_chat_templates_inputs inputs;
-        inputs.messages = { message_user };
-
-        common_chat_tool qwen_union_tool {
-            /* .name = */ "qwen_union",
-            /* .description = */ "Test tool for union/anyOf handling",
-            /* .parameters = */ R"({
-                "type": "object",
-                "properties": {
-                    "priority": { "type": ["number", "null"] },
-                    "maybe_text": { "anyOf": [ { "type": "string" } ] },
-                    "config": { "anyOf": [ { "type": "object" }, { "type": "null" } ] }
-                },
-                "required": []
-            })",
-        };
-        inputs.tools = { qwen_union_tool };
-
-        auto params = common_chat_templates_apply(tmpls.get(), inputs);
-        assert_equals(COMMON_CHAT_FORMAT_QWEN3_CODER_XML, params.format);
-        assert_equals(false, params.grammar.empty());
-
-        // Grammar should compile successfully
-        auto grammar = build_grammar(params.grammar);
-        GGML_ASSERT(grammar && "Failed to build Qwen3-Coder grammar with union types");
-    }
-
     {
         // Step-3.5-Flash template: uses same XML output format as Qwen3-Coder and Nemotron v3,
         // but with <think> support. Routes to the Nemotron v3 PEG parser for streaming and
@@ -3665,6 +3143,135 @@ static void test_template_output_peg_parsers() {
         });
     }
 
+    {
+        // Qwen3-Coder
+        auto tmpls = read_templates("models/templates/Qwen3-Coder.jinja");
+
+        // Test basic message
+        test_peg_parser(tmpls.get(), [&](auto & t) {
+            t.input = "Hello, world!\nWhat's up?";
+            t.expect = message_assist;
+        });
+
+        // Test tool call
+        test_peg_parser(tmpls.get(), [&](auto & t) {
+            t.input =
+                "<tool_call>\n"
+                "<function=special_function>\n"
+                "<parameter=arg1>\n"
+                "1\n"
+                "</parameter>\n"
+                "</function>\n"
+                "</tool_call>";
+            t.params.tools = {special_function_tool};
+            t.expect = message_assist_call;
+        });
+
+        // Test parallel tool calls
+        test_peg_parser(tmpls.get(), [&](auto & t) {
+            t.input =
+                "<tool_call>\n"
+                "<function=special_function>\n"
+                "<parameter=arg1>\n"
+                "1\n"
+                "</parameter>\n"
+                "</function>\n"
+                "</tool_call>\n"
+                "<tool_call>\n"
+                "<function=special_function_with_opt>\n"
+                "<parameter=arg1>\n"
+                "1\n"
+                "</parameter>\n"
+                "<parameter=arg2>\n"
+                "2\n"
+                "</parameter>\n"
+                "</function>\n"
+                "</tool_call>";
+            t.params.parallel_tool_calls = true;
+            t.params.tools = {special_function_tool, special_function_tool_with_optional_param};
+
+            t.expect.tool_calls = {{
+                /* .name = */      "special_function",
+                /* .arguments = */ R"({"arg1": 1})",
+                /* .id = */        {},
+            }, {
+                /* .name = */      "special_function_with_opt",
+                /* .arguments = */ R"({"arg1": 1, "arg2": 2})",
+                /* .id = */        {},
+            }};
+        });
+
+        // Test tool call with string parameter
+        test_peg_parser(tmpls.get(), [&](auto & t) {
+            t.input =
+                "<tool_call>\n"
+                "<function=python>\n"
+                "<parameter=code>\n"
+                "def hello():\n"
+                "    print(\"Hello, world!\")\n"
+                "\n"
+                "hello()\n"
+                "</parameter>\n"
+                "</function>\n"
+                "</tool_call>";
+            t.params.tools = {python_tool};
+
+            t.expect.tool_calls = {{
+                /* .name = */      "python",
+                /* .arguments = */ "{\"code\": \"def hello():\\n    print(\\\"Hello, world!\\\")\\n\\nhello()\"}",
+                /* .id = */        {},
+            }};
+        });
+
+        // Test tool call with JSON parameter
+        test_peg_parser(tmpls.get(), [&](auto & t) {
+            t.input =
+                "<tool_call>\n"
+                "<function=todo_list>\n"
+                "<parameter=todos>\n"
+                "[{\"item\": \"Check stuff\", \"selected\": false}, {\"item\": \"Prepare stuff\", \"selected\": true}]\n"
+                "</parameter>\n"
+                "</function>\n"
+                "</tool_call>";
+            t.params.tools = {todo_list_tool};
+
+            t.expect.tool_calls = {{
+                /* .name = */      "todo_list",
+                /* .arguments = */ "{\"todos\": [{\"item\": \"Check stuff\", \"selected\": false}, {\"item\": \"Prepare stuff\", \"selected\": true}]}",
+                /* .id = */        {},
+            }};
+        });
+
+        // Test tool call with string parameter and no closing </parameter> tag
+        test_peg_parser(tmpls.get(), [&](auto & t) {
+            t.input =
+                "<tool_call>\n"
+                "<function=python>\n"
+                "<parameter=code>\n"
+                "def hello():\n"
+                "    print(\"Hello, world!\")\n"
+                "\n"
+                "hello()\n"
+                "</function>\n"
+                "</tool_call>";
+            t.params.tools = {python_tool};
+
+            t.expect.tool_calls = {{
+                /* .name = */      "python",
+                /* .arguments = */ "{\"code\": \"def hello():\\n    print(\\\"Hello, world!\\\")\\n\\nhello()\"}",
+                /* .id = */        {},
+            }};
+        });
+
+        // Test response format
+        test_peg_parser(tmpls.get(), [&](auto & t) {
+            t.input = R"({"amount": 123.45, "date": "2025-12-03"})";
+            t.params.json_schema = invoice_schema;
+
+            t.expect.content = R"({"amount": 123.45, "date": "2025-12-03"})";
+        });
+    }
+
     {
         // NVIDIA Nemotron-3 Nano
         auto tmpls = read_templates("models/templates/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.jinja");
diff --git a/tests/test-jinja.cpp b/tests/test-jinja.cpp
index f5197bd33f..05ea8ca9e9 100644
--- a/tests/test-jinja.cpp
+++ b/tests/test-jinja.cpp
@@ -32,6 +32,7 @@ static void test_string_methods(testing & t);
 static void test_array_methods(testing & t);
 static void test_object_methods(testing & t);
 static void test_hasher(testing & t);
+static void test_stats(testing & t);
 static void test_fuzzing(testing & t);
 
 static bool g_python_mode = false;
@@ -70,6 +71,7 @@ int main(int argc, char *argv[]) {
     t.test("object methods", test_object_methods);
     if (!g_python_mode) {
         t.test("hasher", test_hasher);
+        t.test("stats", test_stats);
         t.test("fuzzing", test_fuzzing);
     }
 
@@ -1795,6 +1797,63 @@ static void test_hasher(testing & t) {
     });
 }
 
+static void test_stats(testing & t) {
+    static auto get_stats = [](const std::string & tmpl, const json & vars) -> jinja::value {
+        jinja::lexer lexer;
+        auto lexer_res = lexer.tokenize(tmpl);
+
+        jinja::program prog = jinja::parse_from_tokens(lexer_res);
+
+        jinja::context ctx(tmpl);
+        jinja::global_from_json(ctx, json{{ "val", vars }}, true);
+        ctx.is_get_stats = true;
+
+        jinja::runtime runtime(ctx);
+        runtime.execute(prog);
+
+        return ctx.get_val("val");
+    };
+
+    t.test("stats", [](testing & t) {
+        jinja::value val = get_stats(
+            "{{val.num}} "
+            "{{val.str}} "
+            "{{val.arr[0]}} "
+            "{{val.obj.key1}} "
+            "{{val.nested | tojson}}",
+            // Note: the json below will be wrapped inside "val" in the context
+            json{
+                {"num", 1},
+                {"str", "abc"},
+                {"arr", json::array({1, 2, 3})},
+                {"obj", json::object({{"key1", 1}, {"key2", 2}, {"key3", 3}})},
+                {"nested", json::object({
+                    {"inner_key1", json::array({1, 2})},
+                    {"inner_key2", json::object({{"a", "x"}, {"b", "y"}})}
+                })},
+                {"mixed", json::object({
+                    {"used", 1},
+                    {"unused", 2},
+                })},
+            }
+        );
+
+        t.assert_true("num is used", val->at("num")->stats.used);
+        t.assert_true("str is used", val->at("str")->stats.used);
+
+        t.assert_true("arr is used", val->at("arr")->stats.used);
+        t.assert_true("arr[0] is used", val->at("arr")->at(0)->stats.used);
+        t.assert_true("arr[1] is not used", !val->at("arr")->at(1)->stats.used);
+
+        t.assert_true("obj is used", val->at("obj")->stats.used);
+        t.assert_true("obj.key1 is used", val->at("obj")->at("key1")->stats.used);
+        t.assert_true("obj.key2 is not used", !val->at("obj")->at("key2")->stats.used);
+
+        t.assert_true("inner_key1[0] is used", val->at("nested")->at("inner_key1")->at(0)->stats.used);
+        t.assert_true("inner_key2.a is used", val->at("nested")->at("inner_key2")->at("a")->stats.used);
+    });
+}
+
 static void test_template_cpp(testing & t, const std::string & name, const std::string & tmpl, const json & vars, const std::string & expect) {
     t.test(name, [&tmpl, &vars, &expect](testing & t) {
         jinja::lexer lexer;
diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp
index ad421e6326..e57bf52e36 100644
--- a/tools/cli/cli.cpp
+++ b/tools/cli/cli.cpp
@@ -380,6 +380,15 @@ int main(int argc, char ** argv) {
                 console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str());
                 continue;
             }
+            if (inf.fim_sep_token != LLAMA_TOKEN_NULL) {
+                cur_msg += common_token_to_piece(ctx_cli.ctx_server.get_llama_context(), inf.fim_sep_token, true);
+                cur_msg += fname;
+                cur_msg.push_back('\n');
+            } else {
+                cur_msg += "--- File: ";
+                cur_msg += fname;
+                cur_msg += " ---\n";
+            }
             cur_msg += marker;
             console::log("Loaded text from '%s'\n", fname.c_str());
             continue;
diff --git a/tools/completion/completion.cpp b/tools/completion/completion.cpp
index 977132756f..aed2c0e38f 100644
--- a/tools/completion/completion.cpp
+++ b/tools/completion/completion.cpp
@@ -387,6 +387,17 @@ int main(int argc, char ** argv) {
         }
 
         session_do_save = !path_session.empty() && n_match < embd_inp.size() && !params.prompt_cache_ro;
+
+        // Logits are not stored as part of the session state so we need to
+        // "replay" the last token to get logits for sampling.
+        if (!session_tokens.empty() && n_match > 0 && n_match == session_tokens.size()) {
+            if (!common_replay_last_token(ctx, session_tokens.back(), n_match)) {
+                return 1;
+            }
+
+            session_do_save = false;
+            LOG_INF("%s: replayed last token from session\n", __func__);
+        }
     }
 
     // number of tokens to keep when resetting context
@@ -675,40 +686,27 @@ int main(int argc, char ** argv) {
             }
 
             if (!embd.empty()) {
-                int n_eval = (int) embd.size();
-                LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
-
-                GGML_ASSERT(n_eval <= params.n_batch);
-                if (llama_decode(ctx, llama_batch_get_one(embd.data(), n_eval))) {
-                    LOG_ERR("%s : failed to eval\n", __func__);
+                const bool is_last_batch = (n_consumed >= (int) embd_inp.size());
+                const bool save_now = session_do_save && is_last_batch;
+                if (!common_prompt_batch_decode(ctx, embd, n_past, params.n_batch, path_session, save_now)) {
                     return 1;
                 }
-
-                n_past += n_eval;
+                session_tokens.insert(session_tokens.end(), embd.begin(), embd.begin());
+                n_session_consumed = session_tokens.size();
+                session_do_save = false;
 
                 LOG_DBG("n_past = %d\n", n_past);
+
                 // Display total tokens alongside total time
                 if (params.n_print > 0 && n_past % params.n_print == 0) {
                     LOG_DBG("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx);
                 }
             }
-
-            if (!embd.empty() && !path_session.empty()) {
-                session_tokens.insert(session_tokens.end(), embd.begin(), embd.end());
-                n_session_consumed = session_tokens.size();
-            }
         }
 
         embd.clear();
 
         if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
-            // optionally save the session on first sample (for faster prompt loading next time)
-            if (session_do_save) {
-                session_do_save = false;
-                llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
-
-                LOG_DBG("saved session to %s\n", path_session.c_str());
-            }
 
             const llama_token id = common_sampler_sample(smpl, ctx, -1);
 
diff --git a/tools/server/public/index.html.gz b/tools/server/public/index.html.gz
index c69481e798..a5465fcd13 100644
Binary files a/tools/server/public/index.html.gz and b/tools/server/public/index.html.gz differ
diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp
index d717fb6698..88b6e77d82 100644
--- a/tools/server/server-common.cpp
+++ b/tools/server/server-common.cpp
@@ -1105,6 +1105,8 @@ json convert_responses_to_chatcmpl(const json & response_body) {
         };
 
         for (json item : input_value) {
+            bool merge_prev = !chatcmpl_messages.empty() && chatcmpl_messages.back().value("role", "") == "assistant";
+
             if (exists_and_is_string(item, "content")) {
                 // #responses_create-input-input_item_list-input_message-content-text_input
                 // Only "Input message" contains item["content"]::string
@@ -1193,7 +1195,7 @@ json convert_responses_to_chatcmpl(const json & response_body) {
                 item.at("type") == "message"
             ) {
                 // #responses_create-input-input_item_list-item-output_message
-                std::vector<json> chatcmpl_content;
+                auto chatcmpl_content = json::array();
 
                 for (const auto & output_text : item.at("content")) {
                     const std::string type = json_value(output_text, "type", std::string());
@@ -1210,10 +1212,19 @@ json convert_responses_to_chatcmpl(const json & response_body) {
                     });
                 }
 
-                item.erase("status");
-                item.erase("type");
-                item["content"] = chatcmpl_content;
-                chatcmpl_messages.push_back(item);
+                if (merge_prev) {
+                    auto & prev_msg = chatcmpl_messages.back();
+                    if (!exists_and_is_array(prev_msg, "content")) {
+                        prev_msg["content"] = json::array();
+                    }
+                    auto & prev_content = prev_msg["content"];
+                    prev_content.insert(prev_content.end(), chatcmpl_content.begin(), chatcmpl_content.end());
+                } else {
+                    item.erase("status");
+                    item.erase("type");
+                    item["content"] = chatcmpl_content;
+                    chatcmpl_messages.push_back(item);
+                }
             } else if (exists_and_is_string(item, "arguments") &&
                 exists_and_is_string(item, "call_id") &&
                 exists_and_is_string(item, "name") &&
@@ -1221,24 +1232,27 @@ json convert_responses_to_chatcmpl(const json & response_body) {
                 item.at("type") == "function_call"
             ) {
                 // #responses_create-input-input_item_list-item-function_tool_call
-                json msg = json {
-                    {"role", "assistant"},
-                    {"tool_calls", json::array({ json {
-                        {"function", json {
-                            {"arguments", item.at("arguments")},
-                            {"name",      item.at("name")},
-                        }},
-                        {"id",   item.at("call_id")},
-                        {"type", "function"},
-                    }})},
+                json tool_call = {
+                    {"function", json {
+                        {"arguments", item.at("arguments")},
+                        {"name",      item.at("name")},
+                    }},
+                    {"id",   item.at("call_id")},
+                    {"type", "function"},
                 };
 
-                if (!chatcmpl_messages.empty() && chatcmpl_messages.back().contains("reasoning_content")) {
-                    // Move reasoning content from dummy message to tool call message
-                    msg["reasoning_content"] = chatcmpl_messages.back().at("reasoning_content");
-                    chatcmpl_messages.pop_back();
+                if (merge_prev) {
+                    auto & prev_msg = chatcmpl_messages.back();
+                    if (!exists_and_is_array(prev_msg, "tool_calls")) {
+                        prev_msg["tool_calls"] = json::array();
+                    }
+                    prev_msg["tool_calls"].push_back(tool_call);
+                } else {
+                    chatcmpl_messages.push_back(json {
+                        {"role",       "assistant"},
+                        {"tool_calls", json::array({tool_call})}
+                    });
                 }
-                chatcmpl_messages.push_back(msg);
             } else if (exists_and_is_string(item, "call_id") &&
                 (exists_and_is_string(item, "output") || exists_and_is_array(item, "output")) &&
                 exists_and_is_string(item, "type") &&
@@ -1282,12 +1296,16 @@ json convert_responses_to_chatcmpl(const json & response_body) {
                     throw std::invalid_argument("item['content']['text'] is not a string");
                 }
 
-                // Pack reasoning content in dummy message
-                chatcmpl_messages.push_back(json {
-                    {"role", "assistant"},
-                    {"content", json::array()},
-                    {"reasoning_content", item.at("content")[0].at("text")},
-                });
+                if (merge_prev) {
+                    auto & prev_msg = chatcmpl_messages.back();
+                    prev_msg["reasoning_content"] = item.at("content")[0].at("text");
+                } else {
+                    chatcmpl_messages.push_back(json {
+                        {"role", "assistant"},
+                        {"content", json::array()},
+                        {"reasoning_content", item.at("content")[0].at("text")},
+                    });
+                }
             } else {
                 throw std::invalid_argument("Cannot determine type of 'item'");
             }
@@ -1296,20 +1314,6 @@ json convert_responses_to_chatcmpl(const json & response_body) {
         throw std::invalid_argument("'input' must be a string or array of objects");
     }
 
-    // Remove unused dummy message which contains
-    // reasoning content not followed by tool call
-    chatcmpl_messages.erase(std::remove_if(
-        chatcmpl_messages.begin(),
-        chatcmpl_messages.end(),
-        [](const json & x){ return x.contains("role") &&
-            x.at("role") == "assistant" &&
-            x.contains("content") &&
-            x.at("content") == json::array() &&
-            x.contains("reasoning_content");
-        }),
-        chatcmpl_messages.end()
-    );
-
     chatcmpl_body["messages"] = chatcmpl_messages;
 
     if (response_body.contains("tools")) {
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 8aab0d4c1b..0f2f3a45aa 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -2911,6 +2911,9 @@ server_context_meta server_context::get_meta() const {
         /* fim_pre_token          */ llama_vocab_fim_pre(impl->vocab),
         /* fim_sub_token          */ llama_vocab_fim_suf(impl->vocab),
         /* fim_mid_token          */ llama_vocab_fim_mid(impl->vocab),
+        /* fim_pad_token          */ llama_vocab_fim_pad(impl->vocab),
+        /* fim_rep_token          */ llama_vocab_fim_rep(impl->vocab),
+        /* fim_sep_token          */ llama_vocab_fim_sep(impl->vocab),
 
         /* model_vocab_type       */ llama_vocab_type(impl->vocab),
         /* model_vocab_n_tokens   */ llama_vocab_n_tokens(impl->vocab),
diff --git a/tools/server/server-context.h b/tools/server/server-context.h
index c0b5d373ff..03c29f513b 100644
--- a/tools/server/server-context.h
+++ b/tools/server/server-context.h
@@ -30,6 +30,9 @@ struct server_context_meta {
     llama_token fim_pre_token;
     llama_token fim_sub_token;
     llama_token fim_mid_token;
+    llama_token fim_pad_token;
+    llama_token fim_rep_token;
+    llama_token fim_sep_token;
 
     // model meta
     enum llama_vocab_type model_vocab_type;
diff --git a/tools/server/webui/README.md b/tools/server/webui/README.md
index 98b01fdcd7..6fc908e274 100644
--- a/tools/server/webui/README.md
+++ b/tools/server/webui/README.md
@@ -101,7 +101,7 @@ In a separate terminal, start the backend server:
 ./llama-server -m model.gguf
 
 # Multi-model (ROUTER mode)
-./llama-server --model-store /path/to/models
+./llama-server --models-dir /path/to/models
 ```
 
 ### 3. Start Development Servers
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettings.svelte b/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettings.svelte
index c3cb8343fc..2130658dda 100644
--- a/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettings.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettings.svelte
@@ -114,6 +114,11 @@
 					label: 'Render user content as Markdown',
 					type: SettingsFieldType.CHECKBOX
 				},
+				{
+					key: SETTINGS_KEYS.FULL_HEIGHT_CODE_BLOCKS,
+					label: 'Use full height code blocks',
+					type: SettingsFieldType.CHECKBOX
+				},
 				{
 					key: SETTINGS_KEYS.DISABLE_AUTO_SCROLL,
 					label: 'Disable automatic scroll',
diff --git a/tools/server/webui/src/lib/components/app/content/MarkdownContent.svelte b/tools/server/webui/src/lib/components/app/content/MarkdownContent.svelte
index 0bc69a739f..a0944e18a0 100644
--- a/tools/server/webui/src/lib/components/app/content/MarkdownContent.svelte
+++ b/tools/server/webui/src/lib/components/app/content/MarkdownContent.svelte
@@ -38,6 +38,8 @@
 	import { ActionIconsCodeBlock, DialogCodePreview } from '$lib/components/app';
 	import { createAutoScrollController } from '$lib/hooks/use-auto-scroll.svelte';
 	import type { DatabaseMessageExtra } from '$lib/types/database';
+	import { config } from '$lib/stores/settings.svelte';
+	import { SETTINGS_KEYS } from '$lib/constants/settings-keys';
 
 	interface Props {
 		attachments?: DatabaseMessageExtra[];
@@ -593,7 +595,12 @@
 	});
 </script>
 
-<div bind:this={containerRef} class={className}>
+<div
+	bind:this={containerRef}
+	class="{className}{config()[SETTINGS_KEYS.FULL_HEIGHT_CODE_BLOCKS]
+		? ' full-height-code-blocks'
+		: ''}"
+>
 	{#each renderedBlocks as block (block.id)}
 		<div class="markdown-block" data-block-id={block.id}>
 			<!-- eslint-disable-next-line no-at-html-tags -->
@@ -914,6 +921,16 @@
 		line-height: 1.3;
 	}
 
+	.full-height-code-blocks :global(.code-block-wrapper) {
+		max-height: none;
+	}
+
+	.full-height-code-blocks :global(.code-block-scroll-container),
+	.full-height-code-blocks .streaming-code-scroll-container {
+		max-height: none;
+		overflow-y: visible;
+	}
+
 	div :global(.code-block-header) {
 		display: flex;
 		justify-content: space-between;
diff --git a/tools/server/webui/src/lib/components/app/models/ModelsSelector.svelte b/tools/server/webui/src/lib/components/app/models/ModelsSelector.svelte
index e011fa6ec1..ebffae1212 100644
--- a/tools/server/webui/src/lib/components/app/models/ModelsSelector.svelte
+++ b/tools/server/webui/src/lib/components/app/models/ModelsSelector.svelte
@@ -251,9 +251,6 @@
 			return options.find((option) => option.id === activeId);
 		}
 
-		if (options.length === 1) {
-			return options[0];
-		}
 		// No selection - return undefined to show "Select model"
 		return undefined;
 	}
diff --git a/tools/server/webui/src/lib/constants/settings-config.ts b/tools/server/webui/src/lib/constants/settings-config.ts
index 6f6dbea2ec..00dac3d6e9 100644
--- a/tools/server/webui/src/lib/constants/settings-config.ts
+++ b/tools/server/webui/src/lib/constants/settings-config.ts
@@ -22,6 +22,7 @@ export const SETTING_CONFIG_DEFAULT: Record<string, string | number | boolean> =
 	alwaysShowSidebarOnDesktop: false,
 	autoShowSidebarOnNewChat: true,
 	autoMicOnEmpty: false,
+	fullHeightCodeBlocks: false,
 	// make sure these default values are in sync with `common.h`
 	samplers: 'top_k;typ_p;top_p;min_p;temperature',
 	backend_sampling: false,
@@ -113,6 +114,8 @@ export const SETTING_CONFIG_INFO: Record<string, string> = {
 		'Automatically show sidebar when starting a new chat. Disable to keep the sidebar hidden until you click on it.',
 	autoMicOnEmpty:
 		'Automatically show microphone button instead of send button when textarea is empty for models with audio modality support.',
+	fullHeightCodeBlocks:
+		'Always display code blocks at their full natural height, overriding any height limits.',
 	pyInterpreterEnabled:
 		'Enable Python interpreter using Pyodide. Allows running Python code in markdown code blocks.',
 	enableContinueGeneration:
diff --git a/tools/server/webui/src/lib/constants/settings-keys.ts b/tools/server/webui/src/lib/constants/settings-keys.ts
index 63960d4d56..38de41ffee 100644
--- a/tools/server/webui/src/lib/constants/settings-keys.ts
+++ b/tools/server/webui/src/lib/constants/settings-keys.ts
@@ -23,6 +23,7 @@ export const SETTINGS_KEYS = {
 	DISABLE_AUTO_SCROLL: 'disableAutoScroll',
 	ALWAYS_SHOW_SIDEBAR_ON_DESKTOP: 'alwaysShowSidebarOnDesktop',
 	AUTO_SHOW_SIDEBAR_ON_NEW_CHAT: 'autoShowSidebarOnNewChat',
+	FULL_HEIGHT_CODE_BLOCKS: 'fullHeightCodeBlocks',
 	// Sampling
 	TEMPERATURE: 'temperature',
 	DYNATEMP_RANGE: 'dynatemp_range',
diff --git a/tools/server/webui/src/lib/services/parameter-sync.service.ts b/tools/server/webui/src/lib/services/parameter-sync.service.ts
index 1d7666e955..1acb5ce453 100644
--- a/tools/server/webui/src/lib/services/parameter-sync.service.ts
+++ b/tools/server/webui/src/lib/services/parameter-sync.service.ts
@@ -153,6 +153,12 @@ export const SYNCABLE_PARAMETERS: SyncableParameter[] = [
 		serverKey: 'enableContinueGeneration',
 		type: SyncableParameterType.BOOLEAN,
 		canSync: true
+	},
+	{
+		key: 'fullHeightCodeBlocks',
+		serverKey: 'fullHeightCodeBlocks',
+		type: SyncableParameterType.BOOLEAN,
+		canSync: true
 	}
 ];
 
diff --git a/tools/server/webui/src/lib/stores/models.svelte.ts b/tools/server/webui/src/lib/stores/models.svelte.ts
index 4cb6167220..c4cc3d3860 100644
--- a/tools/server/webui/src/lib/stores/models.svelte.ts
+++ b/tools/server/webui/src/lib/stores/models.svelte.ts
@@ -306,6 +306,16 @@ class ModelsStore {
 			const response = await ModelsService.listRouter();
 			this.routerModels = response.data;
 			await this.fetchModalitiesForLoadedModels();
+
+			const o = this.models.filter((option) => {
+				const modelProps = this.getModelProps(option.model);
+
+				return modelProps?.webui !== false;
+			});
+
+			if (o.length === 1 && this.isModelLoaded(o[0].model)) {
+				this.selectModelById(o[0].id);
+			}
 		} catch (error) {
 			console.warn('Failed to fetch router models:', error);
 			this.routerModels = [];
diff --git a/vendor/cpp-httplib/httplib.cpp b/vendor/cpp-httplib/httplib.cpp
index 9d24594f98..15e118731f 100644
--- a/vendor/cpp-httplib/httplib.cpp
+++ b/vendor/cpp-httplib/httplib.cpp
@@ -1,5 +1,7 @@
 #include "httplib.h"
 namespace httplib {
+// httplib::any — type-erased value container (C++11 compatible)
+// On C++17+ builds, thin wrappers around std::any are provided.
 
 /*
  * Implementation that will be part of the .cc file if split into .h + .cc.
@@ -630,6 +632,56 @@ size_t to_utf8(int code, char *buff) {
   return 0;
 }
 
+} // namespace detail
+
+namespace ws {
+namespace impl {
+
+bool is_valid_utf8(const std::string &s) {
+  size_t i = 0;
+  auto n = s.size();
+  while (i < n) {
+    auto c = static_cast<unsigned char>(s[i]);
+    size_t len;
+    uint32_t cp;
+    if (c < 0x80) {
+      i++;
+      continue;
+    } else if ((c & 0xE0) == 0xC0) {
+      len = 2;
+      cp = c & 0x1F;
+    } else if ((c & 0xF0) == 0xE0) {
+      len = 3;
+      cp = c & 0x0F;
+    } else if ((c & 0xF8) == 0xF0) {
+      len = 4;
+      cp = c & 0x07;
+    } else {
+      return false;
+    }
+    if (i + len > n) { return false; }
+    for (size_t j = 1; j < len; j++) {
+      auto b = static_cast<unsigned char>(s[i + j]);
+      if ((b & 0xC0) != 0x80) { return false; }
+      cp = (cp << 6) | (b & 0x3F);
+    }
+    // Overlong encoding check
+    if (len == 2 && cp < 0x80) { return false; }
+    if (len == 3 && cp < 0x800) { return false; }
+    if (len == 4 && cp < 0x10000) { return false; }
+    // Surrogate halves (U+D800..U+DFFF) and beyond U+10FFFF are invalid
+    if (cp >= 0xD800 && cp <= 0xDFFF) { return false; }
+    if (cp > 0x10FFFF) { return false; }
+    i += len;
+  }
+  return true;
+}
+
+} // namespace impl
+} // namespace ws
+
+namespace detail {
+
 // NOTE: This code came up with the following stackoverflow post:
 // https://stackoverflow.com/questions/180947/base64-decode-snippet-in-c
 std::string base64_encode(const std::string &in) {
@@ -660,6 +712,281 @@ std::string base64_encode(const std::string &in) {
   return out;
 }
 
+std::string sha1(const std::string &input) {
+  // RFC 3174 SHA-1 implementation
+  auto left_rotate = [](uint32_t x, uint32_t n) -> uint32_t {
+    return (x << n) | (x >> (32 - n));
+  };
+
+  uint32_t h0 = 0x67452301;
+  uint32_t h1 = 0xEFCDAB89;
+  uint32_t h2 = 0x98BADCFE;
+  uint32_t h3 = 0x10325476;
+  uint32_t h4 = 0xC3D2E1F0;
+
+  // Pre-processing: adding padding bits
+  std::string msg = input;
+  uint64_t original_bit_len = static_cast<uint64_t>(msg.size()) * 8;
+  msg.push_back(static_cast<char>(0x80));
+  while (msg.size() % 64 != 56) {
+    msg.push_back(0);
+  }
+
+  // Append original length in bits as 64-bit big-endian
+  for (int i = 56; i >= 0; i -= 8) {
+    msg.push_back(static_cast<char>((original_bit_len >> i) & 0xFF));
+  }
+
+  // Process each 512-bit chunk
+  for (size_t offset = 0; offset < msg.size(); offset += 64) {
+    uint32_t w[80];
+
+    for (size_t i = 0; i < 16; i++) {
+      w[i] =
+          (static_cast<uint32_t>(static_cast<uint8_t>(msg[offset + i * 4]))
+           << 24) |
+          (static_cast<uint32_t>(static_cast<uint8_t>(msg[offset + i * 4 + 1]))
+           << 16) |
+          (static_cast<uint32_t>(static_cast<uint8_t>(msg[offset + i * 4 + 2]))
+           << 8) |
+          (static_cast<uint32_t>(
+              static_cast<uint8_t>(msg[offset + i * 4 + 3])));
+    }
+
+    for (int i = 16; i < 80; i++) {
+      w[i] = left_rotate(w[i - 3] ^ w[i - 8] ^ w[i - 14] ^ w[i - 16], 1);
+    }
+
+    uint32_t a = h0, b = h1, c = h2, d = h3, e = h4;
+
+    for (int i = 0; i < 80; i++) {
+      uint32_t f, k;
+      if (i < 20) {
+        f = (b & c) | ((~b) & d);
+        k = 0x5A827999;
+      } else if (i < 40) {
+        f = b ^ c ^ d;
+        k = 0x6ED9EBA1;
+      } else if (i < 60) {
+        f = (b & c) | (b & d) | (c & d);
+        k = 0x8F1BBCDC;
+      } else {
+        f = b ^ c ^ d;
+        k = 0xCA62C1D6;
+      }
+
+      uint32_t temp = left_rotate(a, 5) + f + e + k + w[i];
+      e = d;
+      d = c;
+      c = left_rotate(b, 30);
+      b = a;
+      a = temp;
+    }
+
+    h0 += a;
+    h1 += b;
+    h2 += c;
+    h3 += d;
+    h4 += e;
+  }
+
+  // Produce the final hash as a 20-byte binary string
+  std::string hash(20, '\0');
+  for (size_t i = 0; i < 4; i++) {
+    hash[i] = static_cast<char>((h0 >> (24 - i * 8)) & 0xFF);
+    hash[4 + i] = static_cast<char>((h1 >> (24 - i * 8)) & 0xFF);
+    hash[8 + i] = static_cast<char>((h2 >> (24 - i * 8)) & 0xFF);
+    hash[12 + i] = static_cast<char>((h3 >> (24 - i * 8)) & 0xFF);
+    hash[16 + i] = static_cast<char>((h4 >> (24 - i * 8)) & 0xFF);
+  }
+  return hash;
+}
+
+std::string websocket_accept_key(const std::string &client_key) {
+  const std::string magic = "258EAFA5-E914-47DA-95CA-C5AB0DC85B11";
+  return base64_encode(sha1(client_key + magic));
+}
+
+bool is_websocket_upgrade(const Request &req) {
+  if (req.method != "GET") { return false; }
+
+  // Check Upgrade: websocket (case-insensitive)
+  auto upgrade_it = req.headers.find("Upgrade");
+  if (upgrade_it == req.headers.end()) { return false; }
+  auto upgrade_val = upgrade_it->second;
+  std::transform(upgrade_val.begin(), upgrade_val.end(), upgrade_val.begin(),
+                 ::tolower);
+  if (upgrade_val != "websocket") { return false; }
+
+  // Check Connection header contains "Upgrade"
+  auto connection_it = req.headers.find("Connection");
+  if (connection_it == req.headers.end()) { return false; }
+  auto connection_val = connection_it->second;
+  std::transform(connection_val.begin(), connection_val.end(),
+                 connection_val.begin(), ::tolower);
+  if (connection_val.find("upgrade") == std::string::npos) { return false; }
+
+  // Check Sec-WebSocket-Key is a valid base64-encoded 16-byte value (24 chars)
+  // RFC 6455 Section 4.2.1
+  auto ws_key = req.get_header_value("Sec-WebSocket-Key");
+  if (ws_key.size() != 24 || ws_key[22] != '=' || ws_key[23] != '=') {
+    return false;
+  }
+  static const std::string b64chars =
+      "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+  for (size_t i = 0; i < 22; i++) {
+    if (b64chars.find(ws_key[i]) == std::string::npos) { return false; }
+  }
+
+  // Check Sec-WebSocket-Version: 13
+  auto version = req.get_header_value("Sec-WebSocket-Version");
+  if (version != "13") { return false; }
+
+  return true;
+}
+
+bool write_websocket_frame(Stream &strm, ws::Opcode opcode,
+                                  const char *data, size_t len, bool fin,
+                                  bool mask) {
+  // First byte: FIN + opcode
+  uint8_t header[2];
+  header[0] = static_cast<uint8_t>((fin ? 0x80 : 0x00) |
+                                   (static_cast<uint8_t>(opcode) & 0x0F));
+
+  // Second byte: MASK + payload length
+  if (len < 126) {
+    header[1] = static_cast<uint8_t>(len);
+    if (mask) { header[1] |= 0x80; }
+    if (strm.write(reinterpret_cast<char *>(header), 2) < 0) { return false; }
+  } else if (len <= 0xFFFF) {
+    header[1] = 126;
+    if (mask) { header[1] |= 0x80; }
+    if (strm.write(reinterpret_cast<char *>(header), 2) < 0) { return false; }
+    uint8_t ext[2];
+    ext[0] = static_cast<uint8_t>((len >> 8) & 0xFF);
+    ext[1] = static_cast<uint8_t>(len & 0xFF);
+    if (strm.write(reinterpret_cast<char *>(ext), 2) < 0) { return false; }
+  } else {
+    header[1] = 127;
+    if (mask) { header[1] |= 0x80; }
+    if (strm.write(reinterpret_cast<char *>(header), 2) < 0) { return false; }
+    uint8_t ext[8];
+    for (int i = 7; i >= 0; i--) {
+      ext[7 - i] = static_cast<uint8_t>((len >> (i * 8)) & 0xFF);
+    }
+    if (strm.write(reinterpret_cast<char *>(ext), 8) < 0) { return false; }
+  }
+
+  if (mask) {
+    // Generate random mask key
+    thread_local std::mt19937 rng(std::random_device{}());
+    uint8_t mask_key[4];
+    auto r = rng();
+    std::memcpy(mask_key, &r, 4);
+    if (strm.write(reinterpret_cast<char *>(mask_key), 4) < 0) { return false; }
+
+    // Write masked payload in chunks
+    const size_t chunk_size = 4096;
+    std::vector<char> buf((std::min)(len, chunk_size));
+    for (size_t offset = 0; offset < len; offset += chunk_size) {
+      size_t n = (std::min)(chunk_size, len - offset);
+      for (size_t i = 0; i < n; i++) {
+        buf[i] =
+            data[offset + i] ^ static_cast<char>(mask_key[(offset + i) % 4]);
+      }
+      if (strm.write(buf.data(), n) < 0) { return false; }
+    }
+  } else {
+    if (len > 0) {
+      if (strm.write(data, len) < 0) { return false; }
+    }
+  }
+
+  return true;
+}
+
+} // namespace detail
+
+namespace ws {
+namespace impl {
+
+bool read_websocket_frame(Stream &strm, Opcode &opcode,
+                                 std::string &payload, bool &fin,
+                                 bool expect_masked, size_t max_len) {
+  // Read first 2 bytes
+  uint8_t header[2];
+  if (strm.read(reinterpret_cast<char *>(header), 2) != 2) { return false; }
+
+  fin = (header[0] & 0x80) != 0;
+
+  // RSV1, RSV2, RSV3 must be 0 when no extension is negotiated
+  if (header[0] & 0x70) { return false; }
+
+  opcode = static_cast<Opcode>(header[0] & 0x0F);
+  bool masked = (header[1] & 0x80) != 0;
+  uint64_t payload_len = header[1] & 0x7F;
+
+  // RFC 6455 Section 5.5: control frames MUST NOT be fragmented and
+  // MUST have a payload length of 125 bytes or less
+  bool is_control = (static_cast<uint8_t>(opcode) & 0x08) != 0;
+  if (is_control) {
+    if (!fin) { return false; }
+    if (payload_len > 125) { return false; }
+  }
+
+  if (masked != expect_masked) { return false; }
+
+  // Extended payload length
+  if (payload_len == 126) {
+    uint8_t ext[2];
+    if (strm.read(reinterpret_cast<char *>(ext), 2) != 2) { return false; }
+    payload_len = (static_cast<uint64_t>(ext[0]) << 8) | ext[1];
+  } else if (payload_len == 127) {
+    uint8_t ext[8];
+    if (strm.read(reinterpret_cast<char *>(ext), 8) != 8) { return false; }
+    // RFC 6455 Section 5.2: the most significant bit MUST be 0
+    if (ext[0] & 0x80) { return false; }
+    payload_len = 0;
+    for (int i = 0; i < 8; i++) {
+      payload_len = (payload_len << 8) | ext[i];
+    }
+  }
+
+  if (payload_len > max_len) { return false; }
+
+  // Read mask key if present
+  uint8_t mask_key[4] = {0};
+  if (masked) {
+    if (strm.read(reinterpret_cast<char *>(mask_key), 4) != 4) { return false; }
+  }
+
+  // Read payload
+  payload.resize(static_cast<size_t>(payload_len));
+  if (payload_len > 0) {
+    size_t total_read = 0;
+    while (total_read < payload_len) {
+      auto n = strm.read(&payload[total_read],
+                         static_cast<size_t>(payload_len - total_read));
+      if (n <= 0) { return false; }
+      total_read += static_cast<size_t>(n);
+    }
+  }
+
+  // Unmask if needed
+  if (masked) {
+    for (size_t i = 0; i < payload.size(); i++) {
+      payload[i] ^= static_cast<char>(mask_key[i % 4]);
+    }
+  }
+
+  return true;
+}
+
+} // namespace impl
+} // namespace ws
+
+namespace detail {
+
 bool is_valid_path(const std::string &path) {
   size_t level = 0;
   size_t i = 0;
@@ -1333,12 +1660,14 @@ public:
   bool is_readable() const override;
   bool wait_readable() const override;
   bool wait_writable() const override;
+  bool is_peer_alive() const override;
   ssize_t read(char *ptr, size_t size) override;
   ssize_t write(const char *ptr, size_t size) override;
   void get_remote_ip_and_port(std::string &ip, int &port) const override;
   void get_local_ip_and_port(std::string &ip, int &port) const override;
   socket_t socket() const override;
   time_t duration() const override;
+  void set_read_timeout(time_t sec, time_t usec = 0) override;
 
 private:
   socket_t sock_;
@@ -2653,6 +2982,50 @@ bool read_headers(Stream &strm, Headers &headers) {
   return true;
 }
 
+bool read_websocket_upgrade_response(Stream &strm,
+                                            const std::string &expected_accept,
+                                            std::string &selected_subprotocol) {
+  // Read status line
+  const auto bufsiz = 2048;
+  char buf[bufsiz];
+  stream_line_reader line_reader(strm, buf, bufsiz);
+  if (!line_reader.getline()) { return false; }
+
+  // Check for "HTTP/1.1 101"
+  auto line = std::string(line_reader.ptr(), line_reader.size());
+  if (line.find("HTTP/1.1 101") == std::string::npos) { return false; }
+
+  // Parse headers using existing read_headers
+  Headers headers;
+  if (!read_headers(strm, headers)) { return false; }
+
+  // Verify Upgrade: websocket (case-insensitive)
+  auto upgrade_it = headers.find("Upgrade");
+  if (upgrade_it == headers.end()) { return false; }
+  auto upgrade_val = upgrade_it->second;
+  std::transform(upgrade_val.begin(), upgrade_val.end(), upgrade_val.begin(),
+                 ::tolower);
+  if (upgrade_val != "websocket") { return false; }
+
+  // Verify Connection header contains "Upgrade" (case-insensitive)
+  auto connection_it = headers.find("Connection");
+  if (connection_it == headers.end()) { return false; }
+  auto connection_val = connection_it->second;
+  std::transform(connection_val.begin(), connection_val.end(),
+                 connection_val.begin(), ::tolower);
+  if (connection_val.find("upgrade") == std::string::npos) { return false; }
+
+  // Verify Sec-WebSocket-Accept header value
+  auto it = headers.find("Sec-WebSocket-Accept");
+  if (it == headers.end() || it->second != expected_accept) { return false; }
+
+  // Extract negotiated subprotocol
+  auto proto_it = headers.find("Sec-WebSocket-Protocol");
+  if (proto_it != headers.end()) { selected_subprotocol = proto_it->second; }
+
+  return true;
+}
+
 enum class ReadContentResult {
   Success,         // Successfully read the content
   PayloadTooLarge, // The content exceeds the specified payload limit
@@ -2941,10 +3314,10 @@ bool write_content_with_progress(Stream &strm,
     return ok;
   };
 
-  data_sink.is_writable = [&]() -> bool { return strm.wait_writable(); };
+  data_sink.is_writable = [&]() -> bool { return strm.is_peer_alive(); };
 
   while (offset < end_offset && !is_shutting_down()) {
-    if (!strm.wait_writable()) {
+    if (!strm.wait_writable() || !strm.is_peer_alive()) {
       error = Error::Write;
       return false;
     } else if (!content_provider(offset, end_offset - offset, data_sink)) {
@@ -2956,6 +3329,11 @@ bool write_content_with_progress(Stream &strm,
     }
   }
 
+  if (offset < end_offset) { // exited due to is_shutting_down(), not completion
+    error = Error::Write;
+    return false;
+  }
+
   error = Error::Success;
   return true;
 }
@@ -2995,12 +3373,12 @@ write_content_without_length(Stream &strm,
     return ok;
   };
 
-  data_sink.is_writable = [&]() -> bool { return strm.wait_writable(); };
+  data_sink.is_writable = [&]() -> bool { return strm.is_peer_alive(); };
 
   data_sink.done = [&](void) { data_available = false; };
 
   while (data_available && !is_shutting_down()) {
-    if (!strm.wait_writable()) {
+    if (!strm.wait_writable() || !strm.is_peer_alive()) {
       return false;
     } else if (!content_provider(offset, 0, data_sink)) {
       return false;
@@ -3008,7 +3386,8 @@ write_content_without_length(Stream &strm,
       return false;
     }
   }
-  return true;
+  return !data_available; // true only if done() was called, false if shutting
+                          // down
 }
 
 template <typename T, typename U>
@@ -3044,7 +3423,7 @@ write_content_chunked(Stream &strm, const ContentProvider &content_provider,
     return ok;
   };
 
-  data_sink.is_writable = [&]() -> bool { return strm.wait_writable(); };
+  data_sink.is_writable = [&]() -> bool { return strm.is_peer_alive(); };
 
   auto done_with_trailer = [&](const Headers *trailer) {
     if (!ok) { return; }
@@ -3094,7 +3473,7 @@ write_content_chunked(Stream &strm, const ContentProvider &content_provider,
   };
 
   while (data_available && !is_shutting_down()) {
-    if (!strm.wait_writable()) {
+    if (!strm.wait_writable() || !strm.is_peer_alive()) {
       error = Error::Write;
       return false;
     } else if (!content_provider(offset, 0, data_sink)) {
@@ -3106,6 +3485,11 @@ write_content_chunked(Stream &strm, const ContentProvider &content_provider,
     }
   }
 
+  if (data_available) { // exited due to is_shutting_down(), not done()
+    error = Error::Write;
+    return false;
+  }
+
   error = Error::Success;
   return true;
 }
@@ -3772,6 +4156,73 @@ serialize_multipart_formdata(const UploadFormDataItems &items,
   return body;
 }
 
+size_t get_multipart_content_length(const UploadFormDataItems &items,
+                                           const std::string &boundary) {
+  size_t total = 0;
+  for (const auto &item : items) {
+    total += serialize_multipart_formdata_item_begin(item, boundary).size();
+    total += item.content.size();
+    total += serialize_multipart_formdata_item_end().size();
+  }
+  total += serialize_multipart_formdata_finish(boundary).size();
+  return total;
+}
+
+struct MultipartSegment {
+  const char *data;
+  size_t size;
+};
+
+// NOTE: items must outlive the returned ContentProvider
+//       (safe for synchronous use inside Post/Put/Patch)
+ContentProvider
+make_multipart_content_provider(const UploadFormDataItems &items,
+                                const std::string &boundary) {
+  // Own the per-item header strings and the finish string
+  std::vector<std::string> owned;
+  owned.reserve(items.size() + 1);
+  for (const auto &item : items)
+    owned.push_back(serialize_multipart_formdata_item_begin(item, boundary));
+  owned.push_back(serialize_multipart_formdata_finish(boundary));
+
+  // Flat segment list: [header, content, "\r\n"] * N + [finish]
+  std::vector<MultipartSegment> segs;
+  segs.reserve(items.size() * 3 + 1);
+  static const char crlf[] = "\r\n";
+  for (size_t i = 0; i < items.size(); i++) {
+    segs.push_back({owned[i].data(), owned[i].size()});
+    segs.push_back({items[i].content.data(), items[i].content.size()});
+    segs.push_back({crlf, 2});
+  }
+  segs.push_back({owned.back().data(), owned.back().size()});
+
+  struct MultipartState {
+    std::vector<std::string> owned;
+    std::vector<MultipartSegment> segs;
+  };
+  auto state = std::make_shared<MultipartState>();
+  state->owned = std::move(owned);
+  // `segs` holds raw pointers into owned strings; std::string move preserves
+  // the data pointer, so these pointers remain valid after the move above.
+  state->segs = std::move(segs);
+
+  return [state](size_t offset, size_t length, DataSink &sink) -> bool {
+    size_t pos = 0;
+    for (const auto &seg : state->segs) {
+      // Loop invariant: pos <= offset (proven by advancing pos only when
+      // offset - pos >= seg.size, i.e., the segment doesn't contain offset)
+      if (seg.size > 0 && offset - pos < seg.size) {
+        size_t seg_offset = offset - pos;
+        size_t available = seg.size - seg_offset;
+        size_t to_write = (std::min)(available, length);
+        return sink.write(seg.data + seg_offset, to_write);
+      }
+      pos += seg.size;
+    }
+    return true; // past end (shouldn't be reached when content_length is exact)
+  };
+}
+
 void coalesce_ranges(Ranges &ranges, size_t content_length) {
   if (ranges.size() <= 1) return;
 
@@ -4020,15 +4471,6 @@ bool expect_content(const Request &req) {
   return false;
 }
 
-bool has_crlf(const std::string &s) {
-  auto p = s.c_str();
-  while (*p) {
-    if (*p == '\r' || *p == '\n') { return true; }
-    p++;
-  }
-  return false;
-}
-
 #ifdef _WIN32
 class WSInit {
 public:
@@ -4148,6 +4590,52 @@ bool is_field_content(const std::string &s) {
 bool is_field_value(const std::string &s) { return is_field_content(s); }
 
 } // namespace fields
+
+bool perform_websocket_handshake(Stream &strm, const std::string &host,
+                                        int port, const std::string &path,
+                                        const Headers &headers,
+                                        std::string &selected_subprotocol) {
+  // Validate path and host
+  if (!fields::is_field_value(path) || !fields::is_field_value(host)) {
+    return false;
+  }
+
+  // Validate user-provided headers
+  for (const auto &h : headers) {
+    if (!fields::is_field_name(h.first) || !fields::is_field_value(h.second)) {
+      return false;
+    }
+  }
+
+  // Generate random Sec-WebSocket-Key
+  thread_local std::mt19937 rng(std::random_device{}());
+  std::string key_bytes(16, '\0');
+  for (size_t i = 0; i < 16; i += 4) {
+    auto r = rng();
+    std::memcpy(&key_bytes[i], &r, (std::min)(size_t(4), size_t(16 - i)));
+  }
+  auto client_key = base64_encode(key_bytes);
+
+  // Build upgrade request
+  std::string req_str = "GET " + path + " HTTP/1.1\r\n";
+  req_str += "Host: " + host + ":" + std::to_string(port) + "\r\n";
+  req_str += "Upgrade: websocket\r\n";
+  req_str += "Connection: Upgrade\r\n";
+  req_str += "Sec-WebSocket-Key: " + client_key + "\r\n";
+  req_str += "Sec-WebSocket-Version: 13\r\n";
+  for (const auto &h : headers) {
+    req_str += h.first + ": " + h.second + "\r\n";
+  }
+  req_str += "\r\n";
+
+  if (strm.write(req_str.data(), req_str.size()) < 0) { return false; }
+
+  // Verify 101 response and Sec-WebSocket-Accept header
+  auto expected_accept = websocket_accept_key(client_key);
+  return read_websocket_upgrade_response(strm, expected_accept,
+                                         selected_subprotocol);
+}
+
 } // namespace detail
 
 /*
@@ -4170,12 +4658,14 @@ public:
   bool is_readable() const override;
   bool wait_readable() const override;
   bool wait_writable() const override;
+  bool is_peer_alive() const override;
   ssize_t read(char *ptr, size_t size) override;
   ssize_t write(const char *ptr, size_t size) override;
   void get_remote_ip_and_port(std::string &ip, int &port) const override;
   void get_local_ip_and_port(std::string &ip, int &port) const override;
   socket_t socket() const override;
   time_t duration() const override;
+  void set_read_timeout(time_t sec, time_t usec = 0) override;
 
 private:
   socket_t sock_;
@@ -4268,6 +4758,39 @@ std::string SHA_512(const std::string &s) {
 #endif
   return hash_to_hex(hash);
 }
+#elif defined(CPPHTTPLIB_WOLFSSL_SUPPORT)
+namespace {
+template <size_t N>
+std::string hash_to_hex(const unsigned char (&hash)[N]) {
+  std::stringstream ss;
+  for (size_t i = 0; i < N; ++i) {
+    ss << std::hex << std::setw(2) << std::setfill('0')
+       << static_cast<unsigned int>(hash[i]);
+  }
+  return ss.str();
+}
+} // namespace
+
+std::string MD5(const std::string &s) {
+  unsigned char hash[WC_MD5_DIGEST_SIZE];
+  wc_Md5Hash(reinterpret_cast<const unsigned char *>(s.c_str()),
+             static_cast<word32>(s.size()), hash);
+  return hash_to_hex(hash);
+}
+
+std::string SHA_256(const std::string &s) {
+  unsigned char hash[WC_SHA256_DIGEST_SIZE];
+  wc_Sha256Hash(reinterpret_cast<const unsigned char *>(s.c_str()),
+                static_cast<word32>(s.size()), hash);
+  return hash_to_hex(hash);
+}
+
+std::string SHA_512(const std::string &s) {
+  unsigned char hash[WC_SHA512_DIGEST_SIZE];
+  wc_Sha512Hash(reinterpret_cast<const unsigned char *>(s.c_str()),
+                static_cast<word32>(s.size()), hash);
+  return hash_to_hex(hash);
+}
 #endif
 
 bool is_ip_address(const std::string &host) {
@@ -4510,6 +5033,53 @@ bool verify_cert_with_windows_schannel(
 }
 #endif // _WIN32
 
+bool setup_client_tls_session(const std::string &host, tls::ctx_t &ctx,
+                                     tls::session_t &session, socket_t sock,
+                                     bool server_certificate_verification,
+                                     const std::string &ca_cert_file_path,
+                                     tls::ca_store_t ca_cert_store,
+                                     time_t timeout_sec, time_t timeout_usec) {
+  using namespace tls;
+
+  ctx = create_client_context();
+  if (!ctx) { return false; }
+
+  if (server_certificate_verification) {
+    if (!ca_cert_file_path.empty()) {
+      load_ca_file(ctx, ca_cert_file_path.c_str());
+    }
+    if (ca_cert_store) { set_ca_store(ctx, ca_cert_store); }
+    load_system_certs(ctx);
+  }
+
+  bool is_ip = is_ip_address(host);
+
+#ifdef CPPHTTPLIB_MBEDTLS_SUPPORT
+  if (is_ip && server_certificate_verification) {
+    set_verify_client(ctx, false);
+  } else {
+    set_verify_client(ctx, server_certificate_verification);
+  }
+#endif
+
+  session = create_session(ctx, sock);
+  if (!session) { return false; }
+
+  // RFC 6066: SNI must not be set for IP addresses
+  if (!is_ip) { set_sni(session, host.c_str()); }
+  if (server_certificate_verification) { set_hostname(session, host.c_str()); }
+
+  if (!connect_nonblocking(session, sock, timeout_sec, timeout_usec, nullptr)) {
+    return false;
+  }
+
+  if (server_certificate_verification) {
+    if (get_verify_result(session) != 0) { return false; }
+  }
+
+  return true;
+}
+
 } // namespace detail
 #endif // CPPHTTPLIB_SSL_ENABLED
 
@@ -5327,22 +5897,37 @@ ssize_t detail::BodyReader::read(char *buf, size_t len) {
 }
 
 // ThreadPool implementation
-ThreadPool::ThreadPool(size_t n, size_t mqr)
-    : shutdown_(false), max_queued_requests_(mqr) {
-  threads_.reserve(n);
-  while (n) {
-    threads_.emplace_back(worker(*this));
-    n--;
+ThreadPool::ThreadPool(size_t n, size_t max_n, size_t mqr)
+    : base_thread_count_(n), max_queued_requests_(mqr), idle_thread_count_(0),
+      shutdown_(false) {
+#ifndef CPPHTTPLIB_NO_EXCEPTIONS
+  if (max_n != 0 && max_n < n) {
+    std::string msg = "max_threads must be >= base_threads";
+    throw std::invalid_argument(msg);
+  }
+#endif
+  max_thread_count_ = max_n == 0 ? n : max_n;
+  threads_.reserve(base_thread_count_);
+  for (size_t i = 0; i < base_thread_count_; i++) {
+    threads_.emplace_back(std::thread([this]() { worker(false); }));
   }
 }
 
 bool ThreadPool::enqueue(std::function<void()> fn) {
   {
     std::unique_lock<std::mutex> lock(mutex_);
+    if (shutdown_) { return false; }
     if (max_queued_requests_ > 0 && jobs_.size() >= max_queued_requests_) {
       return false;
     }
     jobs_.push_back(std::move(fn));
+
+    // Spawn a dynamic thread if no idle threads and under max
+    if (idle_thread_count_ == 0 &&
+        threads_.size() + dynamic_threads_.size() < max_thread_count_) {
+      cleanup_finished_threads();
+      dynamic_threads_.emplace_back(std::thread([this]() { worker(true); }));
+    }
   }
 
   cond_.notify_one();
@@ -5350,7 +5935,6 @@ bool ThreadPool::enqueue(std::function<void()> fn) {
 }
 
 void ThreadPool::shutdown() {
-  // Stop all worker threads...
   {
     std::unique_lock<std::mutex> lock(mutex_);
     shutdown_ = true;
@@ -5358,31 +5942,84 @@ void ThreadPool::shutdown() {
 
   cond_.notify_all();
 
-  // Join...
   for (auto &t : threads_) {
-    t.join();
+    if (t.joinable()) { t.join(); }
+  }
+
+  // Move dynamic_threads_ to a local list under the lock to avoid racing
+  // with worker threads that call move_to_finished() concurrently.
+  std::list<std::thread> remaining_dynamic;
+  {
+    std::unique_lock<std::mutex> lock(mutex_);
+    remaining_dynamic = std::move(dynamic_threads_);
+  }
+  for (auto &t : remaining_dynamic) {
+    if (t.joinable()) { t.join(); }
+  }
+
+  std::unique_lock<std::mutex> lock(mutex_);
+  cleanup_finished_threads();
+}
+
+void ThreadPool::move_to_finished(std::thread::id id) {
+  // Must be called with mutex_ held
+  for (auto it = dynamic_threads_.begin(); it != dynamic_threads_.end(); ++it) {
+    if (it->get_id() == id) {
+      finished_threads_.push_back(std::move(*it));
+      dynamic_threads_.erase(it);
+      return;
+    }
   }
 }
 
-ThreadPool::worker::worker(ThreadPool &pool) : pool_(pool) {}
+void ThreadPool::cleanup_finished_threads() {
+  // Must be called with mutex_ held
+  for (auto &t : finished_threads_) {
+    if (t.joinable()) { t.join(); }
+  }
+  finished_threads_.clear();
+}
 
-void ThreadPool::worker::operator()() {
+void ThreadPool::worker(bool is_dynamic) {
   for (;;) {
     std::function<void()> fn;
     {
-      std::unique_lock<std::mutex> lock(pool_.mutex_);
+      std::unique_lock<std::mutex> lock(mutex_);
+      idle_thread_count_++;
 
-      pool_.cond_.wait(lock,
-                       [&] { return !pool_.jobs_.empty() || pool_.shutdown_; });
+      if (is_dynamic) {
+        auto has_work = cond_.wait_for(
+            lock, std::chrono::seconds(CPPHTTPLIB_THREAD_POOL_IDLE_TIMEOUT),
+            [&] { return !jobs_.empty() || shutdown_; });
+        if (!has_work) {
+          // Timed out with no work - exit this dynamic thread
+          idle_thread_count_--;
+          move_to_finished(std::this_thread::get_id());
+          break;
+        }
+      } else {
+        cond_.wait(lock, [&] { return !jobs_.empty() || shutdown_; });
+      }
 
-      if (pool_.shutdown_ && pool_.jobs_.empty()) { break; }
+      idle_thread_count_--;
 
-      fn = pool_.jobs_.front();
-      pool_.jobs_.pop_front();
+      if (shutdown_ && jobs_.empty()) { break; }
+
+      fn = std::move(jobs_.front());
+      jobs_.pop_front();
     }
 
     assert(true == static_cast<bool>(fn));
     fn();
+
+    // Dynamic thread: exit if queue is empty after task completion
+    if (is_dynamic) {
+      std::unique_lock<std::mutex> lock(mutex_);
+      if (jobs_.empty()) {
+        move_to_finished(std::this_thread::get_id());
+        break;
+      }
+    }
   }
 
 #if defined(CPPHTTPLIB_OPENSSL_SUPPORT) && !defined(OPENSSL_IS_BORINGSSL) &&   \
@@ -5445,8 +6082,11 @@ bool SocketStream::wait_readable() const {
 }
 
 bool SocketStream::wait_writable() const {
-  return select_write(sock_, write_timeout_sec_, write_timeout_usec_) > 0 &&
-         is_socket_alive(sock_);
+  return select_write(sock_, write_timeout_sec_, write_timeout_usec_) > 0;
+}
+
+bool SocketStream::is_peer_alive() const {
+  return detail::is_socket_alive(sock_);
 }
 
 ssize_t SocketStream::read(char *ptr, size_t size) {
@@ -5540,6 +6180,11 @@ time_t SocketStream::duration() const {
       .count();
 }
 
+void SocketStream::set_read_timeout(time_t sec, time_t usec) {
+  read_timeout_sec_ = sec;
+  read_timeout_usec_ = usec;
+}
+
 // Buffer stream implementation
 bool BufferStream::is_readable() const { return true; }
 
@@ -5772,7 +6417,11 @@ bool SSLSocketStream::wait_readable() const {
 
 bool SSLSocketStream::wait_writable() const {
   return select_write(sock_, write_timeout_sec_, write_timeout_usec_) > 0 &&
-         is_socket_alive(sock_) && !tls::is_peer_closed(session_, sock_);
+         !tls::is_peer_closed(session_, sock_);
+}
+
+bool SSLSocketStream::is_peer_alive() const {
+  return !tls::is_peer_closed(session_, sock_);
 }
 
 ssize_t SSLSocketStream::read(char *ptr, size_t size) {
@@ -5865,6 +6514,11 @@ time_t SSLSocketStream::duration() const {
       .count();
 }
 
+void SSLSocketStream::set_read_timeout(time_t sec, time_t usec) {
+  read_timeout_sec_ = sec;
+  read_timeout_usec_ = usec;
+}
+
 } // namespace detail
 #endif // CPPHTTPLIB_SSL_ENABLED
 
@@ -5874,8 +6528,10 @@ time_t SSLSocketStream::duration() const {
 
 // HTTP server implementation
 Server::Server()
-    : new_task_queue(
-          [] { return new ThreadPool(CPPHTTPLIB_THREAD_POOL_COUNT); }) {
+    : new_task_queue([] {
+        return new ThreadPool(CPPHTTPLIB_THREAD_POOL_COUNT,
+                              CPPHTTPLIB_THREAD_POOL_MAX_COUNT);
+      }) {
 #ifndef _WIN32
   signal(SIGPIPE, SIG_IGN);
 #endif
@@ -5950,6 +6606,21 @@ Server &Server::Options(const std::string &pattern, Handler handler) {
   return *this;
 }
 
+Server &Server::WebSocket(const std::string &pattern,
+                                 WebSocketHandler handler) {
+  websocket_handlers_.push_back(
+      {make_matcher(pattern), std::move(handler), nullptr});
+  return *this;
+}
+
+Server &Server::WebSocket(const std::string &pattern,
+                                 WebSocketHandler handler,
+                                 SubProtocolSelector sub_protocol_selector) {
+  websocket_handlers_.push_back({make_matcher(pattern), std::move(handler),
+                                 std::move(sub_protocol_selector)});
+  return *this;
+}
+
 bool Server::set_base_dir(const std::string &dir,
                                  const std::string &mount_point) {
   return set_mount_point(mount_point, dir);
@@ -6274,35 +6945,33 @@ bool Server::write_response_core(Stream &strm, bool close_connection,
   if (post_routing_handler_) { post_routing_handler_(req, res); }
 
   // Response line and headers
-  {
-    detail::BufferStream bstrm;
-    if (!detail::write_response_line(bstrm, res.status)) { return false; }
-    if (header_writer_(bstrm, res.headers) <= 0) { return false; }
+  detail::BufferStream bstrm;
+  if (!detail::write_response_line(bstrm, res.status)) { return false; }
+  if (header_writer_(bstrm, res.headers) <= 0) { return false; }
 
-    // Flush buffer
-    auto &data = bstrm.get_buffer();
-    detail::write_data(strm, data.data(), data.size());
+  // Combine small body with headers to reduce write syscalls
+  if (req.method != "HEAD" && !res.body.empty() && !res.content_provider_) {
+    bstrm.write(res.body.data(), res.body.size());
   }
 
-  // Body
+  // Log before writing to avoid race condition with client-side code that
+  // accesses logger-captured data immediately after receiving the response.
+  output_log(req, res);
+
+  // Flush buffer
+  auto &data = bstrm.get_buffer();
+  if (!detail::write_data(strm, data.data(), data.size())) { return false; }
+
+  // Streaming body
   auto ret = true;
-  if (req.method != "HEAD") {
-    if (!res.body.empty()) {
-      if (!detail::write_data(strm, res.body.data(), res.body.size())) {
-        ret = false;
-      }
-    } else if (res.content_provider_) {
-      if (write_content_with_provider(strm, req, res, boundary, content_type)) {
-        res.content_provider_success_ = true;
-      } else {
-        ret = false;
-      }
+  if (req.method != "HEAD" && res.content_provider_) {
+    if (write_content_with_provider(strm, req, res, boundary, content_type)) {
+      res.content_provider_success_ = true;
+    } else {
+      ret = false;
     }
   }
 
-  // Log
-  output_log(req, res);
-
   return ret;
 }
 
@@ -7072,7 +7741,8 @@ Server::process_request(Stream &strm, const std::string &remote_addr,
                         int remote_port, const std::string &local_addr,
                         int local_port, bool close_connection,
                         bool &connection_closed,
-                        const std::function<void(Request &)> &setup_request) {
+                        const std::function<void(Request &)> &setup_request,
+                        bool *websocket_upgraded) {
   std::array<char, 2048> buf{};
 
   detail::stream_line_reader line_reader(strm, buf.data(), buf.size());
@@ -7175,6 +7845,77 @@ Server::process_request(Stream &strm, const std::string &remote_addr,
     return !detail::is_socket_alive(sock);
   };
 
+  // WebSocket upgrade
+  // Check pre_routing_handler_ before upgrading so that authentication
+  // and other middleware can reject the request with an HTTP response
+  // (e.g., 401) before the protocol switches.
+  if (detail::is_websocket_upgrade(req)) {
+    if (pre_routing_handler_ &&
+        pre_routing_handler_(req, res) == HandlerResponse::Handled) {
+      if (res.status == -1) { res.status = StatusCode::OK_200; }
+      return write_response(strm, close_connection, req, res);
+    }
+    // Find matching WebSocket handler
+    for (const auto &entry : websocket_handlers_) {
+      if (entry.matcher->match(req)) {
+        // Compute accept key
+        auto client_key = req.get_header_value("Sec-WebSocket-Key");
+        auto accept_key = detail::websocket_accept_key(client_key);
+
+        // Negotiate subprotocol
+        std::string selected_subprotocol;
+        if (entry.sub_protocol_selector) {
+          auto protocol_header = req.get_header_value("Sec-WebSocket-Protocol");
+          if (!protocol_header.empty()) {
+            std::vector<std::string> protocols;
+            std::istringstream iss(protocol_header);
+            std::string token;
+            while (std::getline(iss, token, ',')) {
+              // Trim whitespace
+              auto start = token.find_first_not_of(' ');
+              auto end = token.find_last_not_of(' ');
+              if (start != std::string::npos) {
+                protocols.push_back(token.substr(start, end - start + 1));
+              }
+            }
+            selected_subprotocol = entry.sub_protocol_selector(protocols);
+          }
+        }
+
+        // Send 101 Switching Protocols
+        std::string handshake_response = "HTTP/1.1 101 Switching Protocols\r\n"
+                                         "Upgrade: websocket\r\n"
+                                         "Connection: Upgrade\r\n"
+                                         "Sec-WebSocket-Accept: " +
+                                         accept_key + "\r\n";
+        if (!selected_subprotocol.empty()) {
+          if (!detail::fields::is_field_value(selected_subprotocol)) {
+            return false;
+          }
+          handshake_response +=
+              "Sec-WebSocket-Protocol: " + selected_subprotocol + "\r\n";
+        }
+        handshake_response += "\r\n";
+        if (strm.write(handshake_response.data(), handshake_response.size()) <
+            0) {
+          return false;
+        }
+
+        connection_closed = true;
+        if (websocket_upgraded) { *websocket_upgraded = true; }
+
+        {
+          // Use WebSocket-specific read timeout instead of HTTP timeout
+          strm.set_read_timeout(CPPHTTPLIB_WEBSOCKET_READ_TIMEOUT_SECOND, 0);
+          ws::WebSocket ws(strm, req, true);
+          entry.handler(req, ws);
+        }
+        return true;
+      }
+    }
+    // No matching handler - fall through to 404
+  }
+
   // Routing
   auto routed = false;
 #ifdef CPPHTTPLIB_NO_EXCEPTIONS
@@ -7271,6 +8012,7 @@ bool Server::process_and_close_socket(socket_t sock) {
   int local_port = 0;
   detail::get_local_ip_and_port(sock, local_addr, local_port);
 
+  bool websocket_upgraded = false;
   auto ret = detail::process_server_socket(
       svr_sock_, sock, keep_alive_max_count_, keep_alive_timeout_sec_,
       read_timeout_sec_, read_timeout_usec_, write_timeout_sec_,
@@ -7278,7 +8020,7 @@ bool Server::process_and_close_socket(socket_t sock) {
       [&](Stream &strm, bool close_connection, bool &connection_closed) {
         return process_request(strm, remote_addr, remote_port, local_addr,
                                local_port, close_connection, connection_closed,
-                               nullptr);
+                               nullptr, &websocket_upgraded);
       });
 
   detail::shutdown_socket(sock);
@@ -9019,8 +9761,10 @@ Result ClientImpl::Post(const std::string &path, const Headers &headers,
   const auto &boundary = detail::make_multipart_data_boundary();
   const auto &content_type =
       detail::serialize_multipart_formdata_get_content_type(boundary);
-  const auto &body = detail::serialize_multipart_formdata(items, boundary);
-  return Post(path, headers, body, content_type, progress);
+  auto content_length = detail::get_multipart_content_length(items, boundary);
+  return Post(path, headers, content_length,
+              detail::make_multipart_content_provider(items, boundary),
+              content_type, progress);
 }
 
 Result ClientImpl::Post(const std::string &path, const Headers &headers,
@@ -9033,8 +9777,10 @@ Result ClientImpl::Post(const std::string &path, const Headers &headers,
 
   const auto &content_type =
       detail::serialize_multipart_formdata_get_content_type(boundary);
-  const auto &body = detail::serialize_multipart_formdata(items, boundary);
-  return Post(path, headers, body, content_type, progress);
+  auto content_length = detail::get_multipart_content_length(items, boundary);
+  return Post(path, headers, content_length,
+              detail::make_multipart_content_provider(items, boundary),
+              content_type, progress);
 }
 
 Result ClientImpl::Post(const std::string &path, const Headers &headers,
@@ -9212,8 +9958,10 @@ Result ClientImpl::Put(const std::string &path, const Headers &headers,
   const auto &boundary = detail::make_multipart_data_boundary();
   const auto &content_type =
       detail::serialize_multipart_formdata_get_content_type(boundary);
-  const auto &body = detail::serialize_multipart_formdata(items, boundary);
-  return Put(path, headers, body, content_type, progress);
+  auto content_length = detail::get_multipart_content_length(items, boundary);
+  return Put(path, headers, content_length,
+             detail::make_multipart_content_provider(items, boundary),
+             content_type, progress);
 }
 
 Result ClientImpl::Put(const std::string &path, const Headers &headers,
@@ -9226,8 +9974,10 @@ Result ClientImpl::Put(const std::string &path, const Headers &headers,
 
   const auto &content_type =
       detail::serialize_multipart_formdata_get_content_type(boundary);
-  const auto &body = detail::serialize_multipart_formdata(items, boundary);
-  return Put(path, headers, body, content_type, progress);
+  auto content_length = detail::get_multipart_content_length(items, boundary);
+  return Put(path, headers, content_length,
+             detail::make_multipart_content_provider(items, boundary),
+             content_type, progress);
 }
 
 Result ClientImpl::Put(const std::string &path, const Headers &headers,
@@ -9407,8 +10157,10 @@ Result ClientImpl::Patch(const std::string &path, const Headers &headers,
   const auto &boundary = detail::make_multipart_data_boundary();
   const auto &content_type =
       detail::serialize_multipart_formdata_get_content_type(boundary);
-  const auto &body = detail::serialize_multipart_formdata(items, boundary);
-  return Patch(path, headers, body, content_type, progress);
+  auto content_length = detail::get_multipart_content_length(items, boundary);
+  return Patch(path, headers, content_length,
+               detail::make_multipart_content_provider(items, boundary),
+               content_type, progress);
 }
 
 Result ClientImpl::Patch(const std::string &path, const Headers &headers,
@@ -9421,8 +10173,10 @@ Result ClientImpl::Patch(const std::string &path, const Headers &headers,
 
   const auto &content_type =
       detail::serialize_multipart_formdata_get_content_type(boundary);
-  const auto &body = detail::serialize_multipart_formdata(items, boundary);
-  return Patch(path, headers, body, content_type, progress);
+  auto content_length = detail::get_multipart_content_length(items, boundary);
+  return Patch(path, headers, content_length,
+               detail::make_multipart_content_provider(items, boundary),
+               content_type, progress);
 }
 
 Result ClientImpl::Patch(const std::string &path, const Headers &headers,
@@ -10579,9 +11333,9 @@ bool SSLServer::process_and_close_socket(socket_t sock) {
   // Use scope_exit to ensure cleanup on all paths (including exceptions)
   bool handshake_done = false;
   bool ret = false;
+  bool websocket_upgraded = false;
   auto cleanup = detail::scope_exit([&] {
-    // Shutdown gracefully if handshake succeeded and processing was successful
-    if (handshake_done) { shutdown(session, ret); }
+    if (handshake_done) { shutdown(session, !websocket_upgraded && ret); }
     free_session(session);
     detail::shutdown_socket(sock);
     detail::close_socket(sock);
@@ -10621,9 +11375,10 @@ bool SSLServer::process_and_close_socket(socket_t sock) {
       read_timeout_sec_, read_timeout_usec_, write_timeout_sec_,
       write_timeout_usec_,
       [&](Stream &strm, bool close_connection, bool &connection_closed) {
-        return process_request(strm, remote_addr, remote_port, local_addr,
-                               local_port, close_connection, connection_closed,
-                               [&](Request &req) { req.ssl = session; });
+        return process_request(
+            strm, remote_addr, remote_port, local_addr, local_port,
+            close_connection, connection_closed,
+            [&](Request &req) { req.ssl = session; }, &websocket_upgraded);
       });
 
   return ret;
@@ -10929,11 +11684,11 @@ bool SSLClient::initialize_ssl(Socket &socket, Error &error) {
 
   bool is_ip = detail::is_ip_address(host_);
 
-#ifdef CPPHTTPLIB_MBEDTLS_SUPPORT
-  // MbedTLS needs explicit verification mode (OpenSSL uses SSL_VERIFY_NONE
-  // by default and performs all verification post-handshake).
+#if defined(CPPHTTPLIB_MBEDTLS_SUPPORT) || defined(CPPHTTPLIB_WOLFSSL_SUPPORT)
+  // MbedTLS/wolfSSL need explicit verification mode (OpenSSL uses
+  // SSL_VERIFY_NONE by default and performs all verification post-handshake).
   // For IP addresses with verification enabled, use OPTIONAL mode since
-  // MbedTLS requires hostname for VERIFY_REQUIRED.
+  // these backends require hostname for strict verification.
   if (is_ip && server_certificate_verification_) {
     set_verify_client(ctx_, false);
   } else {
@@ -11154,6 +11909,107 @@ VerifyCallback &get_mbedtls_verify_callback() {
   return callback;
 }
 
+// Check if a string is an IPv4 address
+bool is_ipv4_address(const std::string &str) {
+  int dots = 0;
+  for (char c : str) {
+    if (c == '.') {
+      dots++;
+    } else if (!isdigit(static_cast<unsigned char>(c))) {
+      return false;
+    }
+  }
+  return dots == 3;
+}
+
+// Parse IPv4 address string to bytes
+bool parse_ipv4(const std::string &str, unsigned char *out) {
+  int parts[4];
+  if (sscanf(str.c_str(), "%d.%d.%d.%d", &parts[0], &parts[1], &parts[2],
+             &parts[3]) != 4) {
+    return false;
+  }
+  for (int i = 0; i < 4; i++) {
+    if (parts[i] < 0 || parts[i] > 255) return false;
+    out[i] = static_cast<unsigned char>(parts[i]);
+  }
+  return true;
+}
+
+#ifdef _WIN32
+// Enumerate Windows system certificates and call callback with DER data
+template <typename Callback>
+bool enumerate_windows_system_certs(Callback cb) {
+  bool loaded = false;
+  static const wchar_t *store_names[] = {L"ROOT", L"CA"};
+  for (auto store_name : store_names) {
+    HCERTSTORE hStore = CertOpenSystemStoreW(0, store_name);
+    if (hStore) {
+      PCCERT_CONTEXT pContext = nullptr;
+      while ((pContext = CertEnumCertificatesInStore(hStore, pContext)) !=
+             nullptr) {
+        if (cb(pContext->pbCertEncoded, pContext->cbCertEncoded)) {
+          loaded = true;
+        }
+      }
+      CertCloseStore(hStore, 0);
+    }
+  }
+  return loaded;
+}
+#endif
+
+#if defined(__APPLE__) && defined(CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN)
+// Enumerate macOS Keychain certificates and call callback with DER data
+template <typename Callback>
+bool enumerate_macos_keychain_certs(Callback cb) {
+  bool loaded = false;
+  CFArrayRef certs = nullptr;
+  OSStatus status = SecTrustCopyAnchorCertificates(&certs);
+  if (status == errSecSuccess && certs) {
+    CFIndex count = CFArrayGetCount(certs);
+    for (CFIndex i = 0; i < count; i++) {
+      SecCertificateRef cert =
+          (SecCertificateRef)CFArrayGetValueAtIndex(certs, i);
+      CFDataRef data = SecCertificateCopyData(cert);
+      if (data) {
+        if (cb(CFDataGetBytePtr(data),
+               static_cast<size_t>(CFDataGetLength(data)))) {
+          loaded = true;
+        }
+        CFRelease(data);
+      }
+    }
+    CFRelease(certs);
+  }
+  return loaded;
+}
+#endif
+
+#if !defined(_WIN32) && !(defined(__APPLE__) &&                                \
+                          defined(CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN))
+// Common CA certificate file paths on Linux/Unix
+const char **system_ca_paths() {
+  static const char *paths[] = {
+      "/etc/ssl/certs/ca-certificates.crt", // Debian/Ubuntu
+      "/etc/pki/tls/certs/ca-bundle.crt",   // RHEL/CentOS
+      "/etc/ssl/ca-bundle.pem",             // OpenSUSE
+      "/etc/pki/tls/cacert.pem",            // OpenELEC
+      "/etc/ssl/cert.pem",                  // Alpine, FreeBSD
+      nullptr};
+  return paths;
+}
+
+// Common CA certificate directory paths on Linux/Unix
+const char **system_ca_dirs() {
+  static const char *dirs[] = {"/etc/ssl/certs",             // Debian/Ubuntu
+                               "/etc/pki/tls/certs",         // RHEL/CentOS
+                               "/usr/share/ca-certificates", // Other
+                               nullptr};
+  return dirs;
+}
+#endif
+
 } // namespace impl
 
 bool set_client_ca_file(ctx_t ctx, const char *ca_file,
@@ -12730,33 +13586,6 @@ int mbedtls_sni_callback(void *p_ctx, mbedtls_ssl_context *ssl,
 int mbedtls_verify_callback(void *data, mbedtls_x509_crt *crt,
                                    int cert_depth, uint32_t *flags);
 
-// Check if a string is an IPv4 address
-bool is_ipv4_address(const std::string &str) {
-  int dots = 0;
-  for (char c : str) {
-    if (c == '.') {
-      dots++;
-    } else if (!isdigit(static_cast<unsigned char>(c))) {
-      return false;
-    }
-  }
-  return dots == 3;
-}
-
-// Parse IPv4 address string to bytes
-bool parse_ipv4(const std::string &str, unsigned char *out) {
-  int parts[4];
-  if (sscanf(str.c_str(), "%d.%d.%d.%d", &parts[0], &parts[1], &parts[2],
-             &parts[3]) != 4) {
-    return false;
-  }
-  for (int i = 0; i < 4; i++) {
-    if (parts[i] < 0 || parts[i] > 255) return false;
-    out[i] = static_cast<unsigned char>(parts[i]);
-  }
-  return true;
-}
-
 // MbedTLS verify callback wrapper
 int mbedtls_verify_callback(void *data, mbedtls_x509_crt *crt,
                                    int cert_depth, uint32_t *flags) {
@@ -12971,68 +13800,26 @@ bool load_system_certs(ctx_t ctx) {
   bool loaded = false;
 
 #ifdef _WIN32
-  // Load from Windows certificate store (ROOT and CA)
-  static const wchar_t *store_names[] = {L"ROOT", L"CA"};
-  for (auto store_name : store_names) {
-    HCERTSTORE hStore = CertOpenSystemStoreW(0, store_name);
-    if (hStore) {
-      PCCERT_CONTEXT pContext = nullptr;
-      while ((pContext = CertEnumCertificatesInStore(hStore, pContext)) !=
-             nullptr) {
-        int ret = mbedtls_x509_crt_parse_der(
-            &mctx->ca_chain, pContext->pbCertEncoded, pContext->cbCertEncoded);
-        if (ret == 0) { loaded = true; }
-      }
-      CertCloseStore(hStore, 0);
-    }
-  }
+  loaded = impl::enumerate_windows_system_certs(
+      [&](const unsigned char *data, size_t len) {
+        return mbedtls_x509_crt_parse_der(&mctx->ca_chain, data, len) == 0;
+      });
 #elif defined(__APPLE__) && defined(CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN)
-  // Load from macOS Keychain
-  CFArrayRef certs = nullptr;
-  OSStatus status = SecTrustCopyAnchorCertificates(&certs);
-  if (status == errSecSuccess && certs) {
-    CFIndex count = CFArrayGetCount(certs);
-    for (CFIndex i = 0; i < count; i++) {
-      SecCertificateRef cert =
-          (SecCertificateRef)CFArrayGetValueAtIndex(certs, i);
-      CFDataRef data = SecCertificateCopyData(cert);
-      if (data) {
-        int ret = mbedtls_x509_crt_parse_der(
-            &mctx->ca_chain, CFDataGetBytePtr(data),
-            static_cast<size_t>(CFDataGetLength(data)));
-        if (ret == 0) { loaded = true; }
-        CFRelease(data);
-      }
-    }
-    CFRelease(certs);
-  }
+  loaded = impl::enumerate_macos_keychain_certs(
+      [&](const unsigned char *data, size_t len) {
+        return mbedtls_x509_crt_parse_der(&mctx->ca_chain, data, len) == 0;
+      });
 #else
-  // Try common CA certificate locations on Linux/Unix
-  static const char *ca_paths[] = {
-      "/etc/ssl/certs/ca-certificates.crt", // Debian/Ubuntu
-      "/etc/pki/tls/certs/ca-bundle.crt",   // RHEL/CentOS
-      "/etc/ssl/ca-bundle.pem",             // OpenSUSE
-      "/etc/pki/tls/cacert.pem",            // OpenELEC
-      "/etc/ssl/cert.pem",                  // Alpine, FreeBSD
-      nullptr};
-
-  for (const char **path = ca_paths; *path; ++path) {
-    int ret = mbedtls_x509_crt_parse_file(&mctx->ca_chain, *path);
-    if (ret >= 0) {
+  for (auto path = impl::system_ca_paths(); *path; ++path) {
+    if (mbedtls_x509_crt_parse_file(&mctx->ca_chain, *path) >= 0) {
       loaded = true;
       break;
     }
   }
 
-  // Also try the CA directory
   if (!loaded) {
-    static const char *ca_dirs[] = {"/etc/ssl/certs",     // Debian/Ubuntu
-                                    "/etc/pki/tls/certs", // RHEL/CentOS
-                                    "/usr/share/ca-certificates", nullptr};
-
-    for (const char **dir = ca_dirs; *dir; ++dir) {
-      int ret = mbedtls_x509_crt_parse_path(&mctx->ca_chain, *dir);
-      if (ret >= 0) {
+    for (auto dir = impl::system_ca_dirs(); *dir; ++dir) {
+      if (mbedtls_x509_crt_parse_path(&mctx->ca_chain, *dir) >= 0) {
         loaded = true;
         break;
       }
@@ -13083,6 +13870,18 @@ bool set_client_cert_pem(ctx_t ctx, const char *cert, const char *key,
     return false;
   }
 
+  // Verify that the certificate and private key match
+#ifdef CPPHTTPLIB_MBEDTLS_V3
+  ret = mbedtls_pk_check_pair(&mctx->own_cert.pk, &mctx->own_key,
+                              mbedtls_ctr_drbg_random, &mctx->ctr_drbg);
+#else
+  ret = mbedtls_pk_check_pair(&mctx->own_cert.pk, &mctx->own_key);
+#endif
+  if (ret != 0) {
+    impl::mbedtls_last_error() = ret;
+    return false;
+  }
+
   ret = mbedtls_ssl_conf_own_cert(&mctx->conf, &mctx->own_cert, &mctx->own_key);
   if (ret != 0) {
     impl::mbedtls_last_error() = ret;
@@ -13116,6 +13915,18 @@ bool set_client_cert_file(ctx_t ctx, const char *cert_path,
     return false;
   }
 
+  // Verify that the certificate and private key match
+#ifdef CPPHTTPLIB_MBEDTLS_V3
+  ret = mbedtls_pk_check_pair(&mctx->own_cert.pk, &mctx->own_key,
+                              mbedtls_ctr_drbg_random, &mctx->ctr_drbg);
+#else
+  ret = mbedtls_pk_check_pair(&mctx->own_cert.pk, &mctx->own_key);
+#endif
+  if (ret != 0) {
+    impl::mbedtls_last_error() = ret;
+    return false;
+  }
+
   ret = mbedtls_ssl_conf_own_cert(&mctx->conf, &mctx->own_cert, &mctx->own_key);
   if (ret != 0) {
     impl::mbedtls_last_error() = ret;
@@ -13877,4 +14688,1477 @@ std::string verify_error_string(long error_code) {
 
 #endif // CPPHTTPLIB_MBEDTLS_SUPPORT
 
+/*
+ * Group 10: TLS abstraction layer - wolfSSL backend
+ */
+
+/*
+ * wolfSSL Backend Implementation
+ */
+
+#ifdef CPPHTTPLIB_WOLFSSL_SUPPORT
+namespace tls {
+
+namespace impl {
+
+// wolfSSL session wrapper
+struct WolfSSLSession {
+  WOLFSSL *ssl = nullptr;
+  socket_t sock = INVALID_SOCKET;
+  std::string hostname;     // For client: set via set_sni
+  std::string sni_hostname; // For server: received from client via SNI callback
+
+  WolfSSLSession() = default;
+
+  ~WolfSSLSession() {
+    if (ssl) { wolfSSL_free(ssl); }
+  }
+
+  WolfSSLSession(const WolfSSLSession &) = delete;
+  WolfSSLSession &operator=(const WolfSSLSession &) = delete;
+};
+
+// Thread-local error code accessor for wolfSSL
+uint64_t &wolfssl_last_error() {
+  static thread_local uint64_t err = 0;
+  return err;
+}
+
+// Helper to map wolfSSL error to ErrorCode.
+// ssl_error is the value from wolfSSL_get_error().
+// raw_ret is the raw return value from the wolfSSL call (for low-level error).
+ErrorCode map_wolfssl_error(WOLFSSL *ssl, int ssl_error,
+                                   int &out_errno) {
+  switch (ssl_error) {
+  case SSL_ERROR_NONE: return ErrorCode::Success;
+  case SSL_ERROR_WANT_READ: return ErrorCode::WantRead;
+  case SSL_ERROR_WANT_WRITE: return ErrorCode::WantWrite;
+  case SSL_ERROR_ZERO_RETURN: return ErrorCode::PeerClosed;
+  case SSL_ERROR_SYSCALL: out_errno = errno; return ErrorCode::SyscallError;
+  default:
+    if (ssl) {
+      // wolfSSL stores the low-level error code as a negative value.
+      // DOMAIN_NAME_MISMATCH (-322) indicates hostname verification failure.
+      int low_err = ssl_error; // wolfSSL_get_error returns the low-level code
+      if (low_err == DOMAIN_NAME_MISMATCH) {
+        return ErrorCode::HostnameMismatch;
+      }
+      // Check verify result to distinguish cert verification from generic SSL
+      // errors.
+      long vr = wolfSSL_get_verify_result(ssl);
+      if (vr != 0) { return ErrorCode::CertVerifyFailed; }
+    }
+    return ErrorCode::Fatal;
+  }
+}
+
+// WolfSSLContext constructor/destructor implementations
+WolfSSLContext::WolfSSLContext() { wolfSSL_Init(); }
+
+WolfSSLContext::~WolfSSLContext() {
+  if (ctx) { wolfSSL_CTX_free(ctx); }
+}
+
+// Thread-local storage for SNI captured during handshake
+std::string &wolfssl_pending_sni() {
+  static thread_local std::string sni;
+  return sni;
+}
+
+// SNI callback for wolfSSL server to capture client's SNI hostname
+int wolfssl_sni_callback(WOLFSSL *ssl, int *ret, void *exArg) {
+  (void)ret;
+  (void)exArg;
+
+  void *name_data = nullptr;
+  unsigned short name_len =
+      wolfSSL_SNI_GetRequest(ssl, WOLFSSL_SNI_HOST_NAME, &name_data);
+
+  if (name_data && name_len > 0) {
+    wolfssl_pending_sni().assign(static_cast<const char *>(name_data),
+                                 name_len);
+  } else {
+    wolfssl_pending_sni().clear();
+  }
+  return 0; // Continue regardless
+}
+
+// wolfSSL verify callback wrapper
+int wolfssl_verify_callback(int preverify_ok,
+                                   WOLFSSL_X509_STORE_CTX *x509_ctx) {
+  auto &callback = get_verify_callback();
+  if (!callback) { return preverify_ok; }
+
+  WOLFSSL_X509 *cert = wolfSSL_X509_STORE_CTX_get_current_cert(x509_ctx);
+  int depth = wolfSSL_X509_STORE_CTX_get_error_depth(x509_ctx);
+  int err = wolfSSL_X509_STORE_CTX_get_error(x509_ctx);
+
+  // Get the WOLFSSL object from the X509_STORE_CTX
+  WOLFSSL *ssl = static_cast<WOLFSSL *>(wolfSSL_X509_STORE_CTX_get_ex_data(
+      x509_ctx, wolfSSL_get_ex_data_X509_STORE_CTX_idx()));
+
+  VerifyContext verify_ctx;
+  verify_ctx.session = static_cast<session_t>(ssl);
+  verify_ctx.cert = static_cast<cert_t>(cert);
+  verify_ctx.depth = depth;
+  verify_ctx.preverify_ok = (preverify_ok != 0);
+  verify_ctx.error_code = static_cast<long>(err);
+
+  if (err != 0) {
+    verify_ctx.error_string = wolfSSL_X509_verify_cert_error_string(err);
+  } else {
+    verify_ctx.error_string = nullptr;
+  }
+
+  bool accepted = callback(verify_ctx);
+  return accepted ? 1 : 0;
+}
+
+void set_wolfssl_password_cb(WOLFSSL_CTX *ctx, const char *password) {
+  wolfSSL_CTX_set_default_passwd_cb_userdata(ctx, const_cast<char *>(password));
+  wolfSSL_CTX_set_default_passwd_cb(
+      ctx, [](char *buf, int size, int /*rwflag*/, void *userdata) -> int {
+        auto *pwd = static_cast<const char *>(userdata);
+        if (!pwd) return 0;
+        auto len = static_cast<int>(strlen(pwd));
+        if (len > size) len = size;
+        memcpy(buf, pwd, static_cast<size_t>(len));
+        return len;
+      });
+}
+
+} // namespace impl
+
+ctx_t create_client_context() {
+  auto ctx = new (std::nothrow) impl::WolfSSLContext();
+  if (!ctx) { return nullptr; }
+
+  ctx->is_server = false;
+
+  WOLFSSL_METHOD *method = wolfTLSv1_2_client_method();
+  if (!method) {
+    delete ctx;
+    return nullptr;
+  }
+
+  ctx->ctx = wolfSSL_CTX_new(method);
+  if (!ctx->ctx) {
+    delete ctx;
+    return nullptr;
+  }
+
+  // Default: verify peer certificate
+  wolfSSL_CTX_set_verify(ctx->ctx, SSL_VERIFY_PEER, nullptr);
+
+  return static_cast<ctx_t>(ctx);
+}
+
+ctx_t create_server_context() {
+  auto ctx = new (std::nothrow) impl::WolfSSLContext();
+  if (!ctx) { return nullptr; }
+
+  ctx->is_server = true;
+
+  WOLFSSL_METHOD *method = wolfTLSv1_2_server_method();
+  if (!method) {
+    delete ctx;
+    return nullptr;
+  }
+
+  ctx->ctx = wolfSSL_CTX_new(method);
+  if (!ctx->ctx) {
+    delete ctx;
+    return nullptr;
+  }
+
+  // Default: don't verify client
+  wolfSSL_CTX_set_verify(ctx->ctx, SSL_VERIFY_NONE, nullptr);
+
+  // Enable SNI on server
+  wolfSSL_CTX_SNI_SetOptions(ctx->ctx, WOLFSSL_SNI_HOST_NAME,
+                             WOLFSSL_SNI_CONTINUE_ON_MISMATCH);
+  wolfSSL_CTX_set_servername_callback(ctx->ctx, impl::wolfssl_sni_callback);
+
+  return static_cast<ctx_t>(ctx);
+}
+
+void free_context(ctx_t ctx) {
+  if (ctx) { delete static_cast<impl::WolfSSLContext *>(ctx); }
+}
+
+bool set_min_version(ctx_t ctx, Version version) {
+  if (!ctx) { return false; }
+  auto wctx = static_cast<impl::WolfSSLContext *>(ctx);
+
+  int min_ver = WOLFSSL_TLSV1_2;
+  if (version >= Version::TLS1_3) { min_ver = WOLFSSL_TLSV1_3; }
+
+  return wolfSSL_CTX_SetMinVersion(wctx->ctx, min_ver) == WOLFSSL_SUCCESS;
+}
+
+bool load_ca_pem(ctx_t ctx, const char *pem, size_t len) {
+  if (!ctx || !pem) { return false; }
+  auto wctx = static_cast<impl::WolfSSLContext *>(ctx);
+
+  int ret = wolfSSL_CTX_load_verify_buffer(
+      wctx->ctx, reinterpret_cast<const unsigned char *>(pem),
+      static_cast<long>(len), SSL_FILETYPE_PEM);
+  if (ret != SSL_SUCCESS) {
+    impl::wolfssl_last_error() =
+        static_cast<uint64_t>(wolfSSL_ERR_peek_last_error());
+    return false;
+  }
+  wctx->ca_pem_data_.append(pem, len);
+  return true;
+}
+
+bool load_ca_file(ctx_t ctx, const char *file_path) {
+  if (!ctx || !file_path) { return false; }
+  auto wctx = static_cast<impl::WolfSSLContext *>(ctx);
+
+  int ret = wolfSSL_CTX_load_verify_locations(wctx->ctx, file_path, nullptr);
+  if (ret != SSL_SUCCESS) {
+    impl::wolfssl_last_error() =
+        static_cast<uint64_t>(wolfSSL_ERR_peek_last_error());
+    return false;
+  }
+  return true;
+}
+
+bool load_ca_dir(ctx_t ctx, const char *dir_path) {
+  if (!ctx || !dir_path) { return false; }
+  auto wctx = static_cast<impl::WolfSSLContext *>(ctx);
+
+  int ret = wolfSSL_CTX_load_verify_locations(wctx->ctx, nullptr, dir_path);
+  // wolfSSL may fail if the directory doesn't contain properly hashed certs.
+  // Unlike OpenSSL which lazily loads certs from directories, wolfSSL scans
+  // immediately. Return true even on failure since the CA file may have
+  // already been loaded, matching OpenSSL's lenient behavior.
+  (void)ret;
+  return true;
+}
+
+bool load_system_certs(ctx_t ctx) {
+  if (!ctx) { return false; }
+  auto wctx = static_cast<impl::WolfSSLContext *>(ctx);
+  bool loaded = false;
+
+#ifdef _WIN32
+  loaded = impl::enumerate_windows_system_certs(
+      [&](const unsigned char *data, size_t len) {
+        return wolfSSL_CTX_load_verify_buffer(wctx->ctx, data,
+                                              static_cast<long>(len),
+                                              SSL_FILETYPE_ASN1) == SSL_SUCCESS;
+      });
+#elif defined(__APPLE__) && defined(CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN)
+  loaded = impl::enumerate_macos_keychain_certs(
+      [&](const unsigned char *data, size_t len) {
+        return wolfSSL_CTX_load_verify_buffer(wctx->ctx, data,
+                                              static_cast<long>(len),
+                                              SSL_FILETYPE_ASN1) == SSL_SUCCESS;
+      });
+#else
+  for (auto path = impl::system_ca_paths(); *path; ++path) {
+    if (wolfSSL_CTX_load_verify_locations(wctx->ctx, *path, nullptr) ==
+        SSL_SUCCESS) {
+      loaded = true;
+      break;
+    }
+  }
+
+  if (!loaded) {
+    for (auto dir = impl::system_ca_dirs(); *dir; ++dir) {
+      if (wolfSSL_CTX_load_verify_locations(wctx->ctx, nullptr, *dir) ==
+          SSL_SUCCESS) {
+        loaded = true;
+        break;
+      }
+    }
+  }
+#endif
+
+  return loaded;
+}
+
+bool set_client_cert_pem(ctx_t ctx, const char *cert, const char *key,
+                                const char *password) {
+  if (!ctx || !cert || !key) { return false; }
+  auto wctx = static_cast<impl::WolfSSLContext *>(ctx);
+
+  // Load certificate
+  int ret = wolfSSL_CTX_use_certificate_buffer(
+      wctx->ctx, reinterpret_cast<const unsigned char *>(cert),
+      static_cast<long>(strlen(cert)), SSL_FILETYPE_PEM);
+  if (ret != SSL_SUCCESS) {
+    impl::wolfssl_last_error() =
+        static_cast<uint64_t>(wolfSSL_ERR_peek_last_error());
+    return false;
+  }
+
+  // Set password callback if password is provided
+  if (password) { impl::set_wolfssl_password_cb(wctx->ctx, password); }
+
+  // Load private key
+  ret = wolfSSL_CTX_use_PrivateKey_buffer(
+      wctx->ctx, reinterpret_cast<const unsigned char *>(key),
+      static_cast<long>(strlen(key)), SSL_FILETYPE_PEM);
+  if (ret != SSL_SUCCESS) {
+    impl::wolfssl_last_error() =
+        static_cast<uint64_t>(wolfSSL_ERR_peek_last_error());
+    return false;
+  }
+
+  // Verify that the certificate and private key match
+  return wolfSSL_CTX_check_private_key(wctx->ctx) == SSL_SUCCESS;
+}
+
+bool set_client_cert_file(ctx_t ctx, const char *cert_path,
+                                 const char *key_path, const char *password) {
+  if (!ctx || !cert_path || !key_path) { return false; }
+  auto wctx = static_cast<impl::WolfSSLContext *>(ctx);
+
+  // Load certificate file
+  int ret =
+      wolfSSL_CTX_use_certificate_file(wctx->ctx, cert_path, SSL_FILETYPE_PEM);
+  if (ret != SSL_SUCCESS) {
+    impl::wolfssl_last_error() =
+        static_cast<uint64_t>(wolfSSL_ERR_peek_last_error());
+    return false;
+  }
+
+  // Set password callback if password is provided
+  if (password) { impl::set_wolfssl_password_cb(wctx->ctx, password); }
+
+  // Load private key file
+  ret = wolfSSL_CTX_use_PrivateKey_file(wctx->ctx, key_path, SSL_FILETYPE_PEM);
+  if (ret != SSL_SUCCESS) {
+    impl::wolfssl_last_error() =
+        static_cast<uint64_t>(wolfSSL_ERR_peek_last_error());
+    return false;
+  }
+
+  // Verify that the certificate and private key match
+  return wolfSSL_CTX_check_private_key(wctx->ctx) == SSL_SUCCESS;
+}
+
+void set_verify_client(ctx_t ctx, bool require) {
+  if (!ctx) { return; }
+  auto wctx = static_cast<impl::WolfSSLContext *>(ctx);
+  wctx->verify_client = require;
+  if (require) {
+    wolfSSL_CTX_set_verify(
+        wctx->ctx, SSL_VERIFY_PEER | SSL_VERIFY_FAIL_IF_NO_PEER_CERT,
+        wctx->has_verify_callback ? impl::wolfssl_verify_callback : nullptr);
+  } else {
+    if (wctx->has_verify_callback) {
+      wolfSSL_CTX_set_verify(wctx->ctx, SSL_VERIFY_PEER,
+                             impl::wolfssl_verify_callback);
+    } else {
+      wolfSSL_CTX_set_verify(wctx->ctx, SSL_VERIFY_NONE, nullptr);
+    }
+  }
+}
+
+session_t create_session(ctx_t ctx, socket_t sock) {
+  if (!ctx || sock == INVALID_SOCKET) { return nullptr; }
+  auto wctx = static_cast<impl::WolfSSLContext *>(ctx);
+
+  auto session = new (std::nothrow) impl::WolfSSLSession();
+  if (!session) { return nullptr; }
+
+  session->sock = sock;
+  session->ssl = wolfSSL_new(wctx->ctx);
+  if (!session->ssl) {
+    impl::wolfssl_last_error() =
+        static_cast<uint64_t>(wolfSSL_ERR_peek_last_error());
+    delete session;
+    return nullptr;
+  }
+
+  wolfSSL_set_fd(session->ssl, static_cast<int>(sock));
+
+  return static_cast<session_t>(session);
+}
+
+void free_session(session_t session) {
+  if (session) { delete static_cast<impl::WolfSSLSession *>(session); }
+}
+
+bool set_sni(session_t session, const char *hostname) {
+  if (!session || !hostname) { return false; }
+  auto wsession = static_cast<impl::WolfSSLSession *>(session);
+
+  int ret = wolfSSL_UseSNI(wsession->ssl, WOLFSSL_SNI_HOST_NAME, hostname,
+                           static_cast<word16>(strlen(hostname)));
+  if (ret != WOLFSSL_SUCCESS) {
+    impl::wolfssl_last_error() =
+        static_cast<uint64_t>(wolfSSL_ERR_peek_last_error());
+    return false;
+  }
+
+  // Also set hostname for verification
+  wolfSSL_check_domain_name(wsession->ssl, hostname);
+
+  wsession->hostname = hostname;
+  return true;
+}
+
+bool set_hostname(session_t session, const char *hostname) {
+  // In wolfSSL, set_hostname also sets up hostname verification
+  return set_sni(session, hostname);
+}
+
+TlsError connect(session_t session) {
+  TlsError err;
+  if (!session) {
+    err.code = ErrorCode::Fatal;
+    return err;
+  }
+
+  auto wsession = static_cast<impl::WolfSSLSession *>(session);
+  int ret = wolfSSL_connect(wsession->ssl);
+
+  if (ret == SSL_SUCCESS) {
+    err.code = ErrorCode::Success;
+  } else {
+    int ssl_error = wolfSSL_get_error(wsession->ssl, ret);
+    err.code = impl::map_wolfssl_error(wsession->ssl, ssl_error, err.sys_errno);
+    err.backend_code = static_cast<uint64_t>(ssl_error);
+    impl::wolfssl_last_error() = err.backend_code;
+  }
+
+  return err;
+}
+
+TlsError accept(session_t session) {
+  TlsError err;
+  if (!session) {
+    err.code = ErrorCode::Fatal;
+    return err;
+  }
+
+  auto wsession = static_cast<impl::WolfSSLSession *>(session);
+  int ret = wolfSSL_accept(wsession->ssl);
+
+  if (ret == SSL_SUCCESS) {
+    err.code = ErrorCode::Success;
+    // Capture SNI from thread-local storage after successful handshake
+    wsession->sni_hostname = std::move(impl::wolfssl_pending_sni());
+    impl::wolfssl_pending_sni().clear();
+  } else {
+    int ssl_error = wolfSSL_get_error(wsession->ssl, ret);
+    err.code = impl::map_wolfssl_error(wsession->ssl, ssl_error, err.sys_errno);
+    err.backend_code = static_cast<uint64_t>(ssl_error);
+    impl::wolfssl_last_error() = err.backend_code;
+  }
+
+  return err;
+}
+
+bool connect_nonblocking(session_t session, socket_t sock,
+                                time_t timeout_sec, time_t timeout_usec,
+                                TlsError *err) {
+  if (!session) {
+    if (err) { err->code = ErrorCode::Fatal; }
+    return false;
+  }
+
+  auto wsession = static_cast<impl::WolfSSLSession *>(session);
+
+  // Set socket to non-blocking mode
+  detail::set_nonblocking(sock, true);
+  auto cleanup =
+      detail::scope_exit([&]() { detail::set_nonblocking(sock, false); });
+
+  int ret;
+  while ((ret = wolfSSL_connect(wsession->ssl)) != SSL_SUCCESS) {
+    int ssl_error = wolfSSL_get_error(wsession->ssl, ret);
+    if (ssl_error == SSL_ERROR_WANT_READ) {
+      if (detail::select_read(sock, timeout_sec, timeout_usec) > 0) {
+        continue;
+      }
+    } else if (ssl_error == SSL_ERROR_WANT_WRITE) {
+      if (detail::select_write(sock, timeout_sec, timeout_usec) > 0) {
+        continue;
+      }
+    }
+
+    // Error or timeout
+    if (err) {
+      err->code =
+          impl::map_wolfssl_error(wsession->ssl, ssl_error, err->sys_errno);
+      err->backend_code = static_cast<uint64_t>(ssl_error);
+    }
+    impl::wolfssl_last_error() = static_cast<uint64_t>(ssl_error);
+    return false;
+  }
+
+  if (err) { err->code = ErrorCode::Success; }
+  return true;
+}
+
+bool accept_nonblocking(session_t session, socket_t sock,
+                               time_t timeout_sec, time_t timeout_usec,
+                               TlsError *err) {
+  if (!session) {
+    if (err) { err->code = ErrorCode::Fatal; }
+    return false;
+  }
+
+  auto wsession = static_cast<impl::WolfSSLSession *>(session);
+
+  // Set socket to non-blocking mode
+  detail::set_nonblocking(sock, true);
+  auto cleanup =
+      detail::scope_exit([&]() { detail::set_nonblocking(sock, false); });
+
+  int ret;
+  while ((ret = wolfSSL_accept(wsession->ssl)) != SSL_SUCCESS) {
+    int ssl_error = wolfSSL_get_error(wsession->ssl, ret);
+    if (ssl_error == SSL_ERROR_WANT_READ) {
+      if (detail::select_read(sock, timeout_sec, timeout_usec) > 0) {
+        continue;
+      }
+    } else if (ssl_error == SSL_ERROR_WANT_WRITE) {
+      if (detail::select_write(sock, timeout_sec, timeout_usec) > 0) {
+        continue;
+      }
+    }
+
+    // Error or timeout
+    if (err) {
+      err->code =
+          impl::map_wolfssl_error(wsession->ssl, ssl_error, err->sys_errno);
+      err->backend_code = static_cast<uint64_t>(ssl_error);
+    }
+    impl::wolfssl_last_error() = static_cast<uint64_t>(ssl_error);
+    return false;
+  }
+
+  if (err) { err->code = ErrorCode::Success; }
+
+  // Capture SNI from thread-local storage after successful handshake
+  wsession->sni_hostname = std::move(impl::wolfssl_pending_sni());
+  impl::wolfssl_pending_sni().clear();
+
+  return true;
+}
+
+ssize_t read(session_t session, void *buf, size_t len, TlsError &err) {
+  if (!session || !buf) {
+    err.code = ErrorCode::Fatal;
+    return -1;
+  }
+
+  auto wsession = static_cast<impl::WolfSSLSession *>(session);
+  int ret = wolfSSL_read(wsession->ssl, buf, static_cast<int>(len));
+
+  if (ret > 0) {
+    err.code = ErrorCode::Success;
+    return static_cast<ssize_t>(ret);
+  }
+
+  if (ret == 0) {
+    err.code = ErrorCode::PeerClosed;
+    return 0;
+  }
+
+  int ssl_error = wolfSSL_get_error(wsession->ssl, ret);
+  err.code = impl::map_wolfssl_error(wsession->ssl, ssl_error, err.sys_errno);
+  err.backend_code = static_cast<uint64_t>(ssl_error);
+  impl::wolfssl_last_error() = err.backend_code;
+  return -1;
+}
+
+ssize_t write(session_t session, const void *buf, size_t len,
+                     TlsError &err) {
+  if (!session || !buf) {
+    err.code = ErrorCode::Fatal;
+    return -1;
+  }
+
+  auto wsession = static_cast<impl::WolfSSLSession *>(session);
+  int ret = wolfSSL_write(wsession->ssl, buf, static_cast<int>(len));
+
+  if (ret > 0) {
+    err.code = ErrorCode::Success;
+    return static_cast<ssize_t>(ret);
+  }
+
+  // wolfSSL_write returns 0 when the peer has sent a close_notify.
+  // Treat this as an error (return -1) so callers don't spin in a
+  // write loop adding zero to the offset.
+  if (ret == 0) {
+    err.code = ErrorCode::PeerClosed;
+    return -1;
+  }
+
+  int ssl_error = wolfSSL_get_error(wsession->ssl, ret);
+  err.code = impl::map_wolfssl_error(wsession->ssl, ssl_error, err.sys_errno);
+  err.backend_code = static_cast<uint64_t>(ssl_error);
+  impl::wolfssl_last_error() = err.backend_code;
+  return -1;
+}
+
+int pending(const_session_t session) {
+  if (!session) { return 0; }
+  auto wsession =
+      static_cast<impl::WolfSSLSession *>(const_cast<void *>(session));
+  return wolfSSL_pending(wsession->ssl);
+}
+
+void shutdown(session_t session, bool graceful) {
+  if (!session) { return; }
+  auto wsession = static_cast<impl::WolfSSLSession *>(session);
+
+  if (graceful) {
+    int ret;
+    int attempts = 0;
+    while ((ret = wolfSSL_shutdown(wsession->ssl)) != SSL_SUCCESS &&
+           attempts < 3) {
+      int ssl_error = wolfSSL_get_error(wsession->ssl, ret);
+      if (ssl_error != SSL_ERROR_WANT_READ &&
+          ssl_error != SSL_ERROR_WANT_WRITE) {
+        break;
+      }
+      attempts++;
+    }
+  } else {
+    wolfSSL_shutdown(wsession->ssl);
+  }
+}
+
+bool is_peer_closed(session_t session, socket_t sock) {
+  if (!session || sock == INVALID_SOCKET) { return true; }
+  auto wsession = static_cast<impl::WolfSSLSession *>(session);
+
+  // Check if there's already decrypted data available
+  if (wolfSSL_pending(wsession->ssl) > 0) { return false; }
+
+  // Set socket to non-blocking to avoid blocking on read
+  detail::set_nonblocking(sock, true);
+  auto cleanup =
+      detail::scope_exit([&]() { detail::set_nonblocking(sock, false); });
+
+  // Peek 1 byte to check connection status without consuming data
+  unsigned char buf;
+  int ret = wolfSSL_peek(wsession->ssl, &buf, 1);
+
+  // If we got data or WANT_READ (would block), connection is alive
+  if (ret > 0) { return false; }
+
+  int ssl_error = wolfSSL_get_error(wsession->ssl, ret);
+  if (ssl_error == SSL_ERROR_WANT_READ) { return false; }
+
+  return ssl_error == SSL_ERROR_ZERO_RETURN || ssl_error == SSL_ERROR_SYSCALL ||
+         ret == 0;
+}
+
+cert_t get_peer_cert(const_session_t session) {
+  if (!session) { return nullptr; }
+  auto wsession =
+      static_cast<impl::WolfSSLSession *>(const_cast<void *>(session));
+
+  WOLFSSL_X509 *cert = wolfSSL_get_peer_certificate(wsession->ssl);
+  return static_cast<cert_t>(cert);
+}
+
+void free_cert(cert_t cert) {
+  if (cert) { wolfSSL_X509_free(static_cast<WOLFSSL_X509 *>(cert)); }
+}
+
+bool verify_hostname(cert_t cert, const char *hostname) {
+  if (!cert || !hostname) { return false; }
+  auto x509 = static_cast<WOLFSSL_X509 *>(cert);
+  std::string host_str(hostname);
+
+  // Check if hostname is an IP address
+  bool is_ip = impl::is_ipv4_address(host_str);
+  unsigned char ip_bytes[4];
+  if (is_ip) { impl::parse_ipv4(host_str, ip_bytes); }
+
+  // Check Subject Alternative Names
+  auto *san_names = static_cast<WOLF_STACK_OF(WOLFSSL_GENERAL_NAME) *>(
+      wolfSSL_X509_get_ext_d2i(x509, NID_subject_alt_name, nullptr, nullptr));
+
+  if (san_names) {
+    int san_count = wolfSSL_sk_num(san_names);
+    for (int i = 0; i < san_count; i++) {
+      auto *names =
+          static_cast<WOLFSSL_GENERAL_NAME *>(wolfSSL_sk_value(san_names, i));
+      if (!names) continue;
+
+      if (!is_ip && names->type == WOLFSSL_GEN_DNS) {
+        // DNS name
+        unsigned char *dns_name = nullptr;
+        int dns_len = wolfSSL_ASN1_STRING_to_UTF8(&dns_name, names->d.dNSName);
+        if (dns_name && dns_len > 0) {
+          std::string san_name(reinterpret_cast<char *>(dns_name),
+                               static_cast<size_t>(dns_len));
+          XFREE(dns_name, nullptr, DYNAMIC_TYPE_OPENSSL);
+          if (detail::match_hostname(san_name, host_str)) {
+            wolfSSL_sk_free(san_names);
+            return true;
+          }
+        }
+      } else if (is_ip && names->type == WOLFSSL_GEN_IPADD) {
+        // IP address
+        unsigned char *ip_data = wolfSSL_ASN1_STRING_data(names->d.iPAddress);
+        int ip_len = wolfSSL_ASN1_STRING_length(names->d.iPAddress);
+        if (ip_data && ip_len == 4 && memcmp(ip_data, ip_bytes, 4) == 0) {
+          wolfSSL_sk_free(san_names);
+          return true;
+        }
+      }
+    }
+    wolfSSL_sk_free(san_names);
+  }
+
+  // Fallback: Check Common Name (CN) in subject
+  WOLFSSL_X509_NAME *subject = wolfSSL_X509_get_subject_name(x509);
+  if (subject) {
+    char cn[256] = {};
+    int cn_len = wolfSSL_X509_NAME_get_text_by_NID(subject, NID_commonName, cn,
+                                                   sizeof(cn));
+    if (cn_len > 0) {
+      std::string cn_str(cn, static_cast<size_t>(cn_len));
+      if (detail::match_hostname(cn_str, host_str)) { return true; }
+    }
+  }
+
+  return false;
+}
+
+uint64_t hostname_mismatch_code() {
+  return static_cast<uint64_t>(DOMAIN_NAME_MISMATCH);
+}
+
+long get_verify_result(const_session_t session) {
+  if (!session) { return -1; }
+  auto wsession =
+      static_cast<impl::WolfSSLSession *>(const_cast<void *>(session));
+  long result = wolfSSL_get_verify_result(wsession->ssl);
+  return result;
+}
+
+std::string get_cert_subject_cn(cert_t cert) {
+  if (!cert) return "";
+  auto x509 = static_cast<WOLFSSL_X509 *>(cert);
+
+  WOLFSSL_X509_NAME *subject = wolfSSL_X509_get_subject_name(x509);
+  if (!subject) return "";
+
+  char cn[256] = {};
+  int cn_len = wolfSSL_X509_NAME_get_text_by_NID(subject, NID_commonName, cn,
+                                                 sizeof(cn));
+  if (cn_len <= 0) return "";
+  return std::string(cn, static_cast<size_t>(cn_len));
+}
+
+std::string get_cert_issuer_name(cert_t cert) {
+  if (!cert) return "";
+  auto x509 = static_cast<WOLFSSL_X509 *>(cert);
+
+  WOLFSSL_X509_NAME *issuer = wolfSSL_X509_get_issuer_name(x509);
+  if (!issuer) return "";
+
+  char *name_str = wolfSSL_X509_NAME_oneline(issuer, nullptr, 0);
+  if (!name_str) return "";
+
+  std::string result(name_str);
+  XFREE(name_str, nullptr, DYNAMIC_TYPE_OPENSSL);
+  return result;
+}
+
+bool get_cert_sans(cert_t cert, std::vector<SanEntry> &sans) {
+  sans.clear();
+  if (!cert) return false;
+  auto x509 = static_cast<WOLFSSL_X509 *>(cert);
+
+  auto *san_names = static_cast<WOLF_STACK_OF(WOLFSSL_GENERAL_NAME) *>(
+      wolfSSL_X509_get_ext_d2i(x509, NID_subject_alt_name, nullptr, nullptr));
+  if (!san_names) return true; // No SANs is not an error
+
+  int count = wolfSSL_sk_num(san_names);
+  for (int i = 0; i < count; i++) {
+    auto *name =
+        static_cast<WOLFSSL_GENERAL_NAME *>(wolfSSL_sk_value(san_names, i));
+    if (!name) continue;
+
+    SanEntry entry;
+    switch (name->type) {
+    case WOLFSSL_GEN_DNS: {
+      entry.type = SanType::DNS;
+      unsigned char *dns_name = nullptr;
+      int dns_len = wolfSSL_ASN1_STRING_to_UTF8(&dns_name, name->d.dNSName);
+      if (dns_name && dns_len > 0) {
+        entry.value = std::string(reinterpret_cast<char *>(dns_name),
+                                  static_cast<size_t>(dns_len));
+        XFREE(dns_name, nullptr, DYNAMIC_TYPE_OPENSSL);
+      }
+      break;
+    }
+    case WOLFSSL_GEN_IPADD: {
+      entry.type = SanType::IP;
+      unsigned char *ip_data = wolfSSL_ASN1_STRING_data(name->d.iPAddress);
+      int ip_len = wolfSSL_ASN1_STRING_length(name->d.iPAddress);
+      if (ip_data && ip_len == 4) {
+        char buf[16];
+        snprintf(buf, sizeof(buf), "%d.%d.%d.%d", ip_data[0], ip_data[1],
+                 ip_data[2], ip_data[3]);
+        entry.value = buf;
+      } else if (ip_data && ip_len == 16) {
+        char buf[64];
+        snprintf(buf, sizeof(buf),
+                 "%02x%02x:%02x%02x:%02x%02x:%02x%02x:"
+                 "%02x%02x:%02x%02x:%02x%02x:%02x%02x",
+                 ip_data[0], ip_data[1], ip_data[2], ip_data[3], ip_data[4],
+                 ip_data[5], ip_data[6], ip_data[7], ip_data[8], ip_data[9],
+                 ip_data[10], ip_data[11], ip_data[12], ip_data[13],
+                 ip_data[14], ip_data[15]);
+        entry.value = buf;
+      }
+      break;
+    }
+    case WOLFSSL_GEN_EMAIL:
+      entry.type = SanType::EMAIL;
+      {
+        unsigned char *email = nullptr;
+        int email_len = wolfSSL_ASN1_STRING_to_UTF8(&email, name->d.rfc822Name);
+        if (email && email_len > 0) {
+          entry.value = std::string(reinterpret_cast<char *>(email),
+                                    static_cast<size_t>(email_len));
+          XFREE(email, nullptr, DYNAMIC_TYPE_OPENSSL);
+        }
+      }
+      break;
+    case WOLFSSL_GEN_URI:
+      entry.type = SanType::URI;
+      {
+        unsigned char *uri = nullptr;
+        int uri_len = wolfSSL_ASN1_STRING_to_UTF8(
+            &uri, name->d.uniformResourceIdentifier);
+        if (uri && uri_len > 0) {
+          entry.value = std::string(reinterpret_cast<char *>(uri),
+                                    static_cast<size_t>(uri_len));
+          XFREE(uri, nullptr, DYNAMIC_TYPE_OPENSSL);
+        }
+      }
+      break;
+    default: entry.type = SanType::OTHER; break;
+    }
+
+    if (!entry.value.empty()) { sans.push_back(std::move(entry)); }
+  }
+  wolfSSL_sk_free(san_names);
+  return true;
+}
+
+bool get_cert_validity(cert_t cert, time_t &not_before,
+                              time_t &not_after) {
+  if (!cert) return false;
+  auto x509 = static_cast<WOLFSSL_X509 *>(cert);
+
+  const WOLFSSL_ASN1_TIME *nb = wolfSSL_X509_get_notBefore(x509);
+  const WOLFSSL_ASN1_TIME *na = wolfSSL_X509_get_notAfter(x509);
+
+  if (!nb || !na) return false;
+
+  // wolfSSL_ASN1_TIME_to_tm is available
+  struct tm tm_nb = {}, tm_na = {};
+  if (wolfSSL_ASN1_TIME_to_tm(nb, &tm_nb) != WOLFSSL_SUCCESS) return false;
+  if (wolfSSL_ASN1_TIME_to_tm(na, &tm_na) != WOLFSSL_SUCCESS) return false;
+
+#ifdef _WIN32
+  not_before = _mkgmtime(&tm_nb);
+  not_after = _mkgmtime(&tm_na);
+#else
+  not_before = timegm(&tm_nb);
+  not_after = timegm(&tm_na);
+#endif
+  return true;
+}
+
+std::string get_cert_serial(cert_t cert) {
+  if (!cert) return "";
+  auto x509 = static_cast<WOLFSSL_X509 *>(cert);
+
+  WOLFSSL_ASN1_INTEGER *serial_asn1 = wolfSSL_X509_get_serialNumber(x509);
+  if (!serial_asn1) return "";
+
+  // Get the serial number data
+  int len = serial_asn1->length;
+  unsigned char *data = serial_asn1->data;
+  if (!data || len <= 0) return "";
+
+  std::string result;
+  result.reserve(static_cast<size_t>(len) * 2);
+  for (int i = 0; i < len; i++) {
+    char hex[3];
+    snprintf(hex, sizeof(hex), "%02X", data[i]);
+    result += hex;
+  }
+  return result;
+}
+
+bool get_cert_der(cert_t cert, std::vector<unsigned char> &der) {
+  if (!cert) return false;
+  auto x509 = static_cast<WOLFSSL_X509 *>(cert);
+
+  int der_len = 0;
+  const unsigned char *der_data = wolfSSL_X509_get_der(x509, &der_len);
+  if (!der_data || der_len <= 0) return false;
+
+  der.assign(der_data, der_data + der_len);
+  return true;
+}
+
+const char *get_sni(const_session_t session) {
+  if (!session) return nullptr;
+  auto wsession = static_cast<const impl::WolfSSLSession *>(session);
+
+  // For server: return SNI received from client during handshake
+  if (!wsession->sni_hostname.empty()) {
+    return wsession->sni_hostname.c_str();
+  }
+
+  // For client: return the hostname set via set_sni
+  if (!wsession->hostname.empty()) { return wsession->hostname.c_str(); }
+
+  return nullptr;
+}
+
+uint64_t peek_error() {
+  return static_cast<uint64_t>(wolfSSL_ERR_peek_last_error());
+}
+
+uint64_t get_error() {
+  uint64_t err = impl::wolfssl_last_error();
+  impl::wolfssl_last_error() = 0;
+  return err;
+}
+
+std::string error_string(uint64_t code) {
+  char buf[256];
+  wolfSSL_ERR_error_string(static_cast<unsigned long>(code), buf);
+  return std::string(buf);
+}
+
+ca_store_t create_ca_store(const char *pem, size_t len) {
+  if (!pem || len == 0) { return nullptr; }
+  // Validate by attempting to load into a temporary ctx
+  WOLFSSL_CTX *tmp_ctx = wolfSSL_CTX_new(wolfTLSv1_2_client_method());
+  if (!tmp_ctx) { return nullptr; }
+  int ret = wolfSSL_CTX_load_verify_buffer(
+      tmp_ctx, reinterpret_cast<const unsigned char *>(pem),
+      static_cast<long>(len), SSL_FILETYPE_PEM);
+  wolfSSL_CTX_free(tmp_ctx);
+  if (ret != SSL_SUCCESS) { return nullptr; }
+  return static_cast<ca_store_t>(
+      new impl::WolfSSLCAStore{std::string(pem, len)});
+}
+
+void free_ca_store(ca_store_t store) {
+  delete static_cast<impl::WolfSSLCAStore *>(store);
+}
+
+bool set_ca_store(ctx_t ctx, ca_store_t store) {
+  if (!ctx || !store) { return false; }
+  auto *wctx = static_cast<impl::WolfSSLContext *>(ctx);
+  auto *ca = static_cast<impl::WolfSSLCAStore *>(store);
+  int ret = wolfSSL_CTX_load_verify_buffer(
+      wctx->ctx, reinterpret_cast<const unsigned char *>(ca->pem_data.data()),
+      static_cast<long>(ca->pem_data.size()), SSL_FILETYPE_PEM);
+  if (ret == SSL_SUCCESS) { wctx->ca_pem_data_ += ca->pem_data; }
+  return ret == SSL_SUCCESS;
+}
+
+size_t get_ca_certs(ctx_t ctx, std::vector<cert_t> &certs) {
+  certs.clear();
+  if (!ctx) { return 0; }
+  auto *wctx = static_cast<impl::WolfSSLContext *>(ctx);
+  if (wctx->ca_pem_data_.empty()) { return 0; }
+
+  const std::string &pem = wctx->ca_pem_data_;
+  const std::string begin_marker = "-----BEGIN CERTIFICATE-----";
+  const std::string end_marker = "-----END CERTIFICATE-----";
+  size_t pos = 0;
+  while ((pos = pem.find(begin_marker, pos)) != std::string::npos) {
+    size_t end_pos = pem.find(end_marker, pos);
+    if (end_pos == std::string::npos) { break; }
+    end_pos += end_marker.size();
+    std::string cert_pem = pem.substr(pos, end_pos - pos);
+    WOLFSSL_X509 *x509 = wolfSSL_X509_load_certificate_buffer(
+        reinterpret_cast<const unsigned char *>(cert_pem.data()),
+        static_cast<int>(cert_pem.size()), WOLFSSL_FILETYPE_PEM);
+    if (x509) { certs.push_back(static_cast<cert_t>(x509)); }
+    pos = end_pos;
+  }
+  return certs.size();
+}
+
+std::vector<std::string> get_ca_names(ctx_t ctx) {
+  std::vector<std::string> names;
+  if (!ctx) { return names; }
+  auto *wctx = static_cast<impl::WolfSSLContext *>(ctx);
+  if (wctx->ca_pem_data_.empty()) { return names; }
+
+  const std::string &pem = wctx->ca_pem_data_;
+  const std::string begin_marker = "-----BEGIN CERTIFICATE-----";
+  const std::string end_marker = "-----END CERTIFICATE-----";
+  size_t pos = 0;
+  while ((pos = pem.find(begin_marker, pos)) != std::string::npos) {
+    size_t end_pos = pem.find(end_marker, pos);
+    if (end_pos == std::string::npos) { break; }
+    end_pos += end_marker.size();
+    std::string cert_pem = pem.substr(pos, end_pos - pos);
+    WOLFSSL_X509 *x509 = wolfSSL_X509_load_certificate_buffer(
+        reinterpret_cast<const unsigned char *>(cert_pem.data()),
+        static_cast<int>(cert_pem.size()), WOLFSSL_FILETYPE_PEM);
+    if (x509) {
+      WOLFSSL_X509_NAME *subject = wolfSSL_X509_get_subject_name(x509);
+      if (subject) {
+        char *name_str = wolfSSL_X509_NAME_oneline(subject, nullptr, 0);
+        if (name_str) {
+          names.push_back(name_str);
+          XFREE(name_str, nullptr, DYNAMIC_TYPE_OPENSSL);
+        }
+      }
+      wolfSSL_X509_free(x509);
+    }
+    pos = end_pos;
+  }
+  return names;
+}
+
+bool update_server_cert(ctx_t ctx, const char *cert_pem,
+                               const char *key_pem, const char *password) {
+  if (!ctx || !cert_pem || !key_pem) { return false; }
+  auto *wctx = static_cast<impl::WolfSSLContext *>(ctx);
+
+  // Load new certificate
+  int ret = wolfSSL_CTX_use_certificate_buffer(
+      wctx->ctx, reinterpret_cast<const unsigned char *>(cert_pem),
+      static_cast<long>(strlen(cert_pem)), SSL_FILETYPE_PEM);
+  if (ret != SSL_SUCCESS) {
+    impl::wolfssl_last_error() =
+        static_cast<uint64_t>(wolfSSL_ERR_peek_last_error());
+    return false;
+  }
+
+  // Set password if provided
+  if (password) { impl::set_wolfssl_password_cb(wctx->ctx, password); }
+
+  // Load new private key
+  ret = wolfSSL_CTX_use_PrivateKey_buffer(
+      wctx->ctx, reinterpret_cast<const unsigned char *>(key_pem),
+      static_cast<long>(strlen(key_pem)), SSL_FILETYPE_PEM);
+  if (ret != SSL_SUCCESS) {
+    impl::wolfssl_last_error() =
+        static_cast<uint64_t>(wolfSSL_ERR_peek_last_error());
+    return false;
+  }
+
+  return true;
+}
+
+bool update_server_client_ca(ctx_t ctx, const char *ca_pem) {
+  if (!ctx || !ca_pem) { return false; }
+  auto *wctx = static_cast<impl::WolfSSLContext *>(ctx);
+
+  int ret = wolfSSL_CTX_load_verify_buffer(
+      wctx->ctx, reinterpret_cast<const unsigned char *>(ca_pem),
+      static_cast<long>(strlen(ca_pem)), SSL_FILETYPE_PEM);
+  if (ret != SSL_SUCCESS) {
+    impl::wolfssl_last_error() =
+        static_cast<uint64_t>(wolfSSL_ERR_peek_last_error());
+    return false;
+  }
+  return true;
+}
+
+bool set_verify_callback(ctx_t ctx, VerifyCallback callback) {
+  if (!ctx) { return false; }
+  auto *wctx = static_cast<impl::WolfSSLContext *>(ctx);
+
+  impl::get_verify_callback() = std::move(callback);
+  wctx->has_verify_callback = static_cast<bool>(impl::get_verify_callback());
+
+  if (wctx->has_verify_callback) {
+    wolfSSL_CTX_set_verify(wctx->ctx, SSL_VERIFY_PEER,
+                           impl::wolfssl_verify_callback);
+  } else {
+    wolfSSL_CTX_set_verify(
+        wctx->ctx,
+        wctx->verify_client
+            ? (SSL_VERIFY_PEER | SSL_VERIFY_FAIL_IF_NO_PEER_CERT)
+            : SSL_VERIFY_NONE,
+        nullptr);
+  }
+  return true;
+}
+
+long get_verify_error(const_session_t session) {
+  if (!session) { return -1; }
+  auto *wsession =
+      static_cast<impl::WolfSSLSession *>(const_cast<void *>(session));
+  return wolfSSL_get_verify_result(wsession->ssl);
+}
+
+std::string verify_error_string(long error_code) {
+  if (error_code == 0) { return ""; }
+  const char *str =
+      wolfSSL_X509_verify_cert_error_string(static_cast<int>(error_code));
+  return str ? std::string(str) : std::string();
+}
+
+} // namespace tls
+
+#endif // CPPHTTPLIB_WOLFSSL_SUPPORT
+
+// WebSocket implementation
+namespace ws {
+
+bool WebSocket::send_frame(Opcode op, const char *data, size_t len,
+                                  bool fin) {
+  std::lock_guard<std::mutex> lock(write_mutex_);
+  if (closed_) { return false; }
+  return detail::write_websocket_frame(strm_, op, data, len, fin, !is_server_);
+}
+
+ReadResult WebSocket::read(std::string &msg) {
+  while (!closed_) {
+    Opcode opcode;
+    std::string payload;
+    bool fin;
+
+    if (!impl::read_websocket_frame(strm_, opcode, payload, fin, is_server_,
+                                    CPPHTTPLIB_WEBSOCKET_MAX_PAYLOAD_LENGTH)) {
+      closed_ = true;
+      return Fail;
+    }
+
+    switch (opcode) {
+    case Opcode::Ping: {
+      std::lock_guard<std::mutex> lock(write_mutex_);
+      detail::write_websocket_frame(strm_, Opcode::Pong, payload.data(),
+                                    payload.size(), true, !is_server_);
+      continue;
+    }
+    case Opcode::Pong: continue;
+    case Opcode::Close: {
+      if (!closed_.exchange(true)) {
+        // Echo close frame back
+        std::lock_guard<std::mutex> lock(write_mutex_);
+        detail::write_websocket_frame(strm_, Opcode::Close, payload.data(),
+                                      payload.size(), true, !is_server_);
+      }
+      return Fail;
+    }
+    case Opcode::Text:
+    case Opcode::Binary: {
+      auto result = opcode == Opcode::Text ? Text : Binary;
+      msg = std::move(payload);
+
+      // Handle fragmentation
+      if (!fin) {
+        while (true) {
+          Opcode cont_opcode;
+          std::string cont_payload;
+          bool cont_fin;
+          if (!impl::read_websocket_frame(
+                  strm_, cont_opcode, cont_payload, cont_fin, is_server_,
+                  CPPHTTPLIB_WEBSOCKET_MAX_PAYLOAD_LENGTH)) {
+            closed_ = true;
+            return Fail;
+          }
+          if (cont_opcode == Opcode::Ping) {
+            std::lock_guard<std::mutex> lock(write_mutex_);
+            detail::write_websocket_frame(
+                strm_, Opcode::Pong, cont_payload.data(), cont_payload.size(),
+                true, !is_server_);
+            continue;
+          }
+          if (cont_opcode == Opcode::Pong) { continue; }
+          if (cont_opcode == Opcode::Close) {
+            if (!closed_.exchange(true)) {
+              std::lock_guard<std::mutex> lock(write_mutex_);
+              detail::write_websocket_frame(
+                  strm_, Opcode::Close, cont_payload.data(),
+                  cont_payload.size(), true, !is_server_);
+            }
+            return Fail;
+          }
+          // RFC 6455: continuation frames must use opcode 0x0
+          if (cont_opcode != Opcode::Continuation) {
+            closed_ = true;
+            return Fail;
+          }
+          msg += cont_payload;
+          if (msg.size() > CPPHTTPLIB_WEBSOCKET_MAX_PAYLOAD_LENGTH) {
+            closed_ = true;
+            return Fail;
+          }
+          if (cont_fin) { break; }
+        }
+      }
+      // RFC 6455 Section 5.6: text frames must contain valid UTF-8
+      if (result == Text && !impl::is_valid_utf8(msg)) {
+        close(CloseStatus::InvalidPayload, "invalid UTF-8");
+        return Fail;
+      }
+      return result;
+    }
+    default: closed_ = true; return Fail;
+    }
+  }
+  return Fail;
+}
+
+bool WebSocket::send(const std::string &data) {
+  return send_frame(Opcode::Text, data.data(), data.size());
+}
+
+bool WebSocket::send(const char *data, size_t len) {
+  return send_frame(Opcode::Binary, data, len);
+}
+
+void WebSocket::close(CloseStatus status, const std::string &reason) {
+  if (closed_.exchange(true)) { return; }
+  ping_cv_.notify_all();
+  std::string payload;
+  auto code = static_cast<uint16_t>(status);
+  payload.push_back(static_cast<char>((code >> 8) & 0xFF));
+  payload.push_back(static_cast<char>(code & 0xFF));
+  // RFC 6455 Section 5.5: control frame payload must not exceed 125 bytes
+  // Close frame has 2-byte status code, so reason is limited to 123 bytes
+  payload += reason.substr(0, 123);
+  {
+    std::lock_guard<std::mutex> lock(write_mutex_);
+    detail::write_websocket_frame(strm_, Opcode::Close, payload.data(),
+                                  payload.size(), true, !is_server_);
+  }
+
+  // RFC 6455 Section 7.1.1: after sending a Close frame, wait for the peer's
+  // Close response before closing the TCP connection. Use a short timeout to
+  // avoid hanging if the peer doesn't respond.
+  strm_.set_read_timeout(CPPHTTPLIB_WEBSOCKET_CLOSE_TIMEOUT_SECOND, 0);
+  Opcode op;
+  std::string resp;
+  bool fin;
+  while (impl::read_websocket_frame(strm_, op, resp, fin, is_server_, 125)) {
+    if (op == Opcode::Close) { break; }
+  }
+}
+
+WebSocket::~WebSocket() {
+  {
+    std::lock_guard<std::mutex> lock(ping_mutex_);
+    closed_ = true;
+  }
+  ping_cv_.notify_all();
+  if (ping_thread_.joinable()) { ping_thread_.join(); }
+}
+
+void WebSocket::start_heartbeat() {
+  ping_thread_ = std::thread([this]() {
+    std::unique_lock<std::mutex> lock(ping_mutex_);
+    while (!closed_) {
+      ping_cv_.wait_for(lock, std::chrono::seconds(
+                                  CPPHTTPLIB_WEBSOCKET_PING_INTERVAL_SECOND));
+      if (closed_) { break; }
+      lock.unlock();
+      if (!send_frame(Opcode::Ping, nullptr, 0)) {
+        closed_ = true;
+        break;
+      }
+      lock.lock();
+    }
+  });
+}
+
+const Request &WebSocket::request() const { return req_; }
+
+bool WebSocket::is_open() const { return !closed_; }
+
+// WebSocketClient implementation
+WebSocketClient::WebSocketClient(
+    const std::string &scheme_host_port_path, const Headers &headers)
+    : headers_(headers) {
+  const static std::regex re(
+      R"(([a-z]+):\/\/(?:\[([a-fA-F\d:]+)\]|([^:/?#]+))(?::(\d+))?(\/.*))");
+
+  std::smatch m;
+  if (std::regex_match(scheme_host_port_path, m, re)) {
+    auto scheme = m[1].str();
+
+#ifdef CPPHTTPLIB_SSL_ENABLED
+    if (scheme != "ws" && scheme != "wss") {
+#else
+    if (scheme != "ws") {
+#endif
+#ifndef CPPHTTPLIB_NO_EXCEPTIONS
+      std::string msg = "'" + scheme + "' scheme is not supported.";
+      throw std::invalid_argument(msg);
+#endif
+      return;
+    }
+
+    auto is_ssl = scheme == "wss";
+
+    host_ = m[2].str();
+    if (host_.empty()) { host_ = m[3].str(); }
+
+    auto port_str = m[4].str();
+    port_ = !port_str.empty() ? std::stoi(port_str) : (is_ssl ? 443 : 80);
+
+    path_ = m[5].str();
+
+#ifdef CPPHTTPLIB_SSL_ENABLED
+    is_ssl_ = is_ssl;
+#else
+    if (is_ssl) { return; }
+#endif
+
+    is_valid_ = true;
+  }
+}
+
+WebSocketClient::~WebSocketClient() { shutdown_and_close(); }
+
+bool WebSocketClient::is_valid() const { return is_valid_; }
+
+void WebSocketClient::shutdown_and_close() {
+#ifdef CPPHTTPLIB_SSL_ENABLED
+  if (is_ssl_) {
+    if (tls_session_) {
+      tls::shutdown(tls_session_, true);
+      tls::free_session(tls_session_);
+      tls_session_ = nullptr;
+    }
+    if (tls_ctx_) {
+      tls::free_context(tls_ctx_);
+      tls_ctx_ = nullptr;
+    }
+  }
+#endif
+  if (ws_ && ws_->is_open()) { ws_->close(); }
+  ws_.reset();
+  if (sock_ != INVALID_SOCKET) {
+    detail::shutdown_socket(sock_);
+    detail::close_socket(sock_);
+    sock_ = INVALID_SOCKET;
+  }
+}
+
+bool WebSocketClient::create_stream(std::unique_ptr<Stream> &strm) {
+#ifdef CPPHTTPLIB_SSL_ENABLED
+  if (is_ssl_) {
+    if (!detail::setup_client_tls_session(
+            host_, tls_ctx_, tls_session_, sock_,
+            server_certificate_verification_, ca_cert_file_path_,
+            ca_cert_store_, read_timeout_sec_, read_timeout_usec_)) {
+      return false;
+    }
+
+    strm = std::unique_ptr<Stream>(new detail::SSLSocketStream(
+        sock_, tls_session_, read_timeout_sec_, read_timeout_usec_,
+        write_timeout_sec_, write_timeout_usec_));
+    return true;
+  }
+#endif
+  strm = std::unique_ptr<Stream>(
+      new detail::SocketStream(sock_, read_timeout_sec_, read_timeout_usec_,
+                               write_timeout_sec_, write_timeout_usec_));
+  return true;
+}
+
+bool WebSocketClient::connect() {
+  if (!is_valid_) { return false; }
+  shutdown_and_close();
+
+  Error error;
+  sock_ = detail::create_client_socket(
+      host_, std::string(), port_, AF_UNSPEC, false, false, nullptr, 5, 0,
+      read_timeout_sec_, read_timeout_usec_, write_timeout_sec_,
+      write_timeout_usec_, std::string(), error);
+
+  if (sock_ == INVALID_SOCKET) { return false; }
+
+  std::unique_ptr<Stream> strm;
+  if (!create_stream(strm)) {
+    shutdown_and_close();
+    return false;
+  }
+
+  std::string selected_subprotocol;
+  if (!detail::perform_websocket_handshake(*strm, host_, port_, path_, headers_,
+                                           selected_subprotocol)) {
+    shutdown_and_close();
+    return false;
+  }
+  subprotocol_ = std::move(selected_subprotocol);
+
+  Request req;
+  req.method = "GET";
+  req.path = path_;
+  ws_ = std::unique_ptr<WebSocket>(new WebSocket(std::move(strm), req, false));
+  return true;
+}
+
+ReadResult WebSocketClient::read(std::string &msg) {
+  if (!ws_) { return Fail; }
+  return ws_->read(msg);
+}
+
+bool WebSocketClient::send(const std::string &data) {
+  if (!ws_) { return false; }
+  return ws_->send(data);
+}
+
+bool WebSocketClient::send(const char *data, size_t len) {
+  if (!ws_) { return false; }
+  return ws_->send(data, len);
+}
+
+void WebSocketClient::close(CloseStatus status,
+                                   const std::string &reason) {
+  if (ws_) { ws_->close(status, reason); }
+}
+
+bool WebSocketClient::is_open() const { return ws_ && ws_->is_open(); }
+
+const std::string &WebSocketClient::subprotocol() const {
+  return subprotocol_;
+}
+
+void WebSocketClient::set_read_timeout(time_t sec, time_t usec) {
+  read_timeout_sec_ = sec;
+  read_timeout_usec_ = usec;
+}
+
+void WebSocketClient::set_write_timeout(time_t sec, time_t usec) {
+  write_timeout_sec_ = sec;
+  write_timeout_usec_ = usec;
+}
+
+#ifdef CPPHTTPLIB_SSL_ENABLED
+
+void WebSocketClient::set_ca_cert_path(const std::string &path) {
+  ca_cert_file_path_ = path;
+}
+
+void WebSocketClient::set_ca_cert_store(tls::ca_store_t store) {
+  ca_cert_store_ = store;
+}
+
+void
+WebSocketClient::enable_server_certificate_verification(bool enabled) {
+  server_certificate_verification_ = enabled;
+}
+
+#endif // CPPHTTPLIB_SSL_ENABLED
+
+} // namespace ws
+
 } // namespace httplib
diff --git a/vendor/cpp-httplib/httplib.h b/vendor/cpp-httplib/httplib.h
index f7563283ee..a39876891e 100644
--- a/vendor/cpp-httplib/httplib.h
+++ b/vendor/cpp-httplib/httplib.h
@@ -8,8 +8,8 @@
 #ifndef CPPHTTPLIB_HTTPLIB_H
 #define CPPHTTPLIB_HTTPLIB_H
 
-#define CPPHTTPLIB_VERSION "0.32.0"
-#define CPPHTTPLIB_VERSION_NUM "0x002000"
+#define CPPHTTPLIB_VERSION "0.34.0"
+#define CPPHTTPLIB_VERSION_NUM "0x002200"
 
 /*
  * Platform compatibility check
@@ -185,6 +185,14 @@
                       : 0))
 #endif
 
+#ifndef CPPHTTPLIB_THREAD_POOL_MAX_COUNT
+#define CPPHTTPLIB_THREAD_POOL_MAX_COUNT (CPPHTTPLIB_THREAD_POOL_COUNT * 4)
+#endif
+
+#ifndef CPPHTTPLIB_THREAD_POOL_IDLE_TIMEOUT
+#define CPPHTTPLIB_THREAD_POOL_IDLE_TIMEOUT 3 // seconds
+#endif
+
 #ifndef CPPHTTPLIB_RECV_FLAGS
 #define CPPHTTPLIB_RECV_FLAGS 0
 #endif
@@ -201,6 +209,22 @@
 #define CPPHTTPLIB_MAX_LINE_LENGTH 32768
 #endif
 
+#ifndef CPPHTTPLIB_WEBSOCKET_MAX_PAYLOAD_LENGTH
+#define CPPHTTPLIB_WEBSOCKET_MAX_PAYLOAD_LENGTH 16777216
+#endif
+
+#ifndef CPPHTTPLIB_WEBSOCKET_READ_TIMEOUT_SECOND
+#define CPPHTTPLIB_WEBSOCKET_READ_TIMEOUT_SECOND 300
+#endif
+
+#ifndef CPPHTTPLIB_WEBSOCKET_CLOSE_TIMEOUT_SECOND
+#define CPPHTTPLIB_WEBSOCKET_CLOSE_TIMEOUT_SECOND 5
+#endif
+
+#ifndef CPPHTTPLIB_WEBSOCKET_PING_INTERVAL_SECOND
+#define CPPHTTPLIB_WEBSOCKET_PING_INTERVAL_SECOND 30
+#endif
+
 /*
  * Headers
  */
@@ -310,6 +334,7 @@ using socket_t = int;
 #include <errno.h>
 #include <exception>
 #include <fcntl.h>
+#include <fstream>
 #include <functional>
 #include <iomanip>
 #include <iostream>
@@ -328,6 +353,9 @@ using socket_t = int;
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
+#if __cplusplus >= 201703L
+#include <any>
+#endif
 
 #if defined(CPPHTTPLIB_USE_NON_BLOCKING_GETADDRINFO) ||                        \
     defined(CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN)
@@ -415,10 +443,46 @@ using socket_t = int;
 
 #endif // CPPHTTPLIB_MBEDTLS_SUPPORT
 
+#ifdef CPPHTTPLIB_WOLFSSL_SUPPORT
+#include <wolfssl/options.h>
+
+#include <wolfssl/openssl/x509v3.h>
+
+// Fallback definitions for older wolfSSL versions (e.g., 5.6.6)
+#ifndef WOLFSSL_GEN_EMAIL
+#define WOLFSSL_GEN_EMAIL 1
+#endif
+#ifndef WOLFSSL_GEN_DNS
+#define WOLFSSL_GEN_DNS 2
+#endif
+#ifndef WOLFSSL_GEN_URI
+#define WOLFSSL_GEN_URI 6
+#endif
+#ifndef WOLFSSL_GEN_IPADD
+#define WOLFSSL_GEN_IPADD 7
+#endif
+
+#include <wolfssl/ssl.h>
+#include <wolfssl/wolfcrypt/hash.h>
+#include <wolfssl/wolfcrypt/md5.h>
+#include <wolfssl/wolfcrypt/sha256.h>
+#include <wolfssl/wolfcrypt/sha512.h>
+#ifdef _WIN32
+#include <wincrypt.h>
+#ifdef _MSC_VER
+#pragma comment(lib, "crypt32.lib")
+#endif
+#endif // _WIN32
+#if defined(CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN)
+#if TARGET_OS_MAC
+#include <Security/Security.h>
+#endif
+#endif // CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN
+#endif // CPPHTTPLIB_WOLFSSL_SUPPORT
+
 // Define CPPHTTPLIB_SSL_ENABLED if any SSL backend is available
-// This simplifies conditional compilation when adding new backends (e.g.,
-// wolfSSL)
-#if defined(CPPHTTPLIB_OPENSSL_SUPPORT) || defined(CPPHTTPLIB_MBEDTLS_SUPPORT)
+#if defined(CPPHTTPLIB_OPENSSL_SUPPORT) ||                                     \
+    defined(CPPHTTPLIB_MBEDTLS_SUPPORT) || defined(CPPHTTPLIB_WOLFSSL_SUPPORT)
 #define CPPHTTPLIB_SSL_ENABLED
 #endif
 
@@ -440,6 +504,10 @@ using socket_t = int;
  */
 namespace httplib {
 
+namespace ws {
+class WebSocket;
+} // namespace ws
+
 namespace detail {
 
 /*
@@ -711,6 +779,143 @@ using Match = std::smatch;
 using DownloadProgress = std::function<bool(size_t current, size_t total)>;
 using UploadProgress = std::function<bool(size_t current, size_t total)>;
 
+
+#if __cplusplus >= 201703L
+
+using any = std::any;
+using bad_any_cast = std::bad_any_cast;
+
+template <typename T> T any_cast(const any &a) { return std::any_cast<T>(a); }
+template <typename T> T any_cast(any &a) { return std::any_cast<T>(a); }
+template <typename T> T any_cast(any &&a) {
+  return std::any_cast<T>(std::move(a));
+}
+template <typename T> const T *any_cast(const any *a) noexcept {
+  return std::any_cast<T>(a);
+}
+template <typename T> T *any_cast(any *a) noexcept {
+  return std::any_cast<T>(a);
+}
+
+#else // C++11/14 implementation
+
+class bad_any_cast : public std::bad_cast {
+public:
+  const char *what() const noexcept override { return "bad any_cast"; }
+};
+
+namespace detail {
+
+using any_type_id = const void *;
+
+// Returns a unique per-type ID without RTTI.
+// The static address is stable across TUs because function templates are
+// implicitly inline and the ODR merges their statics into one.
+template <typename T> any_type_id any_typeid() noexcept {
+  static const char id = 0;
+  return &id;
+}
+
+struct any_storage {
+  virtual ~any_storage() = default;
+  virtual std::unique_ptr<any_storage> clone() const = 0;
+  virtual any_type_id type_id() const noexcept = 0;
+};
+
+template <typename T> struct any_value final : any_storage {
+  T value;
+  template <typename U> explicit any_value(U &&v) : value(std::forward<U>(v)) {}
+  std::unique_ptr<any_storage> clone() const override {
+    return std::unique_ptr<any_storage>(new any_value<T>(value));
+  }
+  any_type_id type_id() const noexcept override { return any_typeid<T>(); }
+};
+
+} // namespace detail
+
+class any {
+  std::unique_ptr<detail::any_storage> storage_;
+
+public:
+  any() noexcept = default;
+  any(const any &o) : storage_(o.storage_ ? o.storage_->clone() : nullptr) {}
+  any(any &&) noexcept = default;
+  any &operator=(const any &o) {
+    storage_ = o.storage_ ? o.storage_->clone() : nullptr;
+    return *this;
+  }
+  any &operator=(any &&) noexcept = default;
+
+  template <
+      typename T, typename D = typename std::decay<T>::type,
+      typename std::enable_if<!std::is_same<D, any>::value, int>::type = 0>
+  any(T &&v) : storage_(new detail::any_value<D>(std::forward<T>(v))) {}
+
+  template <
+      typename T, typename D = typename std::decay<T>::type,
+      typename std::enable_if<!std::is_same<D, any>::value, int>::type = 0>
+  any &operator=(T &&v) {
+    storage_.reset(new detail::any_value<D>(std::forward<T>(v)));
+    return *this;
+  }
+
+  bool has_value() const noexcept { return storage_ != nullptr; }
+  void reset() noexcept { storage_.reset(); }
+
+  template <typename T> friend T *any_cast(any *a) noexcept;
+  template <typename T> friend const T *any_cast(const any *a) noexcept;
+};
+
+template <typename T> T *any_cast(any *a) noexcept {
+  if (!a || !a->storage_) { return nullptr; }
+  if (a->storage_->type_id() != detail::any_typeid<T>()) { return nullptr; }
+  return &static_cast<detail::any_value<T> *>(a->storage_.get())->value;
+}
+
+template <typename T> const T *any_cast(const any *a) noexcept {
+  if (!a || !a->storage_) { return nullptr; }
+  if (a->storage_->type_id() != detail::any_typeid<T>()) { return nullptr; }
+  return &static_cast<const detail::any_value<T> *>(a->storage_.get())->value;
+}
+
+template <typename T> T any_cast(const any &a) {
+  using U =
+      typename std::remove_cv<typename std::remove_reference<T>::type>::type;
+  const U *p = any_cast<U>(&a);
+#ifndef CPPHTTPLIB_NO_EXCEPTIONS
+  if (!p) { throw bad_any_cast{}; }
+#else
+  if (!p) { std::abort(); }
+#endif
+  return static_cast<T>(*p);
+}
+
+template <typename T> T any_cast(any &a) {
+  using U =
+      typename std::remove_cv<typename std::remove_reference<T>::type>::type;
+  U *p = any_cast<U>(&a);
+#ifndef CPPHTTPLIB_NO_EXCEPTIONS
+  if (!p) { throw bad_any_cast{}; }
+#else
+  if (!p) { std::abort(); }
+#endif
+  return static_cast<T>(*p);
+}
+
+template <typename T> T any_cast(any &&a) {
+  using U =
+      typename std::remove_cv<typename std::remove_reference<T>::type>::type;
+  U *p = any_cast<U>(&a);
+#ifndef CPPHTTPLIB_NO_EXCEPTIONS
+  if (!p) { throw bad_any_cast{}; }
+#else
+  if (!p) { std::abort(); }
+#endif
+  return static_cast<T>(std::move(*p));
+}
+
+#endif // __cplusplus >= 201703L
+
 struct Response;
 using ResponseHandler = std::function<bool(const Response &response)>;
 
@@ -805,6 +1010,60 @@ struct FormDataProvider {
 };
 using FormDataProviderItems = std::vector<FormDataProvider>;
 
+inline FormDataProvider
+make_file_provider(const std::string &name, const std::string &filepath,
+                   const std::string &filename = std::string(),
+                   const std::string &content_type = std::string()) {
+  FormDataProvider fdp;
+  fdp.name = name;
+  fdp.filename = filename.empty() ? filepath : filename;
+  fdp.content_type = content_type;
+  fdp.provider = [filepath](size_t offset, DataSink &sink) -> bool {
+    std::ifstream f(filepath, std::ios::binary);
+    if (!f) { return false; }
+    if (offset > 0) {
+      f.seekg(static_cast<std::streamoff>(offset));
+      if (!f.good()) {
+        sink.done();
+        return true;
+      }
+    }
+    char buf[8192];
+    f.read(buf, sizeof(buf));
+    auto n = static_cast<size_t>(f.gcount());
+    if (n > 0) { return sink.write(buf, n); }
+    sink.done(); // EOF
+    return true;
+  };
+  return fdp;
+}
+
+inline std::pair<size_t, ContentProvider>
+make_file_body(const std::string &filepath) {
+  std::ifstream f(filepath, std::ios::binary | std::ios::ate);
+  if (!f) { return {0, ContentProvider{}}; }
+  auto size = static_cast<size_t>(f.tellg());
+
+  ContentProvider provider = [filepath](size_t offset, size_t length,
+                                        DataSink &sink) -> bool {
+    std::ifstream f(filepath, std::ios::binary);
+    if (!f) { return false; }
+    f.seekg(static_cast<std::streamoff>(offset));
+    if (!f.good()) { return false; }
+    char buf[8192];
+    while (length > 0) {
+      auto to_read = (std::min)(sizeof(buf), length);
+      f.read(buf, static_cast<std::streamsize>(to_read));
+      auto n = static_cast<size_t>(f.gcount());
+      if (n == 0) { break; }
+      if (!sink.write(buf, n)) { return false; }
+      length -= n;
+    }
+    return true;
+  };
+  return {size, std::move(provider)};
+}
+
 using ContentReceiverWithProgress = std::function<bool(
     const char *data, size_t data_length, size_t offset, size_t total_length)>;
 
@@ -1010,6 +1269,10 @@ struct Response {
   std::string body;
   std::string location; // Redirect location
 
+  // User-defined context — set by pre-routing/pre-request handlers and read
+  // by route handlers to pass arbitrary data (e.g. decoded auth tokens).
+  std::map<std::string, any> user_data;
+
   bool has_header(const std::string &key) const;
   std::string get_header_value(const std::string &key, const char *def = "",
                                size_t id = 0) const;
@@ -1115,6 +1378,7 @@ public:
   virtual bool is_readable() const = 0;
   virtual bool wait_readable() const = 0;
   virtual bool wait_writable() const = 0;
+  virtual bool is_peer_alive() const { return wait_writable(); }
 
   virtual ssize_t read(char *ptr, size_t size) = 0;
   virtual ssize_t write(const char *ptr, size_t size) = 0;
@@ -1124,6 +1388,11 @@ public:
 
   virtual time_t duration() const = 0;
 
+  virtual void set_read_timeout(time_t sec, time_t usec = 0) {
+    (void)sec;
+    (void)usec;
+  }
+
   ssize_t write(const char *ptr);
   ssize_t write(const std::string &s);
 
@@ -1146,7 +1415,7 @@ public:
 
 class ThreadPool final : public TaskQueue {
 public:
-  explicit ThreadPool(size_t n, size_t mqr = 0);
+  explicit ThreadPool(size_t n, size_t max_n = 0, size_t mqr = 0);
   ThreadPool(const ThreadPool &) = delete;
   ~ThreadPool() override = default;
 
@@ -1154,20 +1423,22 @@ public:
   void shutdown() override;
 
 private:
-  struct worker {
-    explicit worker(ThreadPool &pool);
+  void worker(bool is_dynamic);
+  void move_to_finished(std::thread::id id);
+  void cleanup_finished_threads();
 
-    void operator()();
-
-    ThreadPool &pool_;
-  };
-  friend struct worker;
-
-  std::vector<std::thread> threads_;
-  std::list<std::function<void()>> jobs_;
+  size_t base_thread_count_;
+  size_t max_thread_count_;
+  size_t max_queued_requests_;
+  size_t idle_thread_count_;
 
   bool shutdown_;
-  size_t max_queued_requests_ = 0;
+
+  std::list<std::function<void()>> jobs_;
+  std::vector<std::thread> threads_;       // base threads
+  std::list<std::thread> dynamic_threads_; // dynamic threads
+  std::vector<std::thread>
+      finished_threads_; // exited dynamic threads awaiting join
 
   std::condition_variable cond_;
   std::mutex mutex_;
@@ -1294,6 +1565,11 @@ public:
   using Expect100ContinueHandler =
       std::function<int(const Request &, Response &)>;
 
+  using WebSocketHandler =
+      std::function<void(const Request &, ws::WebSocket &)>;
+  using SubProtocolSelector =
+      std::function<std::string(const std::vector<std::string> &protocols)>;
+
   Server();
 
   virtual ~Server();
@@ -1311,6 +1587,10 @@ public:
   Server &Delete(const std::string &pattern, HandlerWithContentReader handler);
   Server &Options(const std::string &pattern, Handler handler);
 
+  Server &WebSocket(const std::string &pattern, WebSocketHandler handler);
+  Server &WebSocket(const std::string &pattern, WebSocketHandler handler,
+                    SubProtocolSelector sub_protocol_selector);
+
   bool set_base_dir(const std::string &dir,
                     const std::string &mount_point = std::string());
   bool set_mount_point(const std::string &mount_point, const std::string &dir,
@@ -1386,7 +1666,8 @@ protected:
                        int remote_port, const std::string &local_addr,
                        int local_port, bool close_connection,
                        bool &connection_closed,
-                       const std::function<void(Request &)> &setup_request);
+                       const std::function<void(Request &)> &setup_request,
+                       bool *websocket_upgraded = nullptr);
 
   std::atomic<socket_t> svr_sock_{INVALID_SOCKET};
 
@@ -1488,6 +1769,14 @@ private:
   HandlersForContentReader delete_handlers_for_content_reader_;
   Handlers options_handlers_;
 
+  struct WebSocketHandlerEntry {
+    std::unique_ptr<detail::MatcherBase> matcher;
+    WebSocketHandler handler;
+    SubProtocolSelector sub_protocol_selector;
+  };
+  using WebSocketHandlers = std::vector<WebSocketHandlerEntry>;
+  WebSocketHandlers websocket_handlers_;
+
   HandlerWithResponse error_handler_;
   ExceptionHandler exception_handler_;
   HandlerWithResponse pre_routing_handler_;
@@ -2970,6 +3259,36 @@ struct MbedTlsContext {
 } // namespace tls
 #endif
 
+#ifdef CPPHTTPLIB_WOLFSSL_SUPPORT
+namespace tls {
+namespace impl {
+
+// wolfSSL context wrapper (holds WOLFSSL_CTX and related state).
+// This struct is accessible via tls::impl for use in SSL context
+// setup callbacks (cast ctx_t to tls::impl::WolfSSLContext*).
+struct WolfSSLContext {
+  WOLFSSL_CTX *ctx = nullptr;
+  bool is_server = false;
+  bool verify_client = false;
+  bool has_verify_callback = false;
+  std::string ca_pem_data_; // accumulated PEM for get_ca_names/get_ca_certs
+
+  WolfSSLContext();
+  ~WolfSSLContext();
+
+  WolfSSLContext(const WolfSSLContext &) = delete;
+  WolfSSLContext &operator=(const WolfSSLContext &) = delete;
+};
+
+// CA store for wolfSSL: holds raw PEM bytes to allow reloading into any ctx
+struct WolfSSLCAStore {
+  std::string pem_data;
+};
+
+} // namespace impl
+} // namespace tls
+#endif
+
 #endif // CPPHTTPLIB_SSL_ENABLED
 
 namespace stream {
@@ -3335,6 +3654,143 @@ private:
 
 } // namespace sse
 
+namespace ws {
+
+enum class Opcode : uint8_t {
+  Continuation = 0x0,
+  Text = 0x1,
+  Binary = 0x2,
+  Close = 0x8,
+  Ping = 0x9,
+  Pong = 0xA,
+};
+
+enum class CloseStatus : uint16_t {
+  Normal = 1000,
+  GoingAway = 1001,
+  ProtocolError = 1002,
+  UnsupportedData = 1003,
+  NoStatus = 1005,
+  Abnormal = 1006,
+  InvalidPayload = 1007,
+  PolicyViolation = 1008,
+  MessageTooBig = 1009,
+  MandatoryExtension = 1010,
+  InternalError = 1011,
+};
+
+enum ReadResult : int { Fail = 0, Text = 1, Binary = 2 };
+
+class WebSocket {
+public:
+  WebSocket(const WebSocket &) = delete;
+  WebSocket &operator=(const WebSocket &) = delete;
+  ~WebSocket();
+
+  ReadResult read(std::string &msg);
+  bool send(const std::string &data);
+  bool send(const char *data, size_t len);
+  void close(CloseStatus status = CloseStatus::Normal,
+             const std::string &reason = "");
+  const Request &request() const;
+  bool is_open() const;
+
+private:
+  friend class httplib::Server;
+  friend class WebSocketClient;
+
+  WebSocket(Stream &strm, const Request &req, bool is_server)
+      : strm_(strm), req_(req), is_server_(is_server) {
+    start_heartbeat();
+  }
+
+  WebSocket(std::unique_ptr<Stream> &&owned_strm, const Request &req,
+            bool is_server)
+      : strm_(*owned_strm), owned_strm_(std::move(owned_strm)), req_(req),
+        is_server_(is_server) {
+    start_heartbeat();
+  }
+
+  void start_heartbeat();
+  bool send_frame(Opcode op, const char *data, size_t len, bool fin = true);
+
+  Stream &strm_;
+  std::unique_ptr<Stream> owned_strm_;
+  Request req_;
+  bool is_server_;
+  std::atomic<bool> closed_{false};
+  std::mutex write_mutex_;
+  std::thread ping_thread_;
+  std::mutex ping_mutex_;
+  std::condition_variable ping_cv_;
+};
+
+class WebSocketClient {
+public:
+  explicit WebSocketClient(const std::string &scheme_host_port_path,
+                           const Headers &headers = {});
+
+  ~WebSocketClient();
+  WebSocketClient(const WebSocketClient &) = delete;
+  WebSocketClient &operator=(const WebSocketClient &) = delete;
+
+  bool is_valid() const;
+
+  bool connect();
+  ReadResult read(std::string &msg);
+  bool send(const std::string &data);
+  bool send(const char *data, size_t len);
+  void close(CloseStatus status = CloseStatus::Normal,
+             const std::string &reason = "");
+  bool is_open() const;
+  const std::string &subprotocol() const;
+  void set_read_timeout(time_t sec, time_t usec = 0);
+  void set_write_timeout(time_t sec, time_t usec = 0);
+
+#ifdef CPPHTTPLIB_SSL_ENABLED
+  void set_ca_cert_path(const std::string &path);
+  void set_ca_cert_store(tls::ca_store_t store);
+  void enable_server_certificate_verification(bool enabled);
+#endif
+
+private:
+  void shutdown_and_close();
+  bool create_stream(std::unique_ptr<Stream> &strm);
+
+  std::string host_;
+  int port_;
+  std::string path_;
+  Headers headers_;
+  std::string subprotocol_;
+  bool is_valid_ = false;
+  socket_t sock_ = INVALID_SOCKET;
+  std::unique_ptr<WebSocket> ws_;
+  time_t read_timeout_sec_ = CPPHTTPLIB_WEBSOCKET_READ_TIMEOUT_SECOND;
+  time_t read_timeout_usec_ = 0;
+  time_t write_timeout_sec_ = CPPHTTPLIB_CLIENT_WRITE_TIMEOUT_SECOND;
+  time_t write_timeout_usec_ = CPPHTTPLIB_CLIENT_WRITE_TIMEOUT_USECOND;
+
+#ifdef CPPHTTPLIB_SSL_ENABLED
+  bool is_ssl_ = false;
+  tls::ctx_t tls_ctx_ = nullptr;
+  tls::session_t tls_session_ = nullptr;
+  std::string ca_cert_file_path_;
+  tls::ca_store_t ca_cert_store_ = nullptr;
+  bool server_certificate_verification_ = true;
+#endif
+};
+
+namespace impl {
+
+bool is_valid_utf8(const std::string &s);
+
+bool read_websocket_frame(Stream &strm, Opcode &opcode, std::string &payload,
+                          bool &fin, bool expect_masked, size_t max_len);
+
+} // namespace impl
+
+} // namespace ws
+
 
 } // namespace httplib