diff --git a/common/arg.cpp b/common/arg.cpp
index 9c85696ebd..18f953a38e 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1301,7 +1301,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, bool value) {
             params.kv_unified = value;
         }
-    ).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_BATCHED, LLAMA_EXAMPLE_BENCH}));
+    ).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_BATCHED, LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
     add_opt(common_arg(
         {"--context-shift"},
         {"--no-context-shift"},
diff --git a/common/common.cpp b/common/common.cpp
index 93474f88c7..ec15804c91 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1,7 +1,3 @@
-#if defined(_MSC_VER)
-#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
-#endif
-
 #include "ggml.h"
 #include "gguf.h"
 
@@ -9,12 +5,12 @@
 #include "log.h"
 #include "llama.h"
 #include "sampling.h"
+#include "unicode.h"
 
 #include <algorithm>
 #include <cinttypes>
 #include <climits>
 #include <cmath>
-#include <codecvt>
 #include <chrono>
 #include <cstdarg>
 #include <cstring>
@@ -706,45 +702,28 @@ bool fs_validate_filename(const std::string & filename, bool allow_subdirs) {
         return false;
     }
 
-    std::u32string filename_utf32;
-    try {
-#if defined(__clang__)
-        // disable C++17 deprecation warning for std::codecvt_utf8
-#    pragma clang diagnostic push
-#    pragma clang diagnostic ignored "-Wdeprecated-declarations"
-#elif defined(__GNUC__)
-#    pragma GCC diagnostic push
-#    pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-#endif
+    size_t offset = 0;
+    while (offset < filename.size()) {
+        utf8_parse_result result = parse_utf8_codepoint(filename, offset);
 
-        std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
-
-#if defined(__clang__)
-#    pragma clang diagnostic pop
-#elif defined(__GNUC__)
-#    pragma GCC diagnostic pop
-#endif
-
-        filename_utf32 = converter.from_bytes(filename);
-
-        // If the reverse conversion mismatches, it means overlong UTF-8 sequences were used,
-        // or invalid encodings were encountered. Reject such attempts
-        std::string filename_reencoded = converter.to_bytes(filename_utf32);
-        if (filename_reencoded != filename) {
+        if (result.status != utf8_parse_result::SUCCESS) {
             return false;
         }
-    } catch (const std::exception &) {
-        return false;
-    }
+        uint32_t c = result.codepoint;
 
-    // Check for forbidden codepoints:
-    // - Control characters
-    // - Unicode equivalents of illegal characters
-    // - UTF-16 surrogate pairs
-    // - UTF-8 replacement character
-    // - Byte order mark (BOM)
-    // - Illegal characters: / \ : * ? " < > |
-    for (char32_t c : filename_utf32) {
+        if ((result.bytes_consumed == 2 && c < 0x80) ||
+            (result.bytes_consumed == 3 && c < 0x800) ||
+            (result.bytes_consumed == 4 && c < 0x10000)) {
+            return false;
+        }
+
+        // Check for forbidden codepoints:
+        // - Control characters
+        // - Unicode equivalents of illegal characters
+        // - UTF-16 surrogate pairs
+        // - UTF-8 replacement character
+        // - Byte order mark (BOM)
+        // - Illegal characters: / \ : * ? " < > |
         if (c <= 0x1F // Control characters (C0)
             || c == 0x7F // Control characters (DEL)
             || (c >= 0x80 && c <= 0x9F) // Control characters (C1)
@@ -752,6 +731,7 @@ bool fs_validate_filename(const std::string & filename, bool allow_subdirs) {
             || c == 0x2215 // Division Slash (forward slash equivalent)
             || c == 0x2216 // Set Minus (backslash equivalent)
             || (c >= 0xD800 && c <= 0xDFFF) // UTF-16 surrogate pairs
+            || c > 0x10FFFF // Max Unicode limit
             || c == 0xFFFD // Replacement Character (UTF-8)
             || c == 0xFEFF // Byte Order Mark (BOM)
             || c == ':' || c == '*' // Illegal characters
@@ -762,6 +742,7 @@ bool fs_validate_filename(const std::string & filename, bool allow_subdirs) {
             // Subdirectories not allowed, reject path separators
             return false;
         }
+        offset += result.bytes_consumed;
     }
 
     // Reject any leading or trailing ' ', or any trailing '.', these are stripped on Windows and will cause a different filename
diff --git a/docs/backend/snapdragon/README.md b/docs/backend/snapdragon/README.md
index 8e1f37b206..2c3f88e91a 100644
--- a/docs/backend/snapdragon/README.md
+++ b/docs/backend/snapdragon/README.md
@@ -35,7 +35,7 @@ Adapt below build commands accordingly.
 Let's build llama.cpp with CPU, OpenCL, and Hexagon backends via CMake presets:
 
 ```
-[d]/workspace> cp docs/backend/hexagon/CMakeUserPresets.json .
+[d]/workspace> cp docs/backend/snapdragon/CMakeUserPresets.json .
 
 [d]/workspace> cmake --preset arm64-android-snapdragon-release -B build-snapdragon
 Preset CMake variables:
diff --git a/ggml/src/ggml-cuda/fattn-wmma-f16.cu b/ggml/src/ggml-cuda/fattn-wmma-f16.cu
index 8694fd06c7..35735d48b2 100644
--- a/ggml/src/ggml-cuda/fattn-wmma-f16.cu
+++ b/ggml/src/ggml-cuda/fattn-wmma-f16.cu
@@ -63,11 +63,19 @@ static __global__ void flash_attn_ext_f16(
     constexpr int frag_m = ncols == 8 ? 32 : 16;
     constexpr int frag_n = ncols == 8 ?  8 : 16;
     static_assert(D % frag_m == 0, "If ncols == 8 then D % frag_m must be 0.");
+#if defined(GGML_USE_HIP)
+    typedef wmma::fragment<wmma::matrix_a,    frag_m, frag_n, 16, _Float16, wmma::row_major> frag_a_K;
+    typedef wmma::fragment<wmma::matrix_a,    frag_m, frag_n, 16, _Float16, wmma::col_major> frag_a_V;
+    typedef wmma::fragment<wmma::matrix_b,    frag_m, frag_n, 16, _Float16, wmma::col_major> frag_b;
+    typedef wmma::fragment<wmma::accumulator, frag_m, frag_n, 16, KQ_acc_t>                      frag_c_KQ;
+    typedef wmma::fragment<wmma::accumulator, frag_m, frag_n, 16, _Float16>                          frag_c_VKQ;
+#else
     typedef wmma::fragment<wmma::matrix_a,    frag_m, frag_n, 16, half, wmma::row_major> frag_a_K;
     typedef wmma::fragment<wmma::matrix_a,    frag_m, frag_n, 16, half, wmma::col_major> frag_a_V;
     typedef wmma::fragment<wmma::matrix_b,    frag_m, frag_n, 16, half, wmma::col_major> frag_b;
     typedef wmma::fragment<wmma::accumulator, frag_m, frag_n, 16, KQ_acc_t>                      frag_c_KQ;
     typedef wmma::fragment<wmma::accumulator, frag_m, frag_n, 16, half>                          frag_c_VKQ;
+#endif
 
     constexpr int KQ_stride_tc  = nwarps*frag_m; // Number of KQ rows calculated in parallel.
     constexpr int VKQ_ratio = KQ_stride_tc/VKQ_stride; // Number of parallel VKQ accumulators needed to keep all warps busy.
@@ -126,6 +134,19 @@ static __global__ void flash_attn_ext_f16(
 
     __shared__ half VKQ[ncols*D_padded]; // Accumulator for final VKQ slice.
     half2 * VKQ2 = (half2 *) VKQ;
+
+#if defined(GGML_USE_HIP)
+    const _Float16 * K_h_f16  = reinterpret_cast<const _Float16 *>(K_h);
+    const _Float16 * V_h_f16  = reinterpret_cast<const _Float16 *>(V_h);
+    _Float16       * KQ_f16   = reinterpret_cast<_Float16 *>(KQ);
+    _Float16       * VKQ_f16  = reinterpret_cast<_Float16 *>(VKQ);
+#else
+    const half * K_h_f16  = K_h;
+    const half * V_h_f16  = V_h;
+    half       * KQ_f16   = KQ;
+    half       * VKQ_f16  = VKQ;
+#endif
+
 #pragma unroll
     for (int j0 = 0; j0 < ncols; j0 += nwarps) {
         const int j = j0 + threadIdx.y;
@@ -160,7 +181,7 @@ static __global__ void flash_attn_ext_f16(
     for (int i0 = 0; i0 < D; i0 += 16) {
 #pragma unroll
         for (int j0 = 0; j0 < ncols; j0 += frag_n) {
-            wmma::load_matrix_sync(Q_b[i0/16][j0/frag_n], KQ + j0*D_padded + i0, D_padded);
+            wmma::load_matrix_sync(Q_b[i0/16][j0/frag_n], KQ_f16 + j0*D_padded + i0, D_padded);
         }
     }
 
@@ -180,7 +201,7 @@ static __global__ void flash_attn_ext_f16(
 #pragma unroll
             for (int k_KQ_0 = 0; k_KQ_0 < D; k_KQ_0 += 16) {
                 frag_a_K K_a;
-                wmma::load_matrix_sync(K_a, K_h + int64_t(k_VKQ_0 + i_KQ_0 + frag_m*threadIdx.y)*stride_KV + k_KQ_0, stride_KV);
+                wmma::load_matrix_sync(K_a, K_h_f16 + int64_t(k_VKQ_0 + i_KQ_0 + frag_m*threadIdx.y)*stride_KV + k_KQ_0, stride_KV);
 #pragma unroll
                 for (int j = 0; j < ncols/frag_n; ++j) {
                     wmma::mma_sync(KQ_c[j], K_a, Q_b[k_KQ_0/16][j], KQ_c[j]);
@@ -310,7 +331,7 @@ static __global__ void flash_attn_ext_f16(
                 const int k = k0 + (threadIdx.y % VKQ_ratio)*16;
                 wmma::load_matrix_sync(
                     KQ_b[k0/(VKQ_ratio*16)][j0/frag_n],
-                    KQ + j0*(kqar*kqs_padded) + k,
+                    KQ_f16 + j0*(kqar*kqs_padded) + k,
                     kqar*kqs_padded);
             }
         }
@@ -328,7 +349,7 @@ static __global__ void flash_attn_ext_f16(
                 const int k = k0 + (threadIdx.y % VKQ_ratio)*16;
 
                 frag_a_V v_a;
-                wmma::load_matrix_sync(v_a, V_h + int64_t(k_VKQ_0 + k)*stride_KV + i_VKQ_0 + frag_m*(threadIdx.y/VKQ_ratio), stride_KV);
+                wmma::load_matrix_sync(v_a, V_h_f16 + int64_t(k_VKQ_0 + k)*stride_KV + i_VKQ_0 + frag_m*(threadIdx.y/VKQ_ratio), stride_KV);
 #pragma unroll
                 for (int j = 0; j < ncols/frag_n; ++j) {
                     wmma::mma_sync(VKQ_c[i_VKQ_0/VKQ_stride][j], v_a, KQ_b[k0/(VKQ_ratio*16)][j], VKQ_c[i_VKQ_0/VKQ_stride][j]);
@@ -344,7 +365,7 @@ static __global__ void flash_attn_ext_f16(
 #pragma unroll
             for (int j0 = 0; j0 < ncols; j0 += frag_n) {
                 wmma::store_matrix_sync(
-                    KQ + offset_k + j0*D_padded + i_KQ_0 + frag_m*(threadIdx.y/VKQ_ratio),
+                    KQ_f16 + offset_k + j0*D_padded + i_KQ_0 + frag_m*(threadIdx.y/VKQ_ratio),
                     VKQ_c[i_KQ_0/VKQ_stride][j0/frag_n],
                     D_padded, wmma::mem_col_major);
             }
diff --git a/ggml/src/ggml-hexagon/htp/htp-ops.h b/ggml/src/ggml-hexagon/htp/htp-ops.h
index c0d72587ce..f1ad24dbfa 100644
--- a/ggml/src/ggml-hexagon/htp/htp-ops.h
+++ b/ggml/src/ggml-hexagon/htp/htp-ops.h
@@ -64,25 +64,12 @@ struct htp_ops_context {
     struct fastdiv_values broadcast_rv2;
     struct fastdiv_values broadcast_rv3;
 
-    struct fastdiv_values mm_div_ne12_ne1; // fastdiv values for ne12 * ne1
-    struct fastdiv_values mm_div_ne1;      // fastdiv values for ne1
-    struct fastdiv_values mm_div_r2;       // fastdiv values for ne12 / ne02
-    struct fastdiv_values mm_div_r3;       // fastdiv values for ne13 / ne03
-
     struct fastdiv_values set_rows_div_ne12; // fastdiv values for ne12
     struct fastdiv_values set_rows_div_ne11; // fastdiv values for ne11
 
     struct fastdiv_values get_rows_div_ne10;      // fastdiv values for ne10
     struct fastdiv_values get_rows_div_ne10_ne11; // fastdiv values for ne10 * ne11
 
-    struct fastdiv_values cpy_div_ne01; // fastdiv values for ne01
-    struct fastdiv_values cpy_div_ne02; // fastdiv values for ne02
-    struct fastdiv_values cpy_div_ne03; // fastdiv values for ne03
-
-    struct fastdiv_values cpy_rshp_div_n0;       // fastdiv values for ne00
-    struct fastdiv_values cpy_rshp_div_n1n0;     // fastdiv values for ne00*ne01
-    struct fastdiv_values cpy_rshp_div_n2n1n0;   // fastdiv values for ne00*ne01*ne02
-
     uint32_t flags;
 };
 
diff --git a/ggml/src/ggml-hexagon/htp/main.c b/ggml/src/ggml-hexagon/htp/main.c
index 62708eee5c..92a1422896 100644
--- a/ggml/src/ggml-hexagon/htp/main.c
+++ b/ggml/src/ggml-hexagon/htp/main.c
@@ -189,7 +189,7 @@ static int vtcm_release_callback(unsigned int rctx, void * state) {
     // otherwise we'll release it once we're done with the current Op.
 
     if (ctx->vtcm_inuse) {
-        ctx->vtcm_needs_release = false;
+        ctx->vtcm_needs_release = true;
         return 0;
     }
 
diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c
index d251eeed33..c360abe8da 100644
--- a/ggml/src/ggml-hexagon/htp/matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c
@@ -23,10 +23,30 @@
 #define MM_SPAD_SRC1_NROWS 16
 #define MM_SPAD_DST_NROWS  2
 
-struct htp_matmul_type {
+struct htp_matmul_context {
     const char * type;
-    void (*vec_dot)(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-    void (*vec_dot_rx2)(const int n, float * restrict s, const void * restrict vx, uint32_t vx_row_size, const void * restrict vy);
+    struct htp_ops_context * octx;
+
+    void (*vec_dot_1x1)(const int n, float * restrict s0,
+         const void * restrict vx0,
+         const void * restrict vy0);
+
+    void (*vec_dot_2x1)(const int n, float * restrict s0,
+         const void * restrict vx0, const void * restrict vx1,
+         const void * restrict vy0);
+
+    void (*vec_dot_2x2)(const int n, float * restrict s0, float * restrict s1,
+         const void * restrict vx0, const void * restrict vx1,
+         const void * restrict vy0, const void * restrict vy1);
+
+    // Precomputed values
+    uint32_t src0_nrows_per_thread;
+    uint32_t src1_nrows_per_thread;
+
+    struct fastdiv_values mm_div_ne12_ne1;
+    struct fastdiv_values mm_div_ne1;
+    struct fastdiv_values mm_div_r2;
+    struct fastdiv_values mm_div_r3;
 };
 
 // vdelta control to replicate first 4x fp32 values across lanes
@@ -122,6 +142,7 @@ static inline HVX_Vector_x8 hvx_vec_load_q4x4x8(const uint8_t * restrict ptr) {
     HVX_Vector v6_7 = vptr[3];  // ...
 
     const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
+    const HVX_Vector i8 = Q6_Vb_vsplat_R(8);
 
     HVX_Vector v0 = Q6_V_vand_VV(v0_1, mask_h4);  // & 0x0F
     HVX_Vector v1 = Q6_Vub_vlsr_VubR(v0_1, 4);    // >> 4
@@ -133,15 +154,14 @@ static inline HVX_Vector_x8 hvx_vec_load_q4x4x8(const uint8_t * restrict ptr) {
     HVX_Vector v7 = Q6_Vub_vlsr_VubR(v6_7, 4);    // >> 4
 
     // Convert uint4 to int4 (i.e. x - 8)
-    const HVX_Vector i8 = Q6_Vb_vsplat_R(8);
-    v0                  = Q6_Vb_vsub_VbVb(v0, i8);
-    v1                  = Q6_Vb_vsub_VbVb(v1, i8);
-    v2                  = Q6_Vb_vsub_VbVb(v2, i8);
-    v3                  = Q6_Vb_vsub_VbVb(v3, i8);
-    v4                  = Q6_Vb_vsub_VbVb(v4, i8);
-    v5                  = Q6_Vb_vsub_VbVb(v5, i8);
-    v6                  = Q6_Vb_vsub_VbVb(v6, i8);
-    v7                  = Q6_Vb_vsub_VbVb(v7, i8);
+    v0 = Q6_Vb_vsub_VbVb(v0, i8);
+    v1 = Q6_Vb_vsub_VbVb(v1, i8);
+    v2 = Q6_Vb_vsub_VbVb(v2, i8);
+    v3 = Q6_Vb_vsub_VbVb(v3, i8);
+    v4 = Q6_Vb_vsub_VbVb(v4, i8);
+    v5 = Q6_Vb_vsub_VbVb(v5, i8);
+    v6 = Q6_Vb_vsub_VbVb(v6, i8);
+    v7 = Q6_Vb_vsub_VbVb(v7, i8);
 
     HVX_Vector_x8 r = { v0, v1, v2, v3, v4, v5, v6, v7 };
     return r;
@@ -156,6 +176,7 @@ static inline HVX_Vector_x8 hvx_vec_load_mxfp4x4x8(const uint8_t * restrict ptr)
     HVX_Vector v6_7 = vptr[3];  // ...
 
     const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
+    const HVX_Vector lut = *(const HVX_Vector *) kvalues_mxfp4_lut;
 
     HVX_Vector v0 = Q6_V_vand_VV(v0_1, mask_h4);  // & 0x0F
     HVX_Vector v1 = Q6_Vub_vlsr_VubR(v0_1, 4);    // >> 4
@@ -166,15 +187,14 @@ static inline HVX_Vector_x8 hvx_vec_load_mxfp4x4x8(const uint8_t * restrict ptr)
     HVX_Vector v6 = Q6_V_vand_VV(v6_7, mask_h4);  // & 0x0F
     HVX_Vector v7 = Q6_Vub_vlsr_VubR(v6_7, 4);    // >> 4
 
-    HVX_Vector lut = *(const HVX_Vector *) kvalues_mxfp4_lut;
-    v0             = Q6_Vb_vlut32_VbVbI(v0, lut, 0);
-    v1             = Q6_Vb_vlut32_VbVbI(v1, lut, 0);
-    v2             = Q6_Vb_vlut32_VbVbI(v2, lut, 0);
-    v3             = Q6_Vb_vlut32_VbVbI(v3, lut, 0);
-    v4             = Q6_Vb_vlut32_VbVbI(v4, lut, 0);
-    v5             = Q6_Vb_vlut32_VbVbI(v5, lut, 0);
-    v6             = Q6_Vb_vlut32_VbVbI(v6, lut, 0);
-    v7             = Q6_Vb_vlut32_VbVbI(v7, lut, 0);
+    v0 = Q6_Vb_vlut32_VbVbI(v0, lut, 0);
+    v1 = Q6_Vb_vlut32_VbVbI(v1, lut, 0);
+    v2 = Q6_Vb_vlut32_VbVbI(v2, lut, 0);
+    v3 = Q6_Vb_vlut32_VbVbI(v3, lut, 0);
+    v4 = Q6_Vb_vlut32_VbVbI(v4, lut, 0);
+    v5 = Q6_Vb_vlut32_VbVbI(v5, lut, 0);
+    v6 = Q6_Vb_vlut32_VbVbI(v6, lut, 0);
+    v7 = Q6_Vb_vlut32_VbVbI(v7, lut, 0);
 
     HVX_Vector_x8 r = { v0, v1, v2, v3, v4, v5, v6, v7 };
     return r;
@@ -196,46 +216,6 @@ static inline HVX_Vector_x8 hvx_vec_load_q8x4x8(const uint8_t * restrict ptr) {
     return r;
 }
 
-static inline HVX_Vector_x4 hvx_vec_load_x4_f16(const uint8_t * restrict ptr) {
-    const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr;
-
-    HVX_Vector v0 = vptr[0];  // first  64 vals
-    HVX_Vector v1 = vptr[1];  // second 64 vals
-    HVX_Vector v2 = vptr[2];  // third  64 vals
-    HVX_Vector v3 = vptr[3];  // forth  64 vals
-
-    HVX_Vector_x4 r = { v0, v1, v2, v3 };
-    return r;
-}
-
-static inline HVX_Vector_x4 hvx_vec_load_x4_f32_as_f16(const uint8_t * restrict ptr) {
-    const HVX_VectorPair * restrict vptr = (const HVX_VectorPair *) ptr;
-
-    HVX_VectorPair v0 = vptr[0];  // first  64 vals
-    HVX_VectorPair v1 = vptr[1];  // second 64 vals
-    HVX_VectorPair v2 = vptr[2];  // third  64 vals
-    HVX_VectorPair v3 = vptr[3];  // forth  64 vals
-
-    HVX_Vector vq0_lo = Q6_Vqf32_vsub_VsfVsf(Q6_V_lo_W(v0), Q6_V_vzero());
-    HVX_Vector vq0_hi = Q6_Vqf32_vsub_VsfVsf(Q6_V_hi_W(v0), Q6_V_vzero());
-    HVX_Vector vq1_lo = Q6_Vqf32_vsub_VsfVsf(Q6_V_lo_W(v1), Q6_V_vzero());
-    HVX_Vector vq1_hi = Q6_Vqf32_vsub_VsfVsf(Q6_V_hi_W(v1), Q6_V_vzero());
-    HVX_Vector vq2_lo = Q6_Vqf32_vsub_VsfVsf(Q6_V_lo_W(v2), Q6_V_vzero());
-    HVX_Vector vq2_hi = Q6_Vqf32_vsub_VsfVsf(Q6_V_hi_W(v2), Q6_V_vzero());
-    HVX_Vector vq3_lo = Q6_Vqf32_vsub_VsfVsf(Q6_V_lo_W(v3), Q6_V_vzero());
-    HVX_Vector vq3_hi = Q6_Vqf32_vsub_VsfVsf(Q6_V_hi_W(v3), Q6_V_vzero());
-
-    HVX_Vector vh0 = Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vq0_hi, vq0_lo));
-    HVX_Vector vh1 = Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vq1_hi, vq1_lo));
-    HVX_Vector vh2 = Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vq2_hi, vq2_lo));
-    HVX_Vector vh3 = Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vq3_hi, vq3_lo));
-
-    // vcombine does a shuffle, use vdeal to undo
-
-    HVX_Vector_x4 r = { Q6_Vh_vdeal_Vh(vh0), Q6_Vh_vdeal_Vh(vh1), Q6_Vh_vdeal_Vh(vh2), Q6_Vh_vdeal_Vh(vh3) };
-    return r;
-}
-
 // Reduce multiply 1024 x 1024 int8 elements (32x q4/8 blocks in 8x HVX vectors).
 // Accumulate each block into a single int32 value.
 // Return a single HVX vector with 32x int32 accumulators.
@@ -300,26 +280,26 @@ static inline HVX_Vector hvx_vec_rmpy_x8_nloe(HVX_Vector_x8 x, HVX_Vector_x8 y,
     return hvx_vec_rmpy_x8_n(x, y, 1024);
 }
 
-static void vec_dot_q4x4x2_q8x4x2(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+static void vec_dot_q4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const void * restrict vx0, const void * restrict vy0) {
     assert(n % 32 == 0);  // min sub-block size
-    assert((unsigned long) vx % 128 == 0);
-    assert((unsigned long) vy % 128 == 0);
+    assert((unsigned long) vx0 % 128 == 0);
+    assert((unsigned long) vy0 % 128 == 0);
 
     const uint32_t qk = QK_Q4_0x4x2 * 4;
 
-    const uint32_t x_dblk_size = 8 * 4 * 2;                                  // 32x __fp16
-    const uint32_t x_qblk_size = qk / 2;                                     // int4
-    const uint32_t x_qrow_size = n / 2;                                      // int4 (not padded)
+    const uint32_t x_dblk_size = 8 * 4 * 2;                                   // 32x __fp16
+    const uint32_t x_qblk_size = qk / 2;                                      // int4
+    const uint32_t x_qrow_size = n / 2;                                       // int4 (not padded)
 
-    const uint32_t y_dblk_size = 8 * 4 * 2;                                  // 32x __fp16
-    const uint32_t y_qblk_size = qk;                                         // int8
-    const uint32_t y_qrow_size = n;                                          // int8 (not padded)
+    const uint32_t y_dblk_size = 8 * 4 * 2;                                   // 32x __fp16
+    const uint32_t y_qblk_size = qk;                                          // int8
+    const uint32_t y_qrow_size = n;                                           // int8 (not padded)
 
-    const uint8_t * restrict r0_x_q = ((const uint8_t *) vx + 0);            // quants first
-    const uint8_t * restrict r0_x_d = ((const uint8_t *) vx + x_qrow_size);  // then scales
+    const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0 + 0);            // quants first
+    const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0 + x_qrow_size);  // then scales
 
-    const uint8_t * restrict y_q = ((const uint8_t *) vy + 0);               // quants first
-    const uint8_t * restrict y_d = ((const uint8_t *) vy + y_qrow_size);     // then scales
+    const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0);               // quants first
+    const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size);     // then scales
 
     // Row sum (sf)
     HVX_Vector r0_sum = Q6_V_vsplat_R(0);
@@ -372,36 +352,34 @@ static void vec_dot_q4x4x2_q8x4x2(const int n, float * restrict s, const void *
 
     r0_sum = hvx_vec_reduce_sum_f32(r0_sum);
 
-    hvx_vec_store_u(&s[0], 4, r0_sum);
+    hvx_vec_store_u(s0, 4, r0_sum);
 }
 
-static void vec_dot_q4x4x2_q8x4x2_rx2(const int n,
-                                      float * restrict s,
-                                      const void * restrict vx,
-                                      uint32_t vx_row_size,
-                                      const void * restrict vy) {
+static void vec_dot_q4x4x2_q8x4x2_2x1(const int n, float * restrict s0,
+                                      const void * restrict vx0, const void * restrict vx1,
+                                      const void * restrict vy0) {
     assert(n % 32 == 0);  // min sub-block size
-    assert((unsigned long) vx % 128 == 0);
-    assert((unsigned long) vy % 128 == 0);
+    assert((unsigned long) vx0 % 128 == 0);
+    assert((unsigned long) vx1 % 128 == 0);
+    assert((unsigned long) vy0 % 128 == 0);
 
     const uint32_t qk = QK_Q4_0x4x2 * 4;
 
-    const uint32_t x_dblk_size = 8 * 4 * 2;                                                        // 32x __fp16
-    const uint32_t x_qblk_size = qk / 2;                                                           // int4
-    const uint32_t x_qrow_size = n / 2;                                                            // int4 (not padded)
+    const uint32_t x_dblk_size = 8 * 4 * 2;                                   // 32x __fp16
+    const uint32_t x_qblk_size = qk / 2;                                      // int4
+    const uint32_t x_qrow_size = n / 2;                                       // int4 (not padded)
 
-    const uint32_t y_dblk_size = 8 * 4 * 2;                                                        // 32x __fp16
-    const uint32_t y_qblk_size = qk;                                                               // int8
-    const uint32_t y_qrow_size = n;                                                                // int8 (not padded)
+    const uint32_t y_dblk_size = 8 * 4 * 2;                                   // 32x __fp16
+    const uint32_t y_qblk_size = qk;                                          // int8
+    const uint32_t y_qrow_size = n;                                           // int8 (not padded)
 
-    const uint8_t * restrict r0_x_q = ((const uint8_t *) (vx + (0 * vx_row_size)) + 0);            // quants first
-    const uint8_t * restrict r0_x_d = ((const uint8_t *) (vx + (0 * vx_row_size)) + x_qrow_size);  // then scales
+    const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0) + 0;            // quants first
+    const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size;  // then scales
+    const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0;            // quants first
+    const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size;  // then scales
 
-    const uint8_t * restrict r1_x_q = ((const uint8_t *) (vx + (1 * vx_row_size)) + 0);            // quants first
-    const uint8_t * restrict r1_x_d = ((const uint8_t *) (vx + (1 * vx_row_size)) + x_qrow_size);  // then scales
-
-    const uint8_t * restrict y_q = ((const uint8_t *) vy + 0);                                     // quants first
-    const uint8_t * restrict y_d = ((const uint8_t *) vy + y_qrow_size);                           // then scales
+    const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0);               // quants first
+    const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size);     // then scales
 
     // Row sum (sf)
     HVX_Vector r0_sum = Q6_V_vsplat_R(0);
@@ -468,13 +446,143 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n,
     }
 
     HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(r0_sum, r1_sum);
-    hvx_vec_store_u(&s[0], 8, rsum);
+    hvx_vec_store_u(s0, 8, rsum);
 }
 
-static void vec_dot_q8x4x2_q8x4x2(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+static void vec_dot_q4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float * restrict s1,
+                                        const void * restrict vx0, const void * restrict vx1,
+                                        const void * restrict vy0, const void * restrict vy1) {
+    assert(n % 32 == 0);
+    assert((unsigned long) vx0 % 128 == 0);
+    assert((unsigned long) vx1 % 128 == 0);
+    assert((unsigned long) vy0 % 128 == 0);
+    assert((unsigned long) vy1 % 128 == 0);
+
+    const uint32_t qk = QK_Q4_0x4x2 * 4;
+
+    const uint32_t x_dblk_size = 8 * 4 * 2;                                   // 32x __fp16
+    const uint32_t x_qblk_size = qk / 2;                                      // int4
+    const uint32_t x_qrow_size = n / 2;                                       // int4 (not padded)
+
+    const uint32_t y_dblk_size = 8 * 4 * 2;                                   // 32x __fp16
+    const uint32_t y_qblk_size = qk;                                          // int8
+    const uint32_t y_qrow_size = n;                                           // int8 (not padded)
+
+    const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0) + 0;            // quants first
+    const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size;  // then scales
+    const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0;            // quants first
+    const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size;  // then scales
+
+    const uint8_t * restrict y0_q = ((const uint8_t *) vy0) + 0;              // quants first
+    const uint8_t * restrict y0_d = ((const uint8_t *) vy0) + y_qrow_size;    // then scales
+    const uint8_t * restrict y1_q = ((const uint8_t *) vy1) + 0;              // quants first
+    const uint8_t * restrict y1_d = ((const uint8_t *) vy1) + y_qrow_size;    // then scales
+
+    // Row sums (sf) - 4 accumulators for 2×2 tile
+    HVX_Vector r0_c0_sum = Q6_V_vsplat_R(0);
+    HVX_Vector r0_c1_sum = Q6_V_vsplat_R(0);
+    HVX_Vector r1_c0_sum = Q6_V_vsplat_R(0);
+    HVX_Vector r1_c1_sum = Q6_V_vsplat_R(0);
+
+    const uint32_t nb   = n / qk;  // num full blocks
+    const uint32_t nloe = n % qk;  // num leftover elements
+
+    uint32_t i = 0;
+    for (; i < nb; i++) {
+        // Load src1 columns (reused across both src0 rows)
+        HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8(y0_q + i * y_qblk_size);
+        HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8(y1_q + i * y_qblk_size);
+
+        // Load src0 rows (reused across both src1 columns)
+        HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8(r0_x_q + i * x_qblk_size);
+        HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8(r1_x_q + i * x_qblk_size);
+
+        // Compute 4 dot products: r0×c0, r0×c1, r1×c0, r1×c1
+        HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy0_q));
+        HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy1_q));
+        HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy0_q));
+        HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy1_q));
+
+        // Load scales
+        HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size));
+        HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size));
+        HVX_Vector r0_d  = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
+        HVX_Vector r1_d  = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
+
+        // Compute combined scales
+        HVX_Vector r0_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy0_d)));
+        HVX_Vector r0_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy1_d)));
+        HVX_Vector r1_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy0_d)));
+        HVX_Vector r1_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy1_d)));
+
+        // Apply scales and accumulate
+        HVX_Vector r0_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c0_ia, r0_c0_dd);
+        HVX_Vector r0_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c1_ia, r0_c1_dd);
+        HVX_Vector r1_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c0_ia, r1_c0_dd);
+        HVX_Vector r1_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c1_ia, r1_c1_dd);
+
+        r0_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_fa, r0_c0_sum));
+        r0_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_fa, r0_c1_sum));
+        r1_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_fa, r1_c0_sum));
+        r1_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_fa, r1_c1_sum));
+    }
+
+    // Process leftovers
+    if (nloe) {
+        HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8(y0_q + i * y_qblk_size);
+        HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8(y1_q + i * y_qblk_size);
+        HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8(r0_x_q + i * x_qblk_size);
+        HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8(r1_x_q + i * x_qblk_size);
+
+        HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy0_q, nloe));
+        HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy1_q, nloe));
+        HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy0_q, nloe));
+        HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy1_q, nloe));
+
+        HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size));
+        HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size));
+        HVX_Vector r0_d  = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
+        HVX_Vector r1_d  = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
+
+        HVX_Vector r0_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy0_d)));
+        HVX_Vector r0_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy1_d)));
+        HVX_Vector r1_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy0_d)));
+        HVX_Vector r1_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy1_d)));
+
+        // Zero out unused scales
+        HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
+        r0_c0_dd = Q6_V_vand_QV(bmask, r0_c0_dd);
+        r0_c1_dd = Q6_V_vand_QV(bmask, r0_c1_dd);
+        r1_c0_dd = Q6_V_vand_QV(bmask, r1_c0_dd);
+        r1_c1_dd = Q6_V_vand_QV(bmask, r1_c1_dd);
+        r0_c0_ia = Q6_V_vand_QV(bmask, r0_c0_ia);
+        r0_c1_ia = Q6_V_vand_QV(bmask, r0_c1_ia);
+        r1_c0_ia = Q6_V_vand_QV(bmask, r1_c0_ia);
+        r1_c1_ia = Q6_V_vand_QV(bmask, r1_c1_ia);
+
+        HVX_Vector r0_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c0_ia, r0_c0_dd);
+        HVX_Vector r0_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c1_ia, r0_c1_dd);
+        HVX_Vector r1_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c0_ia, r1_c0_dd);
+        HVX_Vector r1_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c1_ia, r1_c1_dd);
+
+        r0_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_fa, r0_c0_sum));
+        r0_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_fa, r0_c1_sum));
+        r1_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_fa, r1_c0_sum));
+        r1_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_fa, r1_c1_sum));
+    }
+
+    // Reduce and store results
+    HVX_Vector r0_r1_c0_sum = hvx_vec_reduce_sum_f32x2(r0_c0_sum, r1_c0_sum);
+    HVX_Vector r0_r1_c1_sum = hvx_vec_reduce_sum_f32x2(r0_c1_sum, r1_c1_sum);
+
+    hvx_vec_store_u(s0, 8, r0_r1_c0_sum);  // row0,col0 row1,col0
+    hvx_vec_store_u(s1, 8, r0_r1_c1_sum);  // row0,col1 row1,col1
+}
+
+static void vec_dot_q8x4x2_q8x4x2_1x1(const int n, float * restrict s0, const void * restrict vx0, const void * restrict vy0) {
     assert(n % 32 == 0);  // min sub-block size
-    assert((unsigned long) vx % 128 == 0);
-    assert((unsigned long) vy % 128 == 0);
+    assert((unsigned long) vx0 % 128 == 0);
+    assert((unsigned long) vy0 % 128 == 0);
 
     const uint32_t qk = QK_Q4_0x4x2 * 4;
 
@@ -486,11 +594,11 @@ static void vec_dot_q8x4x2_q8x4x2(const int n, float * restrict s, const void *
     const uint32_t y_qblk_size = qk;                                         // int8
     const uint32_t y_qrow_size = n;                                          // int8 (not padded)
 
-    const uint8_t * restrict r0_x_q = ((const uint8_t *) vx + 0);            // quants first
-    const uint8_t * restrict r0_x_d = ((const uint8_t *) vx + x_qrow_size);  // then scales
+    const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0 + 0);           // quants first
+    const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0 + x_qrow_size); // then scales
 
-    const uint8_t * restrict y_q = ((const uint8_t *) vy + 0);               // quants first
-    const uint8_t * restrict y_d = ((const uint8_t *) vy + y_qrow_size);     // then scales
+    const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0);              // quants first
+    const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size);    // then scales
 
     // Row sum (sf)
     HVX_Vector r0_sum = Q6_V_vsplat_R(0);
@@ -543,36 +651,34 @@ static void vec_dot_q8x4x2_q8x4x2(const int n, float * restrict s, const void *
 
     r0_sum = hvx_vec_reduce_sum_f32(r0_sum);
 
-    hvx_vec_store_u(&s[0], 4, r0_sum);
+    hvx_vec_store_u(s0, 4, r0_sum);
 }
 
-static void vec_dot_q8x4x2_q8x4x2_rx2(const int n,
-                                      float * restrict s,
-                                      const void * restrict vx,
-                                      uint32_t vx_row_size,
-                                      const void * restrict vy) {
+static void vec_dot_q8x4x2_q8x4x2_2x1(const int n, float * restrict s0,
+                                      const void * restrict vx0, const void * restrict vx1,
+                                      const void * restrict vy0) {
     assert(n % 32 == 0);  // min sub-block size
-    assert((unsigned long) vx % 128 == 0);
-    assert((unsigned long) vy % 128 == 0);
+    assert((unsigned long) vx0 % 128 == 0);
+    assert((unsigned long) vx1 % 128 == 0);
+    assert((unsigned long) vy0 % 128 == 0);
 
     const uint32_t qk = QK_Q4_0x4x2 * 4;
 
-    const uint32_t x_dblk_size = 8 * 4 * 2;                                                        // 32x __fp16
-    const uint32_t x_qblk_size = qk;                                                               // int8
-    const uint32_t x_qrow_size = n;                                                                // int8 (not padded)
+    const uint32_t x_dblk_size = 8 * 4 * 2;                                   // 32x __fp16
+    const uint32_t x_qblk_size = qk;                                          // int8
+    const uint32_t x_qrow_size = n;                                           // int8 (not padded)
 
-    const uint32_t y_dblk_size = 8 * 4 * 2;                                                        // 32x __fp16
-    const uint32_t y_qblk_size = qk;                                                               // int8
-    const uint32_t y_qrow_size = n;                                                                // int8 (not padded)
+    const uint32_t y_dblk_size = 8 * 4 * 2;                                   // 32x __fp16
+    const uint32_t y_qblk_size = qk;                                          // int8
+    const uint32_t y_qrow_size = n;                                           // int8 (not padded)
 
-    const uint8_t * restrict r0_x_q = ((const uint8_t *) (vx + (0 * vx_row_size)) + 0);            // quants first
-    const uint8_t * restrict r0_x_d = ((const uint8_t *) (vx + (0 * vx_row_size)) + x_qrow_size);  // then scales
+    const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0) + 0;            // quants first
+    const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size;  // then scales
+    const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0;            // quants first
+    const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size;  // then scales
 
-    const uint8_t * restrict r1_x_q = ((const uint8_t *) (vx + (1 * vx_row_size)) + 0);            // quants first
-    const uint8_t * restrict r1_x_d = ((const uint8_t *) (vx + (1 * vx_row_size)) + x_qrow_size);  // then scales
-
-    const uint8_t * restrict y_q = ((const uint8_t *) vy + 0);                                     // quants first
-    const uint8_t * restrict y_d = ((const uint8_t *) vy + y_qrow_size);                           // then scales
+    const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0);               // quants first
+    const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size);     // then scales
 
     // Row sum (qf32)
     HVX_Vector r0_sum = Q6_V_vsplat_R(0);
@@ -639,16 +745,143 @@ static void vec_dot_q8x4x2_q8x4x2_rx2(const int n,
     }
 
     HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(r0_sum, r1_sum);
-    hvx_vec_store_u(&s[0], 8, rsum);
+    hvx_vec_store_u(s0, 8, rsum);
 }
 
-static void vec_dot_mxfp4x4x2_q8x4x2(const int n,
-                                     float * restrict s,
-                                     const void * restrict vx,
-                                     const void * restrict vy) {
+static void vec_dot_q8x4x2_q8x4x2_2x2(const int n, float * restrict s0, float * restrict s1,
+                                        const void * restrict vx0, const void * restrict vx1,
+                                        const void * restrict vy0, const void * restrict vy1) {
+    assert(n % 32 == 0);
+    assert((unsigned long) vx0 % 128 == 0);
+    assert((unsigned long) vx1 % 128 == 0);
+    assert((unsigned long) vy0 % 128 == 0);
+    assert((unsigned long) vy1 % 128 == 0);
+
+    const uint32_t qk = QK_Q8_0x4x2 * 4;
+
+    const uint32_t x_dblk_size = 8 * 4 * 2;                                   // 32x __fp16
+    const uint32_t x_qblk_size = qk;                                          // int8
+    const uint32_t x_qrow_size = n;                                           // int8 (not padded)
+
+    const uint32_t y_dblk_size = 8 * 4 * 2;                                   // 32x __fp16
+    const uint32_t y_qblk_size = qk;                                          // int8
+    const uint32_t y_qrow_size = n;                                           // int8 (not padded)
+
+    const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0) + 0;            // quants first
+    const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size;  // then scales
+    const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0;            // quants first
+    const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size;  // then scales
+
+    const uint8_t * restrict y0_q = ((const uint8_t *) vy0) + 0;              // quants first
+    const uint8_t * restrict y0_d = ((const uint8_t *) vy0) + y_qrow_size;    // then scales
+    const uint8_t * restrict y1_q = ((const uint8_t *) vy1) + 0;              // quants first
+    const uint8_t * restrict y1_d = ((const uint8_t *) vy1) + y_qrow_size;    // then scales
+
+    // Row sums (sf) - 4 accumulators for 2×2 tile
+    HVX_Vector r0_c0_sum = Q6_V_vsplat_R(0);
+    HVX_Vector r0_c1_sum = Q6_V_vsplat_R(0);
+    HVX_Vector r1_c0_sum = Q6_V_vsplat_R(0);
+    HVX_Vector r1_c1_sum = Q6_V_vsplat_R(0);
+
+    const uint32_t nb   = n / qk;  // num full blocks
+    const uint32_t nloe = n % qk;  // num leftover elements
+
+    uint32_t i = 0;
+    for (; i < nb; i++) {
+        // Load src1 columns (reused across both src0 rows)
+        HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8(y0_q + i * y_qblk_size);
+        HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8(y1_q + i * y_qblk_size);
+
+        // Load src0 rows (reused across both src1 columns)
+        HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8(r0_x_q + i * x_qblk_size);
+        HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8(r1_x_q + i * x_qblk_size);
+
+        // Compute 4 dot products: r0×c0, r0×c1, r1×c0, r1×c1
+        HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy0_q));
+        HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy1_q));
+        HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy0_q));
+        HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy1_q));
+
+        // Load scales
+        HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size));
+        HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size));
+        HVX_Vector r0_d  = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
+        HVX_Vector r1_d  = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
+
+        // Compute combined scales
+        HVX_Vector r0_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy0_d)));
+        HVX_Vector r0_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy1_d)));
+        HVX_Vector r1_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy0_d)));
+        HVX_Vector r1_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy1_d)));
+
+        // Apply scales and accumulate
+        HVX_Vector r0_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c0_ia, r0_c0_dd);
+        HVX_Vector r0_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c1_ia, r0_c1_dd);
+        HVX_Vector r1_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c0_ia, r1_c0_dd);
+        HVX_Vector r1_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c1_ia, r1_c1_dd);
+
+        r0_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_fa, r0_c0_sum));
+        r0_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_fa, r0_c1_sum));
+        r1_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_fa, r1_c0_sum));
+        r1_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_fa, r1_c1_sum));
+    }
+
+    // Process leftovers
+    if (nloe) {
+        HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8(y0_q + i * y_qblk_size);
+        HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8(y1_q + i * y_qblk_size);
+        HVX_Vector_x8 r0_q  = hvx_vec_load_q8x4x8(r0_x_q + i * x_qblk_size);
+        HVX_Vector_x8 r1_q  = hvx_vec_load_q8x4x8(r1_x_q + i * x_qblk_size);
+
+        HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy0_q, nloe));
+        HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy1_q, nloe));
+        HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy0_q, nloe));
+        HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy1_q, nloe));
+
+        HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size));
+        HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size));
+        HVX_Vector r0_d  = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
+        HVX_Vector r1_d  = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
+
+        HVX_Vector r0_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy0_d)));
+        HVX_Vector r0_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy1_d)));
+        HVX_Vector r1_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy0_d)));
+        HVX_Vector r1_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy1_d)));
+
+        // Zero out unused scales
+        HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
+        r0_c0_dd = Q6_V_vand_QV(bmask, r0_c0_dd);
+        r0_c1_dd = Q6_V_vand_QV(bmask, r0_c1_dd);
+        r1_c0_dd = Q6_V_vand_QV(bmask, r1_c0_dd);
+        r1_c1_dd = Q6_V_vand_QV(bmask, r1_c1_dd);
+        r0_c0_ia = Q6_V_vand_QV(bmask, r0_c0_ia);
+        r0_c1_ia = Q6_V_vand_QV(bmask, r0_c1_ia);
+        r1_c0_ia = Q6_V_vand_QV(bmask, r1_c0_ia);
+        r1_c1_ia = Q6_V_vand_QV(bmask, r1_c1_ia);
+
+        HVX_Vector r0_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c0_ia, r0_c0_dd);
+        HVX_Vector r0_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c1_ia, r0_c1_dd);
+        HVX_Vector r1_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c0_ia, r1_c0_dd);
+        HVX_Vector r1_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c1_ia, r1_c1_dd);
+
+        r0_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_fa, r0_c0_sum));
+        r0_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_fa, r0_c1_sum));
+        r1_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_fa, r1_c0_sum));
+        r1_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_fa, r1_c1_sum));
+    }
+
+    // Reduce and store results
+    HVX_Vector r0_r1_c0_sum = hvx_vec_reduce_sum_f32x2(r0_c0_sum, r1_c0_sum);
+    HVX_Vector r0_r1_c1_sum = hvx_vec_reduce_sum_f32x2(r0_c1_sum, r1_c1_sum);
+
+    hvx_vec_store_u(&s0[0], 8, r0_r1_c0_sum);  // row0,col0 row1,col0
+    hvx_vec_store_u(&s1[0], 8, r0_r1_c1_sum);  // row0,col1 row1,col1
+}
+
+static void vec_dot_mxfp4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const void * restrict vx0, const void * restrict vy0) {
     assert(n % 32 == 0);  // min sub-block size
-    assert((unsigned long) vx % 128 == 0);
-    assert((unsigned long) vy % 128 == 0);
+    assert((unsigned long) vx0 % 128 == 0);
+    assert((unsigned long) vy0 % 128 == 0);
 
     const uint32_t qk = QK_MXFP4x4x2 * 4;
 
@@ -660,11 +893,11 @@ static void vec_dot_mxfp4x4x2_q8x4x2(const int n,
     const uint32_t y_qblk_size = qk;                                         // int8
     const uint32_t y_qrow_size = n;                                          // int8 (not padded)
 
-    const uint8_t * restrict r0_x_q = ((const uint8_t *) vx + 0);            // quants first
-    const uint8_t * restrict r0_x_d = ((const uint8_t *) vx + x_qrow_size);  // then scales
+    const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0 + 0);           // quants first
+    const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0 + x_qrow_size); // then scales
 
-    const uint8_t * restrict y_q = ((const uint8_t *) vy + 0);               // quants first
-    const uint8_t * restrict y_d = ((const uint8_t *) vy + y_qrow_size);     // then scales
+    const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0);              // quants first
+    const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size);    // then scales
 
     // Row sum (sf)
     HVX_Vector r0_sum = Q6_V_vsplat_R(0);
@@ -747,36 +980,34 @@ static void vec_dot_mxfp4x4x2_q8x4x2(const int n,
 
     r0_sum = hvx_vec_reduce_sum_f32(r0_sum);
 
-    hvx_vec_store_u(&s[0], 4, r0_sum);
+    hvx_vec_store_u(s0, 4, r0_sum);
 }
 
-static void vec_dot_mxfp4x4x2_q8x4x2_rx2(const int n,
-                                         float * restrict s,
-                                         const void * restrict vx,
-                                         uint32_t vx_row_size,
-                                         const void * restrict vy) {
+static void vec_dot_mxfp4x4x2_q8x4x2_2x1(const int n, float * restrict s0,
+                                      const void * restrict vx0, const void * restrict vx1,
+                                      const void * restrict vy0) {
     assert(n % 32 == 0);  // min sub-block size
-    assert((unsigned long) vx % 128 == 0);
-    assert((unsigned long) vy % 128 == 0);
+    assert((unsigned long) vx0 % 128 == 0);
+    assert((unsigned long) vx1 % 128 == 0);
+    assert((unsigned long) vy0 % 128 == 0);
 
     const uint32_t qk = QK_MXFP4x4x2 * 4;
 
-    const uint32_t x_dblk_size = 8 * 4 * 1;                                                        // 32x e8m0
-    const uint32_t x_qblk_size = qk / 2;                                                           // fp4
-    const uint32_t x_qrow_size = n / 2;                                                            // fp4 (not padded)
+    const uint32_t x_dblk_size = 8 * 4 * 1;                                   // 32x e8m0
+    const uint32_t x_qblk_size = qk / 2;                                      // fp4
+    const uint32_t x_qrow_size = n / 2;                                       // fp4 (not padded)
 
-    const uint32_t y_dblk_size = 8 * 4 * 2;                                                        // 32x __fp16
-    const uint32_t y_qblk_size = qk;                                                               // int8
-    const uint32_t y_qrow_size = n;                                                                // int8 (not padded)
+    const uint32_t y_dblk_size = 8 * 4 * 2;                                   // 32x __fp16
+    const uint32_t y_qblk_size = qk;                                          // int8
+    const uint32_t y_qrow_size = n;                                           // int8 (not padded)
 
-    const uint8_t * restrict r0_x_q = ((const uint8_t *) (vx + (0 * vx_row_size)) + 0);            // quants first
-    const uint8_t * restrict r0_x_d = ((const uint8_t *) (vx + (0 * vx_row_size)) + x_qrow_size);  // then scales
+    const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0) + 0;            // quants first
+    const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size;  // then scales
+    const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0;            // quants first
+    const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size;  // then scales
 
-    const uint8_t * restrict r1_x_q = ((const uint8_t *) (vx + (1 * vx_row_size)) + 0);            // quants first
-    const uint8_t * restrict r1_x_d = ((const uint8_t *) (vx + (1 * vx_row_size)) + x_qrow_size);  // then scales
-
-    const uint8_t * restrict y_q = ((const uint8_t *) vy + 0);                                     // quants first
-    const uint8_t * restrict y_d = ((const uint8_t *) vy + y_qrow_size);                           // then scales
+    const uint8_t * restrict y_q = ((const uint8_t *) vy0) + 0;               // quants first
+    const uint8_t * restrict y_d = ((const uint8_t *) vy0) + y_qrow_size;     // then scales
 
     // Row sum (sf)
     HVX_Vector r0_sum = Q6_V_vsplat_R(0);
@@ -879,10 +1110,180 @@ static void vec_dot_mxfp4x4x2_q8x4x2_rx2(const int n,
     }
 
     HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(r0_sum, r1_sum);
-    hvx_vec_store_u(&s[0], 8, rsum);
+    hvx_vec_store_u(s0, 8, rsum);
 }
 
-static void vec_dot_f16_f16_aa(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+static void vec_dot_mxfp4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float * restrict s1,
+                                        const void * restrict vx0, const void * restrict vx1,
+                                        const void * restrict vy0, const void * restrict vy1) {
+    assert(n % 32 == 0);
+    assert((unsigned long) vx0 % 128 == 0);
+    assert((unsigned long) vx1 % 128 == 0);
+    assert((unsigned long) vy0 % 128 == 0);
+    assert((unsigned long) vy1 % 128 == 0);
+
+    const uint32_t qk = QK_MXFP4x4x2 * 4;
+
+    const uint32_t x_dblk_size = 8 * 4 * 1;                                   // 32x e8m0
+    const uint32_t x_qblk_size = qk / 2;                                      // fp4
+    const uint32_t x_qrow_size = n / 2;                                       // fp4 (not padded)
+
+    const uint32_t y_dblk_size = 8 * 4 * 2;                                   // 32x __fp16
+    const uint32_t y_qblk_size = qk;                                          // int8
+    const uint32_t y_qrow_size = n;                                           // int8 (not padded)
+
+    const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0) + 0;            // quants first
+    const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size;  // then scales
+    const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0;            // quants first
+    const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size;  // then scales
+
+    const uint8_t * restrict y0_q = ((const uint8_t *) vy0) + 0;              // quants first
+    const uint8_t * restrict y0_d = ((const uint8_t *) vy0) + y_qrow_size;    // then scales
+    const uint8_t * restrict y1_q = ((const uint8_t *) vy1) + 0;              // quants first
+    const uint8_t * restrict y1_d = ((const uint8_t *) vy1) + y_qrow_size;    // then scales
+
+    // Row sums (sf) - 4 accumulators for 2×2 tile
+    HVX_Vector r0_c0_sum = Q6_V_vsplat_R(0);
+    HVX_Vector r0_c1_sum = Q6_V_vsplat_R(0);
+    HVX_Vector r1_c0_sum = Q6_V_vsplat_R(0);
+    HVX_Vector r1_c1_sum = Q6_V_vsplat_R(0);
+
+    const uint32_t nb   = n / qk;  // num full blocks
+    const uint32_t nloe = n % qk;  // num leftover elements
+
+    uint32_t i = 0;
+    for (; i < nb; i++) {
+        // Load src1 columns (reused across both src0 rows)
+        HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8(y0_q + i * y_qblk_size);
+        HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8(y1_q + i * y_qblk_size);
+
+        // Load src0 rows (reused across both src1 columns)
+        HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8(r0_x_q + i * x_qblk_size);
+        HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8(r1_x_q + i * x_qblk_size);
+
+        // Compute 4 dot products: r0×c0, r0×c1, r1×c0, r1×c1
+        HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy0_q));
+        HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy1_q));
+        HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy0_q));
+        HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy1_q));
+
+        // Load scales
+        HVX_Vector vy0_d = *(const HVX_UVector *) (y0_d   + i * y_dblk_size);
+        HVX_Vector vy1_d = *(const HVX_UVector *) (y1_d   + i * y_dblk_size);
+        HVX_Vector r0_d  = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size);
+        HVX_Vector r1_d  = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size);
+
+        // Convert vy_d from fp16 to fp32 while applying 0.5 scaling which is used for e8m0 halving
+        HVX_Vector half = Q6_Vh_vsplat_R(0x3800);  // 0.5 in fp16
+        vy0_d           = Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vy0_d), half));
+        vy0_d           = Q6_Vsf_equals_Vqf32(vy0_d);
+        vy1_d           = Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vy1_d), half));
+        vy1_d           = Q6_Vsf_equals_Vqf32(vy1_d);
+
+        // Convert rX_d scales from e8m0 to fp32
+        // Expand and zero-pad 32x uint8 e8m0 values to uint32s : 0 0 0 0, 0 0 0 1, 0 0 0 2, ...
+        // Left shift with zero fill to create FP32
+        // FIXME: might need to handle zero as a special case (see ggml-cpu code)
+        HVX_Vector expand    = *(const HVX_Vector *) expand_x32_e8m0;
+        HVX_Vector e8m0_mask = Q6_V_vsplat_R(0x000000ff);
+        r0_d                 = Q6_V_vdelta_VV(r0_d, expand);
+        r0_d                 = Q6_V_vand_VV(r0_d, e8m0_mask);
+        r0_d                 = Q6_Vw_vasl_VwR(r0_d, 23);
+        r1_d                 = Q6_V_vdelta_VV(r1_d, expand);
+        r1_d                 = Q6_V_vand_VV(r1_d, e8m0_mask);
+        r1_d                 = Q6_Vw_vasl_VwR(r1_d, 23);
+
+        // Compute combined scales
+        HVX_Vector r0_c0_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r0_d, vy0_d));
+        HVX_Vector r0_c1_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r0_d, vy1_d));
+        HVX_Vector r1_c0_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r1_d, vy0_d));
+        HVX_Vector r1_c1_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r1_d, vy1_d));
+
+        // Apply scales and accumulate
+        HVX_Vector r0_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c0_ia, r0_c0_dd);
+        HVX_Vector r0_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c1_ia, r0_c1_dd);
+        HVX_Vector r1_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c0_ia, r1_c0_dd);
+        HVX_Vector r1_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c1_ia, r1_c1_dd);
+
+        r0_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_fa, r0_c0_sum));
+        r0_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_fa, r0_c1_sum));
+        r1_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_fa, r1_c0_sum));
+        r1_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_fa, r1_c1_sum));
+    }
+
+    // Process leftovers
+    if (nloe) {
+        HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8(y0_q + i * y_qblk_size);
+        HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8(y1_q + i * y_qblk_size);
+        HVX_Vector_x8 r0_q  = hvx_vec_load_mxfp4x4x8(r0_x_q + i * x_qblk_size);
+        HVX_Vector_x8 r1_q  = hvx_vec_load_mxfp4x4x8(r1_x_q + i * x_qblk_size);
+
+        HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy0_q, nloe));
+        HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy1_q, nloe));
+        HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy0_q, nloe));
+        HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy1_q, nloe));
+
+        HVX_Vector vy0_d = *(const HVX_UVector *) (y0_d   + i * y_dblk_size);
+        HVX_Vector vy1_d = *(const HVX_UVector *) (y1_d   + i * y_dblk_size);
+        HVX_Vector r0_d  = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size);
+        HVX_Vector r1_d  = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size);
+
+        // Convert vy_d from fp16 to fp32 while applying 0.5 scaling which is used for e8m0 halving
+        HVX_Vector half = Q6_Vh_vsplat_R(0x3800);  // 0.5 in fp16
+        vy0_d           = Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vy0_d), half));
+        vy0_d           = Q6_Vsf_equals_Vqf32(vy0_d);
+        vy1_d           = Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vy1_d), half));
+        vy1_d           = Q6_Vsf_equals_Vqf32(vy1_d);
+
+        // Convert rX_d scales from e8m0 to fp32
+        // Expand and zero-pad 32x uint8 e8m0 values to uint32s : 0 0 0 0, 0 0 0 1, 0 0 0 2, ...
+        // Left shift with zero fill to create FP32
+        // FIXME: might need to handle zero as a special case (see ggml-cpu code)
+        HVX_Vector expand    = *(const HVX_Vector *) expand_x32_e8m0;
+        HVX_Vector e8m0_mask = Q6_V_vsplat_R(0x000000ff);
+        r0_d                 = Q6_V_vdelta_VV(r0_d, expand);
+        r0_d                 = Q6_V_vand_VV(r0_d, e8m0_mask);
+        r0_d                 = Q6_Vw_vasl_VwR(r0_d, 23);
+        r1_d                 = Q6_V_vdelta_VV(r1_d, expand);
+        r1_d                 = Q6_V_vand_VV(r1_d, e8m0_mask);
+        r1_d                 = Q6_Vw_vasl_VwR(r1_d, 23);
+
+        HVX_Vector r0_c0_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r0_d, vy0_d));
+        HVX_Vector r0_c1_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r0_d, vy1_d));
+        HVX_Vector r1_c0_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r1_d, vy0_d));
+        HVX_Vector r1_c1_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r1_d, vy1_d));
+
+        // Zero out unused scales
+        HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
+        r0_c0_dd = Q6_V_vand_QV(bmask, r0_c0_dd);
+        r0_c1_dd = Q6_V_vand_QV(bmask, r0_c1_dd);
+        r1_c0_dd = Q6_V_vand_QV(bmask, r1_c0_dd);
+        r1_c1_dd = Q6_V_vand_QV(bmask, r1_c1_dd);
+        r0_c0_ia = Q6_V_vand_QV(bmask, r0_c0_ia);
+        r0_c1_ia = Q6_V_vand_QV(bmask, r0_c1_ia);
+        r1_c0_ia = Q6_V_vand_QV(bmask, r1_c0_ia);
+        r1_c1_ia = Q6_V_vand_QV(bmask, r1_c1_ia);
+
+        HVX_Vector r0_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c0_ia, r0_c0_dd);
+        HVX_Vector r0_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c1_ia, r0_c1_dd);
+        HVX_Vector r1_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c0_ia, r1_c0_dd);
+        HVX_Vector r1_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c1_ia, r1_c1_dd);
+
+        r0_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_fa, r0_c0_sum));
+        r0_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_fa, r0_c1_sum));
+        r1_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_fa, r1_c0_sum));
+        r1_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_fa, r1_c1_sum));
+    }
+
+    // Reduce and store results
+    HVX_Vector r0_r1_c0_sum = hvx_vec_reduce_sum_f32x2(r0_c0_sum, r1_c0_sum);
+    HVX_Vector r0_r1_c1_sum = hvx_vec_reduce_sum_f32x2(r0_c1_sum, r1_c1_sum);
+
+    hvx_vec_store_u(&s0[0], 8, r0_r1_c0_sum);  // row0,col0 row1,col0
+    hvx_vec_store_u(&s1[0], 8, r0_r1_c1_sum);  // row0,col1 row1,col1
+}
+
+static void vec_dot_f16_f16_aa_1x1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
     const HVX_Vector * restrict x = (const HVX_Vector *) vx;
     const HVX_Vector * restrict y = (const HVX_Vector *) vy;
 
@@ -912,14 +1313,12 @@ static void vec_dot_f16_f16_aa(const int n, float * restrict s, const void * res
     hvx_vec_store_u(&s[0], 4, rsum);
 }
 
-static void vec_dot_f16_f16_aa_rx2(const int n,
-                                float * restrict s,
-                                const void * restrict vx,
-                                uint32_t vx_row_size,
-                                const void * restrict vy) {
-    const HVX_Vector * restrict x0 = (const HVX_Vector *) vx;
-    const HVX_Vector * restrict x1 = (const HVX_Vector *) ((const uint8_t *) vx + vx_row_size);
-    const HVX_Vector * restrict y  = (const HVX_Vector *) vy;
+static void vec_dot_f16_f16_aa_2x1(const int n, float * restrict s0,
+                                const void * restrict vx0, const void * restrict vx1,
+                                const void * restrict vy0) {
+    const HVX_Vector * restrict x0 = (const HVX_Vector *) vx0;
+    const HVX_Vector * restrict x1 = (const HVX_Vector *) vx1;
+    const HVX_Vector * restrict y  = (const HVX_Vector *) vy0;
 
     uint32_t nvec = n / VLEN_FP16;
     uint32_t nloe = n % VLEN_FP16;
@@ -953,10 +1352,86 @@ static void vec_dot_f16_f16_aa_rx2(const int n,
     }
 
     HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(Q6_Vsf_equals_Vqf32(rsum0), Q6_Vsf_equals_Vqf32(rsum1));
-    hvx_vec_store_u(&s[0], 8, rsum);
+    hvx_vec_store_u(s0, 8, rsum);
 }
 
-static void vec_dot_f16_f16_uu(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+static void vec_dot_f16_f16_aa_2x2(const int n, float * restrict s0, float * restrict s1,
+                                const void * restrict vx0, const void * restrict vx1,
+                                const void * restrict vy0, const void * restrict vy1) {
+    const HVX_Vector * restrict x0 = (const HVX_Vector *) vx0;
+    const HVX_Vector * restrict x1 = (const HVX_Vector *) vx1;
+    const HVX_Vector * restrict y0 = (const HVX_Vector *) vy0;
+    const HVX_Vector * restrict y1 = (const HVX_Vector *) vy1;
+
+    uint32_t nvec = n / VLEN_FP16;
+    uint32_t nloe = n % VLEN_FP16;
+
+    // Row sums (sf) - 4 accumulators for 2×2 tile
+    HVX_Vector r0_c0_sum = Q6_V_vsplat_R(0);
+    HVX_Vector r0_c1_sum = Q6_V_vsplat_R(0);
+    HVX_Vector r1_c0_sum = Q6_V_vsplat_R(0);
+    HVX_Vector r1_c1_sum = Q6_V_vsplat_R(0);
+
+    uint32_t i = 0;
+
+    #pragma unroll(2)
+    for (i = 0; i < nvec; i++) {
+        HVX_Vector r0_hf = x0[i];
+        HVX_Vector r1_hf = x1[i];
+        HVX_Vector c0_hf = y0[i];
+        HVX_Vector c1_hf = y1[i];
+
+        // Compute 4 dot products: r0×c0, r0×c1, r1×c0, r1×c1
+        HVX_VectorPair r0_c0_qf_p = Q6_Wqf32_vmpy_VhfVhf(r0_hf, c0_hf);
+        HVX_VectorPair r0_c1_qf_p = Q6_Wqf32_vmpy_VhfVhf(r0_hf, c1_hf);
+        HVX_VectorPair r1_c0_qf_p = Q6_Wqf32_vmpy_VhfVhf(r1_hf, c0_hf);
+        HVX_VectorPair r1_c1_qf_p = Q6_Wqf32_vmpy_VhfVhf(r1_hf, c1_hf);
+
+        HVX_Vector r0_c0_qf = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(r0_c0_qf_p), Q6_V_hi_W(r0_c0_qf_p));
+        HVX_Vector r0_c1_qf = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(r0_c1_qf_p), Q6_V_hi_W(r0_c1_qf_p));
+        HVX_Vector r1_c0_qf = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(r1_c0_qf_p), Q6_V_hi_W(r1_c0_qf_p));
+        HVX_Vector r1_c1_qf = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(r1_c1_qf_p), Q6_V_hi_W(r1_c1_qf_p));
+
+        r0_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_qf, r0_c0_sum));
+        r0_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_qf, r0_c1_sum));
+        r1_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_qf, r1_c0_sum));
+        r1_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_qf, r1_c1_sum));
+    }
+
+    if (nloe) {
+        HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 2);
+
+        HVX_Vector r0_hf = Q6_V_vand_QV(bmask, x0[i]);
+        HVX_Vector r1_hf = Q6_V_vand_QV(bmask, x1[i]);
+        HVX_Vector c0_hf = Q6_V_vand_QV(bmask, y0[i]);
+        HVX_Vector c1_hf = Q6_V_vand_QV(bmask, y1[i]);
+
+        HVX_VectorPair r0_c0_qf_p = Q6_Wqf32_vmpy_VhfVhf(r0_hf, c0_hf);
+        HVX_VectorPair r0_c1_qf_p = Q6_Wqf32_vmpy_VhfVhf(r0_hf, c1_hf);
+        HVX_VectorPair r1_c0_qf_p = Q6_Wqf32_vmpy_VhfVhf(r1_hf, c0_hf);
+        HVX_VectorPair r1_c1_qf_p = Q6_Wqf32_vmpy_VhfVhf(r1_hf, c1_hf);
+
+        HVX_Vector r0_c0_qf = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(r0_c0_qf_p), Q6_V_hi_W(r0_c0_qf_p));
+        HVX_Vector r0_c1_qf = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(r0_c1_qf_p), Q6_V_hi_W(r0_c1_qf_p));
+        HVX_Vector r1_c0_qf = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(r1_c0_qf_p), Q6_V_hi_W(r1_c0_qf_p));
+        HVX_Vector r1_c1_qf = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(r1_c1_qf_p), Q6_V_hi_W(r1_c1_qf_p));
+
+        r0_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_qf, r0_c0_sum));
+        r0_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_qf, r0_c1_sum));
+        r1_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_qf, r1_c0_sum));
+        r1_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_qf, r1_c1_sum));
+
+    }
+
+    // Reduce and store results
+    HVX_Vector r0_r1_c0_sum = hvx_vec_reduce_sum_f32x2(r0_c0_sum, r1_c0_sum);
+    HVX_Vector r0_r1_c1_sum = hvx_vec_reduce_sum_f32x2(r0_c1_sum, r1_c1_sum);
+
+    hvx_vec_store_u(&s0[0], 8, r0_r1_c0_sum);  // row0,col0 row1,col0
+    hvx_vec_store_u(&s1[0], 8, r0_r1_c1_sum);  // row0,col1 row1,col1
+}
+
+static void vec_dot_f16_f16_uu_1x1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
     const HVX_UVector * restrict x = (const HVX_UVector *) vx;
     const HVX_UVector * restrict y = (const HVX_UVector *) vy;
 
@@ -986,7 +1461,7 @@ static void vec_dot_f16_f16_uu(const int n, float * restrict s, const void * res
     hvx_vec_store_u(&s[0], 4, rsum);
 }
 
-static void vec_dot_f16_f32_uu(const int n, float * restrict s, const void * restrict x, const void * restrict y) {
+static void vec_dot_f16_f32_uu_1x1(const int n, float * restrict s, const void * restrict x, const void * restrict y) {
     const HVX_UVector * restrict vx = (const HVX_UVector * restrict) x;
     const HVX_UVector * restrict vy = (const HVX_UVector * restrict) y;
 
@@ -1083,14 +1558,16 @@ static void vec_dot_f16_f32_uu(const int n, float * restrict s, const void * res
     const uint32_t nb2 = dst->nb[2];   \
     const uint32_t nb3 = dst->nb[3];
 
-#define htp_matmul_preamble            \
-    htp_matmul_tensors_preamble;       \
-    dma_queue *dma_queue           = octx->ctx->dma[ith];         \
-    uint32_t src0_nrows_per_thread = octx->src0_nrows_per_thread;
+#define htp_matmul_preamble                                     \
+    struct htp_matmul_context * mmctx = data;                   \
+    struct htp_ops_context * octx  = mmctx->octx;               \
+    htp_matmul_tensors_preamble;                                \
+    dma_queue *dma_queue           = octx->ctx->dma[ith];       \
+    uint32_t src0_nrows_per_thread = mmctx->src0_nrows_per_thread;
 
 // *** matmul with support for 4d tensors and full broadcasting
 
-static void matmul_4d(struct htp_matmul_type * mt, struct htp_ops_context * octx, uint32_t nth, uint32_t ith) {
+static void matmul_4d(unsigned int nth, unsigned int ith, void * data) {
     htp_matmul_preamble;
 
     uint64_t t1, t2;
@@ -1136,13 +1613,13 @@ static void matmul_4d(struct htp_matmul_type * mt, struct htp_ops_context * octx
     for (uint32_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) {
         for (uint32_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) {
             for (uint32_t ir1 = iir1; ir1 < MIN(iir1 + blck_1, ir1_end); ir1++) {
-                const uint32_t i13 = fastdiv(ir1, &octx->mm_div_ne12_ne1);
-                const uint32_t i12 = fastdiv(ir1 - i13 * ne12 * ne1, &octx->mm_div_ne1);
+                const uint32_t i13 = fastdiv(ir1, &mmctx->mm_div_ne12_ne1);
+                const uint32_t i12 = fastdiv(ir1 - i13 * ne12 * ne1, &mmctx->mm_div_ne1);
                 const uint32_t i11 = (ir1 - i13 * ne12 * ne1 - i12 * ne1);
 
                 // broadcast src0 into src1
-                const uint32_t i03 = fastdiv(i13, &octx->mm_div_r3);
-                const uint32_t i02 = fastdiv(i12, &octx->mm_div_r2);
+                const uint32_t i03 = fastdiv(i13, &mmctx->mm_div_r3);
+                const uint32_t i02 = fastdiv(i12, &mmctx->mm_div_r2);
 
                 const uint32_t i1 = i11;
                 const uint32_t i2 = i12;
@@ -1155,7 +1632,7 @@ static void matmul_4d(struct htp_matmul_type * mt, struct htp_ops_context * octx
                 const uint32_t ir0_block_end = MIN(iir0 + blck_0, ir0_end);
                 for (uint32_t ir0 = iir0; ir0 < ir0_block_end; ir0++) {
                     const uint8_t * restrict src0_row = src0_base + ir0 * nb01;
-                    mt->vec_dot(ne00, &dst_col[ir0], src0_row, src1_col);
+                    mmctx->vec_dot_1x1(ne00, &dst_col[ir0], src0_row, src1_col);
                 }
             }
         }
@@ -1170,7 +1647,7 @@ static void matmul_4d(struct htp_matmul_type * mt, struct htp_ops_context * octx
 }
 
 // src1 tensor is already in VTCM spad
-static void matmul_2d(struct htp_matmul_type * mt, struct htp_ops_context * octx, uint32_t nth, uint32_t ith) {
+static void matmul_2d(unsigned int nth, unsigned int ith, void * data) {
     htp_matmul_preamble;
 
     const uint32_t src0_nrows = ne01 * ne02 * ne03;  // src0 rows
@@ -1195,7 +1672,7 @@ static void matmul_2d(struct htp_matmul_type * mt, struct htp_ops_context * octx
     // Per-thread VTCM scratchpads for all tensors
     // Note that the entire src1 tensor is already in VTCM
     // For other tensors we allocate N rows per thread, padded to HVX vector size
-    uint8_t * restrict spad_dst  = dst_spad->data + dst_spad->size_per_thread * ith;
+    uint8_t * restrict spad_dst  = dst_spad->data  + dst_spad->size_per_thread  * ith;
     uint8_t * restrict spad_src0 = src0_spad->data + src0_spad->size_per_thread * ith;
     uint8_t * restrict src1_data = src1_spad->data;
 
@@ -1219,11 +1696,21 @@ static void matmul_2d(struct htp_matmul_type * mt, struct htp_ops_context * octx
     for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
         const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
 
-        #pragma unroll(2)
-        for (uint32_t ir1 = 0; ir1 < src1_nrows; ++ir1) {
+        // Process src1 columns in pairs (2×2 tiling)
+        uint32_t ir1 = 0;
+        for (; ir1 + 1 < src1_nrows; ir1 += 2) {
+            const uint8_t * restrict src1_col0 = (const uint8_t *) (src1_data + (ir1+0) * src1_stride);
+            const uint8_t * restrict src1_col1 = (const uint8_t *) (src1_data + (ir1+1) * src1_stride);
+            float * restrict dst_row0 = (float *) (dst->data + ((ir1+0) * dst_row_size));
+            float * restrict dst_row1 = (float *) (dst->data + ((ir1+1) * dst_row_size));
+            mmctx->vec_dot_2x2(ne00, &dst_row0[ir0], &dst_row1[ir0], ss0, ss0 + src0_stride, src1_col0, src1_col1);
+        }
+
+        // Handle remaining src1 rows (fallback to 2×1)
+        for (; ir1 < src1_nrows; ++ir1) {
             const uint8_t * restrict src1_col = (const uint8_t *) (src1_data + ir1 * src1_stride);
             float * restrict dst_row          = (float *) (dst->data + (ir1 * dst_row_size));
-            mt->vec_dot_rx2(ne00, &dst_row[ir0], ss0, src0_stride, src1_col);
+            mmctx->vec_dot_2x1(ne00, &dst_row[ir0], ss0, ss0 + src0_stride, src1_col);
         }
 
         // Prefetch next (n + spad_nrows) row
@@ -1247,20 +1734,20 @@ static void matmul_2d(struct htp_matmul_type * mt, struct htp_ops_context * octx
         for (uint32_t ir1 = 0; ir1 < src1_nrows; ++ir1) {
             const uint8_t * restrict src1_col = (const uint8_t *) (src1_data + ir1 * src1_stride);
             float * restrict dst_row          = (float *) (dst->data + (ir1 * dst_row_size));
-            mt->vec_dot(ne00, &dst_row[ir0], ss0, src1_col);
+            mmctx->vec_dot_1x1(ne00, &dst_row[ir0], ss0, src1_col);
         }
     }
 
     t2 = HAP_perf_get_qtimer_count();
 
-    FARF(HIGH, "matmul-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", mt->type, ith, nth,
+    FARF(HIGH, "matmul-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", mmctx->type, ith, nth,
          src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0], src1->ne[1],
          src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
          (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
 
 // q8x4x2 src1 tensor is already in VTCM spad
-static void matvec_2d(struct htp_matmul_type * mt, struct htp_ops_context * octx, uint32_t nth, uint32_t ith) {
+static void matvec_2d(unsigned int nth, unsigned int ith, void * data) {
     htp_matmul_preamble;
 
     const uint32_t src0_nrows = ne01;
@@ -1311,7 +1798,7 @@ static void matvec_2d(struct htp_matmul_type * mt, struct htp_ops_context * octx
     // Process src0 rows
     for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
         const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
-        mt->vec_dot_rx2(ne00, &tmp[ir0 - src0_start_row], ss0, src0_stride, src1_col);
+        mmctx->vec_dot_2x1(ne00, &tmp[ir0 - src0_start_row], ss0, ss0 + src0_stride, src1_col);
 
         // Prefetch next (n + spad_nrows) row
         const uint32_t pr0 = (ir0 + MM_SPAD_SRC0_NROWS);
@@ -1329,14 +1816,14 @@ static void matvec_2d(struct htp_matmul_type * mt, struct htp_ops_context * octx
         dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_stride, src0_row + ir0 * src0_row_size),
                        src0_stride, src0_row_size, 1);
         const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
-        mt->vec_dot(ne00, &tmp[ir0 - src0_start_row], ss0, src1_col);
+        mmctx->vec_dot_1x1(ne00, &tmp[ir0 - src0_start_row], ss0, src1_col);
     }
 
     hvx_copy_f32_ua((uint8_t *) &dst_col[src0_start_row], (uint8_t *) tmp, src0_end_row - src0_start_row);
 
     t2 = HAP_perf_get_qtimer_count();
 
-    FARF(HIGH, "matvec-%s %u/%u: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", mt->type, ith, nth,
+    FARF(HIGH, "matvec-%s %u/%u: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", mmctx->type, ith, nth,
          src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0], src1->ne[1],
          src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
          (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
@@ -1350,7 +1837,7 @@ struct mmid_row_mapping {
 };
 
 // src1 tensor is already in VTCM spad
-static void matmul_id(struct htp_matmul_type * mt, struct htp_ops_context * octx, uint32_t nth, uint32_t ith) {
+static void matmul_id(unsigned int nth, unsigned int ith, void * data) {
     htp_matmul_preamble;
 
     struct htp_tensor * restrict     ids = &octx->src2;
@@ -1423,11 +1910,10 @@ static void matmul_id(struct htp_matmul_type * mt, struct htp_ops_context * octx
                 const int               rm2         = row_mapping.i2;  // token idx
 
                 const uint32_t ir1 = src1_nrows == 1 ? 0 : rm1;        // src1 row idx
-                const uint8_t * restrict src1_col =
-                    (const uint8_t *) (src1_data + (ir1 + rm2 * ne11 + 0) * src1_row_size);
+                const uint8_t * restrict src1_col = (const uint8_t *) (src1_data + (ir1 + rm2 * ne11 + 0) * src1_row_size);
                 float * dst_row = (float *) (dst->data + (rm1 * nb1 + rm2 * nb2 + 0));
 
-                mt->vec_dot_rx2(ne00, &dst_row[ir0], ss0, src0_row_size_padded, src1_col);
+                mmctx->vec_dot_2x1(ne00, &dst_row[ir0], ss0, ss0 + src0_row_size_padded, src1_col);
             }
 
             // Prefetch next (n + spad_nrows) row
@@ -1453,25 +1939,24 @@ static void matmul_id(struct htp_matmul_type * mt, struct htp_ops_context * octx
                 const int               rm2         = row_mapping.i2;  // token idx
 
                 const uint32_t ir1 = src1_nrows == 1 ? 0 : rm1;        // src1 row idx
-                const uint8_t * restrict src1_col =
-                    (const uint8_t *) (src1_data + (ir1 + rm2 * ne11 + 0) * src1_row_size);
+                const uint8_t * restrict src1_col = (const uint8_t *) (src1_data + (ir1 + rm2 * ne11 + 0) * src1_row_size);
                 float * dst_row = (float *) (dst->data + (rm1 * nb1 + rm2 * nb2 + 0));
 
-                mt->vec_dot(ne00, &dst_row[ir0], ss0, src1_col);
+                mmctx->vec_dot_1x1(ne00, &dst_row[ir0], ss0, src1_col);
             }
         }
     }
 
     t2 = HAP_perf_get_qtimer_count();
 
-    FARF(HIGH, "matmul-id-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u usec %u\n", mt->type,
+    FARF(HIGH, "matmul-id-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u usec %u\n", mmctx->type,
          ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0],
          src1->ne[1], src1->ne[2], src1->ne[3], ids->ne[0], ids->ne[1], ids->ne[2], ids->ne[3], dst->ne[0], dst->ne[1],
          dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
 
 // src1 tensor is already in VTCM spad
-static void matvec_id(struct htp_matmul_type * mt, struct htp_ops_context * octx, uint32_t nth, uint32_t ith) {
+static void matvec_id(unsigned int nth, unsigned int ith, void * data) {
     htp_matmul_preamble;
 
     struct htp_tensor * restrict     ids = &octx->src2;
@@ -1531,7 +2016,7 @@ static void matvec_id(struct htp_matmul_type * mt, struct htp_ops_context * octx
         // Process src0 rows
         for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
             const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
-            mt->vec_dot_rx2(ne00, &dst_row[ir0], ss0, src0_row_size_padded, src1_col);
+            mmctx->vec_dot_2x1(ne00, &dst_row[ir0], ss0, ss0 + src0_row_size_padded, src1_col);
 
             // Prefetch next (n + spad_nrows) row
             const int pr0 = (ir0 + MM_SPAD_SRC0_NROWS);
@@ -1549,13 +2034,13 @@ static void matvec_id(struct htp_matmul_type * mt, struct htp_ops_context * octx
             dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
                            src0_row_size_padded, src0_row_size, 1);
             const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
-            mt->vec_dot(ne00, &dst_row[ir0], ss0, src1_col);
+            mmctx->vec_dot_1x1(ne00, &dst_row[ir0], ss0, src1_col);
         }
     }
 
     t2 = HAP_perf_get_qtimer_count();
 
-    FARF(HIGH, "matvec-id-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u usec %u\n", mt->type,
+    FARF(HIGH, "matvec-id-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u usec %u\n", mmctx->type,
          ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0],
          src1->ne[1], src1->ne[2], src1->ne[3], src2->ne[0], src2->ne[1], src2->ne[2], src2->ne[3], dst->ne[0],
          dst->ne[1], dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
@@ -1754,12 +2239,14 @@ static void quantize_row_f32_q8x4x2(float * restrict x, uint8_t * restrict y, ui
     hvx_copy_f16_ua(y_d, t_d, nb * 8);
 }
 
-static void quantize_f32_q8x4x2(const struct htp_tensor * src,
-                                 uint8_t * restrict dst,
-                                 struct htp_spad * spad,
-                                 uint32_t          nth,
-                                 uint32_t          ith,
-                                 uint32_t          nrows_per_thread) {
+static void quantize_f32_q8x4x2(unsigned int nth, unsigned int ith, void * data) {
+    struct htp_matmul_context * mmctx = data;
+    struct htp_ops_context * octx = mmctx->octx;
+
+    const struct htp_tensor * src = &octx->src1;
+    uint8_t * restrict dst = octx->src1_spad.data;
+    struct htp_spad * spad = &octx->src0_spad;
+    uint32_t nrows_per_thread = mmctx->src1_nrows_per_thread;
 
     uint64_t t1 = HAP_perf_get_qtimer_count();
 
@@ -1799,8 +2286,14 @@ static void quantize_f32_q8x4x2(const struct htp_tensor * src,
          ir_last, src_row_size, dst_row_size, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
 
-static void quantize_f32_f16(const struct htp_tensor * src, uint8_t * restrict dst, uint32_t nth, uint32_t ith,
-                              uint32_t nrows_per_thread, uint32_t dst_stride) {
+static void quantize_f32_f16(unsigned int nth, unsigned int ith, void * data) {
+    struct htp_matmul_context * mmctx = data;
+    struct htp_ops_context * octx = mmctx->octx;
+
+    const struct htp_tensor * src = &octx->src1;
+    uint8_t * restrict dst = octx->src1_spad.data;
+    uint32_t nrows_per_thread = mmctx->src1_nrows_per_thread;
+    uint32_t dst_stride = octx->src1_spad.stride;
 
     uint64_t t1 = HAP_perf_get_qtimer_count();
 
@@ -1835,8 +2328,14 @@ static void quantize_f32_f16(const struct htp_tensor * src, uint8_t * restrict d
 }
 
 // TODO just a plain copy that should be done via the DMA during the Op setup
-static void quantize_f16_f16(const struct htp_tensor * src, uint8_t * restrict dst, uint32_t nth, uint32_t ith,
-                              uint32_t nrows_per_thread, uint32_t dst_stride) {
+static void quantize_f16_f16(unsigned int nth, unsigned int ith, void * data) {
+    struct htp_matmul_context * mmctx = data;
+    struct htp_ops_context * octx = mmctx->octx;
+
+    const struct htp_tensor * src = &octx->src1;
+    uint8_t * restrict dst = octx->src1_spad.data;
+    uint32_t nrows_per_thread = mmctx->src1_nrows_per_thread;
+    uint32_t dst_stride = octx->src1_spad.stride;
 
     uint64_t t1 = HAP_perf_get_qtimer_count();
 
@@ -1870,213 +2369,76 @@ static void quantize_f16_f16(const struct htp_tensor * src, uint8_t * restrict d
         ir_last, src_row_size, src_stride, dst_stride, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
 
-static void htp_quantize_f32_q8x4x2(unsigned int n, unsigned int i, void * data) {
-    struct htp_ops_context * octx = data;
-    quantize_f32_q8x4x2(&octx->src1, octx->src1_spad.data, &octx->src0_spad, n, i, octx->src1_nrows_per_thread);
-}
-
-static void htp_quantize_f32_f16(unsigned int n, unsigned int i, void * data) {
-    struct htp_ops_context * octx = data;
-    quantize_f32_f16(&octx->src1, octx->src1_spad.data, n, i, octx->src1_nrows_per_thread, octx->src1_spad.stride);
-}
-
-static void htp_quantize_f16_f16(unsigned int n, unsigned int i, void * data) {
-    struct htp_ops_context * octx = data;
-    quantize_f16_f16(&octx->src1, octx->src1_spad.data, n, i, octx->src1_nrows_per_thread, octx->src1_spad.stride);
-}
-
-// ** matmul/matvec callbacks for worker_pool
-
-static void htp_matvec_2d_q4x4x2_q8x4x2(unsigned int n, unsigned int i, void * data) {
-    struct htp_ops_context * octx = data;
-
-    struct htp_matmul_type mt;
-    mt.type        = "q4x4x2-q8x4x2";
-    mt.vec_dot     = vec_dot_q4x4x2_q8x4x2;
-    mt.vec_dot_rx2 = vec_dot_q4x4x2_q8x4x2_rx2;
-
-    matvec_2d(&mt, octx, n, i);
-}
-
-static void htp_matmul_2d_q4x4x2_q8x4x2(unsigned int n, unsigned int i, void * data) {
-    struct htp_ops_context * octx = data;
-
-    struct htp_matmul_type mt;
-    mt.type        = "q4x4x2-q8x4x2";
-    mt.vec_dot     = vec_dot_q4x4x2_q8x4x2;
-    mt.vec_dot_rx2 = vec_dot_q4x4x2_q8x4x2_rx2;
-
-    matmul_2d(&mt, octx, n, i);
-}
-
-static void htp_matvec_2d_q8x4x2_q8x4x2(unsigned int n, unsigned int i, void * data) {
-    struct htp_ops_context * octx = data;
-
-    struct htp_matmul_type mt;
-    mt.type        = "q8x4x2-q8x4x2";
-    mt.vec_dot     = vec_dot_q8x4x2_q8x4x2;
-    mt.vec_dot_rx2 = vec_dot_q8x4x2_q8x4x2_rx2;
-
-    matvec_2d(&mt, octx, n, i);
-}
-
-static void htp_matmul_2d_q8x4x2_q8x4x2(unsigned int n, unsigned int i, void * data) {
-    struct htp_ops_context * octx = data;
-
-    struct htp_matmul_type mt;
-    mt.type        = "q8x4x2-q8x4x2";
-    mt.vec_dot     = vec_dot_q8x4x2_q8x4x2;
-    mt.vec_dot_rx2 = vec_dot_q8x4x2_q8x4x2_rx2;
-
-    matmul_2d(&mt, octx, n, i);
-}
-
-static void htp_matvec_2d_mxfp4x4x2_q8x4x2(unsigned int n, unsigned int i, void * data) {
-    struct htp_ops_context * octx = data;
-
-    struct htp_matmul_type mt;
-    mt.type        = "mxfp4x4x2-q8x4x2";
-    mt.vec_dot     = vec_dot_mxfp4x4x2_q8x4x2;
-    mt.vec_dot_rx2 = vec_dot_mxfp4x4x2_q8x4x2_rx2;
-
-    matvec_2d(&mt, octx, n, i);
-}
-
-static void htp_matmul_2d_mxfp4x4x2_q8x4x2(unsigned int n, unsigned int i, void * data) {
-    struct htp_ops_context * octx = data;
-
-    struct htp_matmul_type mt;
-    mt.type        = "mxfp4x4x2-q8x4x2";
-    mt.vec_dot     = vec_dot_mxfp4x4x2_q8x4x2;
-    mt.vec_dot_rx2 = vec_dot_mxfp4x4x2_q8x4x2_rx2;
-
-    matmul_2d(&mt, octx, n, i);
-}
-
-static void htp_matvec_2d_f16_f16(unsigned int n, unsigned int i, void * data) {
-    struct htp_ops_context * octx = data;
-
-    struct htp_matmul_type mt;
-    mt.type        = "f16-f16";
-    mt.vec_dot     = vec_dot_f16_f16_aa;
-    mt.vec_dot_rx2 = vec_dot_f16_f16_aa_rx2;
-
-    matvec_2d(&mt, octx, n, i);
-}
-
-static void htp_matmul_2d_f16_f16(unsigned int n, unsigned int i, void * data) {
-    struct htp_ops_context * octx = data;
-
-    struct htp_matmul_type mt;
-    mt.type        = "f16-f16";
-    mt.vec_dot     = vec_dot_f16_f16_aa;
-    mt.vec_dot_rx2 = vec_dot_f16_f16_aa_rx2;
-
-    matmul_2d(&mt, octx, n, i);
-}
-
-static void htp_matmul_4d_f16_f32(unsigned int n, unsigned int i, void * data) {
-    struct htp_ops_context * octx = data;
-
-    struct htp_matmul_type mt;
-    mt.type        = "f16-f32";
-    mt.vec_dot     = vec_dot_f16_f32_uu;
-
-    matmul_4d(&mt, octx, n, i);
-}
-
-static void htp_matmul_4d_f16_f16(unsigned int n, unsigned int i, void * data) {
-    struct htp_ops_context * octx = data;
-
-    struct htp_matmul_type mt;
-    mt.type        = "f16-f16";
-    mt.vec_dot     = vec_dot_f16_f16_uu;
-
-    matmul_4d(&mt, octx, n, i);
-}
-
-// ** matmul-id callbacks for worker_pool
-
-static void htp_matvec_id_q4x4x2_q8x4x2(unsigned int n, unsigned int i, void * data) {
-    struct htp_ops_context * octx = data;
-
-    struct htp_matmul_type mt;
-    mt.type        = "q4x4x2-q8x4x2";
-    mt.vec_dot     = vec_dot_q4x4x2_q8x4x2;
-    mt.vec_dot_rx2 = vec_dot_q4x4x2_q8x4x2_rx2;
-
-    matvec_id(&mt, octx, n, i);
-}
-
-static void htp_matmul_id_q4x4x2_q8x4x2(unsigned int n, unsigned int i, void * data) {
-    struct htp_ops_context * octx = data;
-
-    struct htp_matmul_type mt;
-    mt.type        = "q4x4x2-q8x4x2";
-    mt.vec_dot     = vec_dot_q4x4x2_q8x4x2;
-    mt.vec_dot_rx2 = vec_dot_q4x4x2_q8x4x2_rx2;
-
-    matmul_id(&mt, octx, n, i);
-}
-
-static void htp_matvec_id_q8x4x2_q8x4x2(unsigned int n, unsigned int i, void * data) {
-    struct htp_ops_context * octx = data;
-
-    struct htp_matmul_type mt;
-    mt.type        = "q8x4x2-q8x4x2";
-    mt.vec_dot     = vec_dot_q8x4x2_q8x4x2;
-    mt.vec_dot_rx2 = vec_dot_q8x4x2_q8x4x2_rx2;
-
-    matvec_id(&mt, octx, n, i);
-}
-
-static void htp_matmul_id_q8x4x2_q8x4x2(unsigned int n, unsigned int i, void * data) {
-    struct htp_ops_context * octx = data;
-
-    struct htp_matmul_type mt;
-    mt.type        = "q8x4x2-q8x4x2";
-    mt.vec_dot     = vec_dot_q8x4x2_q8x4x2;
-    mt.vec_dot_rx2 = vec_dot_q8x4x2_q8x4x2_rx2;
-
-    matmul_id(&mt, octx, n, i);
-}
-
-static void htp_matvec_id_mxfp4x4x2_q8x4x2(unsigned int n, unsigned int i, void * data) {
-    struct htp_ops_context * octx = data;
-
-    struct htp_matmul_type mt;
-    mt.type        = "mxfp4x4x2-q8x4x2";
-    mt.vec_dot     = vec_dot_mxfp4x4x2_q8x4x2;
-    mt.vec_dot_rx2 = vec_dot_mxfp4x4x2_q8x4x2_rx2;
-
-    matvec_id(&mt, octx, n, i);
-}
-
-static void htp_matmul_id_mxfp4x4x2_q8x4x2(unsigned int n, unsigned int i, void * data) {
-    struct htp_ops_context * octx = data;
-
-    struct htp_matmul_type mt;
-    mt.type        = "mxfp4x4x2-q8x4x2";
-    mt.vec_dot     = vec_dot_mxfp4x4x2_q8x4x2;
-    mt.vec_dot_rx2 = vec_dot_mxfp4x4x2_q8x4x2_rx2;
-
-    matmul_id(&mt, octx, n, i);
-}
-
-// ** main matmul entry point
 
 static inline bool htp_is_permuted(const struct htp_tensor * t) {
     return t->nb[0] > t->nb[1] || t->nb[1] > t->nb[2] || t->nb[2] > t->nb[3];
 }
 
+static int htp_mminit_vec_dot(struct htp_matmul_context * mmctx, enum htp_data_type type) {
+    switch (type) {
+        case HTP_TYPE_Q4_0:
+            mmctx->type        = "q4x4x2-f32";
+            mmctx->vec_dot_1x1 = vec_dot_q4x4x2_q8x4x2_1x1;
+            mmctx->vec_dot_2x1 = vec_dot_q4x4x2_q8x4x2_2x1;
+            mmctx->vec_dot_2x2 = vec_dot_q4x4x2_q8x4x2_2x2;
+            return 0;
+        case HTP_TYPE_Q8_0:
+            mmctx->type        = "q8x4x2-f32";
+            mmctx->vec_dot_1x1 = vec_dot_q8x4x2_q8x4x2_1x1;
+            mmctx->vec_dot_2x1 = vec_dot_q8x4x2_q8x4x2_2x1;
+            mmctx->vec_dot_2x2 = vec_dot_q8x4x2_q8x4x2_2x2;
+            return 0;
+        case HTP_TYPE_MXFP4:
+            mmctx->type        = "mxfp4x4x2-f32";
+            mmctx->vec_dot_1x1 = vec_dot_mxfp4x4x2_q8x4x2_1x1;
+            mmctx->vec_dot_2x1 = vec_dot_mxfp4x4x2_q8x4x2_2x1;
+            mmctx->vec_dot_2x2 = vec_dot_mxfp4x4x2_q8x4x2_2x2;
+            return 0;
+        default:
+            return -1;
+    }
+}
+
+static void htp_mminit_spad(struct htp_ops_context * octx,
+                                 size_t dst_row_size,
+                                 size_t src0_row_size_padded,
+                                 size_t src1_row_size,
+                                 uint32_t src1_nrows,
+                                 size_t src2_spad_size_per_thread) {
+    octx->dst_spad.size_per_thread  = hex_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256);
+    octx->src0_spad.size_per_thread = hex_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256);
+    octx->src1_spad.size_per_thread = hex_round_up(src1_row_size * src1_nrows, 256);
+
+    if (src2_spad_size_per_thread > 0) {
+        octx->src2_spad.size_per_thread = src2_spad_size_per_thread;
+        octx->src2_spad.size            = octx->src2_spad.size_per_thread;
+    }
+
+    // src0 spad is also used in dynamic quantizer to store padded src1 rows
+    size_t src1_row_size_padded = hex_round_up(src1_row_size, QK_Q8_0x4x2 * sizeof(float));
+    if (octx->src0_spad.size_per_thread < src1_row_size_padded) {
+        octx->src0_spad.size_per_thread = src1_row_size_padded;
+    }
+
+    octx->src1_spad.size = octx->src1_spad.size_per_thread;
+    octx->src0_spad.size = octx->src0_spad.size_per_thread * octx->n_threads;
+    octx->dst_spad.size  = octx->dst_spad.size_per_thread * octx->n_threads;
+}
+
 int op_matmul(struct htp_ops_context * octx) {
     htp_matmul_tensors_preamble;
 
-    const char * op_type;
+    struct htp_matmul_context mmctx_struct = {0};
+    struct htp_matmul_context * mmctx = &mmctx_struct;
+    mmctx->octx = octx;
 
     const uint32_t src0_nrows = ne01 * ne02 * ne03;
     const uint32_t src1_nrows = ne11 * ne12 * ne13;
 
+    // Compute src0_nrows_per_thread
+    mmctx->src0_nrows_per_thread  = (src0_nrows + octx->n_threads - 1) / octx->n_threads;
+    mmctx->src0_nrows_per_thread += (mmctx->src0_nrows_per_thread & 1); // round up to even
+
     const size_t src0_row_size = nb01;
     const size_t dst_row_size  = nb1;
     size_t       src1_row_size = nb11;
@@ -2085,181 +2447,95 @@ int op_matmul(struct htp_ops_context * octx) {
     size_t       src1_row_size_padded;
 
     worker_callback_t quant_job_func;
-    worker_callback_t matmul_job_func;
+    worker_callback_t matmul_job_func = src1_nrows > 1 ? matmul_2d : matvec_2d;
 
     bool need_quant = !(octx->flags & HTP_OPFLAGS_SKIP_QUANTIZE);
 
-    switch (src0->type) {
-        case HTP_TYPE_Q4_0:
-            op_type        = "q4x4x2-f32";
-            quant_job_func = htp_quantize_f32_q8x4x2;
-            if (src1_nrows > 1) {
-                matmul_job_func = htp_matmul_2d_q4x4x2_q8x4x2;
-            } else {
-                matmul_job_func = htp_matvec_2d_q4x4x2_q8x4x2;
-            }
+    if (src0->type == HTP_TYPE_F16) {
+        // Try optimized f16-f16 path first (src1 in VTCM)
+        const size_t f16_src1_row_size  = hex_round_up(ne10 * 2, 128);
+        const size_t f16_src1_spad_size = hex_round_up(f16_src1_row_size * src1_nrows, 256);
+        const size_t f16_src0_spad_size = hex_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256) * octx->n_threads;
+        const size_t f16_dst_spad_size  = hex_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256) * octx->n_threads;
 
-            src1_row_size = q8x4x2_row_size(ne10);  // row size post quantization
+        const size_t f16_total_size = f16_src1_spad_size + f16_src0_spad_size + f16_dst_spad_size;
 
-            // Entire src1 tensor is placed into the VTCM
-            // For other tensors we allocate N rows per thread, padded to HVX vector size
+        // Default matmul implementation does not support multi-batch src0 (N-vs-N broadcasting).
+        // It only supports 1-vs-N broadcasting (src0 is 2D) or standard 2D matmul.
+        const bool is_batched  = (ne02 > 1) || (ne03 > 1);
+        const bool is_permuted = htp_is_permuted(&octx->src0) || htp_is_permuted(&octx->src1);
+
+        if (!is_batched && !is_permuted && f16_total_size <= octx->ctx->vtcm_size) {
+            // Optimized path
+            quant_job_func     = (src1->type == HTP_TYPE_F32) ? quantize_f32_f16 : quantize_f16_f16;
+            mmctx->type        = "f16-f16";
+            mmctx->vec_dot_1x1 = vec_dot_f16_f16_aa_1x1;
+            mmctx->vec_dot_2x1 = vec_dot_f16_f16_aa_2x1;
+            mmctx->vec_dot_2x2 = vec_dot_f16_f16_aa_2x2;
+
+            src1_row_size = f16_src1_row_size;  // row size post quantization
 
             octx->dst_spad.size_per_thread  = hex_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256);
             octx->src0_spad.size_per_thread = hex_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256);
             octx->src1_spad.size_per_thread = hex_round_up(src1_row_size * src1_nrows, 256);
 
-            // src0 spad is also used in dynamic quantizer to store padded src1 rows
-            src1_row_size_padded = hex_round_up(src1_row_size, QK_Q8_0x4x2 * sizeof(float));
-            if (octx->src0_spad.size_per_thread < src1_row_size_padded) {
-                octx->src0_spad.size_per_thread = src1_row_size_padded;
-            }
-
             octx->src1_spad.size = octx->src1_spad.size_per_thread;
             octx->src0_spad.size = octx->src0_spad.size_per_thread * octx->n_threads;
             octx->dst_spad.size  = octx->dst_spad.size_per_thread * octx->n_threads;
-            break;
-
-        case HTP_TYPE_Q8_0:
-            op_type        = "q8x4x2-f32";
-            quant_job_func = htp_quantize_f32_q8x4x2;
-            if (src1_nrows > 1) {
-                matmul_job_func = htp_matmul_2d_q8x4x2_q8x4x2;
+        } else {
+            // Fallback to f16/f32 (DDR) if src1 doesn't fit in VTCM or broadcasting is required
+            quant_job_func = NULL;
+            if (src1->type == HTP_TYPE_F32) {
+                mmctx->type        = "f16-f32";
+                mmctx->vec_dot_1x1 = vec_dot_f16_f32_uu_1x1;
+                matmul_job_func    = matmul_4d;
             } else {
-                matmul_job_func = htp_matvec_2d_q8x4x2_q8x4x2;
+                mmctx->type        = "f16-f16";
+                mmctx->vec_dot_1x1 = vec_dot_f16_f16_uu_1x1;
+                matmul_job_func    = matmul_4d;
             }
 
-            src1_row_size = q8x4x2_row_size(ne10);  // row size post quantization
-
-            // Entire src1 tensor is placed into the VTCM
-            // For other tensors we allocate N rows per thread, padded to HVX vector size
+            src1_row_size = nb11;  // original row size in DDR
 
             octx->dst_spad.size_per_thread  = hex_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256);
-            octx->src0_spad.size_per_thread = hex_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256);
-            octx->src1_spad.size_per_thread = hex_round_up(src1_row_size * src1_nrows, 256);
+            octx->src0_spad.size_per_thread = hex_round_up(MM_SPAD_SRC0_NROWS * src0_row_size, 256);
+            octx->src1_spad.size_per_thread = hex_round_up(MM_SPAD_SRC1_NROWS * src1_row_size, 256);
 
-            // src0 spad is also used in dynamic quantizer to store padded src1 rows
-            src1_row_size_padded = hex_round_up(src1_row_size, QK_Q8_0x4x2 * sizeof(float));
-            if (octx->src0_spad.size_per_thread < src1_row_size_padded) {
-                octx->src0_spad.size_per_thread = src1_row_size_padded;
-            }
-
-            octx->src1_spad.size = octx->src1_spad.size_per_thread;
             octx->src0_spad.size = octx->src0_spad.size_per_thread * octx->n_threads;
+            octx->src1_spad.size = octx->src1_spad.size_per_thread * octx->n_threads;
             octx->dst_spad.size  = octx->dst_spad.size_per_thread * octx->n_threads;
-            break;
 
-        case HTP_TYPE_MXFP4:
-            op_type        = "mxfp4x4x2-f32";
-            quant_job_func = htp_quantize_f32_q8x4x2;
-            if (src1_nrows > 1) {
-                matmul_job_func = htp_matmul_2d_mxfp4x4x2_q8x4x2;
-            } else {
-                matmul_job_func = htp_matvec_2d_mxfp4x4x2_q8x4x2;
-            }
+            // Init fastdiv for matmul_4d (supports broadcasting)
+            mmctx->mm_div_ne12_ne1 = init_fastdiv_values(src1->ne[2] * dst->ne[1]);
+            mmctx->mm_div_ne1      = init_fastdiv_values(dst->ne[1]);
+            mmctx->mm_div_r2       = init_fastdiv_values(src1->ne[2] / src0->ne[2]);
+            mmctx->mm_div_r3       = init_fastdiv_values(src1->ne[3] / src0->ne[3]);
 
-            src1_row_size = q8x4x2_row_size(ne10);  // row size post quantization
-
-            // Entire src1 tensor is placed into the VTCM
-            // For other tensors we allocate N rows per thread, padded to HVX vector size
-
-            octx->dst_spad.size_per_thread  = hex_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256);
-            octx->src0_spad.size_per_thread = hex_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256);
-            octx->src1_spad.size_per_thread = hex_round_up(src1_row_size * src1_nrows, 256);
-
-            // src0 spad is also used in dynamic quantizer to store padded src1 rows
-            src1_row_size_padded = hex_round_up(src1_row_size, QK_Q8_0x4x2 * sizeof(float));
-            if (octx->src0_spad.size_per_thread < src1_row_size_padded) {
-                octx->src0_spad.size_per_thread = src1_row_size_padded;
-            }
-
-            octx->src1_spad.size = octx->src1_spad.size_per_thread;
-            octx->src0_spad.size = octx->src0_spad.size_per_thread * octx->n_threads;
-            octx->dst_spad.size  = octx->dst_spad.size_per_thread * octx->n_threads;
-            break;
-
-        case HTP_TYPE_F16:
-            {
-                // Try optimized f16-f16 path first (src1 in VTCM)
-                const size_t f16_src1_row_size  = hex_round_up(ne10 * 2, 128);
-                const size_t f16_src1_spad_size = hex_round_up(f16_src1_row_size * src1_nrows, 256);
-                const size_t f16_src0_spad_size = hex_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256) * octx->n_threads;
-                const size_t f16_dst_spad_size  = hex_round_up(MM_SPAD_DST_NROWS  * dst_row_size, 256) * octx->n_threads;
-
-                const size_t f16_total_size = f16_src1_spad_size + f16_src0_spad_size + f16_dst_spad_size;
-
-                // Default matmul implementation does not support multi-batch src0 (N-vs-N broadcasting).
-                // It only supports 1-vs-N broadcasting (src0 is 2D) or standard 2D matmul.
-                const bool is_batched  = (ne02 > 1) || (ne03 > 1);
-                const bool is_permuted = htp_is_permuted(&octx->src0) || htp_is_permuted(&octx->src1);
-
-                if (!is_batched && !is_permuted && f16_total_size <= octx->ctx->vtcm_size) {
-                    // Optimized path
-                    op_type        = "f16-f16";
-                    quant_job_func = (src1->type == HTP_TYPE_F32) ? htp_quantize_f32_f16 : htp_quantize_f16_f16;
-                    if (src1_nrows > 1) {
-                        matmul_job_func = htp_matmul_2d_f16_f16;
-                    } else {
-                        matmul_job_func = htp_matvec_2d_f16_f16;
-                    }
-
-                    src1_row_size = f16_src1_row_size; // row size post quantization
-
-                    octx->dst_spad.size_per_thread  = hex_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256);
-                    octx->src0_spad.size_per_thread = hex_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256);
-                    octx->src1_spad.size_per_thread = hex_round_up(src1_row_size * src1_nrows, 256);
-
-                    octx->src1_spad.size = octx->src1_spad.size_per_thread;
-                    octx->src0_spad.size = octx->src0_spad.size_per_thread * octx->n_threads;
-                    octx->dst_spad.size  = octx->dst_spad.size_per_thread * octx->n_threads;
-                } else {
-                    // Fallback to f16/f32 (DDR) if src1 doesn't fit in VTCM or broadcasting is required
-                    quant_job_func  = NULL;
-                    if (src1->type == HTP_TYPE_F32) {
-                        op_type         = "f16-f32";
-                        matmul_job_func = htp_matmul_4d_f16_f32;
-                    } else {
-                        op_type         = "f16-f16";
-                        matmul_job_func = htp_matmul_4d_f16_f16;
-                    }
-
-                    src1_row_size = nb11; // original row size in DDR
-
-                    octx->dst_spad.size_per_thread  = hex_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256);
-                    octx->src0_spad.size_per_thread = hex_round_up(MM_SPAD_SRC0_NROWS * src0_row_size, 256);
-                    octx->src1_spad.size_per_thread = hex_round_up(MM_SPAD_SRC1_NROWS * src1_row_size, 256);
-
-                    octx->src0_spad.size = octx->src0_spad.size_per_thread * octx->n_threads;
-                    octx->src1_spad.size = octx->src1_spad.size_per_thread * octx->n_threads;
-                    octx->dst_spad.size  = octx->dst_spad.size_per_thread * octx->n_threads;
-
-                    // Init fastdiv for matmul_4d (supports broadcasting)
-                    octx->mm_div_ne12_ne1 = init_fastdiv_values(src1->ne[2] * dst->ne[1]);
-                    octx->mm_div_ne1      = init_fastdiv_values(dst->ne[1]);
-                    octx->mm_div_r2       = init_fastdiv_values(src1->ne[2] / src0->ne[2]);
-                    octx->mm_div_r3       = init_fastdiv_values(src1->ne[3] / src0->ne[3]);
-
-                    need_quant = false;
-                }
-            }
-            break;
-
-        default:
+            need_quant = false;
+        }
+    } else {
+        if (htp_mminit_vec_dot(mmctx, src0->type) != 0) {
             return HTP_STATUS_NO_SUPPORT;
+        }
+
+        quant_job_func = quantize_f32_q8x4x2;
+        src1_row_size  = q8x4x2_row_size(ne10);
+        htp_mminit_spad(octx, dst_row_size, src0_row_size_padded, src1_row_size, src1_nrows, 0);
     }
 
     // VTCM scratchpads for all tensors
     size_t spad_size = octx->src1_spad.size + octx->src0_spad.size + octx->dst_spad.size;
 
-    FARF(HIGH, "matmul-%s : src0-spad-size %u src1-spad-size %u dst-spad-size %u (%zu)\n", op_type,
+    FARF(HIGH, "matmul-%s : src0-spad-size %u src1-spad-size %u dst-spad-size %u (%zu)\n", mmctx->type,
          octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size, spad_size);
 
-    FARF(HIGH, "matmul-%s : %ux%ux%ux%u * %ux%ux%ux%u-> %ux%ux%ux%u (0x%p, 0x%p, 0x%p)\n", op_type, src0->ne[0],
+    FARF(HIGH, "matmul-%s : %ux%ux%ux%u * %ux%ux%ux%u-> %ux%ux%ux%u (0x%p, 0x%p, 0x%p)\n", mmctx->type, src0->ne[0],
          src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], dst->ne[0],
          dst->ne[1], dst->ne[2], dst->ne[3], src0->data, src1->data, dst->data);
 
     // Make sure the reserved vtcm size is sufficient
     if (octx->ctx->vtcm_size < spad_size) {
-        FARF(ERROR, "matmul-%s : current VTCM reservation %zu is too small, needed %zu\n", op_type,
+        FARF(ERROR, "matmul-%s : current VTCM reservation %zu is too small, needed %zu\n", mmctx->type,
              octx->ctx->vtcm_size, spad_size);
         return HTP_STATUS_VTCM_TOO_SMALL;
     }
@@ -2268,40 +2544,32 @@ int op_matmul(struct htp_ops_context * octx) {
     octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
     octx->dst_spad.data  = octx->src1_spad.data + octx->src1_spad.size;
 
-    octx->src0_nrows_per_thread = (src0_nrows + octx->n_threads - 1) / octx->n_threads;
-    octx->src0_nrows_per_thread += (octx->src0_nrows_per_thread & 1);  // round up to even
-
     octx->src0_spad.stride = src0_row_size_padded;
     octx->src1_spad.stride = src1_row_size;
 
     if (need_quant) {
-        // Run quant jobs
-        const uint32_t n_quant_jobs = MIN(src1_nrows, octx->n_threads);
-        octx->src1_nrows_per_thread = (src1_nrows + n_quant_jobs - 1) / n_quant_jobs;
-        worker_pool_run_func(octx->ctx->worker_pool, quant_job_func, octx, n_quant_jobs);
+        const uint32_t n_quant_jobs  = MIN(src1_nrows, octx->n_threads);
+        mmctx->src1_nrows_per_thread = (src1_nrows + n_quant_jobs - 1) / n_quant_jobs;
+        worker_pool_run_func(octx->ctx->worker_pool, quant_job_func, mmctx, n_quant_jobs);
     }
 
     if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
-        // Run matmul jobs
         const uint32_t n_matmul_jobs = octx->n_threads;
-        worker_pool_run_func(octx->ctx->worker_pool, matmul_job_func, octx, n_matmul_jobs);
+        worker_pool_run_func(octx->ctx->worker_pool, matmul_job_func, mmctx, n_matmul_jobs);
     }
 
     return HTP_STATUS_OK;
 }
 
-// ** main matmul-id entry point
-
 int op_matmul_id(struct htp_ops_context * octx) {
     htp_matmul_tensors_preamble;
 
+    struct htp_matmul_context mmctx_struct = {0};
+    struct htp_matmul_context * mmctx = &mmctx_struct;
+    mmctx->octx = octx;
+
     struct htp_tensor * restrict ids = &octx->src2;
 
-    const char * op_type;
-
-    worker_callback_t quant_job_func;
-    worker_callback_t matmul_id_job_func;
-
     const size_t src0_row_size = nb01;
     const size_t dst_row_size  = nb1;
 
@@ -2310,6 +2578,13 @@ int op_matmul_id(struct htp_ops_context * octx) {
     const uint32_t src0_nrows = ne01;  // per expert
     const uint32_t src1_nrows = ne11 * ne12 * ne13;
 
+    worker_callback_t quant_job_func;
+    worker_callback_t matmul_id_job_func = src1_nrows > 1 ? matmul_id : matvec_id;
+
+    // Compute src0_nrows_per_thread
+    mmctx->src0_nrows_per_thread  = (src0_nrows + octx->n_threads - 1) / octx->n_threads;
+    mmctx->src0_nrows_per_thread += (mmctx->src0_nrows_per_thread & 1); // round up to even
+
     size_t src1_row_size;
     size_t src1_row_size_padded;
 
@@ -2320,112 +2595,29 @@ int op_matmul_id(struct htp_ops_context * octx) {
     size_t matrix_row_counts_size = n_as * sizeof(uint32_t);
     size_t matrix_row_map_size    = n_as * ids->ne[0] * ids->ne[1] * sizeof(struct mmid_row_mapping);
 
-    switch (src0->type) {
-        case HTP_TYPE_Q4_0:
-            op_type        = "q4x2x2-f32";
-            quant_job_func = htp_quantize_f32_q8x4x2;
-            src1_row_size  = q8x4x2_row_size(ne10);  // row size post quantization
-            if (src1_nrows > 1) {
-                matmul_id_job_func = htp_matmul_id_q4x4x2_q8x4x2;
-            } else {
-                matmul_id_job_func = htp_matvec_id_q4x4x2_q8x4x2;
-            }
-
-            // Entire src1 tensor is placed into the VTCM
-            // For other tensors we allocate N rows per thread, padded to HVX vector size
-            octx->dst_spad.size_per_thread  = hex_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256);
-            octx->src0_spad.size_per_thread = hex_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256);
-            octx->src1_spad.size_per_thread = hex_round_up(src1_row_size * src1_nrows, 256);
-            octx->src2_spad.size_per_thread = hex_round_up(matrix_row_counts_size + matrix_row_map_size, 256);
-
-            // src0 spad is also used in dynamic quantizer to store padded src1 rows
-            src1_row_size_padded = hex_round_up(src1_row_size, QK_Q8_0x4x2 * sizeof(float));
-            if (octx->src0_spad.size_per_thread < src1_row_size_padded) {
-                octx->src0_spad.size_per_thread = src1_row_size_padded;
-            }
-
-            octx->src2_spad.size = octx->src2_spad.size_per_thread;
-            octx->src1_spad.size = octx->src1_spad.size_per_thread;
-            octx->src0_spad.size = octx->src0_spad.size_per_thread * octx->n_threads;
-            octx->dst_spad.size  = octx->dst_spad.size_per_thread * octx->n_threads;
-            break;
-
-        case HTP_TYPE_Q8_0:
-            op_type        = "q8x2x2-f32";
-            quant_job_func = htp_quantize_f32_q8x4x2;
-            src1_row_size  = q8x4x2_row_size(ne10);  // row size post quantization
-            if (src1_nrows > 1) {
-                matmul_id_job_func = htp_matmul_id_q8x4x2_q8x4x2;
-            } else {
-                matmul_id_job_func = htp_matvec_id_q8x4x2_q8x4x2;
-            }
-
-            // Entire src1 tensor is placed into the VTCM
-            // For other tensors we allocate N rows per thread, padded to HVX vector size
-            octx->dst_spad.size_per_thread  = hex_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256);
-            octx->src0_spad.size_per_thread = hex_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256);
-            octx->src1_spad.size_per_thread = hex_round_up(src1_row_size * src1_nrows, 256);
-            octx->src2_spad.size_per_thread = hex_round_up(matrix_row_counts_size + matrix_row_map_size, 256);
-
-            // src0 spad is also used in dynamic quantizer to store padded src1 rows
-            src1_row_size_padded = hex_round_up(src1_row_size, QK_Q8_0x4x2 * sizeof(float));
-            if (octx->src0_spad.size_per_thread < src1_row_size_padded) {
-                octx->src0_spad.size_per_thread = src1_row_size_padded;
-            }
-
-            octx->src2_spad.size = octx->src2_spad.size_per_thread;
-            octx->src1_spad.size = octx->src1_spad.size_per_thread;
-            octx->src0_spad.size = octx->src0_spad.size_per_thread * octx->n_threads;
-            octx->dst_spad.size  = octx->dst_spad.size_per_thread * octx->n_threads;
-            break;
-
-        case HTP_TYPE_MXFP4:
-            op_type        = "mxfp4x2x2-f32";
-            quant_job_func = htp_quantize_f32_q8x4x2;
-            src1_row_size  = q8x4x2_row_size(ne10);  // row size post quantization
-            if (src1_nrows > 1) {
-                matmul_id_job_func = htp_matmul_id_mxfp4x4x2_q8x4x2;
-            } else {
-                matmul_id_job_func = htp_matvec_id_mxfp4x4x2_q8x4x2;
-            }
-
-            // Entire src1 tensor is placed into the VTCM
-            // For other tensors we allocate N rows per thread, padded to HVX vector size
-            octx->dst_spad.size_per_thread  = hex_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256);
-            octx->src0_spad.size_per_thread = hex_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256);
-            octx->src1_spad.size_per_thread = hex_round_up(src1_row_size * src1_nrows, 256);
-            octx->src2_spad.size_per_thread = hex_round_up(matrix_row_counts_size + matrix_row_map_size, 256);
-
-            // src0 spad is also used in dynamic quantizer to store padded src1 rows
-            src1_row_size_padded = hex_round_up(src1_row_size, QK_Q8_0x4x2 * sizeof(float));
-            if (octx->src0_spad.size_per_thread < src1_row_size_padded) {
-                octx->src0_spad.size_per_thread = src1_row_size_padded;
-            }
-
-            octx->src2_spad.size = octx->src2_spad.size_per_thread;
-            octx->src1_spad.size = octx->src1_spad.size_per_thread;
-            octx->src0_spad.size = octx->src0_spad.size_per_thread * octx->n_threads;
-            octx->dst_spad.size  = octx->dst_spad.size_per_thread * octx->n_threads;
-            break;
-
-        default:
-            return HTP_STATUS_NO_SUPPORT;
+    if (htp_mminit_vec_dot(mmctx, src0->type) != 0) {
+        return HTP_STATUS_NO_SUPPORT;
     }
 
+    quant_job_func = quantize_f32_q8x4x2;
+    src1_row_size  = q8x4x2_row_size(ne10);
+
+    const size_t src2_spad_size_per_thread = hex_round_up(matrix_row_counts_size + matrix_row_map_size, 256);
+    htp_mminit_spad(octx, dst_row_size, src0_row_size_padded, src1_row_size, src1_nrows, src2_spad_size_per_thread);
+
     size_t spad_size = octx->src2_spad.size + octx->src1_spad.size + octx->src0_spad.size + octx->dst_spad.size;
 
-    FARF(HIGH, "matmul-id-%s : src0-spad-size %u src1-spad-size %u src2-spad-size %u dst-spad-size %u (%zu)\n", op_type,
+    FARF(HIGH, "matmul-id-%s : src0-spad-size %u src1-spad-size %u src2-spad-size %u dst-spad-size %u (%zu)\n", mmctx->type,
          octx->src0_spad.size, octx->src1_spad.size, octx->src2_spad.size, octx->dst_spad.size, spad_size);
 
-    FARF(HIGH, "matmul-id-%s : %ux%ux%ux%u * %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u (0x%p, 0x%p, 0x%p)\n", op_type,
+    FARF(HIGH, "matmul-id-%s : %ux%ux%ux%u * %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u (0x%p, 0x%p, 0x%p)\n", mmctx->type,
          src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
          ids->ne[0], ids->ne[1], ids->ne[2], ids->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], src0->data,
          src1->data, dst->data);
 
     // Make sure the reserved vtcm size is sufficient
     if (octx->ctx->vtcm_size < spad_size) {
-        FARF(ERROR, "matmul-id-%s : current VTCM reservation %zu is too small, needed %zu\n", op_type,
-             octx->ctx->vtcm_size, spad_size);
+        FARF(ERROR, "matmul-id-%s : current VTCM reservation %zu is too small, needed %zu\n", mmctx->type, octx->ctx->vtcm_size, spad_size);
         return HTP_STATUS_VTCM_TOO_SMALL;
     }
 
@@ -2434,8 +2626,8 @@ int op_matmul_id(struct htp_ops_context * octx) {
     octx->src2_spad.data = octx->src1_spad.data + octx->src1_spad.size;
     octx->dst_spad.data  = octx->src2_spad.data + octx->src2_spad.size;
 
-    octx->src0_nrows_per_thread = (src0_nrows + octx->n_threads - 1) / octx->n_threads;
-    octx->src0_nrows_per_thread += (octx->src0_nrows_per_thread & 1);  // round up to even
+    octx->src0_spad.stride = src0_row_size_padded;
+    octx->src1_spad.stride = src1_row_size;
 
     if (src1_nrows > 1) {
         // initialize matrix_row_counts and map
@@ -2447,8 +2639,7 @@ int op_matmul_id(struct htp_ops_context * octx) {
         // group rows by src0 matrix
         for (uint32_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) {  // token idx
             for (uint32_t id = 0; id < n_ids; ++id) {         // expert idx
-                const uint32_t i02 =
-                    *(const uint32_t *) ((const uint8_t *) ids->data + iid1 * ids->nb[1] + id * ids->nb[0]);
+                const uint32_t i02 = *(const uint32_t *) ((const uint8_t *) ids->data + iid1 * ids->nb[1] + id * ids->nb[0]);
 
                 assert(i02 >= 0 && i02 < n_as);
 
@@ -2460,16 +2651,14 @@ int op_matmul_id(struct htp_ops_context * octx) {
 
     // Setup worker pool callbacks
     if (!(octx->flags & HTP_OPFLAGS_SKIP_QUANTIZE)) {
-        // Run quant jobs
         const uint32_t n_quant_jobs = MIN(src1_nrows, octx->n_threads);
-        octx->src1_nrows_per_thread = (src1_nrows + n_quant_jobs - 1) / n_quant_jobs;
-        worker_pool_run_func(octx->ctx->worker_pool, quant_job_func, octx, n_quant_jobs);
+        mmctx->src1_nrows_per_thread = (src1_nrows + n_quant_jobs - 1) / n_quant_jobs;
+        worker_pool_run_func(octx->ctx->worker_pool, quant_job_func, mmctx, n_quant_jobs);
     }
 
     if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
-        // Run matmul-id jobs
         const uint32_t n_matmul_jobs = octx->n_threads;
-        worker_pool_run_func(octx->ctx->worker_pool, matmul_id_job_func, octx, n_matmul_jobs);
+        worker_pool_run_func(octx->ctx->worker_pool, matmul_id_job_func, mmctx, n_matmul_jobs);
     }
 
     return HTP_STATUS_OK;
diff --git a/ggml/src/ggml-metal/ggml-metal-common.cpp b/ggml/src/ggml-metal/ggml-metal-common.cpp
index 95627d3866..87e1378684 100644
--- a/ggml/src/ggml-metal/ggml-metal-common.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-common.cpp
@@ -264,15 +264,25 @@ static std::vector<int> ggml_metal_graph_optimize_reorder(const std::vector<node
             case GGML_OP_NORM:
             case GGML_OP_RMS_NORM:
             case GGML_OP_GROUP_NORM:
+            case GGML_OP_L2_NORM:
             case GGML_OP_SUM_ROWS:
+            case GGML_OP_SSM_CONV:
+            case GGML_OP_SSM_SCAN:
+            case GGML_OP_CLAMP:
+            case GGML_OP_TRI:
+            case GGML_OP_DIAG:
             case GGML_OP_MUL:
             case GGML_OP_ADD:
             case GGML_OP_DIV:
             case GGML_OP_GLU:
             case GGML_OP_SCALE:
+            case GGML_OP_UNARY:
             case GGML_OP_GET_ROWS:
-            case GGML_OP_CPY:
             case GGML_OP_SET_ROWS:
+            case GGML_OP_SET:
+            case GGML_OP_CPY:
+            case GGML_OP_CONT:
+            case GGML_OP_REPEAT:
                 return true;
             default:
                 return ggml_op_is_empty(op);
@@ -312,7 +322,7 @@ static std::vector<int> ggml_metal_graph_optimize_reorder(const std::vector<node
             h_add(mrs1, node0);
 
             // that many nodes forward to search for a concurrent node
-            constexpr int N_FORWARD = 8;
+            constexpr int N_FORWARD = 64;
 
             for (int i1 = i0 + 1; i1 < i0 + N_FORWARD && i1 < n; i1++) {
                 if (used[i1]) {
diff --git a/ggml/src/ggml-metal/ggml-metal-device.cpp b/ggml/src/ggml-metal/ggml-metal-device.cpp
index 517559d12a..06f3d80459 100644
--- a/ggml/src/ggml-metal/ggml-metal-device.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-device.cpp
@@ -328,31 +328,46 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_sum(ggml_metal_l
 }
 
 ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_sum_rows(ggml_metal_library_t lib, const ggml_tensor * op) {
-    GGML_ASSERT(op->src[0]->nb[0] == ggml_type_size(op->src[0]->type));
+    GGML_ASSERT(ggml_is_contiguous_rows(op->src[0]));
 
     char base[256];
     char name[256];
 
-    const char * op_str = "undefined";
+    int op_num = -1;
+
     switch (op->op) {
-        case GGML_OP_SUM_ROWS:
-            op_str = "sum_rows"; break;
-        case GGML_OP_MEAN:
-            op_str = "mean"; break;
+        case GGML_OP_SUM_ROWS: op_num = OP_SUM_ROWS_NUM_SUM_ROWS; break;
+        case GGML_OP_MEAN:     op_num = OP_SUM_ROWS_NUM_MEAN;     break;
         default: GGML_ABORT("fatal error");
     };
 
-    snprintf(base, 256, "kernel_%s_%s", op_str, ggml_type_name(op->src[0]->type));
+    const char * t0_str = ggml_type_name(op->src[0]->type);
+    const char * t_str  = ggml_type_name(op->type);
 
-    snprintf(name, 256, "%s", base);
+    const bool is_c4 = op->src[0]->ne[0] % 4 == 0;
+
+    snprintf(base, 256, "kernel_sum_rows_%s_%s%s", t0_str, t_str, is_c4 ? "_4" : "");
+    snprintf(name, 256, "%s_op=%d", base, op_num);
 
     ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
     if (!res.pipeline) {
-        res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr);
+        ggml_metal_cv_t cv = ggml_metal_cv_init();
+
+        ggml_metal_cv_set_int16(cv, op_num, FC_SUM_ROWS + 0);
+
+        res = ggml_metal_library_compile_pipeline(lib, base, name, cv);
+
+        ggml_metal_cv_free(cv);
     }
 
     res.smem = 32*sizeof(float);
 
+    if (is_c4) {
+        res.smem *= 4;
+    }
+
+    res.c4  = is_c4;
+
     return res;
 }
 
diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m
index 4ea0bfb94d..b4ca9c5dd6 100644
--- a/ggml/src/ggml-metal/ggml-metal-device.m
+++ b/ggml/src/ggml-metal/ggml-metal-device.m
@@ -1159,6 +1159,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
         case GGML_OP_MUL_MAT:
         case GGML_OP_MUL_MAT_ID:
             return has_simdgroup_reduction;
+        case GGML_OP_SET:
         case GGML_OP_CPY:
         case GGML_OP_DUP:
         case GGML_OP_CONT:
diff --git a/ggml/src/ggml-metal/ggml-metal-impl.h b/ggml/src/ggml-metal/ggml-metal-impl.h
index 952e1be076..383e0d6e93 100644
--- a/ggml/src/ggml-metal/ggml-metal-impl.h
+++ b/ggml/src/ggml-metal/ggml-metal-impl.h
@@ -82,6 +82,7 @@
 #define FC_COUNT_EQUAL                 1100
 #define FC_UNARY                       1200
 #define FC_BIN                         1300
+#define FC_SUM_ROWS                    1400
 
 // op-specific constants
 #define OP_FLASH_ATTN_EXT_NQPSG 8
@@ -118,6 +119,8 @@
 #define OP_UNARY_NUM_SOFTPLUS    115
 #define OP_UNARY_NUM_EXPM1       116
 
+#define OP_SUM_ROWS_NUM_SUM_ROWS 10
+#define OP_SUM_ROWS_NUM_MEAN     11
 
 // kernel argument structs
 //
diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp
index 7db95d1c84..c04e9fc7ff 100644
--- a/ggml/src/ggml-metal/ggml-metal-ops.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp
@@ -426,6 +426,10 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {
             {
                 n_fuse = ggml_metal_op_flash_attn_ext(ctx, idx);
             } break;
+        case GGML_OP_SET:
+            {
+                n_fuse = ggml_metal_op_set(ctx, idx);
+            } break;
         case GGML_OP_DUP:
         case GGML_OP_CPY:
         case GGML_OP_CONT:
@@ -904,6 +908,11 @@ int ggml_metal_op_sum_rows(ggml_metal_op_t ctx, int idx) {
     GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
     GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
 
+    GGML_ASSERT(ggml_is_contiguous_rows(op->src[0]));
+
+    ggml_metal_buffer_id bid_src0 = ggml_metal_get_buffer_id(op->src[0]);
+    ggml_metal_buffer_id bid_dst  = ggml_metal_get_buffer_id(op);
+
     ggml_metal_kargs_sum_rows args = {
         /*.ne00 =*/ ne00,
         /*.ne01 =*/ ne01,
@@ -925,21 +934,26 @@ int ggml_metal_op_sum_rows(ggml_metal_op_t ctx, int idx) {
 
     auto pipeline = ggml_metal_library_get_pipeline_sum_rows(lib, op);
 
+    if (pipeline.c4) {
+        args.ne00 = ne00/4;
+        args.ne0  = ne0/4;
+    }
+
     int nth = 32; // SIMD width
 
-    while (nth < ne00 && nth < ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
+    while (nth < args.ne00 && nth < ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
         nth *= 2;
     }
 
     nth = std::min(nth, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
-    nth = std::min(nth, ne00);
+    nth = std::min(nth, (int) args.ne00);
 
     const size_t smem = pipeline.smem;
 
     ggml_metal_encoder_set_pipeline(enc, pipeline);
     ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
-    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
-    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op),         2);
+    ggml_metal_encoder_set_buffer  (enc, bid_src0, 1);
+    ggml_metal_encoder_set_buffer  (enc, bid_dst,  2);
 
     ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
 
@@ -1599,6 +1613,134 @@ int ggml_metal_op_solve_tri(ggml_metal_op_t ctx, int idx) {
     return 1;
 }
 
+int ggml_metal_op_set(ggml_metal_op_t ctx, int idx) {
+    ggml_tensor * op = ctx->node(idx);
+
+    ggml_metal_library_t lib = ctx->lib;
+    ggml_metal_encoder_t enc = ctx->enc;
+
+    GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
+    GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
+    GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
+    GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
+    GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
+    GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
+
+    ggml_metal_buffer_id bid_src0 = ggml_metal_get_buffer_id(op->src[0]);
+    ggml_metal_buffer_id bid_src1 = ggml_metal_get_buffer_id(op->src[1]);
+    ggml_metal_buffer_id bid_dst  = ggml_metal_get_buffer_id(op);
+
+    const size_t pnb1 = ((const int32_t *) op->op_params)[0];
+    const size_t pnb2 = ((const int32_t *) op->op_params)[1];
+    const size_t pnb3 = ((const int32_t *) op->op_params)[2];
+    const size_t offs = ((const int32_t *) op->op_params)[3];
+
+    const bool inplace = (bool) ((const int32_t *) op->op_params)[4];
+
+    if (!inplace) {
+        // run a separete kernel to cpy src->dst
+        // not sure how to avoid this
+        // TODO: make a simpler cpy_bytes kernel
+
+        //const id<MTLComputePipelineState> pipeline = ctx->pipelines[GGML_METAL_PIPELINE_TYPE_CPY_F32_F32].obj;
+        auto pipeline = ggml_metal_library_get_pipeline_cpy(lib, op->src[0]->type, op->type);
+
+        ggml_metal_kargs_cpy args = {
+            /*.nk0  =*/ ne00,
+            /*.ne00 =*/ ne00,
+            /*.ne01 =*/ ne01,
+            /*.ne02 =*/ ne02,
+            /*.ne03 =*/ ne03,
+            /*.nb00 =*/ nb00,
+            /*.nb01 =*/ nb01,
+            /*.nb02 =*/ nb02,
+            /*.nb03 =*/ nb03,
+            /*.ne0  =*/ ne0,
+            /*.ne1  =*/ ne1,
+            /*.ne2  =*/ ne2,
+            /*.ne3  =*/ ne3,
+            /*.nb0  =*/ nb0,
+            /*.nb1  =*/ nb1,
+            /*.nb2  =*/ nb2,
+            /*.nb3  =*/ nb3,
+        };
+
+        ggml_metal_encoder_set_pipeline(enc, pipeline);
+        ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
+        ggml_metal_encoder_set_buffer  (enc, bid_src0, 1);
+        ggml_metal_encoder_set_buffer  (enc, bid_dst,  2);
+
+        const int nth = std::min(ggml_metal_pipeline_max_theads_per_threadgroup(pipeline), ne00);
+
+        ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne02, ne03, nth, 1, 1);
+
+        ggml_metal_op_concurrency_reset(ctx);
+    }
+
+    auto pipeline = ggml_metal_library_get_pipeline_cpy(lib, op->src[1]->type, op->type);
+
+    GGML_ASSERT(ne10 % ggml_blck_size(op->src[1]->type) == 0);
+
+    int64_t nk0 = ne10;
+    if (ggml_is_quantized(op->src[1]->type)) {
+        nk0 = ne10/16;
+    } else if (ggml_is_quantized(op->type)) {
+        nk0 = ne10/ggml_blck_size(op->type);
+    }
+
+    int nth = std::min<int>(nk0, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
+
+    // when rows are small, we can batch them together in a single threadgroup
+    int nrptg = 1;
+
+    // TODO: relax this constraint in the future
+    if (ggml_blck_size(op->src[1]->type) == 1 && ggml_blck_size(op->type) == 1) {
+        if (nth > nk0) {
+            nrptg = (nth + nk0 - 1)/nk0;
+            nth   = nk0;
+
+            if (nrptg*nth > ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
+                nrptg--;
+            }
+        }
+    }
+
+    nth = std::min<int>(nth, nk0);
+
+    ggml_metal_kargs_cpy args = {
+        /*.nk0  =*/ nk0,
+        /*.ne00 =*/ ne10,
+        /*.ne01 =*/ ne11,
+        /*.ne02 =*/ ne12,
+        /*.ne03 =*/ ne13,
+        /*.nb00 =*/ nb10,
+        /*.nb01 =*/ nb11,
+        /*.nb02 =*/ nb12,
+        /*.nb03 =*/ nb13,
+        /*.ne0  =*/ ne10,
+        /*.ne1  =*/ ne11,
+        /*.ne2  =*/ ne12,
+        /*.ne3  =*/ ne13,
+        /*.nb0  =*/ ggml_element_size(op),
+        /*.nb1  =*/ pnb1,
+        /*.nb2  =*/ pnb2,
+        /*.nb3  =*/ pnb3,
+    };
+
+    const int nw0 = nrptg == 1 ? (nk0 + nth - 1)/nth : 1;
+
+    bid_dst.offs += offs;
+
+    ggml_metal_encoder_set_pipeline(enc, pipeline);
+    ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
+    ggml_metal_encoder_set_buffer  (enc, bid_src1, 1);
+    ggml_metal_encoder_set_buffer  (enc, bid_dst,  2);
+
+    ggml_metal_encoder_dispatch_threadgroups(enc, nw0*(ne11 + nrptg - 1)/nrptg, ne12, ne13, nth, nrptg, 1);
+
+    return 1;
+}
+
 int ggml_metal_op_cpy(ggml_metal_op_t ctx, int idx) {
     ggml_tensor * op = ctx->node(idx);
 
diff --git a/ggml/src/ggml-metal/ggml-metal-ops.h b/ggml/src/ggml-metal/ggml-metal-ops.h
index 29456d70d5..f3e38c7aa9 100644
--- a/ggml/src/ggml-metal/ggml-metal-ops.h
+++ b/ggml/src/ggml-metal/ggml-metal-ops.h
@@ -59,6 +59,7 @@ int ggml_metal_op_ssm_conv          (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_ssm_scan          (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_rwkv              (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_solve_tri         (ggml_metal_op_t ctx, int idx);
+int ggml_metal_op_set               (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_cpy               (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_pool_1d           (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_pool_2d           (ggml_metal_op_t ctx, int idx);
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 0036ba90ec..6c349aa0c9 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -77,6 +77,14 @@ static inline float dot(float x, float y) {
     return x*y;
 }
 
+static inline float sum(float x) {
+    return x;
+}
+
+static inline float sum(float4 x) {
+    return x[0] + x[1] + x[2] + x[3];
+}
+
 // NOTE: this is not dequantizing - we are simply fitting the template
 template <typename type4x4>
 void dequantize_f32(device const float4x4 * src, short il, thread type4x4 & reg) {
@@ -1501,33 +1509,35 @@ kernel void kernel_op_sum_f32(
     }
 }
 
-template <bool norm>
-kernel void kernel_sum_rows(
+constant short FC_sum_rows_op [[function_constant(FC_SUM_ROWS + 0)]];
+
+template <typename T0, typename T>
+kernel void kernel_sum_rows_impl(
         constant ggml_metal_kargs_sum_rows & args,
-        device const float * src0,
-        device       float * dst,
-        threadgroup  float * shmem_f32 [[threadgroup(0)]],
+        device const char * src0,
+        device       char * dst,
+        threadgroup  char * shmem [[threadgroup(0)]],
         uint3   tgpig[[threadgroup_position_in_grid]],
         ushort3 tpitg[[thread_position_in_threadgroup]],
         ushort  sgitg[[simdgroup_index_in_threadgroup]],
         ushort  tiisg[[thread_index_in_simdgroup]],
         ushort3   ntg[[threads_per_threadgroup]]) {
-    int64_t i3 = tgpig.z;
-    int64_t i2 = tgpig.y;
-    int64_t i1 = tgpig.x;
+#define FC_OP  FC_sum_rows_op
 
-    if (i3 >= args.ne03 || i2 >= args.ne02 || i1 >= args.ne01) {
-        return;
-    }
+    const int i3 = tgpig.z;
+    const int i2 = tgpig.y;
+    const int i1 = tgpig.x;
+
+    threadgroup T0 * shmem_t = (threadgroup T0 *) shmem;
 
     if (sgitg == 0) {
-        shmem_f32[tiisg] = 0.0f;
+        shmem_t[tiisg] = 0.0f;
     }
 
-    device const float * src_row = (device const float *) ((device const char *) src0 + i1*args.nb01 + i2*args.nb02 + i3*args.nb03);
-    device       float * dst_row = (device       float *) ((device       char *) dst  + i1*args.nb1  + i2*args.nb2  + i3*args.nb3);
+    device const T0 * src_row = (device const T0 *) (src0 + i1*args.nb01 + i2*args.nb02 + i3*args.nb03);
+    device       T  * dst_row = (device       T  *) (dst  + i1*args.nb1  + i2*args.nb2  + i3*args.nb3);
 
-    float sumf = 0;
+    T0 sumf = T0(0.0f);
 
     for (int64_t i0 = tpitg.x; i0 < args.ne00; i0 += ntg.x) {
         sumf += src_row[i0];
@@ -1538,23 +1548,33 @@ kernel void kernel_sum_rows(
     threadgroup_barrier(mem_flags::mem_threadgroup);
 
     if (tiisg == 0) {
-        shmem_f32[sgitg] = sumf;
+        shmem_t[sgitg] = sumf;
     }
 
     threadgroup_barrier(mem_flags::mem_threadgroup);
 
-    sumf = shmem_f32[tiisg];
+    sumf = shmem_t[tiisg];
     sumf = simd_sum(sumf);
 
     if (tpitg.x == 0) {
-        dst_row[0] = norm ? sumf / args.ne00 : sumf;
+        if (FC_OP == OP_SUM_ROWS_NUM_MEAN) {
+            if (is_same<float4, T0>::value) {
+                dst_row[0] = sum(sumf) / (4*args.ne00);
+            } else {
+                dst_row[0] = sum(sumf) / args.ne00;
+            }
+        } else {
+            dst_row[0] = sum(sumf);
+        }
     }
+
+#undef FC_OP
 }
 
-typedef decltype(kernel_sum_rows<false>) kernel_sum_rows_t;
+typedef decltype(kernel_sum_rows_impl<float, float>) kernel_sum_rows_t;
 
-template [[host_name("kernel_sum_rows_f32")]] kernel kernel_sum_rows_t kernel_sum_rows<false>;
-template [[host_name("kernel_mean_f32")]]     kernel kernel_sum_rows_t kernel_sum_rows<true>;
+template [[host_name("kernel_sum_rows_f32_f32")]]   kernel kernel_sum_rows_t kernel_sum_rows_impl<float,  float>;
+template [[host_name("kernel_sum_rows_f32_f32_4")]] kernel kernel_sum_rows_t kernel_sum_rows_impl<float4, float>;
 
 template<typename T>
 kernel void kernel_cumsum_blk(
@@ -2435,9 +2455,6 @@ kernel void kernel_solve_tri_f32(
     const short K   = FC_solve_tri_k;
     const short NP  = PAD2(N, NW);
 
-    const int32_t ne02 = args.ne02;
-    const int32_t ne03 = args.ne03;
-
     const int32_t i03 = tgpig.z;
     const int32_t i02 = tgpig.y;
     const int32_t i01 = tgpig.x*NSG + sgitg;
@@ -5949,7 +5966,7 @@ kernel void kernel_flash_attn_ext_vec(
     static_assert(DK4 % NL == 0, "DK4 must be divisible by NL");
     static_assert(DV4 % NL == 0, "DV4 must be divisible by NL");
 
-    const short T = PK + NSG*SH; // shared memory size per query in (half)
+  //const short T = PK + NSG*SH; // shared memory size per query in (half)
 
   //threadgroup q_t   * sq  = (threadgroup q_t   *) (shmem_f16 +                      0*PK); // holds the query data
     threadgroup q4_t  * sq4 = (threadgroup q4_t  *) (shmem_f16 +                      0*PK); // same as above but in q4_t
@@ -8537,7 +8554,9 @@ kernel void kernel_mul_mm(
     threadgroup S0 * sa = (threadgroup S0 *)(shmem);
     threadgroup S1 * sb = (threadgroup S1 *)(shmem + 4096);
 
+#ifdef GGML_METAL_HAS_TENSOR
     threadgroup float * sc = (threadgroup float *)(shmem);
+#endif
 
     constexpr int NR0 = 64;
     constexpr int NR1 = 32;
@@ -8660,8 +8679,8 @@ kernel void kernel_mul_mm(
             const short sx = (tiitg%NL1);
             const short sy = (tiitg/NL1)/8;
 
-            const short dx = sx;
-            const short dy = sy;
+          //const short dx = sx;
+          //const short dy = sy;
 
             const short ly = (tiitg/NL1)%8;
 
@@ -8910,7 +8929,9 @@ kernel void kernel_mul_mm_id(
     threadgroup S0 * sa = (threadgroup S0 *)(shmem);
     threadgroup S1 * sb = (threadgroup S1 *)(shmem + 4096);
 
+#ifdef GGML_METAL_HAS_TENSOR
     threadgroup float * sc = (threadgroup float *)(shmem);
+#endif
 
     constexpr int NR0 = 64;
     constexpr int NR1 = 32;
@@ -9045,8 +9066,8 @@ kernel void kernel_mul_mm_id(
             const short sx = (tiitg%NL1);
             const short sy = (tiitg/NL1)/8;
 
-            const short dx = sx;
-            const short dy = sy;
+          //const short dx = sx;
+          //const short dy = sy;
 
             const short ly = (tiitg/NL1)%8;
 
diff --git a/ggml/src/ggml-opencl/CMakeLists.txt b/ggml/src/ggml-opencl/CMakeLists.txt
index b6094fb68b..f389193691 100644
--- a/ggml/src/ggml-opencl/CMakeLists.txt
+++ b/ggml/src/ggml-opencl/CMakeLists.txt
@@ -85,6 +85,8 @@ set(GGML_OPENCL_KERNELS
     mul_mv_q4_0_f32_8x_flat
     mul_mv_q4_0_f32_1d_8x_flat
     mul_mv_q4_0_f32_1d_16x_flat
+    mul_mv_q4_1_f32
+    mul_mv_q4_1_f32_flat
     mul_mv_q4_k_f32
     mul_mv_q6_k_f32
     mul_mv_q6_k_f32_flat
@@ -101,6 +103,8 @@ set(GGML_OPENCL_KERNELS
     gemv_moe_mxfp4_f32
     mul_mm_f32_f32_l4_lm
     mul_mm_f16_f32_l4_lm
+    mul_mm_q4_0_f32_l4_lm
+    mul_mm_q4_1_f32_l4_lm
     mul_mm_q8_0_f32_l4_lm
     mul_mm_q6_k_f32_l4_lm
     mul_mm_q8_0_f32_8x4
diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
index 40474c193b..ae3f79fd0d 100644
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -525,6 +525,7 @@ struct ggml_backend_opencl_context {
     cl_kernel kernel_mul_mm_f16_f32_kq;
     cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
     cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0;
+    cl_kernel kernel_convert_block_q4_1, kernel_restore_block_q4_1;
     cl_kernel kernel_convert_block_mxfp4, kernel_convert_block_mxfp4_trans, kernel_restore_block_mxfp4, kernel_restore_block_mxfp4_trans;
     cl_kernel kernel_convert_block_q8_0, kernel_restore_block_q8_0, kernel_restore_block_q8_0_trans;
     cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
@@ -532,6 +533,8 @@ struct ggml_backend_opencl_context {
     cl_kernel kernel_restore_block_q4_0_noshuffle;
     cl_kernel kernel_convert_block_q6_K, kernel_restore_block_q6_K;
     cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
+    cl_kernel kernel_mul_mv_q4_1_f32;
+    cl_kernel kernel_mul_mv_q4_1_f32_flat;
     cl_kernel kernel_mul_mv_q4_K_f32;
     cl_kernel kernel_mul_mv_q6_K_f32;
     cl_kernel kernel_mul_mv_q6_K_f32_flat;
@@ -564,6 +567,8 @@ struct ggml_backend_opencl_context {
     cl_kernel kernel_mul_mv_id_mxfp4_f32_flat;
     cl_kernel kernel_mul_mm_f32_f32_l4_lm;
     cl_kernel kernel_mul_mm_f16_f32_l4_lm;
+    cl_kernel kernel_mul_mm_q4_0_f32_l4_lm;
+    cl_kernel kernel_mul_mm_q4_1_f32_l4_lm;
     cl_kernel kernel_mul_mm_q8_0_f32_l4_lm;
     cl_kernel kernel_mul_mm_q6_k_f32_l4_lm;
 
@@ -888,6 +893,8 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
         CL_CHECK((backend_ctx->kernel_restore_block_q4_0_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_0_noshuffle", &err), err));
         CL_CHECK((backend_ctx->kernel_convert_block_q4_0  = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0", &err), err));
         CL_CHECK((backend_ctx->kernel_restore_block_q4_0  = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_0", &err), err));
+        CL_CHECK((backend_ctx->kernel_convert_block_q4_1  = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_1", &err), err));
+        CL_CHECK((backend_ctx->kernel_restore_block_q4_1  = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_1", &err), err));
         CL_CHECK((backend_ctx->kernel_convert_block_mxfp4 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_mxfp4", &err), err));
         CL_CHECK((backend_ctx->kernel_convert_block_mxfp4_trans = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_mxfp4_trans", &err), err));
         CL_CHECK((backend_ctx->kernel_restore_block_mxfp4_trans = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_mxfp4_trans", &err), err));
@@ -1119,6 +1126,40 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
         GGML_LOG_CONT(".");
     }
 
+    // mul_mv_q4_1_f32
+    {
+#ifdef GGML_OPENCL_EMBED_KERNELS
+        const std::string kernel_src {
+            #include "mul_mv_q4_1_f32.cl.h"
+        };
+#else
+        const std::string kernel_src = read_file("mul_mv_q4_1_f32.cl");
+#endif
+        cl_program prog =
+            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
+
+        CL_CHECK((backend_ctx->kernel_mul_mv_q4_1_f32 = clCreateKernel(prog, "kernel_mul_mv_q4_1_f32", &err), err));
+        CL_CHECK(clReleaseProgram(prog));
+        GGML_LOG_CONT(".");
+    }
+
+    // mul_mv_q4_1_f32_flat
+    {
+#ifdef GGML_OPENCL_EMBED_KERNELS
+        const std::string kernel_src {
+            #include "mul_mv_q4_1_f32_flat.cl.h"
+        };
+#else
+        const std::string kernel_src = read_file("mul_mv_q4_1_f32_flat.cl");
+#endif
+        cl_program prog =
+            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
+
+        CL_CHECK((backend_ctx->kernel_mul_mv_q4_1_f32_flat = clCreateKernel(prog, "kernel_mul_mv_q4_1_f32_flat", &err), err));
+        CL_CHECK(clReleaseProgram(prog));
+        GGML_LOG_CONT(".");
+    }
+
     // mul_mv_q4_k_f32
     {
 #ifdef GGML_OPENCL_EMBED_KERNELS
@@ -1361,6 +1402,38 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
         GGML_LOG_CONT(".");
     }
 
+    // mul_mm_q4_0_f32_l4_lm
+    {
+#ifdef GGML_OPENCL_EMBED_KERNELS
+        const std::string kernel_src {
+            #include "mul_mm_q4_0_f32_l4_lm.cl.h"
+        };
+#else
+        const std::string kernel_src = read_file("mul_mm_q4_0_f32_l4_lm.cl");
+#endif
+        cl_program prog =
+            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
+
+        CL_CHECK((backend_ctx->kernel_mul_mm_q4_0_f32_l4_lm = clCreateKernel(prog, "kernel_mul_mm_q4_0_f32_l4_lm", &err), err));
+        GGML_LOG_CONT(".");
+    }
+
+    // mul_mm_q4_1_f32_l4_lm
+    {
+#ifdef GGML_OPENCL_EMBED_KERNELS
+        const std::string kernel_src {
+            #include "mul_mm_q4_1_f32_l4_lm.cl.h"
+        };
+#else
+        const std::string kernel_src = read_file("mul_mm_q4_1_f32_l4_lm.cl");
+#endif
+        cl_program prog =
+            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
+
+        CL_CHECK((backend_ctx->kernel_mul_mm_q4_1_f32_l4_lm = clCreateKernel(prog, "kernel_mul_mm_q4_1_f32_l4_lm", &err), err));
+        GGML_LOG_CONT(".");
+    }
+
     // mul_mm_q8_0_f32_l4_lm
     {
 #ifdef GGML_OPENCL_EMBED_KERNELS
@@ -2923,6 +2996,59 @@ struct ggml_tensor_extra_cl_q4_0 {
     }
 };
 
+struct ggml_tensor_extra_cl_q4_1 {
+    // Quantized values.
+    cl_mem q = nullptr;
+    // Quantized values in image1d_buffer_t.
+    cl_mem q_img = nullptr;
+    // Scales.
+    cl_mem d = nullptr;
+    // Scales in image1d_buffer_t.
+    cl_mem d_img = nullptr;
+    // Min
+    cl_mem m = nullptr;
+    // Min in image1d_buffer_t.
+    cl_mem m_img = nullptr;
+    // Size of quantized values.
+    size_t size_q = 0;
+    // Size of scales.
+    size_t size_d = 0;
+    // Size of min values.
+    size_t size_m = 0;
+
+    ~ggml_tensor_extra_cl_q4_1() {
+        reset();
+    }
+
+    void reset() {
+        // q and d are subbuffers into the bigger buffer allocated in ggml_backend_buffer.
+        // They must be properly released so that the original buffer can be
+        // properly released to avoid memory leak.
+        if (q != nullptr) {
+            CL_CHECK(clReleaseMemObject(q));
+            q = nullptr;
+        }
+        if (d != nullptr) {
+            CL_CHECK(clReleaseMemObject(d));
+            d = nullptr;
+        }
+        if (m != nullptr) {
+            CL_CHECK(clReleaseMemObject(m));
+            m = nullptr;
+        }
+        // Currently, q_img and d_img are only initialized when SMALL_ALLOC is
+        // enabled. They point to the images in ggml_backend_opencl_buffer_context.
+        // So, there is no need to release them here.
+        // TODO: initialize them for non SMALL_PATH path, or remove them.
+        q_img = nullptr;
+        d_img = nullptr;
+        m_img = nullptr;
+        size_q = 0;
+        size_d = 0;
+        size_m = 0;
+    }
+};
+
 struct ggml_tensor_extra_cl_mxfp4 {
     // Quantized values.
     cl_mem q = nullptr;
@@ -3399,8 +3525,9 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
                 return true;
             } else if (op->src[0]->type == GGML_TYPE_F32) {
                 return op->src[1]->type == GGML_TYPE_F32;
-            } else if (op->src[0]->type == GGML_TYPE_Q4_0 || op->src[0]->type == GGML_TYPE_MXFP4 ||
-                       op->src[0]->type == GGML_TYPE_Q4_K ||
+            } else if (op->src[0]->type == GGML_TYPE_Q4_0  || op->src[0]->type == GGML_TYPE_Q4_1 ||
+                       op->src[0]->type == GGML_TYPE_MXFP4 ||
+                       op->src[0]->type == GGML_TYPE_Q4_K  ||
                        op->src[0]->type == GGML_TYPE_Q6_K) {
                 return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
             } else if (op->src[0]->type == GGML_TYPE_Q8_0) {
@@ -3629,6 +3756,21 @@ struct ggml_backend_opencl_buffer_context {
         return extra;
     }
 
+    ggml_tensor_extra_cl_q4_1 * ggml_opencl_alloc_temp_tensor_extra_q4_1() {
+        ggml_tensor_extra_cl_q4_1 * extra;
+        if (temp_tensor_extras_q4_1.empty()) {
+            extra = new ggml_tensor_extra_cl_q4_1();
+        } else {
+            extra = temp_tensor_extras_q4_1.back();
+            temp_tensor_extras_q4_1.pop_back();
+        }
+
+        temp_tensor_extras_q4_1_in_use.push_back(extra);
+
+        extra->reset();
+        return extra;
+    }
+
     ggml_tensor_extra_cl_mxfp4 * ggml_opencl_alloc_temp_tensor_extra_mxfp4() {
         ggml_tensor_extra_cl_mxfp4 * extra;
         if (temp_tensor_extras_mxfp4.empty()) {
@@ -3685,6 +3827,11 @@ struct ggml_backend_opencl_buffer_context {
         }
         temp_tensor_extras_q4_0_in_use.clear();
 
+        for (ggml_tensor_extra_cl_q4_1 * e : temp_tensor_extras_q4_1_in_use) {
+            temp_tensor_extras_q4_1.push_back(e);
+        }
+        temp_tensor_extras_q4_1_in_use.clear();
+
         for (ggml_tensor_extra_cl_mxfp4 * e : temp_tensor_extras_mxfp4_in_use) {
             temp_tensor_extras_mxfp4.push_back(e);
         }
@@ -3710,6 +3857,8 @@ struct ggml_backend_opencl_buffer_context {
     std::vector<ggml_tensor_extra_cl *> temp_tensor_extras_in_use;
     std::vector<ggml_tensor_extra_cl_q4_0 *> temp_tensor_extras_q4_0;
     std::vector<ggml_tensor_extra_cl_q4_0 *> temp_tensor_extras_q4_0_in_use;
+    std::vector<ggml_tensor_extra_cl_q4_1 *> temp_tensor_extras_q4_1;
+    std::vector<ggml_tensor_extra_cl_q4_1 *> temp_tensor_extras_q4_1_in_use;
     std::vector<ggml_tensor_extra_cl_mxfp4 *> temp_tensor_extras_mxfp4;
     std::vector<ggml_tensor_extra_cl_mxfp4 *> temp_tensor_extras_mxfp4_in_use;
     std::vector<ggml_tensor_extra_cl_q8_0 *> temp_tensor_extras_q8_0;
@@ -4079,6 +4228,75 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
         return;
 
     }
+    if (tensor->type == GGML_TYPE_Q4_1) {
+        ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
+        GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized");
+
+        // Allocate the new extra and create aliases from the original.
+        ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
+        ggml_tensor_extra_cl_q4_1 * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_q4_1();
+
+        size_t size_d = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t);
+        size_t size_m = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t);
+        size_t size_q = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/2;
+        GGML_ASSERT(size_d + size_m + size_q == ggml_nbytes(tensor) && "Incorrect tensor size");
+
+        cl_int err;
+        cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
+            ggml_nbytes(tensor), NULL, &err);
+        CL_CHECK(err);
+        CL_CHECK(clEnqueueWriteBuffer(
+            queue, data_device, CL_TRUE, 0,
+            ggml_nbytes(tensor), data, 0, NULL, NULL));
+
+        cl_buffer_region region;
+
+        // The original tensor memory is divided into scales and quants, i.e.,
+        // we first store scales, mins, then quants.
+        // Create subbuffer for scales.
+        region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment);
+        region.size = size_d;
+        extra->d = clCreateSubBuffer(
+            extra_orig->data_device, CL_MEM_READ_WRITE,
+            CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
+        CL_CHECK(err);
+        auto previous_origin = region.origin;
+
+        // Create subbuffer for mins.
+        region.origin = align_to(previous_origin + size_d, backend_ctx->alignment);
+        region.size = size_m;
+        extra->m = clCreateSubBuffer(
+            extra_orig->data_device, CL_MEM_READ_WRITE,
+            CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
+        CL_CHECK(err);
+        previous_origin = region.origin;
+
+        // Create subbuffer for quants.
+        region.origin = align_to(previous_origin + size_m, backend_ctx->alignment);
+        region.size = size_q;
+        extra->q = clCreateSubBuffer(
+            extra_orig->data_device, CL_MEM_READ_WRITE,
+            CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
+        CL_CHECK(err);
+
+        cl_kernel kernel = backend_ctx->kernel_convert_block_q4_1;
+        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
+        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
+        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d));
+        CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->m));
+
+        size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
+        size_t local_work_size[] = {64, 1, 1};
+
+        cl_event evt;
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+        CL_CHECK(clWaitForEvents(1, &evt));
+        CL_CHECK(clReleaseMemObject(data_device));
+
+        tensor->extra = extra;
+
+        return;
+    }
     if (tensor->type == GGML_TYPE_MXFP4) {
         ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
         GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized");
@@ -4581,7 +4799,35 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
             size, data, 0, NULL, NULL));
         CL_CHECK(clReleaseMemObject(data_device));
         return;
-    } else if (tensor->type == GGML_TYPE_MXFP4) {
+    }
+    if (tensor->type == GGML_TYPE_Q4_1) {
+        ggml_tensor_extra_cl_q4_1 * extra = (ggml_tensor_extra_cl_q4_1 *)tensor->extra;
+
+        cl_int err;
+        cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
+            ggml_nbytes(tensor), NULL, &err);
+        CL_CHECK(err);
+
+        cl_kernel kernel = backend_ctx->kernel_restore_block_q4_1;
+        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
+        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->d));
+        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->m));
+        CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &data_device));
+
+        size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
+        size_t local_work_size[] = {1, 1, 1};
+
+        cl_event evt;
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
+            global_work_size, local_work_size, 0, NULL, &evt));
+        CL_CHECK(clWaitForEvents(1, &evt));
+        CL_CHECK(clEnqueueReadBuffer(
+            queue, data_device, CL_TRUE, offset,
+            size, data, 0, NULL, NULL));
+        CL_CHECK(clReleaseMemObject(data_device));
+        return;
+    }
+    if (tensor->type == GGML_TYPE_MXFP4) {
         ggml_tensor_extra_cl_mxfp4 * extra = (ggml_tensor_extra_cl_mxfp4 *)tensor->extra;
 
         cl_int err;
@@ -8409,6 +8655,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
 
 #ifdef GGML_OPENCL_SOA_Q
     ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra;
+    ggml_tensor_extra_cl_q4_1 * extra0_q4_1 = (ggml_tensor_extra_cl_q4_1 *)src0->extra;
     ggml_tensor_extra_cl_mxfp4 * extra0_mxfp4 = (ggml_tensor_extra_cl_mxfp4 *)src0->extra;
     ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra;
     ggml_tensor_extra_cl_q6_K * extra0_q6_K = (ggml_tensor_extra_cl_q6_K *)src0->extra;
@@ -8922,6 +9169,91 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
                 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
                 return;
             }
+            case GGML_TYPE_Q4_0: {
+                if (ne11 < 32) {
+                    break;
+                }
+                if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1)) {
+                    break;
+                }
+
+                kernel = backend_ctx->kernel_mul_mm_q4_0_f32_l4_lm;
+                nth0 = 128; // calculated as (BM*BN)/(TM*TN)
+
+                int batch_stride_a = ne00*ne01;
+                int batch_stride_b = ne10*ne11;
+                int batch_stride_d = ne0*ne1;
+
+                CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0_q4_0->q));
+                CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_mem),   &extra0_q4_0->d));
+                CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
+                CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
+                CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
+                CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
+                CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne00));
+                CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne01));
+                CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne02));
+                CL_CHECK(clSetKernelArg(kernel,  9, sizeof(int),      &ne11));
+                CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &ne12));
+                CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),      &ne10)); // stride_a
+                CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &ne10)); // stride_b
+                CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &ne01)); // stride_d
+                CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &batch_stride_a));
+                CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),      &batch_stride_b));
+                CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int),      &batch_stride_d));
+                CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int),      &r2));
+                CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int),      &r3));
+
+                // 64 is block tile size BM and BN - change here when BM and BN in the kernel are changed.
+                size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13};
+                size_t local_work_size[] = {(size_t)nth0, 1, 1};
+
+                backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+                return;
+            }
+            case GGML_TYPE_Q4_1: {
+                if (ne11 < 32) {
+                    break;
+                }
+                if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1)) {
+                    break;
+                }
+
+                kernel = backend_ctx->kernel_mul_mm_q4_1_f32_l4_lm;
+                nth0 = 128; // calculated as (BM*BN)/(TM*TN)
+
+                int batch_stride_a = ne00*ne01;
+                int batch_stride_b = ne10*ne11;
+                int batch_stride_d = ne0*ne1;
+
+                CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0_q4_1->q));
+                CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_mem),   &extra0_q4_1->d));
+                CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra0_q4_1->m));
+                CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_mem),   &extra1->data_device));
+                CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_ulong), &offset1));
+                CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_mem),   &extrad->data_device));
+                CL_CHECK(clSetKernelArg(kernel,  6, sizeof(cl_ulong), &offsetd));
+                CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne00));
+                CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne01));
+                CL_CHECK(clSetKernelArg(kernel,  9, sizeof(int),      &ne02));
+                CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &ne11));
+                CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),      &ne12));
+                CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &ne10)); // stride_a
+                CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &ne10)); // stride_b
+                CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &ne01)); // stride_d
+                CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),      &batch_stride_a));
+                CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int),      &batch_stride_b));
+                CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int),      &batch_stride_d));
+                CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int),      &r2));
+                CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int),      &r3));
+
+                // 64 is block tile size BM and BN - change here when BM and BN in the kernel are changed.
+                size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13};
+                size_t local_work_size[] = {(size_t)nth0, 1, 1};
+
+                backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+                return;
+            }
             case GGML_TYPE_Q8_0: {
                 if (ne11 < 32) {
                     break;
@@ -9262,7 +9594,71 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
             CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &r3));
 #endif // GGML_OPENCL_SOA_Q
             break;
-        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_Q4_1: {
+#ifdef GGML_OPENCL_SOA_Q
+            if (backend_ctx->gpu_family == INTEL) {
+                nth0 = 16;
+                nth1 = 1;
+                ndst = 4;
+            } else if (backend_ctx->gpu_family == ADRENO) {
+                nth0 = 64;
+                nth1 = 1;
+                ndst = 4;
+            } else {
+                GGML_ASSERT(false && "TODO: Unknown GPU");
+            }
+
+            kernel = backend_ctx->kernel_mul_mv_q4_1_f32_flat;
+
+            CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0_q4_1->q));
+            CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_mem),   &extra0_q4_1->d));
+            CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra0_q4_1->m));
+            CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_mem),   &extra1->data_device));
+            CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_ulong), &offset1));
+            CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_mem),   &extrad->data_device));
+            CL_CHECK(clSetKernelArg(kernel,  6, sizeof(cl_ulong), &offsetd));
+            CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne00));
+            CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne01));
+            CL_CHECK(clSetKernelArg(kernel,  9, sizeof(int),      &ne02));
+            CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &ne10));
+            CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),      &ne12));
+            CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &ne0));
+            CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &ne1));
+            CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &r2));
+            CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),      &r3));
+#else
+            if (backend_ctx->gpu_family == INTEL) {
+                nth0 = 16;
+                nth1 = 1;
+                ndst = 4;
+            } else if (backend_ctx->gpu_family == ADRENO) {
+                nth0 = 64;
+                nth1 = 1;
+                ndst = 4;
+            } else {
+                GGML_ASSERT(false && "TODO: Unknown GPU");
+            }
+
+            kernel = backend_ctx->kernel_mul_mv_q4_1_f32;
+
+            CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
+            CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
+            CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
+            CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
+            CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
+            CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
+            CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne00));
+            CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne01));
+            CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne02));
+            CL_CHECK(clSetKernelArg(kernel,  9, sizeof(int),      &ne10));
+            CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &ne12));
+            CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),      &ne0));
+            CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &ne1));
+            CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &r2));
+            CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &r3));
+#endif // GGML_OPENCL_SOA_Q
+            break;
+        }
         case GGML_TYPE_Q8_0: {
 #ifdef GGML_OPENCL_SOA_Q
             kernel = backend_ctx->kernel_mul_mv_q8_0_f32_flat;
diff --git a/ggml/src/ggml-opencl/kernels/cvt.cl b/ggml/src/ggml-opencl/kernels/cvt.cl
index 9fb434713d..2c244ce321 100644
--- a/ggml/src/ggml-opencl/kernels/cvt.cl
+++ b/ggml/src/ggml-opencl/kernels/cvt.cl
@@ -46,6 +46,15 @@ struct block_q4_0
     uint8_t qs[QK4_0 / 2];
 };
 
+//------------------------------------------------------------------------------
+// block_q4_1
+//------------------------------------------------------------------------------
+struct block_q4_1 {
+    half d; // delta
+    half m; // min
+    uchar qs[QK4_1 / 2]; // nibbles / quants
+};
+
 //------------------------------------------------------------------------------
 // block_q6_K
 //------------------------------------------------------------------------------
@@ -148,6 +157,48 @@ kernel void kernel_restore_block_q4_0_noshuffle(
     }
 }
 
+//------------------------------------------------------------------------------
+// kernel_convert_block_q4_1
+// Convert the block_q4_1 format to 2 separate arrays (AOS -> SOA).
+// This kernel does not deshuffle the bits.
+//------------------------------------------------------------------------------
+kernel void kernel_convert_block_q4_1(
+    global struct block_q4_1 * src0,
+    global uchar * dst_q,
+    global half  * dst_d,
+    global half  * dst_m
+) {
+    global struct block_q4_1 * b = (global struct block_q4_1 *) src0 + get_global_id(0);
+    global uchar * q = (global uchar *) dst_q + QK4_1/2*get_global_id(0);
+    global half  * d = (global half *) dst_d + get_global_id(0);
+    global half  * m = (global half *) dst_m + get_global_id(0);
+
+    *d = b->d;
+    *m = b->m;
+
+    for (int i = 0; i < QK4_1/2; ++i) {
+        q[i] = b->qs[i];
+    }
+}
+
+kernel void kernel_restore_block_q4_1(
+    global uchar * src_q,
+    global half  * src_d,
+    global half  * src_m,
+    global struct block_q4_1 * dst
+) {
+    global struct block_q4_1 * b = (global struct block_q4_1 *) dst + get_global_id(0);
+    global uchar * q = (global uchar *) src_q + QK4_1/2*get_global_id(0);
+    global half  * d = (global half *) src_d + get_global_id(0);
+    global half  * m = (global half *) src_m + get_global_id(0);
+
+    b->d = *d;
+    b->m = *m;
+    for (int i = 0; i < QK4_1/2; ++i) {
+        b->qs[i] = q[i];
+    }
+}
+
 //------------------------------------------------------------------------------
 // block_mxfp4
 //------------------------------------------------------------------------------
diff --git a/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl b/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl
new file mode 100644
index 0000000000..4100e3080a
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl
@@ -0,0 +1,163 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+#define LOAD_VEC_A 8
+#define LOAD_VEC_B 4
+
+#define BM 64
+#define BN 64
+#define BK 32
+#define TM 4
+#define TN 8
+
+kernel void kernel_mul_mm_q4_0_f32_l4_lm(
+    global uchar4 * src0_q,
+    global half   * src0_d,
+    global float4 * src1,
+    ulong offset1,
+    global float  * dst,
+    ulong offsetd,
+
+    int ne00,
+    int ne01,
+    int ne02,
+    int ne11,
+    int ne12,
+
+    int stride_a,
+    int stride_b,
+    int stride_d,
+
+    int batch_stride_a,
+    int batch_stride_b,
+    int batch_stride_d,
+
+    int r2,
+    int r3
+) {
+    src1 = (global float4*)((global char*)src1 + offset1);
+    dst  = (global float *)((global char*)dst  + offsetd);
+
+    local float buf_a[BM * BK];
+    local float buf_b[BN * BK];
+
+    const int batch_idx = get_global_id(2);
+
+    const int i13 = batch_idx / ne12;
+    const int i12 = batch_idx % ne12;
+
+    const int i03 = i13 / r3;
+    const int i02 = i12 / r2;
+
+    const int batch_idx_a = i03 * ne02 + i02;
+
+    const int ir = get_group_id(0);
+    const int ic = get_group_id(1);
+
+    const int tid = get_local_id(0);
+    const int th_r  = tid % (BM / TM);
+    const int th_c  = tid / (BM / TM);
+
+    const int loadr_a = get_local_id(0) % (BK / LOAD_VEC_A);
+    const int loadc_a = get_local_id(0) / (BK / LOAD_VEC_A);
+    const int loadr_b = get_local_id(0) % (BK / LOAD_VEC_B);
+    const int loadc_b = get_local_id(0) / (BK / LOAD_VEC_B);
+
+    const int loadstride_a = get_local_size(0) * LOAD_VEC_A / BK;
+    const int loadstride_b = get_local_size(0) * LOAD_VEC_B / BK;
+
+    int pos_a = (batch_idx_a * batch_stride_a + ir * BM * stride_a) / LOAD_VEC_A;
+    int pos_b = (batch_idx   * batch_stride_b + ic * BN * stride_b) / LOAD_VEC_B;
+
+    float sums[TM * TN];
+    float cache_a[TM];
+    float cache_b[TN];
+
+    for (int i = 0; i < TM * TN; i++) {
+        sums[i] = 0.0f;
+    }
+
+    for (int block = 0; block < ne00; block += BK) {
+        for (int l = 0; l < BM; l += loadstride_a) {
+            if (ir*BM + loadc_a + l < ne01) {
+                int idx = pos_a + (loadc_a + l) * stride_a / LOAD_VEC_A + loadr_a;
+                int ib  = idx / 4;
+                int iqs = idx % 4;
+
+                float d = (float)src0_d[ib];
+                global uchar4 * qs = src0_q + ib*4 + iqs;
+                uchar4 q = *qs;
+                float4 v1 = (convert_float4((uchar4)((q.s0   )&0x0F, (q.s1   )&0x0F, (q.s2   )&0x0F, (q.s3   )&0x0F)) - 8.0f)*d;
+                float4 v2 = (convert_float4((uchar4)((q.s0>>4)&0x0F, (q.s1>>4)&0x0F, (q.s2>>4)&0x0F, (q.s3>>4)&0x0F)) - 8.0f)*d;
+
+                buf_a[(loadr_a * 4 +  0) * BM + loadc_a + l] = v1.s0;
+                buf_a[(loadr_a * 4 +  1) * BM + loadc_a + l] = v1.s1;
+                buf_a[(loadr_a * 4 +  2) * BM + loadc_a + l] = v1.s2;
+                buf_a[(loadr_a * 4 +  3) * BM + loadc_a + l] = v1.s3;
+                buf_a[(loadr_a * 4 + 16) * BM + loadc_a + l] = v2.s0;
+                buf_a[(loadr_a * 4 + 17) * BM + loadc_a + l] = v2.s1;
+                buf_a[(loadr_a * 4 + 18) * BM + loadc_a + l] = v2.s2;
+                buf_a[(loadr_a * 4 + 19) * BM + loadc_a + l] = v2.s3;
+            } else {
+                buf_a[(loadr_a * 4 +  0) * BM + loadc_a + l] = 0.0f;
+                buf_a[(loadr_a * 4 +  1) * BM + loadc_a + l] = 0.0f;
+                buf_a[(loadr_a * 4 +  2) * BM + loadc_a + l] = 0.0f;
+                buf_a[(loadr_a * 4 +  3) * BM + loadc_a + l] = 0.0f;
+                buf_a[(loadr_a * 4 + 16) * BM + loadc_a + l] = 0.0f;
+                buf_a[(loadr_a * 4 + 17) * BM + loadc_a + l] = 0.0f;
+                buf_a[(loadr_a * 4 + 18) * BM + loadc_a + l] = 0.0f;
+                buf_a[(loadr_a * 4 + 19) * BM + loadc_a + l] = 0.0f;
+            }
+        }
+
+        for (int l = 0; l < BN; l += loadstride_b) {
+            if (ic*BN + loadc_b + l < ne11) {
+                int idx = pos_b + (loadc_b + l) * stride_b / LOAD_VEC_B + loadr_b;
+                buf_b[(loadr_b * LOAD_VEC_B + 0) * BN + loadc_b + l] = src1[idx].s0;
+                buf_b[(loadr_b * LOAD_VEC_B + 1) * BN + loadc_b + l] = src1[idx].s1;
+                buf_b[(loadr_b * LOAD_VEC_B + 2) * BN + loadc_b + l] = src1[idx].s2;
+                buf_b[(loadr_b * LOAD_VEC_B + 3) * BN + loadc_b + l] = src1[idx].s3;
+            } else {
+                buf_b[(loadr_b * LOAD_VEC_B + 0) * BN + loadc_b + l] = 0.0f;
+                buf_b[(loadr_b * LOAD_VEC_B + 1) * BN + loadc_b + l] = 0.0f;
+                buf_b[(loadr_b * LOAD_VEC_B + 2) * BN + loadc_b + l] = 0.0f;
+                buf_b[(loadr_b * LOAD_VEC_B + 3) * BN + loadc_b + l] = 0.0f;
+            }
+        }
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        pos_a += BK / LOAD_VEC_A;
+        pos_b += BK / LOAD_VEC_B;
+
+        for (int i = 0; i < BK; i++) {
+            for (int j = 0; j < TM; j++) {
+                cache_a[j] = buf_a[(i) * BM + th_r * TM + j];
+            }
+
+            for (int j = 0; j < TN; j++) {
+                cache_b[j] = buf_b[(i) * BN + th_c * TN + j];
+            }
+
+            for (int cc = 0; cc < TN; cc++) {
+                for (int cr = 0; cr < TM; cr++) {
+                    const int sums_idx = cc*TM + cr;
+                    sums[sums_idx] = mad(cache_a[cr], cache_b[cc], sums[sums_idx]);
+                }
+            }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    const int dr = ir * BM + th_r * TM;
+    const int dc = ic * BN + th_c * TN;
+
+    const int offsets = batch_idx * batch_stride_d;
+
+    for (int cc = 0; cc < TN; cc++) {
+        for (int cr = 0; cr < TM; cr++) {
+            if (dr + cr < ne01 && dc + cc < ne11) {
+                dst[offsets + (dc + cc) * stride_d + dr + cr] = sums[cc * TM + cr];
+            }
+        }
+    }
+}
diff --git a/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl b/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl
new file mode 100644
index 0000000000..d0d2f08361
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl
@@ -0,0 +1,165 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+#define LOAD_VEC_A 8
+#define LOAD_VEC_B 4
+
+#define BM 64
+#define BN 64
+#define BK 32
+#define TM 4
+#define TN 8
+
+kernel void kernel_mul_mm_q4_1_f32_l4_lm(
+    global uchar4 * src0_q,
+    global half   * src0_d,
+    global half   * src0_m,
+    global float4 * src1,
+    ulong offset1,
+    global float  * dst,
+    ulong offsetd,
+
+    int ne00,
+    int ne01,
+    int ne02,
+    int ne11,
+    int ne12,
+
+    int stride_a,
+    int stride_b,
+    int stride_d,
+
+    int batch_stride_a,
+    int batch_stride_b,
+    int batch_stride_d,
+
+    int r2,
+    int r3
+) {
+    src1 = (global float4*)((global char*)src1 + offset1);
+    dst  = (global float *)((global char*)dst  + offsetd);
+
+    local float buf_a[BM * BK];
+    local float buf_b[BN * BK];
+
+    const int batch_idx = get_global_id(2);
+
+    const int i13 = batch_idx / ne12;
+    const int i12 = batch_idx % ne12;
+
+    const int i03 = i13 / r3;
+    const int i02 = i12 / r2;
+
+    const int batch_idx_a = i03 * ne02 + i02;
+
+    const int ir = get_group_id(0);
+    const int ic = get_group_id(1);
+
+    const int tid = get_local_id(0);
+    const int th_r  = tid % (BM / TM);
+    const int th_c  = tid / (BM / TM);
+
+    const int loadr_a = get_local_id(0) % (BK / LOAD_VEC_A);
+    const int loadc_a = get_local_id(0) / (BK / LOAD_VEC_A);
+    const int loadr_b = get_local_id(0) % (BK / LOAD_VEC_B);
+    const int loadc_b = get_local_id(0) / (BK / LOAD_VEC_B);
+
+    const int loadstride_a = get_local_size(0) * LOAD_VEC_A / BK;
+    const int loadstride_b = get_local_size(0) * LOAD_VEC_B / BK;
+
+    int pos_a = (batch_idx_a * batch_stride_a + ir * BM * stride_a) / LOAD_VEC_A;
+    int pos_b = (batch_idx   * batch_stride_b + ic * BN * stride_b) / LOAD_VEC_B;
+
+    float sums[TM * TN];
+    float cache_a[TM];
+    float cache_b[TN];
+
+    for (int i = 0; i < TM * TN; i++) {
+        sums[i] = 0.0f;
+    }
+
+    for (int block = 0; block < ne00; block += BK) {
+        for (int l = 0; l < BM; l += loadstride_a) {
+            if (ir*BM + loadc_a + l < ne01) {
+                int idx = pos_a + (loadc_a + l) * stride_a / LOAD_VEC_A + loadr_a;
+                int ib  = idx / 4;
+                int iqs = idx % 4;
+
+                float d = (float)src0_d[ib];
+                float m = (float)src0_m[ib];
+                global uchar4 * qs = src0_q + ib*4 + iqs;
+                uchar4 q = *qs;
+                float4 v1 = (convert_float4((uchar4)((q.s0   )&0x0F, (q.s1   )&0x0F, (q.s2   )&0x0F, (q.s3   )&0x0F)))*d + m;
+                float4 v2 = (convert_float4((uchar4)((q.s0>>4)&0x0F, (q.s1>>4)&0x0F, (q.s2>>4)&0x0F, (q.s3>>4)&0x0F)))*d + m;
+
+                buf_a[(loadr_a * 4 +  0) * BM + loadc_a + l] = v1.s0;
+                buf_a[(loadr_a * 4 +  1) * BM + loadc_a + l] = v1.s1;
+                buf_a[(loadr_a * 4 +  2) * BM + loadc_a + l] = v1.s2;
+                buf_a[(loadr_a * 4 +  3) * BM + loadc_a + l] = v1.s3;
+                buf_a[(loadr_a * 4 + 16) * BM + loadc_a + l] = v2.s0;
+                buf_a[(loadr_a * 4 + 17) * BM + loadc_a + l] = v2.s1;
+                buf_a[(loadr_a * 4 + 18) * BM + loadc_a + l] = v2.s2;
+                buf_a[(loadr_a * 4 + 19) * BM + loadc_a + l] = v2.s3;
+            } else {
+                buf_a[(loadr_a * 4 +  0) * BM + loadc_a + l] = 0.0f;
+                buf_a[(loadr_a * 4 +  1) * BM + loadc_a + l] = 0.0f;
+                buf_a[(loadr_a * 4 +  2) * BM + loadc_a + l] = 0.0f;
+                buf_a[(loadr_a * 4 +  3) * BM + loadc_a + l] = 0.0f;
+                buf_a[(loadr_a * 4 + 16) * BM + loadc_a + l] = 0.0f;
+                buf_a[(loadr_a * 4 + 17) * BM + loadc_a + l] = 0.0f;
+                buf_a[(loadr_a * 4 + 18) * BM + loadc_a + l] = 0.0f;
+                buf_a[(loadr_a * 4 + 19) * BM + loadc_a + l] = 0.0f;
+            }
+        }
+
+        for (int l = 0; l < BN; l += loadstride_b) {
+            if (ic*BN + loadc_b + l < ne11) {
+                int idx = pos_b + (loadc_b + l) * stride_b / LOAD_VEC_B + loadr_b;
+                buf_b[(loadr_b * LOAD_VEC_B + 0) * BN + loadc_b + l] = src1[idx].s0;
+                buf_b[(loadr_b * LOAD_VEC_B + 1) * BN + loadc_b + l] = src1[idx].s1;
+                buf_b[(loadr_b * LOAD_VEC_B + 2) * BN + loadc_b + l] = src1[idx].s2;
+                buf_b[(loadr_b * LOAD_VEC_B + 3) * BN + loadc_b + l] = src1[idx].s3;
+            } else {
+                buf_b[(loadr_b * LOAD_VEC_B + 0) * BN + loadc_b + l] = 0.0f;
+                buf_b[(loadr_b * LOAD_VEC_B + 1) * BN + loadc_b + l] = 0.0f;
+                buf_b[(loadr_b * LOAD_VEC_B + 2) * BN + loadc_b + l] = 0.0f;
+                buf_b[(loadr_b * LOAD_VEC_B + 3) * BN + loadc_b + l] = 0.0f;
+            }
+        }
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        pos_a += BK / LOAD_VEC_A;
+        pos_b += BK / LOAD_VEC_B;
+
+        for (int i = 0; i < BK; i++) {
+            for (int j = 0; j < TM; j++) {
+                cache_a[j] = buf_a[(i) * BM + th_r * TM + j];
+            }
+
+            for (int j = 0; j < TN; j++) {
+                cache_b[j] = buf_b[(i) * BN + th_c * TN + j];
+            }
+
+            for (int cc = 0; cc < TN; cc++) {
+                for (int cr = 0; cr < TM; cr++) {
+                    const int sums_idx = cc*TM + cr;
+                    sums[sums_idx] = mad(cache_a[cr], cache_b[cc], sums[sums_idx]);
+                }
+            }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    const int dr = ir * BM + th_r * TM;
+    const int dc = ic * BN + th_c * TN;
+
+    const int offsets = batch_idx * batch_stride_d;
+
+    for (int cc = 0; cc < TN; cc++) {
+        for (int cr = 0; cr < TM; cr++) {
+            if (dr + cr < ne01 && dc + cc < ne11) {
+                dst[offsets + (dc + cc) * stride_d + dr + cr] = sums[cc * TM + cr];
+            }
+        }
+    }
+}
diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl b/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl
new file mode 100644
index 0000000000..6fe828f20e
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl
@@ -0,0 +1,219 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+#ifdef cl_intel_subgroups
+#pragma OPENCL EXTENSION cl_intel_subgroups : enable
+#else
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#endif
+
+#ifdef cl_intel_required_subgroup_size
+#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
+#define INTEL_GPU 1
+#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
+#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
+#elif defined(cl_qcom_reqd_sub_group_size)
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+#define ADRENO_GPU 1
+#define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
+#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
+#endif
+
+#define QK4_1                   32
+
+struct block_q4_1 {
+    half d; // delta
+    half m; // min
+    uchar qs[QK4_1 / 2]; // nibbles / quants
+};
+
+inline float block_q4_1_dot_y(
+    global const struct block_q4_1 * qb_curr,
+    float sumy,
+    float16 yl,
+    int il
+) {
+    float d = qb_curr->d;
+    float m = qb_curr->m;
+
+    float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
+
+    global const ushort * qs = ((global const ushort *) qb_curr + 2 + il/2);
+
+    acc.s0 += yl.s0 * (qs[0] & 0x000F);
+    acc.s0 += yl.s1 * (qs[0] & 0x0F00);
+    acc.s0 += yl.s8 * (qs[0] & 0x00F0);
+    acc.s3 += yl.s9 * (qs[0] & 0xF000);
+
+    acc.s0 += yl.s2 * (qs[1] & 0x000F);
+    acc.s1 += yl.s3 * (qs[1] & 0x0F00);
+    acc.s2 += yl.sa * (qs[1] & 0x00F0);
+    acc.s3 += yl.sb * (qs[1] & 0xF000);
+
+    acc.s0 += yl.s4 * (qs[2] & 0x000F);
+    acc.s1 += yl.s5 * (qs[2] & 0x0F00);
+    acc.s2 += yl.sc * (qs[2] & 0x00F0);
+    acc.s3 += yl.sd * (qs[2] & 0xF000);
+
+    acc.s0 += yl.s6 * (qs[3] & 0x000F);
+    acc.s1 += yl.s7 * (qs[3] & 0x0F00);
+    acc.s2 += yl.se * (qs[3] & 0x00F0);
+    acc.s3 += yl.sf * (qs[3] & 0xF000);
+
+    return d * (acc.s0 + acc.s1 + acc.s2 + acc.s3) + sumy * m;
+}
+
+#undef N_DST
+#undef N_SIMDGROUP
+#undef N_SIMDWIDTH
+
+#ifdef INTEL_GPU
+#define N_DST 4 // each subgroup works on 4 rows
+#define N_SIMDGROUP 1 // number of subgroups in a thread group
+#define N_SIMDWIDTH 16 // assuming subgroup size is 16
+#elif defined (ADRENO_GPU)
+#define N_DST 4
+#define N_SIMDGROUP 1
+#define N_SIMDWIDTH 64
+#endif
+
+inline void mul_vec_q_n_f32(
+        global void * src0,
+        global float * src1,
+        global float * dst,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne10,
+        int ne12,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3
+) {
+    const ulong nb = ne00/QK4_1;
+
+    int r0 = get_group_id(0);
+    int r1 = get_group_id(1);
+    int im = get_group_id(2);
+
+    int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
+
+    int i12 = im%ne12;
+    int i13 = im/ne12;
+
+    ulong offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+
+    global struct block_q4_1 * x = (global struct block_q4_1 *) src0 + offset0;
+    global float             * y = (global float             *) src1 + r1*ne10 + im*ne00*ne1;
+
+    float16 yl;
+    float4 sumf = (float4)(0.f, 0.f, 0.f, 0.f);
+
+    int ix = get_sub_group_local_id()/2;
+    int il = 8*(get_sub_group_local_id()%2);
+
+    global float * yb = y + ix * QK4_1 + il;
+
+    for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
+        float sumy = 0;
+
+        sumy += yb[0];
+        sumy += yb[1];
+        sumy += yb[2];
+        sumy += yb[3];
+        sumy += yb[4];
+        sumy += yb[5];
+        sumy += yb[6];
+        sumy += yb[7];
+
+        sumy += yb[16];
+        sumy += yb[17];
+        sumy += yb[18];
+        sumy += yb[19];
+        sumy += yb[20];
+        sumy += yb[21];
+        sumy += yb[22];
+        sumy += yb[23];
+
+
+        yl.s0 = yb[0];
+        yl.s1 = yb[1]/256.f;
+
+        yl.s2 = yb[2];
+        yl.s3 = yb[3]/256.f;
+
+        yl.s4 = yb[4];
+        yl.s5 = yb[5]/256.f;
+
+        yl.s6 = yb[6];
+        yl.s7 = yb[7]/256.f;
+
+        yl.s8 = yb[16]/16.f;
+        yl.s9 = yb[17]/4096.f;
+
+        yl.sa = yb[18]/16.f;
+        yl.sb = yb[19]/4096.f;
+
+        yl.sc = yb[20]/16.f;
+        yl.sd = yb[21]/4096.f;
+
+        yl.se = yb[22]/16.f;
+        yl.sf = yb[23]/4096.f;
+
+        sumf.s0 += block_q4_1_dot_y(x+ib+0*nb, sumy, yl, il);
+        sumf.s1 += block_q4_1_dot_y(x+ib+1*nb, sumy, yl, il);
+        sumf.s2 += block_q4_1_dot_y(x+ib+2*nb, sumy, yl, il);
+        sumf.s3 += block_q4_1_dot_y(x+ib+3*nb, sumy, yl, il);
+
+        yb += QK4_1 * (N_SIMDWIDTH/2);
+    }
+
+    float4 tot = (float4)(
+        sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
+        sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3)
+    );
+
+    if (get_sub_group_local_id() == 0) {
+        if (first_row + 0 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
+        }
+        if (first_row + 1 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
+        }
+        if (first_row + 2 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
+        }
+        if (first_row + 3 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
+        }
+    }
+}
+
+#ifdef INTEL_GPU
+REQD_SUBGROUP_SIZE_16
+#elif defined (ADRENO_GPU)
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_mul_mv_q4_1_f32(
+        global void * src0,
+        ulong offset0,
+        global float * src1,
+        ulong offset1,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne10,
+        int ne12,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3
+) {
+    src0 = (global void*)((global char*)src0 + offset0);
+    src1 = (global float*)((global char*)src1 + offset1);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    mul_vec_q_n_f32(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
+}
diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl b/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl
new file mode 100644
index 0000000000..d7c4645d67
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl
@@ -0,0 +1,229 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+#ifdef cl_intel_subgroups
+#pragma OPENCL EXTENSION cl_intel_subgroups : enable
+#else
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#endif
+
+#ifdef cl_intel_required_subgroup_size
+#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
+#define INTEL_GPU 1
+#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
+#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
+#elif defined(cl_qcom_reqd_sub_group_size)
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+#define ADRENO_GPU 1
+#define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
+#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
+#endif
+
+#define QK4_1                   32
+
+struct block_q4_1 {
+    half d; // delta
+    half m; // min
+    uchar qs[QK4_1 / 2]; // nibbles / quants
+};
+
+inline float block_q4_1_dot_y_flat(
+    global const uchar * x,
+    global const half  * dh,
+    global const half  * mh,
+    float sumy,
+    float16 yl,
+    int il
+) {
+    float                 d   = *dh;
+    float                 m   = *mh;
+    global const ushort * qs = ((global const ushort *) x + il/2);
+
+    float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
+
+    acc.s0 += yl.s0 * (qs[0] & 0x000F);
+    acc.s0 += yl.s1 * (qs[0] & 0x0F00);
+    acc.s0 += yl.s8 * (qs[0] & 0x00F0);
+    acc.s3 += yl.s9 * (qs[0] & 0xF000);
+
+    acc.s0 += yl.s2 * (qs[1] & 0x000F);
+    acc.s1 += yl.s3 * (qs[1] & 0x0F00);
+    acc.s2 += yl.sa * (qs[1] & 0x00F0);
+    acc.s3 += yl.sb * (qs[1] & 0xF000);
+
+    acc.s0 += yl.s4 * (qs[2] & 0x000F);
+    acc.s1 += yl.s5 * (qs[2] & 0x0F00);
+    acc.s2 += yl.sc * (qs[2] & 0x00F0);
+    acc.s3 += yl.sd * (qs[2] & 0xF000);
+
+    acc.s0 += yl.s6 * (qs[3] & 0x000F);
+    acc.s1 += yl.s7 * (qs[3] & 0x0F00);
+    acc.s2 += yl.se * (qs[3] & 0x00F0);
+    acc.s3 += yl.sf * (qs[3] & 0xF000);
+
+    return d * (acc.s0 + acc.s1 + acc.s2 + acc.s3) + sumy * m;
+}
+
+#undef N_DST
+#undef N_SIMDGROUP
+#undef N_SIMDWIDTH
+
+#ifdef INTEL_GPU
+#define N_DST 4 // each subgroup works on 4 rows
+#define N_SIMDGROUP 1 // number of subgroups in a thread group
+#define N_SIMDWIDTH 16 // assuming subgroup size is 16
+#elif defined (ADRENO_GPU)
+#define N_DST 4
+#define N_SIMDGROUP 1
+#define N_SIMDWIDTH 64
+#endif
+
+inline void mul_vec_q_n_f32_flat(
+        global void * src0_q,
+        global void * src0_d,
+        global void * src0_m,
+        global float * src1,
+        global float * dst,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne10,
+        int ne12,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3
+) {
+    const ulong nb = ne00/QK4_1;
+
+    int r0 = get_group_id(0);
+    int r1 = get_group_id(1);
+    int im = get_group_id(2);
+
+    int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
+
+    int i12 = im%ne12;
+    int i13 = im/ne12;
+
+    ulong offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+
+    // The number of scales/mins is the same as the number of blocks.
+    ulong offset0_dm = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02));
+    // Each block contains QK4_1/2 uchars, hence offset for qs is as follows.
+    ulong offset0_q  = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_1/2;
+
+    global uchar * x = (global uchar *) src0_q + offset0_q;
+    global half  * d = (global half  *) src0_d + offset0_dm;
+    global half  * m = (global half  *) src0_m + offset0_dm;
+    global float * y = (global float *) src1   + r1*ne10 + im*ne00*ne1;
+
+    float16 yl;
+    float4 sumf = (float4)(0.f, 0.f, 0.f, 0.f);
+
+    int ix = get_sub_group_local_id()/2;
+    int il = 8*(get_sub_group_local_id()%2);
+
+    global float * yb = y + ix * QK4_1 + il;
+
+    for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
+        float sumy = 0;
+
+        sumy += yb[0];
+        sumy += yb[1];
+        sumy += yb[2];
+        sumy += yb[3];
+        sumy += yb[4];
+        sumy += yb[5];
+        sumy += yb[6];
+        sumy += yb[7];
+
+        sumy += yb[16];
+        sumy += yb[17];
+        sumy += yb[18];
+        sumy += yb[19];
+        sumy += yb[20];
+        sumy += yb[21];
+        sumy += yb[22];
+        sumy += yb[23];
+
+
+        yl.s0 = yb[0];
+        yl.s1 = yb[1]/256.f;
+
+        yl.s2 = yb[2];
+        yl.s3 = yb[3]/256.f;
+
+        yl.s4 = yb[4];
+        yl.s5 = yb[5]/256.f;
+
+        yl.s6 = yb[6];
+        yl.s7 = yb[7]/256.f;
+
+        yl.s8 = yb[16]/16.f;
+        yl.s9 = yb[17]/4096.f;
+
+        yl.sa = yb[18]/16.f;
+        yl.sb = yb[19]/4096.f;
+
+        yl.sc = yb[20]/16.f;
+        yl.sd = yb[21]/4096.f;
+
+        yl.se = yb[22]/16.f;
+        yl.sf = yb[23]/4096.f;
+
+        sumf.s0 += block_q4_1_dot_y_flat(x + ib*QK4_1/2 + 0*nb*QK4_1/2, d + ib + 0*nb, m + ib + 0*nb, sumy, yl, il);
+        sumf.s1 += block_q4_1_dot_y_flat(x + ib*QK4_1/2 + 1*nb*QK4_1/2, d + ib + 1*nb, m + ib + 1*nb, sumy, yl, il);
+        sumf.s2 += block_q4_1_dot_y_flat(x + ib*QK4_1/2 + 2*nb*QK4_1/2, d + ib + 2*nb, m + ib + 2*nb, sumy, yl, il);
+        sumf.s3 += block_q4_1_dot_y_flat(x + ib*QK4_1/2 + 3*nb*QK4_1/2, d + ib + 3*nb, m + ib + 3*nb, sumy, yl, il);
+
+        yb += QK4_1 * (N_SIMDWIDTH/2);
+    }
+
+    float4 tot = (float4)(
+        sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
+        sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3)
+    );
+
+    if (get_sub_group_local_id() == 0) {
+        if (first_row + 0 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
+        }
+        if (first_row + 1 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
+        }
+        if (first_row + 2 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
+        }
+        if (first_row + 3 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
+        }
+    }
+}
+
+#ifdef INTEL_GPU
+REQD_SUBGROUP_SIZE_16
+#elif defined (ADRENO_GPU)
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_mul_mv_q4_1_f32_flat(
+        global void * src0_q,
+        global void * src0_d,
+        global void * src0_m,
+        global float * src1,
+        ulong offset1,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne10,
+        int ne12,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3
+) {
+    src1 = (global float*)((global char*)src1 + offset1);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    mul_vec_q_n_f32_flat(src0_q, src0_d, src0_m, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
+}
diff --git a/include/llama.h b/include/llama.h
index 46c3672e98..305623127c 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -1150,9 +1150,9 @@ extern "C" {
     //
 
     /// Apply chat template. Inspired by hf apply_chat_template() on python.
-    /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
+    ///
     /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggml-org/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
-    /// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
+    /// @param tmpl A Jinja template to use for this chat.
     /// @param chat Pointer to a list of multiple llama_chat_message
     /// @param n_msg Number of llama_chat_message in this chat
     /// @param add_ass Whether to end the prompt with the token(s) that indicate the start of an assistant message.
diff --git a/scripts/pr2wt.sh b/scripts/pr2wt.sh
index bd635f3b9d..067f5d466b 100755
--- a/scripts/pr2wt.sh
+++ b/scripts/pr2wt.sh
@@ -30,12 +30,18 @@ fi
 PR=$1
 [[ "$PR" =~ ^[0-9]+$ ]] || { echo "error: PR number must be numeric"; exit 1; }
 
+url_origin=$(git config --get remote.upstream.url 2>/dev/null) || \
 url_origin=$(git config --get remote.origin.url) || {
-    echo "error: no remote named 'origin' in this repository"
+    echo "error: no remote named 'upstream' or 'origin' in this repository"
     exit 1
 }
 
-org_repo=$(echo $url_origin | cut -d/ -f4-)
+# Extract org/repo from either https or ssh format.
+if [[ $url_origin =~ ^git@ ]]; then
+    org_repo=$(echo $url_origin | cut -d: -f2)
+else
+    org_repo=$(echo $url_origin | cut -d/ -f4-)
+fi
 org_repo=${org_repo%.git}
 
 echo "org/repo: $org_repo"
diff --git a/scripts/sync_vendor.py b/scripts/sync_vendor.py
index 1ff6a9a40f..68db04dea9 100755
--- a/scripts/sync_vendor.py
+++ b/scripts/sync_vendor.py
@@ -2,6 +2,8 @@
 
 import urllib.request
 
+HTTPLIB_VERSION = "f80864ca031932351abef49b74097c67f14719c6"
+
 vendor = {
     "https://github.com/nlohmann/json/releases/latest/download/json.hpp":     "vendor/nlohmann/json.hpp",
     "https://github.com/nlohmann/json/releases/latest/download/json_fwd.hpp": "vendor/nlohmann/json_fwd.hpp",
@@ -12,8 +14,8 @@ vendor = {
     # "https://github.com/mackron/miniaudio/raw/refs/tags/0.11.23/miniaudio.h": "vendor/miniaudio/miniaudio.h",
     "https://github.com/mackron/miniaudio/raw/669ed3e844524fcd883231b13095baee9f6de304/miniaudio.h": "vendor/miniaudio/miniaudio.h",
 
-    "https://raw.githubusercontent.com/yhirose/cpp-httplib/refs/tags/v0.30.2/httplib.h": "vendor/cpp-httplib/httplib.h",
-    "https://raw.githubusercontent.com/yhirose/cpp-httplib/refs/tags/v0.30.2/LICENSE":   "vendor/cpp-httplib/LICENSE",
+    f"https://raw.githubusercontent.com/yhirose/cpp-httplib/{HTTPLIB_VERSION}/httplib.h": "vendor/cpp-httplib/httplib.h",
+    f"https://raw.githubusercontent.com/yhirose/cpp-httplib/{HTTPLIB_VERSION}/LICENSE":   "vendor/cpp-httplib/LICENSE",
 
     "https://raw.githubusercontent.com/sheredom/subprocess.h/b49c56e9fe214488493021017bf3954b91c7c1f5/subprocess.h": "vendor/sheredom/subprocess.h",
 }
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 5816e9a954..6b7da69e9d 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -7965,7 +7965,6 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                             cparams.n_seq_max,
                             nullptr);
                 } else if (llm_arch_is_hybrid(arch)) {
-
                     // The main difference between hybrid architectures is the
                     // layer filters, so pick the right one here
                     llama_memory_hybrid::layer_filter_cb filter_attn = nullptr;
@@ -7990,7 +7989,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                             /* attn_type_v       */ params.type_v,
                             /* attn_v_trans      */ !cparams.flash_attn,
                             /* attn_swa_full     */ params.swa_full,
-                            /* attn_kv_size      */ cparams.n_ctx,
+                            /* attn_kv_size      */ cparams.n_ctx_seq,
                             /* attn_n_ubatch     */ cparams.n_ubatch,
                             /* attn_n_pad        */ 1,
                             /* recurrent_type_r  */ GGML_TYPE_F32,
@@ -8007,7 +8006,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                             /* attn_type_k       */ params.type_k,
                             /* attn_type_v       */ params.type_v,
                             /* attn_v_trans      */ !cparams.flash_attn,
-                            /* attn_kv_size      */ cparams.n_ctx,
+                            /* attn_kv_size      */ cparams.n_ctx_seq,
                             /* attn_n_pad        */ 1,
                             /* attn_n_swa        */ hparams.n_swa,
                             /* attn_swa_type     */ hparams.swa_type,
diff --git a/src/unicode.cpp b/src/unicode.cpp
index adfc489d1f..b88d953bd2 100644
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -1,16 +1,10 @@
-#if defined(_MSC_VER)
-#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
-#endif
-
 #include "unicode.h"
 #include "unicode-data.h"
 
 #include <algorithm>
 #include <cassert>
-#include <codecvt>
 #include <cstddef>
 #include <cstdint>
-#include <locale>
 #include <map>
 #include <regex>
 #include <stdexcept>
@@ -199,27 +193,6 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
     return map;
 }
 
-static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
-#if defined(__clang__)
-    // disable C++17 deprecation warning for std::codecvt_utf8
-#    pragma clang diagnostic push
-#    pragma clang diagnostic ignored "-Wdeprecated-declarations"
-#elif defined(__GNUC__)
-#    pragma GCC diagnostic push
-#    pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-#endif
-
-    std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
-
-#if defined(__clang__)
-#    pragma clang diagnostic pop
-#elif defined(__GNUC__)
-#    pragma GCC diagnostic pop
-#endif
-
-    return conv.from_bytes(s);
-}
-
 static std::vector<std::string> unicode_byte_encoding_process(const std::vector<std::string> & bpe_words) {
     std::vector<std::string> bpe_encoded_words;
     for (const auto & word : bpe_words) {
@@ -1028,10 +1001,10 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
                     break;
                 }
             }
+            const auto cpts_regex = unicode_cpts_from_utf8(regex_expr);
 
             if (use_collapsed) {
                 // sanity-check that the original regex does not contain any non-ASCII characters
-                const auto cpts_regex = unicode_cpts_from_utf8(regex_expr);
                 for (size_t i = 0; i < cpts_regex.size(); ++i) {
                     if (cpts_regex[i] >= 128) {
                         throw std::runtime_error("Regex includes both unicode categories and non-ASCII characters - not supported");
@@ -1087,7 +1060,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
                 bpe_offsets = unicode_regex_split_stl(text_collapsed, regex_expr_collapsed, bpe_offsets);
             } else {
                 // no unicode category used, we can use std::wregex directly
-                const std::wstring wregex_expr = unicode_wstring_from_utf8(regex_expr);
+                std::wstring wregex_expr(cpts_regex.begin(), cpts_regex.end());
 
                 // std::wregex \s does not mach non-ASCII whitespaces, using 0x0B as fallback
                 std::wstring wtext(cpts.begin(), cpts.end());
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index ed99c24516..8816f6963f 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -2786,9 +2786,10 @@ struct test_set : public test_case {
     const ggml_type type_dst;
     const std::array<int64_t, 4> ne;
     const int dim;
+    const bool inplace;
 
     std::string vars() override {
-        return VARS_TO_STR4(type_src, type_dst, ne, dim);
+        return VARS_TO_STR5(type_src, type_dst, ne, dim, inplace);
     }
 
     size_t op_size(ggml_tensor * t) override {
@@ -2796,8 +2797,8 @@ struct test_set : public test_case {
     }
 
     test_set(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {6, 5, 4, 3}, int dim = 1)
-        : type_src(type_src), type_dst(type_dst), ne(ne), dim(dim) {}
+            std::array<int64_t, 4> ne = {6, 5, 4, 3}, int dim = 1, bool inplace = false)
+        : type_src(type_src), type_dst(type_dst), ne(ne), dim(dim), inplace(inplace) {}
 
     ggml_tensor * build_graph(ggml_context * ctx) override {
         ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data());
@@ -2808,7 +2809,7 @@ struct test_set : public test_case {
         for (int i = 0; i < dim; ++i) {
             ne_dst[i] *= 2;
         }
-        ggml_tensor* dst = ggml_new_tensor(ctx, type_dst, 4, ne_dst.data());
+        ggml_tensor * dst = ggml_new_tensor(ctx, type_dst, 4, ne_dst.data());
         ggml_set_param(dst);
         ggml_set_name(dst, "dst");
 
@@ -2816,9 +2817,16 @@ struct test_set : public test_case {
         for (int i = 0; i < dim; ++i) {
             offset += ((ne_dst[i] - ne[i])/2)*dst->nb[i];
         }
-        ggml_tensor * out = ggml_set(ctx, dst, src,
-            // The backward pass requires setting a contiguous region:
-            src->nb[1], src->nb[2], src->nb[3], offset);
+        ggml_tensor * out;
+        if (inplace) {
+            out = ggml_set_inplace(ctx, dst, src,
+                    // The backward pass requires setting a contiguous region:
+                    src->nb[1], src->nb[2], src->nb[3], offset);
+        } else {
+            out = ggml_set(ctx, dst, src,
+                    // The backward pass requires setting a contiguous region:
+                    src->nb[1], src->nb[2], src->nb[3], offset);
+        }
         ggml_set_name(out, "out");
 
         return out;
@@ -7428,11 +7436,13 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10,  8, 3, 1}, {1, 2, 0, 3}));
 
     for (int dim = 1; dim < GGML_MAX_DIMS; ++dim) {
-        test_cases.emplace_back(new test_set(GGML_TYPE_F32, GGML_TYPE_F32, {6, 5, 4, 3}, dim));
+        test_cases.emplace_back(new test_set(GGML_TYPE_F32, GGML_TYPE_F32, {6, 5, 4, 3}, dim, false));
+        test_cases.emplace_back(new test_set(GGML_TYPE_F32, GGML_TYPE_F32, {6, 5, 4, 3}, dim, true));
     }
 
     for (int dim = 1; dim < GGML_MAX_DIMS; ++dim) {
-        test_cases.emplace_back(new test_set(GGML_TYPE_I32, GGML_TYPE_I32, {6, 5, 4, 3}, dim));
+        test_cases.emplace_back(new test_set(GGML_TYPE_I32, GGML_TYPE_I32, {6, 5, 4, 3}, dim, false));
+        test_cases.emplace_back(new test_set(GGML_TYPE_I32, GGML_TYPE_I32, {6, 5, 4, 3}, dim, true));
     }
 
     // same-type copy
@@ -8132,24 +8142,30 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     }
 
     test_cases.emplace_back(new test_sum());
-    test_cases.emplace_back(new test_sum_rows());
     test_cases.emplace_back(new test_sum(GGML_TYPE_F32, {11, 5, 6, 3}, {0, 2, 1, 3}));  // row-contiguous but non-contiguous
     test_cases.emplace_back(new test_sum(GGML_TYPE_F32, {11, 5, 6, 3}, {0, 3, 2, 1}));
     test_cases.emplace_back(new test_sum(GGML_TYPE_F32, {11, 5, 6, 3}, {0, 1, 3, 2}));
+    test_cases.emplace_back(new test_mean());
+    test_cases.emplace_back(new test_mean(GGML_TYPE_F32, { 33, 1, 1, 1 }));
+    test_cases.emplace_back(new test_mean(GGML_TYPE_F32, { 33, 256, 1, 1 }));
+    test_cases.emplace_back(new test_mean(GGML_TYPE_F32, { 32769, 1, 1, 1 }));
+    test_cases.emplace_back(new test_mean(GGML_TYPE_F32, { 32, 1, 1, 1 }));
+    test_cases.emplace_back(new test_mean(GGML_TYPE_F32, { 32, 256, 1, 1 }));
+    test_cases.emplace_back(new test_mean(GGML_TYPE_F32, { 32768, 1, 1, 1 }));
+    test_cases.emplace_back(new test_sum(GGML_TYPE_F32, { 33, 1, 1, 1 }));
+    test_cases.emplace_back(new test_sum(GGML_TYPE_F32, { 33, 1024, 1, 1 }));
+    test_cases.emplace_back(new test_sum(GGML_TYPE_F32, { 33, 256, 1, 1 }));
+    test_cases.emplace_back(new test_sum(GGML_TYPE_F32, { 33, 256, 1, 1 }, { 1, 0, 2, 3 })); // sum dst not-contiguous
+    test_cases.emplace_back(new test_sum_rows());
     test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, { 11, 5, 6, 3 }, true, false));
     test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, { 11, 5, 6, 3 }, false, true));
     test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, { 11, 5, 6, 3 }, true, true));
-    test_cases.emplace_back(new test_mean());
-    test_cases.emplace_back(new test_sum(GGML_TYPE_F32, { 33, 1, 1, 1 }));
+    test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, { 16, 5, 6, 3 }, true, false));
+    test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, { 16, 5, 6, 3 }, false, true));
+    test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, { 16, 5, 6, 3 }, true, true));
     test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, { 33, 1, 1, 1 }));
-    test_cases.emplace_back(new test_mean(GGML_TYPE_F32, { 33, 1, 1, 1 }));
-    test_cases.emplace_back(new test_sum(GGML_TYPE_F32, { 33, 1024, 1, 1 }));
     test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, { 33, 1024, 1, 1 }));
-    test_cases.emplace_back(new test_sum(GGML_TYPE_F32, { 33, 256, 1, 1 }));
-    test_cases.emplace_back(new test_sum(GGML_TYPE_F32, { 33, 256, 1, 1 }, { 1, 0, 2, 3 })); // sum dst not-contiguous
     test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, { 33, 256, 1, 1 }));
-    test_cases.emplace_back(new test_mean(GGML_TYPE_F32, { 33, 256, 1, 1 }));
-    test_cases.emplace_back(new test_mean(GGML_TYPE_F32, { 32769, 1, 1, 1 }));
     test_cases.emplace_back(new test_group_norm(GGML_TYPE_F32, {64, 64, 320, 1}));
     test_cases.emplace_back(new test_group_norm(GGML_TYPE_F32, {9, 9, 1280, 1}));
     test_cases.emplace_back(new test_group_norm_mul_add(GGML_TYPE_F32, {64, 64, 320, 1}));
diff --git a/tools/server/README.md b/tools/server/README.md
index d132830171..0b56ca1e27 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -19,7 +19,7 @@ Set of LLM REST APIs and a web UI to interact with llama.cpp.
  * Speculative decoding
  * Easy-to-use web UI
 
-For the ful list of features, please refer to [server's changelog](https://github.com/ggml-org/llama.cpp/issues/9291)
+For the full list of features, please refer to [server's changelog](https://github.com/ggml-org/llama.cpp/issues/9291)
 
 ## Usage
 
diff --git a/tools/server/public/index.html.gz b/tools/server/public/index.html.gz
index e3b06f4901..f4ff57b4c9 100644
Binary files a/tools/server/public/index.html.gz and b/tools/server/public/index.html.gz differ
diff --git a/tools/server/webui/docs/flows/settings-flow.md b/tools/server/webui/docs/flows/settings-flow.md
index 578e01e6e1..474aef01b0 100644
--- a/tools/server/webui/docs/flows/settings-flow.md
+++ b/tools/server/webui/docs/flows/settings-flow.md
@@ -139,6 +139,6 @@ sequenceDiagram
 
     Note over settingsStore: UI-only (not synced):
     rect rgb(255, 240, 240)
-        Note over settingsStore: systemMessage, custom (JSON)<br/>showStatistics, enableContinueGeneration<br/>autoMicOnEmpty, disableAutoScroll<br/>apiKey, pdfAsImage, disableReasoningFormat
+        Note over settingsStore: systemMessage, custom (JSON)<br/>showStatistics, enableContinueGeneration<br/>autoMicOnEmpty, disableAutoScroll<br/>apiKey, pdfAsImage, disableReasoningParsing, showRawOutputSwitch
     end
 ```
diff --git a/tools/server/webui/src/app.css b/tools/server/webui/src/app.css
index 9705040a4d..3ab21f0cc7 100644
--- a/tools/server/webui/src/app.css
+++ b/tools/server/webui/src/app.css
@@ -14,11 +14,11 @@
 	--popover-foreground: oklch(0.145 0 0);
 	--primary: oklch(0.205 0 0);
 	--primary-foreground: oklch(0.985 0 0);
-	--secondary: oklch(0.97 0 0);
+	--secondary: oklch(0.95 0 0);
 	--secondary-foreground: oklch(0.205 0 0);
 	--muted: oklch(0.97 0 0);
 	--muted-foreground: oklch(0.556 0 0);
-	--accent: oklch(0.97 0 0);
+	--accent: oklch(0.95 0 0);
 	--accent-foreground: oklch(0.205 0 0);
 	--destructive: oklch(0.577 0.245 27.325);
 	--border: oklch(0.875 0 0);
@@ -37,7 +37,7 @@
 	--sidebar-accent-foreground: oklch(0.205 0 0);
 	--sidebar-border: oklch(0.922 0 0);
 	--sidebar-ring: oklch(0.708 0 0);
-	--code-background: oklch(0.975 0 0);
+	--code-background: oklch(0.985 0 0);
 	--code-foreground: oklch(0.145 0 0);
 	--layer-popover: 1000000;
 }
@@ -51,7 +51,7 @@
 	--popover-foreground: oklch(0.985 0 0);
 	--primary: oklch(0.922 0 0);
 	--primary-foreground: oklch(0.205 0 0);
-	--secondary: oklch(0.269 0 0);
+	--secondary: oklch(0.29 0 0);
 	--secondary-foreground: oklch(0.985 0 0);
 	--muted: oklch(0.269 0 0);
 	--muted-foreground: oklch(0.708 0 0);
@@ -116,12 +116,62 @@
 	--color-sidebar-ring: var(--sidebar-ring);
 }
 
+:root {
+	--chat-form-area-height: 8rem;
+	--chat-form-area-offset: 2rem;
+	--max-message-height: max(24rem, min(80dvh, calc(100dvh - var(--chat-form-area-height) - 12rem)));
+}
+
+@media (min-width: 640px) {
+	:root {
+		--chat-form-area-height: 24rem;
+		--chat-form-area-offset: 12rem;
+	}
+}
+
 @layer base {
 	* {
 		@apply border-border outline-ring/50;
 	}
+
 	body {
 		@apply bg-background text-foreground;
+		scrollbar-width: thin;
+		scrollbar-gutter: stable;
+	}
+
+	/* Global scrollbar styling - visible only on hover */
+	* {
+		scrollbar-width: thin;
+		scrollbar-color: transparent transparent;
+		transition: scrollbar-color 0.2s ease;
+	}
+
+	*:hover {
+		scrollbar-color: hsl(var(--muted-foreground) / 0.3) transparent;
+	}
+
+	*::-webkit-scrollbar {
+		width: 6px;
+		height: 6px;
+	}
+
+	*::-webkit-scrollbar-track {
+		background: transparent;
+	}
+
+	*::-webkit-scrollbar-thumb {
+		background: transparent;
+		border-radius: 3px;
+		transition: background 0.2s ease;
+	}
+
+	*:hover::-webkit-scrollbar-thumb {
+		background: hsl(var(--muted-foreground) / 0.3);
+	}
+
+	*::-webkit-scrollbar-thumb:hover {
+		background: hsl(var(--muted-foreground) / 0.5);
 	}
 }
 
diff --git a/tools/server/webui/src/lib/components/app/actions/ActionIcon.svelte b/tools/server/webui/src/lib/components/app/actions/ActionIcon.svelte
new file mode 100644
index 0000000000..4494ea880b
--- /dev/null
+++ b/tools/server/webui/src/lib/components/app/actions/ActionIcon.svelte
@@ -0,0 +1,48 @@
+<script lang="ts">
+	import { Button } from '$lib/components/ui/button';
+	import * as Tooltip from '$lib/components/ui/tooltip';
+	import type { Component } from 'svelte';
+
+	interface Props {
+		icon: Component;
+		tooltip: string;
+		variant?: 'default' | 'destructive' | 'outline' | 'secondary' | 'ghost' | 'link';
+		size?: 'default' | 'sm' | 'lg' | 'icon';
+		class?: string;
+		disabled?: boolean;
+		onclick: () => void;
+		'aria-label'?: string;
+	}
+
+	let {
+		icon,
+		tooltip,
+		variant = 'ghost',
+		size = 'sm',
+		class: className = '',
+		disabled = false,
+		onclick,
+		'aria-label': ariaLabel
+	}: Props = $props();
+</script>
+
+<Tooltip.Root>
+	<Tooltip.Trigger>
+		<Button
+			{variant}
+			{size}
+			{disabled}
+			{onclick}
+			class="h-6 w-6 p-0 {className} flex"
+			aria-label={ariaLabel || tooltip}
+		>
+			{@const IconComponent = icon}
+
+			<IconComponent class="h-3 w-3" />
+		</Button>
+	</Tooltip.Trigger>
+
+	<Tooltip.Content>
+		<p>{tooltip}</p>
+	</Tooltip.Content>
+</Tooltip.Root>
diff --git a/tools/server/webui/src/lib/components/app/actions/ActionIconCopyToClipboard.svelte b/tools/server/webui/src/lib/components/app/actions/ActionIconCopyToClipboard.svelte
new file mode 100644
index 0000000000..bf6cd4fb28
--- /dev/null
+++ b/tools/server/webui/src/lib/components/app/actions/ActionIconCopyToClipboard.svelte
@@ -0,0 +1,18 @@
+<script lang="ts">
+	import { Copy } from '@lucide/svelte';
+	import { copyToClipboard } from '$lib/utils';
+
+	interface Props {
+		ariaLabel?: string;
+		canCopy?: boolean;
+		text: string;
+	}
+
+	let { ariaLabel = 'Copy to clipboard', canCopy = true, text }: Props = $props();
+</script>
+
+<Copy
+	class="h-3 w-3 flex-shrink-0 cursor-{canCopy ? 'pointer' : 'not-allowed'}"
+	aria-label={ariaLabel}
+	onclick={() => canCopy && copyToClipboard(text)}
+/>
diff --git a/tools/server/webui/src/lib/components/app/actions/ActionIconRemove.svelte b/tools/server/webui/src/lib/components/app/actions/ActionIconRemove.svelte
new file mode 100644
index 0000000000..1ae3d21774
--- /dev/null
+++ b/tools/server/webui/src/lib/components/app/actions/ActionIconRemove.svelte
@@ -0,0 +1,26 @@
+<script lang="ts">
+	import { X } from '@lucide/svelte';
+	import { Button } from '$lib/components/ui/button';
+
+	interface Props {
+		id: string;
+		onRemove?: (id: string) => void;
+		class?: string;
+	}
+
+	let { id, onRemove, class: className = '' }: Props = $props();
+</script>
+
+<Button
+	type="button"
+	variant="ghost"
+	size="sm"
+	class="h-6 w-6 bg-white/20 p-0 hover:bg-white/30 {className}"
+	onclick={(e: MouseEvent) => {
+		e.stopPropagation();
+		onRemove?.(id);
+	}}
+	aria-label="Remove file"
+>
+	<X class="h-3 w-3" />
+</Button>
diff --git a/tools/server/webui/src/lib/components/app/actions/ActionIconsCodeBlock.svelte b/tools/server/webui/src/lib/components/app/actions/ActionIconsCodeBlock.svelte
new file mode 100644
index 0000000000..54ff0af1a0
--- /dev/null
+++ b/tools/server/webui/src/lib/components/app/actions/ActionIconsCodeBlock.svelte
@@ -0,0 +1,46 @@
+<script lang="ts">
+	import { Eye } from '@lucide/svelte';
+	import ActionIconCopyToClipboard from '$lib/components/app/actions/ActionIconCopyToClipboard.svelte';
+	import { FileTypeText } from '$lib/enums';
+
+	interface Props {
+		code: string;
+		language: string;
+		disabled?: boolean;
+		onPreview?: (code: string, language: string) => void;
+	}
+
+	let { code, language, disabled = false, onPreview }: Props = $props();
+
+	const showPreview = $derived(language?.toLowerCase() === FileTypeText.HTML);
+
+	function handlePreview() {
+		if (disabled) return;
+		onPreview?.(code, language);
+	}
+</script>
+
+<div class="code-block-actions">
+	<div class="copy-code-btn" class:opacity-50={disabled} class:!cursor-not-allowed={disabled}>
+		<ActionIconCopyToClipboard
+			text={code}
+			canCopy={!disabled}
+			ariaLabel={disabled ? 'Code incomplete' : 'Copy code'}
+		/>
+	</div>
+
+	{#if showPreview}
+		<button
+			class="preview-code-btn"
+			class:opacity-50={disabled}
+			class:!cursor-not-allowed={disabled}
+			title={disabled ? 'Code incomplete' : 'Preview code'}
+			aria-label="Preview code"
+			aria-disabled={disabled}
+			type="button"
+			onclick={handlePreview}
+		>
+			<Eye size={16} />
+		</button>
+	{/if}
+</div>
diff --git a/tools/server/webui/src/lib/components/app/actions/index.ts b/tools/server/webui/src/lib/components/app/actions/index.ts
new file mode 100644
index 0000000000..43485c7b7e
--- /dev/null
+++ b/tools/server/webui/src/lib/components/app/actions/index.ts
@@ -0,0 +1,19 @@
+/**
+ *
+ * ACTIONS
+ *
+ * Small interactive components for user actions.
+ *
+ */
+
+/** Styled icon button for action triggers with tooltip. */
+export { default as ActionIcon } from './ActionIcon.svelte';
+
+/** Code block actions component (copy, preview). */
+export { default as ActionIconsCodeBlock } from './ActionIconsCodeBlock.svelte';
+
+/** Copy-to-clipboard icon button with click handler. */
+export { default as ActionIconCopyToClipboard } from './ActionIconCopyToClipboard.svelte';
+
+/** Remove/delete icon button with X icon. */
+export { default as ActionIconRemove } from './ActionIconRemove.svelte';
diff --git a/tools/server/webui/src/lib/components/app/badges/BadgeChatStatistic.svelte b/tools/server/webui/src/lib/components/app/badges/BadgeChatStatistic.svelte
new file mode 100644
index 0000000000..a2b28d2057
--- /dev/null
+++ b/tools/server/webui/src/lib/components/app/badges/BadgeChatStatistic.svelte
@@ -0,0 +1,44 @@
+<script lang="ts">
+	import { BadgeInfo } from '$lib/components/app';
+	import * as Tooltip from '$lib/components/ui/tooltip';
+	import { copyToClipboard } from '$lib/utils';
+	import type { Component } from 'svelte';
+
+	interface Props {
+		class?: string;
+		icon: Component;
+		value: string | number;
+		tooltipLabel?: string;
+	}
+
+	let { class: className = '', icon: Icon, value, tooltipLabel }: Props = $props();
+
+	function handleClick() {
+		void copyToClipboard(String(value));
+	}
+</script>
+
+{#if tooltipLabel}
+	<Tooltip.Root>
+		<Tooltip.Trigger>
+			<BadgeInfo class={className} onclick={handleClick}>
+				{#snippet icon()}
+					<Icon class="h-3 w-3" />
+				{/snippet}
+
+				{value}
+			</BadgeInfo>
+		</Tooltip.Trigger>
+		<Tooltip.Content>
+			<p>{tooltipLabel}</p>
+		</Tooltip.Content>
+	</Tooltip.Root>
+{:else}
+	<BadgeInfo class={className} onclick={handleClick}>
+		{#snippet icon()}
+			<Icon class="h-3 w-3" />
+		{/snippet}
+
+		{value}
+	</BadgeInfo>
+{/if}
diff --git a/tools/server/webui/src/lib/components/app/badges/BadgeInfo.svelte b/tools/server/webui/src/lib/components/app/badges/BadgeInfo.svelte
new file mode 100644
index 0000000000..c70af6f423
--- /dev/null
+++ b/tools/server/webui/src/lib/components/app/badges/BadgeInfo.svelte
@@ -0,0 +1,27 @@
+<script lang="ts">
+	import { cn } from '$lib/components/ui/utils';
+	import type { Snippet } from 'svelte';
+
+	interface Props {
+		children: Snippet;
+		class?: string;
+		icon?: Snippet;
+		onclick?: () => void;
+	}
+
+	let { children, class: className = '', icon, onclick }: Props = $props();
+</script>
+
+<button
+	class={cn(
+		'inline-flex cursor-pointer items-center gap-1 rounded-sm bg-muted-foreground/15 px-1.5 py-0.75',
+		className
+	)}
+	{onclick}
+>
+	{#if icon}
+		{@render icon()}
+	{/if}
+
+	{@render children()}
+</button>
diff --git a/tools/server/webui/src/lib/components/app/badges/BadgeModality.svelte b/tools/server/webui/src/lib/components/app/badges/BadgeModality.svelte
new file mode 100644
index 0000000000..a0d5e863c2
--- /dev/null
+++ b/tools/server/webui/src/lib/components/app/badges/BadgeModality.svelte
@@ -0,0 +1,39 @@
+<script lang="ts">
+	import { ModelModality } from '$lib/enums';
+	import { MODALITY_ICONS, MODALITY_LABELS } from '$lib/constants/icons';
+	import { cn } from '$lib/components/ui/utils';
+
+	type DisplayableModality = ModelModality.VISION | ModelModality.AUDIO;
+
+	interface Props {
+		modalities: ModelModality[];
+		class?: string;
+	}
+
+	let { modalities, class: className = '' }: Props = $props();
+
+	// Filter to only modalities that have icons (VISION, AUDIO)
+	const displayableModalities = $derived(
+		modalities.filter(
+			(m): m is DisplayableModality => m === ModelModality.VISION || m === ModelModality.AUDIO
+		)
+	);
+</script>
+
+{#each displayableModalities as modality, index (index)}
+	{@const IconComponent = MODALITY_ICONS[modality]}
+	{@const label = MODALITY_LABELS[modality]}
+
+	<span
+		class={cn(
+			'inline-flex items-center gap-1 rounded-md bg-muted px-2 py-1 text-xs font-medium',
+			className
+		)}
+	>
+		{#if IconComponent}
+			<IconComponent class="h-3 w-3" />
+		{/if}
+
+		{label}
+	</span>
+{/each}
diff --git a/tools/server/webui/src/lib/components/app/badges/index.ts b/tools/server/webui/src/lib/components/app/badges/index.ts
new file mode 100644
index 0000000000..860afe3084
--- /dev/null
+++ b/tools/server/webui/src/lib/components/app/badges/index.ts
@@ -0,0 +1,16 @@
+/**
+ *
+ * BADGES & INDICATORS
+ *
+ * Small visual indicators for status and metadata.
+ *
+ */
+
+/** Badge displaying chat statistics (tokens, timing). */
+export { default as BadgeChatStatistic } from './BadgeChatStatistic.svelte';
+
+/** Generic info badge with optional tooltip and click handler. */
+export { default as BadgeInfo } from './BadgeInfo.svelte';
+
+/** Badge indicating model modality (vision, audio, tools). */
+export { default as BadgeModality } from './BadgeModality.svelte';
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte b/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte
index 27ab975cbd..95645295fb 100644
--- a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte
@@ -27,11 +27,13 @@
 	interface Props {
 		class?: string;
 		disabled?: boolean;
+		initialMessage?: string;
 		isLoading?: boolean;
 		onFileRemove?: (fileId: string) => void;
 		onFileUpload?: (files: File[]) => void;
 		onSend?: (message: string, files?: ChatUploadedFile[]) => Promise<boolean>;
 		onStop?: () => void;
+		onSystemPromptAdd?: (draft: { message: string; files: ChatUploadedFile[] }) => void;
 		showHelperText?: boolean;
 		uploadedFiles?: ChatUploadedFile[];
 	}
@@ -39,11 +41,13 @@
 	let {
 		class: className,
 		disabled = false,
+		initialMessage = '',
 		isLoading = false,
 		onFileRemove,
 		onFileUpload,
 		onSend,
 		onStop,
+		onSystemPromptAdd,
 		showHelperText = true,
 		uploadedFiles = $bindable([])
 	}: Props = $props();
@@ -53,15 +57,28 @@
 	let currentConfig = $derived(config());
 	let fileInputRef: ChatFormFileInputInvisible | undefined = $state(undefined);
 	let isRecording = $state(false);
-	let message = $state('');
+	let message = $state(initialMessage);
 	let pasteLongTextToFileLength = $derived.by(() => {
 		const n = Number(currentConfig.pasteLongTextToFileLen);
 		return Number.isNaN(n) ? Number(SETTING_CONFIG_DEFAULT.pasteLongTextToFileLen) : n;
 	});
 	let previousIsLoading = $state(isLoading);
+	let previousInitialMessage = $state(initialMessage);
 	let recordingSupported = $state(false);
 	let textareaRef: ChatFormTextarea | undefined = $state(undefined);
 
+	// Sync message when initialMessage prop changes (e.g., after draft restoration)
+	$effect(() => {
+		if (initialMessage !== previousInitialMessage) {
+			message = initialMessage;
+			previousInitialMessage = initialMessage;
+		}
+	});
+
+	function handleSystemPromptClick() {
+		onSystemPromptAdd?.({ message, files: uploadedFiles });
+	}
+
 	// Check if model is selected (in ROUTER mode)
 	let conversationModel = $derived(
 		chatStore.getConversationModel(activeMessages() as DatabaseMessage[])
@@ -308,6 +325,7 @@
 			onFileUpload={handleFileUpload}
 			onMicClick={handleMicClick}
 			onStop={handleStop}
+			onSystemPromptClick={handleSystemPromptClick}
 		/>
 	</div>
 </form>
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionFileAttachments.svelte b/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionFileAttachments.svelte
index dd37268096..3545b4aebf 100644
--- a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionFileAttachments.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionFileAttachments.svelte
@@ -1,5 +1,6 @@
 <script lang="ts">
 	import { Paperclip } from '@lucide/svelte';
+	import { MessageSquare } from '@lucide/svelte';
 	import { Button } from '$lib/components/ui/button';
 	import * as DropdownMenu from '$lib/components/ui/dropdown-menu';
 	import * as Tooltip from '$lib/components/ui/tooltip';
@@ -11,6 +12,7 @@
 		hasAudioModality?: boolean;
 		hasVisionModality?: boolean;
 		onFileUpload?: () => void;
+		onSystemPromptClick?: () => void;
 	}
 
 	let {
@@ -18,7 +20,8 @@
 		disabled = false,
 		hasAudioModality = false,
 		hasVisionModality = false,
-		onFileUpload
+		onFileUpload,
+		onSystemPromptClick
 	}: Props = $props();
 
 	const fileUploadTooltipText = $derived.by(() => {
@@ -118,6 +121,23 @@
 					</Tooltip.Content>
 				{/if}
 			</Tooltip.Root>
+			<DropdownMenu.Separator />
+			<Tooltip.Root>
+				<Tooltip.Trigger class="w-full">
+					<DropdownMenu.Item
+						class="flex cursor-pointer items-center gap-2"
+						onclick={() => onSystemPromptClick?.()}
+					>
+						<MessageSquare class="h-4 w-4" />
+
+						<span>System Prompt</span>
+					</DropdownMenu.Item>
+				</Tooltip.Trigger>
+
+				<Tooltip.Content>
+					<p>Add a custom system message for this conversation</p>
+				</Tooltip.Content>
+			</Tooltip.Root>
 		</DropdownMenu.Content>
 	</DropdownMenu.Root>
 </div>
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActions.svelte b/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActions.svelte
index dde9bda2d8..c621a69e05 100644
--- a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActions.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActions.svelte
@@ -27,6 +27,7 @@
 		onFileUpload?: () => void;
 		onMicClick?: () => void;
 		onStop?: () => void;
+		onSystemPromptClick?: () => void;
 	}
 
 	let {
@@ -39,7 +40,8 @@
 		uploadedFiles = [],
 		onFileUpload,
 		onMicClick,
-		onStop
+		onStop,
+		onSystemPromptClick
 	}: Props = $props();
 
 	let currentConfig = $derived(config());
@@ -170,6 +172,7 @@
 		{hasAudioModality}
 		{hasVisionModality}
 		{onFileUpload}
+		{onSystemPromptClick}
 	/>
 
 	<ModelsSelector
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte
index 220276fc9e..82ef7de7c7 100644
--- a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte
@@ -1,6 +1,15 @@
 <script lang="ts">
-	import { chatStore } from '$lib/stores/chat.svelte';
+	import { goto } from '$app/navigation';
+	import {
+		chatStore,
+		pendingEditMessageId,
+		clearPendingEditMessageId,
+		removeSystemPromptPlaceholder
+	} from '$lib/stores/chat.svelte';
+	import { conversationsStore } from '$lib/stores/conversations.svelte';
+	import { DatabaseService } from '$lib/services';
 	import { config } from '$lib/stores/settings.svelte';
+	import { SYSTEM_MESSAGE_PLACEHOLDER } from '$lib/constants/ui';
 	import { copyToClipboard, isIMEComposing, formatMessageForClipboard } from '$lib/utils';
 	import ChatMessageAssistant from './ChatMessageAssistant.svelte';
 	import ChatMessageUser from './ChatMessageUser.svelte';
@@ -92,8 +101,30 @@
 		return null;
 	});
 
-	function handleCancelEdit() {
+	// Auto-start edit mode if this message is the pending edit target
+	$effect(() => {
+		const pendingId = pendingEditMessageId();
+
+		if (pendingId && pendingId === message.id && !isEditing) {
+			handleEdit();
+			clearPendingEditMessageId();
+		}
+	});
+
+	async function handleCancelEdit() {
 		isEditing = false;
+
+		// If canceling a new system message with placeholder content, remove it without deleting children
+		if (message.role === 'system') {
+			const conversationDeleted = await removeSystemPromptPlaceholder(message.id);
+
+			if (conversationDeleted) {
+				goto('/');
+			}
+
+			return;
+		}
+
 		editedContent = message.content;
 		editedExtras = message.extra ? [...message.extra] : [];
 		editedUploadedFiles = [];
@@ -114,8 +145,17 @@
 		onCopy?.(message);
 	}
 
-	function handleConfirmDelete() {
-		onDelete?.(message);
+	async function handleConfirmDelete() {
+		if (message.role === 'system') {
+			const conversationDeleted = await removeSystemPromptPlaceholder(message.id);
+
+			if (conversationDeleted) {
+				goto('/');
+			}
+		} else {
+			onDelete?.(message);
+		}
+
 		showDeleteDialog = false;
 	}
 
@@ -126,7 +166,12 @@
 
 	function handleEdit() {
 		isEditing = true;
-		editedContent = message.content;
+		// Clear placeholder content for system messages
+		editedContent =
+			message.role === 'system' && message.content === SYSTEM_MESSAGE_PLACEHOLDER
+				? ''
+				: message.content;
+		textareaElement?.focus();
 		editedExtras = message.extra ? [...message.extra] : [];
 		editedUploadedFiles = [];
 
@@ -166,7 +211,26 @@
 	}
 
 	async function handleSaveEdit() {
-		if (message.role === 'user' || message.role === 'system') {
+		if (message.role === 'system') {
+			// System messages: update in place without branching
+			const newContent = editedContent.trim();
+
+			// If content is empty or still the placeholder, remove without deleting children
+			if (!newContent) {
+				const conversationDeleted = await removeSystemPromptPlaceholder(message.id);
+				isEditing = false;
+				if (conversationDeleted) {
+					goto('/');
+				}
+				return;
+			}
+
+			await DatabaseService.updateMessage(message.id, { content: newContent });
+			const index = conversationsStore.findMessageIndex(message.id);
+			if (index !== -1) {
+				conversationsStore.updateMessageAtIndex(index, { content: newContent });
+			}
+		} else if (message.role === 'user') {
 			const finalExtras = await getMergedExtras();
 			onEditWithBranching?.(message, editedContent.trim(), finalExtras);
 		} else {
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageActions.svelte b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageActions.svelte
index 3cb48157d8..dbd9b98228 100644
--- a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageActions.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageActions.svelte
@@ -5,6 +5,7 @@
 		ChatMessageBranchingControls,
 		DialogConfirmation
 	} from '$lib/components/app';
+	import { Switch } from '$lib/components/ui/switch';
 
 	interface Props {
 		role: 'user' | 'assistant';
@@ -26,6 +27,9 @@
 		onConfirmDelete: () => void;
 		onNavigateToSibling?: (siblingId: string) => void;
 		onShowDeleteDialogChange: (show: boolean) => void;
+		showRawOutputSwitch?: boolean;
+		rawOutputEnabled?: boolean;
+		onRawOutputToggle?: (enabled: boolean) => void;
 	}
 
 	let {
@@ -42,7 +46,10 @@
 		onRegenerate,
 		role,
 		siblingInfo = null,
-		showDeleteDialog
+		showDeleteDialog,
+		showRawOutputSwitch = false,
+		rawOutputEnabled = false,
+		onRawOutputToggle
 	}: Props = $props();
 
 	function handleConfirmDelete() {
@@ -51,9 +58,9 @@
 	}
 </script>
 
-<div class="relative {justify === 'start' ? 'mt-2' : ''} flex h-6 items-center justify-{justify}">
+<div class="relative {justify === 'start' ? 'mt-2' : ''} flex h-6 items-center justify-between">
 	<div
-		class="absolute top-0 {actionsPosition === 'left'
+		class="{actionsPosition === 'left'
 			? 'left-0'
 			: 'right-0'} flex items-center gap-2 opacity-100 transition-opacity"
 	>
@@ -81,6 +88,16 @@
 			<ActionButton icon={Trash2} tooltip="Delete" onclick={onDelete} />
 		</div>
 	</div>
+
+	{#if showRawOutputSwitch}
+		<div class="flex items-center gap-2">
+			<span class="text-xs text-muted-foreground">Show raw output</span>
+			<Switch
+				checked={rawOutputEnabled}
+				onCheckedChange={(checked) => onRawOutputToggle?.(checked)}
+			/>
+		</div>
+	{/if}
 </div>
 
 <DialogConfirmation
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte
index 2b34b1c20a..1cb6b274b6 100644
--- a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte
@@ -90,6 +90,9 @@
 
 	const processingState = useProcessingState();
 
+	// Local state for raw output toggle (per message)
+	let showRawOutput = $state(false);
+
 	let currentConfig = $derived(config());
 	let isRouter = $derived(isRouterMode());
 	let displayedModel = $derived((): string | null => {
@@ -238,7 +241,7 @@
 			</div>
 		</div>
 	{:else if message.role === 'assistant'}
-		{#if config().disableReasoningFormat}
+		{#if showRawOutput}
 			<pre class="raw-output">{messageContent || ''}</pre>
 		{:else}
 			<MarkdownContent content={messageContent || ''} />
@@ -352,6 +355,9 @@
 			{onConfirmDelete}
 			{onNavigateToSibling}
 			{onShowDeleteDialogChange}
+			showRawOutputSwitch={currentConfig.showRawOutputSwitch}
+			rawOutputEnabled={showRawOutput}
+			onRawOutputToggle={(enabled) => (showRawOutput = enabled)}
 		/>
 	{/if}
 </div>
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageSystem.svelte b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageSystem.svelte
index c203822f60..887df5b771 100644
--- a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageSystem.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageSystem.svelte
@@ -116,7 +116,7 @@
 
 				<Button class="h-8 px-3" onclick={onSaveEdit} disabled={!editedContent.trim()} size="sm">
 					<Check class="mr-1 h-3 w-3" />
-					Send
+					Save
 				</Button>
 			</div>
 		</div>
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte b/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte
index 27439551a1..a5450e6af8 100644
--- a/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte
@@ -21,6 +21,7 @@
 		chatStore,
 		errorDialog,
 		isLoading,
+		isChatStreaming,
 		isEditing,
 		getAddFilesHandler
 	} from '$lib/stores/chat.svelte';
@@ -71,6 +72,8 @@
 
 	let emptyFileNames = $state<string[]>([]);
 
+	let initialMessage = $state('');
+
 	let isEmpty = $derived(
 		showCenteredEmpty && !activeConversation() && activeMessages().length === 0 && !isLoading()
 	);
@@ -79,7 +82,7 @@
 	let isServerLoading = $derived(serverLoading());
 	let hasPropsError = $derived(!!serverError());
 
-	let isCurrentConversationLoading = $derived(isLoading());
+	let isCurrentConversationLoading = $derived(isLoading() || isChatStreaming());
 
 	let isRouter = $derived(isRouterMode());
 
@@ -221,6 +224,14 @@
 		}
 	}
 
+	async function handleSystemPromptAdd(draft: { message: string; files: ChatUploadedFile[] }) {
+		if (draft.message || draft.files.length > 0) {
+			chatStore.savePendingDraft(draft.message, draft.files);
+		}
+
+		await chatStore.addSystemPrompt();
+	}
+
 	function handleScroll() {
 		if (disableAutoScroll || !chatScrollContainer) return;
 
@@ -343,6 +354,12 @@
 		if (!disableAutoScroll) {
 			setTimeout(() => scrollChatToBottom('instant'), INITIAL_SCROLL_DELAY);
 		}
+
+		const pendingDraft = chatStore.consumePendingDraft();
+		if (pendingDraft) {
+			initialMessage = pendingDraft.message;
+			uploadedFiles = pendingDraft.files;
+		}
 	});
 
 	$effect(() => {
@@ -428,11 +445,13 @@
 			<div class="conversation-chat-form pointer-events-auto rounded-t-3xl pb-4">
 				<ChatForm
 					disabled={hasPropsError || isEditing()}
+					{initialMessage}
 					isLoading={isCurrentConversationLoading}
 					onFileRemove={handleFileRemove}
 					onFileUpload={handleFileUpload}
 					onSend={handleSendMessage}
 					onStop={() => chatStore.stopGeneration()}
+					onSystemPromptAdd={handleSystemPromptAdd}
 					showHelperText={false}
 					bind:uploadedFiles
 				/>
@@ -486,11 +505,13 @@
 			<div in:fly={{ y: 10, duration: 250, delay: hasPropsError ? 0 : 300 }}>
 				<ChatForm
 					disabled={hasPropsError}
+					{initialMessage}
 					isLoading={isCurrentConversationLoading}
 					onFileRemove={handleFileRemove}
 					onFileUpload={handleFileUpload}
 					onSend={handleSendMessage}
 					onStop={() => chatStore.stopGeneration()}
+					onSystemPromptAdd={handleSystemPromptAdd}
 					showHelperText={true}
 					bind:uploadedFiles
 				/>
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettings.svelte b/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettings.svelte
index 5a668aa300..967f19bbce 100644
--- a/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettings.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettings.svelte
@@ -254,8 +254,13 @@
 					type: 'checkbox'
 				},
 				{
-					key: 'disableReasoningFormat',
-					label: 'Show raw LLM output',
+					key: 'disableReasoningParsing',
+					label: 'Disable reasoning content parsing',
+					type: 'checkbox'
+				},
+				{
+					key: 'showRawOutputSwitch',
+					label: 'Enable raw output toggle',
 					type: 'checkbox'
 				},
 				{
diff --git a/tools/server/webui/src/lib/components/app/content/CollapsibleContentBlock.svelte b/tools/server/webui/src/lib/components/app/content/CollapsibleContentBlock.svelte
new file mode 100644
index 0000000000..082738da57
--- /dev/null
+++ b/tools/server/webui/src/lib/components/app/content/CollapsibleContentBlock.svelte
@@ -0,0 +1,97 @@
+<script lang="ts">
+	import ChevronsUpDownIcon from '@lucide/svelte/icons/chevrons-up-down';
+	import * as Collapsible from '$lib/components/ui/collapsible/index.js';
+	import { buttonVariants } from '$lib/components/ui/button/index.js';
+	import { Card } from '$lib/components/ui/card';
+	import { createAutoScrollController } from '$lib/hooks/use-auto-scroll.svelte';
+	import type { Snippet } from 'svelte';
+	import type { Component } from 'svelte';
+
+	interface Props {
+		open?: boolean;
+		class?: string;
+		icon?: Component;
+		iconClass?: string;
+		title: string;
+		subtitle?: string;
+		isStreaming?: boolean;
+		onToggle?: () => void;
+		children: Snippet;
+	}
+
+	let {
+		open = $bindable(false),
+		class: className = '',
+		icon: Icon,
+		iconClass = 'h-4 w-4',
+		title,
+		subtitle,
+		isStreaming = false,
+		onToggle,
+		children
+	}: Props = $props();
+
+	let contentContainer: HTMLDivElement | undefined = $state();
+	const autoScroll = createAutoScrollController();
+
+	$effect(() => {
+		autoScroll.setContainer(contentContainer);
+	});
+
+	$effect(() => {
+		// Only auto-scroll when open and streaming
+		autoScroll.updateInterval(open && isStreaming);
+	});
+
+	function handleScroll() {
+		autoScroll.handleScroll();
+	}
+</script>
+
+<Collapsible.Root
+	{open}
+	onOpenChange={(value) => {
+		open = value;
+		onToggle?.();
+	}}
+	class={className}
+>
+	<Card class="gap-0 border-muted bg-muted/30 py-0">
+		<Collapsible.Trigger class="flex w-full cursor-pointer items-center justify-between p-3">
+			<div class="flex items-center gap-2 text-muted-foreground">
+				{#if Icon}
+					<Icon class={iconClass} />
+				{/if}
+
+				<span class="font-mono text-sm font-medium">{title}</span>
+
+				{#if subtitle}
+					<span class="text-xs italic">{subtitle}</span>
+				{/if}
+			</div>
+
+			<div
+				class={buttonVariants({
+					variant: 'ghost',
+					size: 'sm',
+					class: 'h-6 w-6 p-0 text-muted-foreground hover:text-foreground'
+				})}
+			>
+				<ChevronsUpDownIcon class="h-4 w-4" />
+
+				<span class="sr-only">Toggle content</span>
+			</div>
+		</Collapsible.Trigger>
+
+		<Collapsible.Content>
+			<div
+				bind:this={contentContainer}
+				class="overflow-y-auto border-t border-muted px-3 pb-3"
+				onscroll={handleScroll}
+				style="min-height: var(--min-message-height); max-height: var(--max-message-height);"
+			>
+				{@render children()}
+			</div>
+		</Collapsible.Content>
+	</Card>
+</Collapsible.Root>
diff --git a/tools/server/webui/src/lib/components/app/content/MarkdownContent.svelte b/tools/server/webui/src/lib/components/app/content/MarkdownContent.svelte
new file mode 100644
index 0000000000..ef6c7e064f
--- /dev/null
+++ b/tools/server/webui/src/lib/components/app/content/MarkdownContent.svelte
@@ -0,0 +1,1201 @@
+<script lang="ts">
+	import { remark } from 'remark';
+	import remarkBreaks from 'remark-breaks';
+	import remarkGfm from 'remark-gfm';
+	import remarkMath from 'remark-math';
+	import rehypeHighlight from 'rehype-highlight';
+	import remarkRehype from 'remark-rehype';
+	import rehypeKatex from 'rehype-katex';
+	import rehypeStringify from 'rehype-stringify';
+	import type { Root as HastRoot, RootContent as HastRootContent } from 'hast';
+	import type { Root as MdastRoot } from 'mdast';
+	import { browser } from '$app/environment';
+	import { onDestroy, tick } from 'svelte';
+	import { SvelteMap } from 'svelte/reactivity';
+	import { rehypeRestoreTableHtml } from '$lib/markdown/table-html-restorer';
+	import { rehypeEnhanceLinks } from '$lib/markdown/enhance-links';
+	import { rehypeEnhanceCodeBlocks } from '$lib/markdown/enhance-code-blocks';
+	import { remarkLiteralHtml } from '$lib/markdown/literal-html';
+	import { copyCodeToClipboard, preprocessLaTeX, getImageErrorFallbackHtml } from '$lib/utils';
+	import {
+		IMAGE_NOT_ERROR_BOUND_SELECTOR,
+		DATA_ERROR_BOUND_ATTR,
+		DATA_ERROR_HANDLED_ATTR,
+		BOOL_TRUE_STRING
+	} from '$lib/constants/markdown';
+	import { FileTypeText } from '$lib/enums/files';
+	import {
+		highlightCode,
+		detectIncompleteCodeBlock,
+		type IncompleteCodeBlock
+	} from '$lib/utils/code';
+	import '$styles/katex-custom.scss';
+	import githubDarkCss from 'highlight.js/styles/github-dark.css?inline';
+	import githubLightCss from 'highlight.js/styles/github.css?inline';
+	import { mode } from 'mode-watcher';
+	import ActionIconsCodeBlock from '$lib/components/app/actions/ActionIconsCodeBlock.svelte';
+	import DialogCodePreview from '$lib/components/app/misc/CodePreviewDialog.svelte';
+	import { createAutoScrollController } from '$lib/hooks/use-auto-scroll.svelte';
+	import type { DatabaseMessageExtra } from '$lib/types/database';
+
+	interface Props {
+		attachments?: DatabaseMessageExtra[];
+		content: string;
+		class?: string;
+		disableMath?: boolean;
+	}
+
+	interface MarkdownBlock {
+		id: string;
+		html: string;
+		contentHash?: string;
+	}
+
+	let { content, attachments, class: className = '', disableMath = false }: Props = $props();
+
+	let containerRef = $state<HTMLDivElement>();
+	let renderedBlocks = $state<MarkdownBlock[]>([]);
+	let unstableBlockHtml = $state('');
+	let incompleteCodeBlock = $state<IncompleteCodeBlock | null>(null);
+	let previewDialogOpen = $state(false);
+	let previewCode = $state('');
+	let previewLanguage = $state('text');
+	let streamingCodeScrollContainer = $state<HTMLDivElement>();
+
+	// Auto-scroll controller for streaming code block content
+	const streamingAutoScroll = createAutoScrollController();
+
+	let pendingMarkdown: string | null = null;
+	let isProcessing = false;
+
+	// Per-instance transform cache, avoids re-transforming stable blocks during streaming
+	// Garbage collected when component is destroyed (on conversation change)
+	const transformCache = new SvelteMap<string, string>();
+	let previousContent = '';
+
+	const themeStyleId = `highlight-theme-${(window.idxThemeStyle = (window.idxThemeStyle ?? 0) + 1)}`;
+
+	let processor = $derived(() => {
+		void attachments;
+		// eslint-disable-next-line @typescript-eslint/no-explicit-any
+		let proc: any = remark().use(remarkGfm); // GitHub Flavored Markdown
+
+		if (!disableMath) {
+			proc = proc.use(remarkMath); // Parse $inline$ and $$block$$ math
+		}
+
+		proc = proc
+			.use(remarkBreaks) // Convert line breaks to <br>
+			.use(remarkLiteralHtml) // Treat raw HTML as literal text with preserved indentation
+			.use(remarkRehype); // Convert Markdown AST to rehype
+
+		if (!disableMath) {
+			proc = proc.use(rehypeKatex); // Render math using KaTeX
+		}
+
+		return proc
+			.use(rehypeHighlight, {
+				aliases: { [FileTypeText.XML]: [FileTypeText.SVELTE, FileTypeText.VUE] }
+			}) // Add syntax highlighting
+			.use(rehypeRestoreTableHtml) // Restore limited HTML (e.g., <br>, <ul>) inside Markdown tables
+			.use(rehypeEnhanceLinks) // Add target="_blank" to links
+			.use(rehypeEnhanceCodeBlocks) // Wrap code blocks with header and actions
+			.use(rehypeStringify, { allowDangerousHtml: true }); // Convert to HTML string
+	});
+
+	/**
+	 * Removes click event listeners from copy and preview buttons.
+	 * Called on component destroy.
+	 */
+	function cleanupEventListeners() {
+		if (!containerRef) return;
+
+		const copyButtons = containerRef.querySelectorAll<HTMLButtonElement>('.copy-code-btn');
+		const previewButtons = containerRef.querySelectorAll<HTMLButtonElement>('.preview-code-btn');
+
+		for (const button of copyButtons) {
+			button.removeEventListener('click', handleCopyClick);
+		}
+
+		for (const button of previewButtons) {
+			button.removeEventListener('click', handlePreviewClick);
+		}
+	}
+
+	/**
+	 * Removes this component's highlight.js theme style from the document head.
+	 * Called on component destroy to clean up injected styles.
+	 */
+	function cleanupHighlightTheme() {
+		if (!browser) return;
+
+		const existingTheme = document.getElementById(themeStyleId);
+		existingTheme?.remove();
+	}
+
+	/**
+	 * Loads the appropriate highlight.js theme based on dark/light mode.
+	 * Injects a scoped style element into the document head.
+	 * @param isDark - Whether to load the dark theme (true) or light theme (false)
+	 */
+	function loadHighlightTheme(isDark: boolean) {
+		if (!browser) return;
+
+		const existingTheme = document.getElementById(themeStyleId);
+		existingTheme?.remove();
+
+		const style = document.createElement('style');
+		style.id = themeStyleId;
+		style.textContent = isDark ? githubDarkCss : githubLightCss;
+
+		document.head.appendChild(style);
+	}
+
+	/**
+	 * Extracts code information from a button click target within a code block.
+	 * @param target - The clicked button element
+	 * @returns Object with rawCode and language, or null if extraction fails
+	 */
+	function getCodeInfoFromTarget(target: HTMLElement) {
+		const wrapper = target.closest('.code-block-wrapper');
+
+		if (!wrapper) {
+			console.error('No wrapper found');
+			return null;
+		}
+
+		const codeElement = wrapper.querySelector<HTMLElement>('code[data-code-id]');
+
+		if (!codeElement) {
+			console.error('No code element found in wrapper');
+			return null;
+		}
+
+		const rawCode = codeElement.textContent ?? '';
+
+		const languageLabel = wrapper.querySelector<HTMLElement>('.code-language');
+		const language = languageLabel?.textContent?.trim() || 'text';
+
+		return { rawCode, language };
+	}
+
+	/**
+	 * Generates a unique identifier for a HAST node based on its position.
+	 * Used for stable block identification during incremental rendering.
+	 * @param node - The HAST root content node
+	 * @param indexFallback - Fallback index if position is unavailable
+	 * @returns Unique string identifier for the node
+	 */
+	function getHastNodeId(node: HastRootContent, indexFallback: number): string {
+		const position = node.position;
+
+		if (position?.start?.offset != null && position?.end?.offset != null) {
+			return `hast-${position.start.offset}-${position.end.offset}`;
+		}
+
+		return `${node.type}-${indexFallback}`;
+	}
+
+	/**
+	 * Generates a hash for MDAST node based on its position.
+	 * Used for cache lookup during incremental rendering.
+	 */
+	function getMdastNodeHash(node: unknown, index: number): string {
+		const n = node as {
+			type?: string;
+			position?: { start?: { offset?: number }; end?: { offset?: number } };
+		};
+
+		if (n.position?.start?.offset != null && n.position?.end?.offset != null) {
+			return `${n.type}-${n.position.start.offset}-${n.position.end.offset}`;
+		}
+
+		return `${n.type}-idx${index}`;
+	}
+
+	/**
+	 * Check if we're in append-only mode (streaming).
+	 */
+	function isAppendMode(newContent: string): boolean {
+		return previousContent.length > 0 && newContent.startsWith(previousContent);
+	}
+
+	/**
+	 * Transforms a single MDAST node to HTML string with caching.
+	 * Runs the full remark/rehype plugin pipeline (GFM, math, syntax highlighting, etc.)
+	 * on an isolated single-node tree, then stringifies the resulting HAST to HTML.
+	 * Results are cached by node position hash for streaming performance.
+	 * @param processorInstance - The remark/rehype processor instance
+	 * @param node - The MDAST node to transform
+	 * @param index - Node index for hash fallback
+	 * @returns Object containing the HTML string and cache hash
+	 */
+	async function transformMdastNode(
+		processorInstance: ReturnType<typeof processor>,
+		node: unknown,
+		index: number
+	): Promise<{ html: string; hash: string }> {
+		const hash = getMdastNodeHash(node, index);
+
+		const cached = transformCache.get(hash);
+		if (cached) {
+			return { html: cached, hash };
+		}
+
+		const singleNodeRoot = { type: 'root', children: [node] };
+		const transformedRoot = (await processorInstance.run(singleNodeRoot as MdastRoot)) as HastRoot;
+		const html = processorInstance.stringify(transformedRoot);
+
+		transformCache.set(hash, html);
+
+		return { html, hash };
+	}
+
+	/**
+	 * Handles click events on copy buttons within code blocks.
+	 * Copies the raw code content to the clipboard.
+	 * @param event - The click event from the copy button
+	 */
+	async function handleCopyClick(event: Event) {
+		event.preventDefault();
+		event.stopPropagation();
+
+		const target = event.currentTarget as HTMLButtonElement | null;
+
+		if (!target) {
+			return;
+		}
+
+		const info = getCodeInfoFromTarget(target);
+
+		if (!info) {
+			return;
+		}
+
+		try {
+			await copyCodeToClipboard(info.rawCode);
+		} catch (error) {
+			console.error('Failed to copy code:', error);
+		}
+	}
+
+	/**
+	 * Handles preview dialog open state changes.
+	 * Clears preview content when dialog is closed.
+	 * @param open - Whether the dialog is being opened or closed
+	 */
+	function handlePreviewDialogOpenChange(open: boolean) {
+		previewDialogOpen = open;
+
+		if (!open) {
+			previewCode = '';
+			previewLanguage = 'text';
+		}
+	}
+
+	/**
+	 * Handles click events on preview buttons within HTML code blocks.
+	 * Opens a preview dialog with the rendered HTML content.
+	 * @param event - The click event from the preview button
+	 */
+	function handlePreviewClick(event: Event) {
+		event.preventDefault();
+		event.stopPropagation();
+
+		const target = event.currentTarget as HTMLButtonElement | null;
+
+		if (!target) {
+			return;
+		}
+
+		const info = getCodeInfoFromTarget(target);
+
+		if (!info) {
+			return;
+		}
+
+		previewCode = info.rawCode;
+		previewLanguage = info.language;
+		previewDialogOpen = true;
+	}
+
+	/**
+	 * Processes markdown content into stable and unstable HTML blocks.
+	 * Uses incremental rendering: stable blocks are cached, unstable block is re-rendered.
+	 * Incomplete code blocks are rendered using SyntaxHighlightedCode to maintain interactivity.
+	 * @param markdown - The raw markdown string to process
+	 */
+	async function processMarkdown(markdown: string) {
+		// Early exit if content unchanged (can happen with rapid coalescing)
+		if (markdown === previousContent) {
+			return;
+		}
+
+		if (!markdown) {
+			renderedBlocks = [];
+			unstableBlockHtml = '';
+			incompleteCodeBlock = null;
+			previousContent = '';
+			return;
+		}
+
+		// Check for incomplete code block at the end of content
+		const incompleteBlock = detectIncompleteCodeBlock(markdown);
+
+		if (incompleteBlock) {
+			// Process only the prefix (content before the incomplete code block)
+			const prefixMarkdown = markdown.slice(0, incompleteBlock.openingIndex);
+
+			if (prefixMarkdown.trim()) {
+				const normalizedPrefix = preprocessLaTeX(prefixMarkdown);
+				const processorInstance = processor();
+				const ast = processorInstance.parse(normalizedPrefix) as MdastRoot;
+				const mdastChildren = (ast as { children?: unknown[] }).children ?? [];
+				const nextBlocks: MarkdownBlock[] = [];
+
+				// Check if we're in append mode for cache reuse
+				const appendMode = isAppendMode(prefixMarkdown);
+				const previousBlockCount = appendMode ? renderedBlocks.length : 0;
+
+				// All prefix blocks are now stable since code block is separate
+				for (let index = 0; index < mdastChildren.length; index++) {
+					const child = mdastChildren[index];
+
+					// In append mode, reuse previous blocks if unchanged
+					if (appendMode && index < previousBlockCount) {
+						const prevBlock = renderedBlocks[index];
+						const currentHash = getMdastNodeHash(child, index);
+
+						if (prevBlock?.contentHash === currentHash) {
+							nextBlocks.push(prevBlock);
+
+							continue;
+						}
+					}
+
+					// Transform this block (with caching)
+					const { html, hash } = await transformMdastNode(processorInstance, child, index);
+					const id = getHastNodeId(
+						{ position: (child as { position?: unknown }).position } as HastRootContent,
+						index
+					);
+
+					nextBlocks.push({ id, html, contentHash: hash });
+				}
+
+				renderedBlocks = nextBlocks;
+			} else {
+				renderedBlocks = [];
+			}
+
+			previousContent = prefixMarkdown;
+			unstableBlockHtml = '';
+			incompleteCodeBlock = incompleteBlock;
+
+			return;
+		}
+
+		// No incomplete code block - use standard processing
+		incompleteCodeBlock = null;
+
+		const normalized = preprocessLaTeX(markdown);
+		const processorInstance = processor();
+		const ast = processorInstance.parse(normalized) as MdastRoot;
+		const mdastChildren = (ast as { children?: unknown[] }).children ?? [];
+		const stableCount = Math.max(mdastChildren.length - 1, 0);
+		const nextBlocks: MarkdownBlock[] = [];
+
+		// Check if we're in append mode for cache reuse
+		const appendMode = isAppendMode(markdown);
+		const previousBlockCount = appendMode ? renderedBlocks.length : 0;
+
+		for (let index = 0; index < stableCount; index++) {
+			const child = mdastChildren[index];
+
+			// In append mode, reuse previous blocks if unchanged
+			if (appendMode && index < previousBlockCount) {
+				const prevBlock = renderedBlocks[index];
+				const currentHash = getMdastNodeHash(child, index);
+				if (prevBlock?.contentHash === currentHash) {
+					nextBlocks.push(prevBlock);
+
+					continue;
+				}
+			}
+
+			// Transform this block (with caching)
+			const { html, hash } = await transformMdastNode(processorInstance, child, index);
+			const id = getHastNodeId(
+				{ position: (child as { position?: unknown }).position } as HastRootContent,
+				index
+			);
+
+			nextBlocks.push({ id, html, contentHash: hash });
+		}
+
+		let unstableHtml = '';
+
+		if (mdastChildren.length > stableCount) {
+			const unstableChild = mdastChildren[stableCount];
+			const singleNodeRoot = { type: 'root', children: [unstableChild] };
+			const transformedRoot = (await processorInstance.run(
+				singleNodeRoot as MdastRoot
+			)) as HastRoot;
+
+			unstableHtml = processorInstance.stringify(transformedRoot);
+		}
+
+		renderedBlocks = nextBlocks;
+		previousContent = markdown;
+		await tick(); // Force DOM sync before updating unstable HTML block
+		unstableBlockHtml = unstableHtml;
+	}
+
+	/**
+	 * Attaches click event listeners to copy and preview buttons in code blocks.
+	 * Uses data-listener-bound attribute to prevent duplicate bindings.
+	 */
+	function setupCodeBlockActions() {
+		if (!containerRef) return;
+
+		const wrappers = containerRef.querySelectorAll<HTMLElement>('.code-block-wrapper');
+
+		for (const wrapper of wrappers) {
+			const copyButton = wrapper.querySelector<HTMLButtonElement>('.copy-code-btn');
+			const previewButton = wrapper.querySelector<HTMLButtonElement>('.preview-code-btn');
+
+			if (copyButton && copyButton.dataset.listenerBound !== 'true') {
+				copyButton.dataset.listenerBound = 'true';
+				copyButton.addEventListener('click', handleCopyClick);
+			}
+
+			if (previewButton && previewButton.dataset.listenerBound !== 'true') {
+				previewButton.dataset.listenerBound = 'true';
+				previewButton.addEventListener('click', handlePreviewClick);
+			}
+		}
+	}
+
+	/**
+	 * Attaches error handlers to images to show fallback UI when loading fails (e.g., CORS).
+	 * Uses data-error-bound attribute to prevent duplicate bindings.
+	 */
+	function setupImageErrorHandlers() {
+		if (!containerRef) return;
+
+		const images = containerRef.querySelectorAll<HTMLImageElement>(IMAGE_NOT_ERROR_BOUND_SELECTOR);
+
+		for (const img of images) {
+			img.dataset[DATA_ERROR_BOUND_ATTR] = BOOL_TRUE_STRING;
+			img.addEventListener('error', handleImageError);
+		}
+	}
+
+	/**
+	 * Handles image load errors by replacing the image with a fallback UI.
+	 * Shows a placeholder with a link to open the image in a new tab.
+	 */
+	function handleImageError(event: Event) {
+		const img = event.target as HTMLImageElement;
+		if (!img || !img.src) return;
+
+		// Don't handle data URLs or already-handled images
+		if (img.src.startsWith('data:') || img.dataset[DATA_ERROR_HANDLED_ATTR] === BOOL_TRUE_STRING)
+			return;
+		img.dataset[DATA_ERROR_HANDLED_ATTR] = BOOL_TRUE_STRING;
+
+		const src = img.src;
+		// Create fallback element
+		const fallback = document.createElement('div');
+		fallback.className = 'image-load-error';
+		fallback.innerHTML = getImageErrorFallbackHtml(src);
+
+		// Replace image with fallback
+		img.parentNode?.replaceChild(fallback, img);
+	}
+
+	/**
+	 * Queues markdown for processing with coalescing support.
+	 * Only processes the latest markdown when multiple updates arrive quickly.
+	 * Uses requestAnimationFrame to yield to browser paint between batches.
+	 * @param markdown - The markdown content to render
+	 */
+	async function updateRenderedBlocks(markdown: string) {
+		pendingMarkdown = markdown;
+
+		if (isProcessing) {
+			return;
+		}
+
+		isProcessing = true;
+
+		try {
+			while (pendingMarkdown !== null) {
+				const nextMarkdown = pendingMarkdown;
+				pendingMarkdown = null;
+
+				await processMarkdown(nextMarkdown);
+
+				// Yield to browser for paint. During this, new chunks coalesce
+				// into pendingMarkdown, so we always render the latest state.
+				if (pendingMarkdown !== null) {
+					await new Promise((resolve) => requestAnimationFrame(resolve));
+				}
+			}
+		} catch (error) {
+			console.error('Failed to process markdown:', error);
+			renderedBlocks = [];
+			unstableBlockHtml = markdown.replace(/\n/g, '<br>');
+		} finally {
+			isProcessing = false;
+		}
+	}
+
+	$effect(() => {
+		const currentMode = mode.current;
+		const isDark = currentMode === 'dark';
+
+		loadHighlightTheme(isDark);
+	});
+
+	$effect(() => {
+		updateRenderedBlocks(content);
+	});
+
+	$effect(() => {
+		const hasRenderedBlocks = renderedBlocks.length > 0;
+		const hasUnstableBlock = Boolean(unstableBlockHtml);
+
+		if ((hasRenderedBlocks || hasUnstableBlock) && containerRef) {
+			setupCodeBlockActions();
+			setupImageErrorHandlers();
+		}
+	});
+
+	// Auto-scroll for streaming code block
+	$effect(() => {
+		streamingAutoScroll.setContainer(streamingCodeScrollContainer);
+	});
+
+	$effect(() => {
+		streamingAutoScroll.updateInterval(incompleteCodeBlock !== null);
+	});
+
+	onDestroy(() => {
+		cleanupEventListeners();
+		cleanupHighlightTheme();
+		streamingAutoScroll.destroy();
+	});
+</script>
+
+<div bind:this={containerRef} class={className}>
+	{#each renderedBlocks as block (block.id)}
+		<div class="markdown-block" data-block-id={block.id}>
+			<!-- eslint-disable-next-line no-at-html-tags -->
+			{@html block.html}
+		</div>
+	{/each}
+
+	{#if unstableBlockHtml}
+		<div class="markdown-block markdown-block--unstable" data-block-id="unstable">
+			<!-- eslint-disable-next-line no-at-html-tags -->
+			{@html unstableBlockHtml}
+		</div>
+	{/if}
+
+	{#if incompleteCodeBlock}
+		<div class="code-block-wrapper streaming-code-block relative">
+			<div class="code-block-header">
+				<span class="code-language">{incompleteCodeBlock.language || 'text'}</span>
+				<ActionIconsCodeBlock
+					code={incompleteCodeBlock.code}
+					language={incompleteCodeBlock.language || 'text'}
+					disabled={true}
+					onPreview={(code: string, lang: string) => {
+						previewCode = code;
+						previewLanguage = lang;
+						previewDialogOpen = true;
+					}}
+				/>
+			</div>
+			<div
+				bind:this={streamingCodeScrollContainer}
+				class="streaming-code-scroll-container"
+				onscroll={() => streamingAutoScroll.handleScroll()}
+			>
+				<pre class="streaming-code-pre"><code
+						class="hljs language-{incompleteCodeBlock.language || 'text'}"
+						>{@html highlightCode(
+							incompleteCodeBlock.code,
+							incompleteCodeBlock.language || 'text'
+						)}</code
+					></pre>
+			</div>
+		</div>
+	{/if}
+</div>
+
+<DialogCodePreview
+	open={previewDialogOpen}
+	code={previewCode}
+	language={previewLanguage}
+	onOpenChange={handlePreviewDialogOpenChange}
+/>
+
+<style>
+	.markdown-block,
+	.markdown-block--unstable {
+		display: contents;
+	}
+
+	/* Streaming code block uses .code-block-wrapper styles */
+	.streaming-code-block .streaming-code-pre {
+		background: transparent;
+		padding: 0.5rem;
+		margin: 0;
+		overflow-x: visible;
+		border-radius: 0;
+		border: none;
+		font-size: 0.875rem;
+	}
+
+	/* Base typography styles */
+	div :global(p) {
+		margin-block: 1rem;
+		line-height: 1.75;
+	}
+
+	div :global(:is(h1, h2, h3, h4, h5, h6):first-child) {
+		margin-top: 0;
+	}
+
+	/* Headers with consistent spacing */
+	div :global(h1) {
+		font-size: 1.875rem;
+		font-weight: 700;
+		line-height: 1.2;
+		margin: 1.5rem 0 0.75rem 0;
+	}
+
+	div :global(h2) {
+		font-size: 1.5rem;
+		font-weight: 600;
+		line-height: 1.3;
+		margin: 1.25rem 0 0.5rem 0;
+	}
+
+	div :global(h3) {
+		font-size: 1.25rem;
+		font-weight: 600;
+		margin: 1.5rem 0 0.5rem 0;
+		line-height: 1.4;
+	}
+
+	div :global(h4) {
+		font-size: 1.125rem;
+		font-weight: 600;
+		margin: 0.75rem 0 0.25rem 0;
+	}
+
+	div :global(h5) {
+		font-size: 1rem;
+		font-weight: 600;
+		margin: 0.5rem 0 0.25rem 0;
+	}
+
+	div :global(h6) {
+		font-size: 0.875rem;
+		font-weight: 600;
+		margin: 0.5rem 0 0.25rem 0;
+	}
+
+	/* Text formatting */
+	div :global(strong) {
+		font-weight: 600;
+	}
+
+	div :global(em) {
+		font-style: italic;
+	}
+
+	div :global(del) {
+		text-decoration: line-through;
+		opacity: 0.7;
+	}
+
+	/* Inline code */
+	div :global(code:not(pre code)) {
+		background: var(--muted);
+		color: var(--muted-foreground);
+		padding: 0.125rem 0.375rem;
+		border-radius: 0.375rem;
+		font-size: 0.875rem;
+		font-family:
+			ui-monospace, SFMono-Regular, 'SF Mono', Monaco, 'Cascadia Code', 'Roboto Mono', Consolas,
+			'Liberation Mono', Menlo, monospace;
+	}
+
+	div :global(pre) {
+		display: inline;
+		margin: 0 !important;
+		overflow: hidden !important;
+		background: var(--muted);
+		overflow-x: auto;
+		border-radius: 1rem;
+		border: none;
+		line-height: 1 !important;
+	}
+
+	div :global(pre code) {
+		padding: 0 !important;
+		display: inline !important;
+	}
+
+	div :global(code) {
+		background: transparent;
+		color: var(--code-foreground);
+	}
+
+	/* Links */
+	div :global(a) {
+		color: var(--primary);
+		text-decoration: underline;
+		text-underline-offset: 2px;
+		transition: color 0.2s ease;
+		overflow-wrap: anywhere;
+		word-break: break-all;
+	}
+
+	div :global(a:hover) {
+		color: var(--primary);
+	}
+
+	/* Lists */
+	div :global(ul) {
+		list-style-type: disc;
+		margin-left: 1.5rem;
+		margin-bottom: 1rem;
+	}
+
+	div :global(ol) {
+		list-style-type: decimal;
+		margin-left: 1.5rem;
+		margin-bottom: 1rem;
+	}
+
+	div :global(li) {
+		margin-bottom: 0.25rem;
+		padding-left: 0.5rem;
+	}
+
+	div :global(li::marker) {
+		color: var(--muted-foreground);
+	}
+
+	/* Nested lists */
+	div :global(ul ul) {
+		list-style-type: circle;
+		margin-top: 0.25rem;
+		margin-bottom: 0.25rem;
+	}
+
+	div :global(ol ol) {
+		list-style-type: lower-alpha;
+		margin-top: 0.25rem;
+		margin-bottom: 0.25rem;
+	}
+
+	/* Task lists */
+	div :global(.task-list-item) {
+		list-style: none;
+		margin-left: 0;
+		padding-left: 0;
+	}
+
+	div :global(.task-list-item-checkbox) {
+		margin-right: 0.5rem;
+		margin-top: 0.125rem;
+	}
+
+	/* Blockquotes */
+	div :global(blockquote) {
+		border-left: 4px solid var(--border);
+		padding: 0.5rem 1rem;
+		margin: 1.5rem 0;
+		font-style: italic;
+		color: var(--muted-foreground);
+		background: var(--muted);
+		border-radius: 0 0.375rem 0.375rem 0;
+	}
+
+	/* Tables */
+	div :global(table) {
+		width: 100%;
+		margin: 1.5rem 0;
+		border-collapse: collapse;
+		border: 1px solid var(--border);
+		border-radius: 0.375rem;
+		overflow: hidden;
+	}
+
+	div :global(th) {
+		background: hsl(var(--muted) / 0.3);
+		border: 1px solid var(--border);
+		padding: 0.5rem 0.75rem;
+		text-align: left;
+		font-weight: 600;
+	}
+
+	div :global(td) {
+		border: 1px solid var(--border);
+		padding: 0.5rem 0.75rem;
+	}
+
+	div :global(tr:nth-child(even)) {
+		background: hsl(var(--muted) / 0.1);
+	}
+
+	/* User message markdown should keep table borders visible on light primary backgrounds */
+	div.markdown-user-content :global(table),
+	div.markdown-user-content :global(th),
+	div.markdown-user-content :global(td),
+	div.markdown-user-content :global(.table-wrapper) {
+		border-color: currentColor;
+	}
+
+	/* Horizontal rules */
+	div :global(hr) {
+		border: none;
+		border-top: 1px solid var(--border);
+		margin: 1.5rem 0;
+	}
+
+	/* Images */
+	div :global(img) {
+		border-radius: 0.5rem;
+		box-shadow:
+			0 1px 3px 0 rgb(0 0 0 / 0.1),
+			0 1px 2px -1px rgb(0 0 0 / 0.1);
+		margin: 1.5rem 0;
+		max-width: 100%;
+		height: auto;
+	}
+
+	/* Code blocks */
+
+	div :global(.code-block-wrapper) {
+		margin: 1.5rem 0;
+		border-radius: 0.75rem;
+		overflow: hidden;
+		border: 1px solid color-mix(in oklch, var(--border) 30%, transparent);
+		background: var(--code-background);
+		box-shadow: 0 1px 2px 0 rgb(0 0 0 / 0.05);
+		min-height: var(--min-message-height);
+		max-height: var(--max-message-height);
+	}
+
+	:global(.dark) div :global(.code-block-wrapper) {
+		border-color: color-mix(in oklch, var(--border) 20%, transparent);
+	}
+
+	/* Scroll container for code blocks (both streaming and completed) */
+	div :global(.code-block-scroll-container),
+	.streaming-code-scroll-container {
+		min-height: var(--min-message-height);
+		max-height: var(--max-message-height);
+		overflow-y: auto;
+		overflow-x: auto;
+		padding: 3rem 1rem 1rem;
+		line-height: 1.3;
+	}
+
+	div :global(.code-block-header) {
+		display: flex;
+		justify-content: space-between;
+		align-items: center;
+		padding: 0.5rem 1rem 0;
+		font-size: 0.875rem;
+		position: absolute;
+		top: 0;
+		left: 0;
+		right: 0;
+	}
+
+	div :global(.code-language) {
+		color: var(--color-foreground);
+		font-weight: 500;
+		font-family:
+			ui-monospace, SFMono-Regular, 'SF Mono', Monaco, 'Cascadia Code', 'Roboto Mono', Consolas,
+			'Liberation Mono', Menlo, monospace;
+		text-transform: uppercase;
+		font-size: 0.75rem;
+		letter-spacing: 0.05em;
+	}
+
+	div :global(.code-block-actions) {
+		display: flex;
+		align-items: center;
+		gap: 0.5rem;
+	}
+
+	div :global(.copy-code-btn),
+	div :global(.preview-code-btn) {
+		display: flex;
+		align-items: center;
+		justify-content: center;
+		padding: 0;
+		background: transparent;
+		color: var(--code-foreground);
+		cursor: pointer;
+		transition: all 0.2s ease;
+	}
+
+	div :global(.copy-code-btn:hover),
+	div :global(.preview-code-btn:hover) {
+		transform: scale(1.05);
+	}
+
+	div :global(.copy-code-btn:active),
+	div :global(.preview-code-btn:active) {
+		transform: scale(0.95);
+	}
+
+	div :global(.code-block-wrapper pre) {
+		background: transparent;
+		margin: 0;
+		border-radius: 0;
+		border: none;
+		font-size: 0.875rem;
+	}
+
+	/* Mentions and hashtags */
+	div :global(.mention) {
+		color: hsl(var(--primary));
+		font-weight: 500;
+		text-decoration: none;
+	}
+
+	div :global(.mention:hover) {
+		text-decoration: underline;
+	}
+
+	div :global(.hashtag) {
+		color: hsl(var(--primary));
+		font-weight: 500;
+		text-decoration: none;
+	}
+
+	div :global(.hashtag:hover) {
+		text-decoration: underline;
+	}
+
+	/* Advanced table enhancements */
+	div :global(table) {
+		transition: all 0.2s ease;
+	}
+
+	div :global(table:hover) {
+		box-shadow:
+			0 4px 6px -1px rgb(0 0 0 / 0.1),
+			0 2px 4px -2px rgb(0 0 0 / 0.1);
+	}
+
+	div :global(th:hover),
+	div :global(td:hover) {
+		background: var(--muted);
+	}
+
+	/* Disable hover effects when rendering user messages */
+	.markdown-user-content :global(a),
+	.markdown-user-content :global(a:hover) {
+		color: inherit;
+	}
+
+	.markdown-user-content :global(table:hover) {
+		box-shadow: none;
+	}
+
+	.markdown-user-content :global(th:hover),
+	.markdown-user-content :global(td:hover) {
+		background: inherit;
+	}
+
+	/* Enhanced blockquotes */
+	div :global(blockquote) {
+		transition: all 0.2s ease;
+		position: relative;
+	}
+
+	div :global(blockquote:hover) {
+		border-left-width: 6px;
+		background: var(--muted);
+		transform: translateX(2px);
+	}
+
+	div :global(blockquote::before) {
+		content: '"';
+		position: absolute;
+		top: -0.5rem;
+		left: 0.5rem;
+		font-size: 3rem;
+		color: var(--muted-foreground);
+		font-family: serif;
+		line-height: 1;
+	}
+
+	/* Enhanced images */
+	div :global(img) {
+		transition: all 0.3s ease;
+		cursor: pointer;
+	}
+
+	div :global(img:hover) {
+		transform: scale(1.02);
+		box-shadow:
+			0 10px 15px -3px rgb(0 0 0 / 0.1),
+			0 4px 6px -4px rgb(0 0 0 / 0.1);
+	}
+
+	/* Image zoom overlay */
+	div :global(.image-zoom-overlay) {
+		position: fixed;
+		top: 0;
+		left: 0;
+		right: 0;
+		bottom: 0;
+		background: rgba(0, 0, 0, 0.8);
+		display: flex;
+		align-items: center;
+		justify-content: center;
+		z-index: 1000;
+		cursor: pointer;
+	}
+
+	div :global(.image-zoom-overlay img) {
+		max-width: 90vw;
+		max-height: 90vh;
+		border-radius: 0.5rem;
+		box-shadow: 0 25px 50px -12px rgb(0 0 0 / 0.25);
+	}
+
+	/* Enhanced horizontal rules */
+	div :global(hr) {
+		border: none;
+		height: 2px;
+		background: linear-gradient(to right, transparent, var(--border), transparent);
+		margin: 2rem 0;
+		position: relative;
+	}
+
+	div :global(hr::after) {
+		content: '';
+		position: absolute;
+		top: 50%;
+		left: 50%;
+		transform: translate(-50%, -50%);
+		width: 1rem;
+		height: 1rem;
+		background: var(--border);
+		border-radius: 50%;
+	}
+
+	/* Scrollable tables */
+	div :global(.table-wrapper) {
+		overflow-x: auto;
+		margin: 1.5rem 0;
+		border-radius: 0.5rem;
+		border: 1px solid var(--border);
+	}
+
+	div :global(.table-wrapper table) {
+		margin: 0;
+		border: none;
+	}
+
+	/* Responsive adjustments */
+	@media (max-width: 640px) {
+		div :global(h1) {
+			font-size: 1.5rem;
+		}
+
+		div :global(h2) {
+			font-size: 1.25rem;
+		}
+
+		div :global(h3) {
+			font-size: 1.125rem;
+		}
+
+		div :global(table) {
+			font-size: 0.875rem;
+		}
+
+		div :global(th),
+		div :global(td) {
+			padding: 0.375rem 0.5rem;
+		}
+
+		div :global(.table-wrapper) {
+			margin: 0.5rem -1rem;
+			border-radius: 0;
+			border-left: none;
+			border-right: none;
+		}
+	}
+
+	/* Dark mode adjustments */
+	@media (prefers-color-scheme: dark) {
+		div :global(blockquote:hover) {
+			background: var(--muted);
+		}
+	}
+
+	/* Image load error fallback */
+	div :global(.image-load-error) {
+		display: flex;
+		align-items: center;
+		justify-content: center;
+		margin: 1.5rem 0;
+		padding: 1.5rem;
+		border-radius: 0.5rem;
+		background: var(--muted);
+		border: 1px dashed var(--border);
+	}
+
+	div :global(.image-error-content) {
+		display: flex;
+		flex-direction: column;
+		align-items: center;
+		gap: 0.75rem;
+		color: var(--muted-foreground);
+		text-align: center;
+	}
+
+	div :global(.image-error-content svg) {
+		opacity: 0.5;
+	}
+
+	div :global(.image-error-text) {
+		font-size: 0.875rem;
+	}
+
+	div :global(.image-error-link) {
+		display: inline-flex;
+		align-items: center;
+		gap: 0.375rem;
+		padding: 0.5rem 1rem;
+		font-size: 0.875rem;
+		font-weight: 500;
+		color: var(--primary);
+		background: var(--background);
+		border: 1px solid var(--border);
+		border-radius: 0.375rem;
+		text-decoration: none;
+		transition: all 0.2s ease;
+	}
+
+	div :global(.image-error-link:hover) {
+		background: var(--muted);
+		border-color: var(--primary);
+	}
+</style>
diff --git a/tools/server/webui/src/lib/components/app/content/SyntaxHighlightedCode.svelte b/tools/server/webui/src/lib/components/app/content/SyntaxHighlightedCode.svelte
new file mode 100644
index 0000000000..625fdc7b1b
--- /dev/null
+++ b/tools/server/webui/src/lib/components/app/content/SyntaxHighlightedCode.svelte
@@ -0,0 +1,95 @@
+<script lang="ts">
+	import hljs from 'highlight.js';
+	import { browser } from '$app/environment';
+	import { mode } from 'mode-watcher';
+
+	import githubDarkCss from 'highlight.js/styles/github-dark.css?inline';
+	import githubLightCss from 'highlight.js/styles/github.css?inline';
+
+	interface Props {
+		code: string;
+		language?: string;
+		class?: string;
+		maxHeight?: string;
+		maxWidth?: string;
+	}
+
+	let {
+		code,
+		language = 'text',
+		class: className = '',
+		maxHeight = '60vh',
+		maxWidth = ''
+	}: Props = $props();
+
+	let highlightedHtml = $state('');
+
+	function loadHighlightTheme(isDark: boolean) {
+		if (!browser) return;
+
+		const existingThemes = document.querySelectorAll('style[data-highlight-theme-preview]');
+		existingThemes.forEach((style) => style.remove());
+
+		const style = document.createElement('style');
+		style.setAttribute('data-highlight-theme-preview', 'true');
+		style.textContent = isDark ? githubDarkCss : githubLightCss;
+
+		document.head.appendChild(style);
+	}
+
+	$effect(() => {
+		const currentMode = mode.current;
+		const isDark = currentMode === 'dark';
+
+		loadHighlightTheme(isDark);
+	});
+
+	$effect(() => {
+		if (!code) {
+			highlightedHtml = '';
+			return;
+		}
+
+		try {
+			// Check if the language is supported
+			const lang = language.toLowerCase();
+			const isSupported = hljs.getLanguage(lang);
+
+			if (isSupported) {
+				const result = hljs.highlight(code, { language: lang });
+				highlightedHtml = result.value;
+			} else {
+				// Try auto-detection or fallback to plain text
+				const result = hljs.highlightAuto(code);
+				highlightedHtml = result.value;
+			}
+		} catch {
+			// Fallback to escaped plain text
+			highlightedHtml = code.replace(/&/g, '&amp;').replace(/</g, '&lt;').replace(/>/g, '&gt;');
+		}
+	});
+</script>
+
+<div
+	class="code-preview-wrapper rounded-lg border border-border bg-muted {className}"
+	style="max-height: {maxHeight}; max-width: {maxWidth};"
+>
+	<!-- Needs to be formatted as single line for proper rendering -->
+	<pre class="m-0"><code class="hljs text-sm leading-relaxed">{@html highlightedHtml}</code></pre>
+</div>
+
+<style>
+	.code-preview-wrapper {
+		font-family:
+			ui-monospace, SFMono-Regular, 'SF Mono', Monaco, 'Cascadia Code', 'Roboto Mono', Consolas,
+			'Liberation Mono', Menlo, monospace;
+	}
+
+	.code-preview-wrapper pre {
+		background: transparent;
+	}
+
+	.code-preview-wrapper code {
+		background: transparent;
+	}
+</style>
diff --git a/tools/server/webui/src/lib/components/app/content/index.ts b/tools/server/webui/src/lib/components/app/content/index.ts
new file mode 100644
index 0000000000..bca1c9f4c2
--- /dev/null
+++ b/tools/server/webui/src/lib/components/app/content/index.ts
@@ -0,0 +1,79 @@
+/**
+ *
+ * CONTENT RENDERING
+ *
+ * Components for rendering rich content: markdown, code, and previews.
+ *
+ */
+
+/**
+ * **MarkdownContent** - Rich markdown renderer
+ *
+ * Renders markdown content with syntax highlighting, LaTeX math,
+ * tables, links, and code blocks. Optimized for streaming with
+ * incremental block-based rendering.
+ *
+ * **Features:**
+ * - GFM (GitHub Flavored Markdown): tables, task lists, strikethrough
+ * - LaTeX math via KaTeX (`$inline$` and `$$block$$`)
+ * - Syntax highlighting (highlight.js) with language detection
+ * - Code copy buttons with click feedback
+ * - External links open in new tab with security attrs
+ * - Image attachment resolution from message extras
+ * - Dark/light theme support (auto-switching)
+ * - Streaming-optimized incremental rendering
+ * - Code preview dialog for large blocks
+ *
+ * @example
+ * ```svelte
+ * <MarkdownContent content={message.content} attachments={message.extra} />
+ * ```
+ */
+export { default as MarkdownContent } from './MarkdownContent.svelte';
+
+/**
+ * **SyntaxHighlightedCode** - Code syntax highlighting
+ *
+ * Renders code with syntax highlighting using highlight.js.
+ * Supports theme switching and scrollable containers.
+ *
+ * **Features:**
+ * - Auto language detection with fallback
+ * - Dark/light theme auto-switching
+ * - Scrollable container with configurable max dimensions
+ * - Monospace font styling
+ * - Preserves whitespace and formatting
+ *
+ * @example
+ * ```svelte
+ * <SyntaxHighlightedCode code={jsonString} language="json" />
+ * ```
+ */
+export { default as SyntaxHighlightedCode } from './SyntaxHighlightedCode.svelte';
+
+/**
+ * **CollapsibleContentBlock** - Expandable content card
+ *
+ * Reusable collapsible card with header, icon, and auto-scroll.
+ * Used for tool calls and reasoning blocks in chat messages.
+ *
+ * **Features:**
+ * - Collapsible content with smooth animation
+ * - Custom icon and title display
+ * - Optional subtitle/status text
+ * - Auto-scroll during streaming (pauses on user scroll)
+ * - Configurable max height with overflow scroll
+ *
+ * @example
+ * ```svelte
+ * <CollapsibleContentBlock
+ *   bind:open
+ *   icon={BrainIcon}
+ *   title="Thinking..."
+ *   isStreaming={true}
+ * >
+ *   {reasoningContent}
+ * </CollapsibleContentBlock>
+ * ```
+ */
+export { default as CollapsibleContentBlock } from './CollapsibleContentBlock.svelte';
diff --git a/tools/server/webui/src/lib/components/app/misc/ConversationSelection.svelte b/tools/server/webui/src/lib/components/app/misc/ConversationSelection.svelte
index e2095e0876..21412f47e5 100644
--- a/tools/server/webui/src/lib/components/app/misc/ConversationSelection.svelte
+++ b/tools/server/webui/src/lib/components/app/misc/ConversationSelection.svelte
@@ -17,9 +17,13 @@
 	let { conversations, messageCountMap = new Map(), mode, onCancel, onConfirm }: Props = $props();
 
 	let searchQuery = $state('');
-	let selectedIds = $state.raw<SvelteSet<string>>(new SvelteSet(conversations.map((c) => c.id)));
+	let selectedIds = $state.raw<SvelteSet<string>>(getInitialSelectedIds());
 	let lastClickedId = $state<string | null>(null);
 
+	function getInitialSelectedIds(): SvelteSet<string> {
+		return new SvelteSet(conversations.map((c) => c.id));
+	}
+
 	let filteredConversations = $derived(
 		conversations.filter((conv) => {
 			const name = conv.name || 'Untitled conversation';
@@ -92,7 +96,7 @@
 	}
 
 	function handleCancel() {
-		selectedIds = new SvelteSet(conversations.map((c) => c.id));
+		selectedIds = getInitialSelectedIds();
 		searchQuery = '';
 		lastClickedId = null;
 
@@ -100,7 +104,7 @@
 	}
 
 	export function reset() {
-		selectedIds = new SvelteSet(conversations.map((c) => c.id));
+		selectedIds = getInitialSelectedIds();
 		searchQuery = '';
 		lastClickedId = null;
 	}
diff --git a/tools/server/webui/src/lib/components/app/misc/HorizontalScrollCarousel.svelte b/tools/server/webui/src/lib/components/app/misc/HorizontalScrollCarousel.svelte
new file mode 100644
index 0000000000..e302f83e11
--- /dev/null
+++ b/tools/server/webui/src/lib/components/app/misc/HorizontalScrollCarousel.svelte
@@ -0,0 +1,93 @@
+<script lang="ts">
+	import { ChevronLeft, ChevronRight } from '@lucide/svelte';
+
+	interface Props {
+		class?: string;
+		children?: import('svelte').Snippet;
+		gapSize?: string;
+		onScrollableChange?: (isScrollable: boolean) => void;
+	}
+
+	let { class: className = '', children, gapSize = '3', onScrollableChange }: Props = $props();
+
+	let canScrollLeft = $state(false);
+	let canScrollRight = $state(false);
+	let scrollContainer: HTMLDivElement | undefined = $state();
+
+	function scrollLeft(event?: MouseEvent) {
+		event?.stopPropagation();
+		event?.preventDefault();
+
+		if (!scrollContainer) return;
+
+		scrollContainer.scrollBy({ left: scrollContainer.clientWidth * -0.67, behavior: 'smooth' });
+	}
+
+	function scrollRight(event?: MouseEvent) {
+		event?.stopPropagation();
+		event?.preventDefault();
+
+		if (!scrollContainer) return;
+
+		scrollContainer.scrollBy({ left: scrollContainer.clientWidth * 0.67, behavior: 'smooth' });
+	}
+
+	function updateScrollButtons() {
+		if (!scrollContainer) return;
+
+		const { scrollLeft, scrollWidth, clientWidth } = scrollContainer;
+
+		canScrollLeft = scrollLeft > 0;
+		canScrollRight = scrollLeft < scrollWidth - clientWidth - 1;
+
+		const isScrollable = scrollWidth > clientWidth;
+		onScrollableChange?.(isScrollable);
+	}
+
+	export function resetScroll() {
+		if (scrollContainer) {
+			scrollContainer.scrollLeft = 0;
+			setTimeout(() => {
+				updateScrollButtons();
+			}, 0);
+		}
+	}
+
+	$effect(() => {
+		if (scrollContainer) {
+			setTimeout(() => {
+				updateScrollButtons();
+			}, 0);
+		}
+	});
+</script>
+
+<div class="relative {className}">
+	<button
+		class="absolute top-1/2 left-4 z-10 flex h-6 w-6 -translate-y-1/2 items-center justify-center rounded-full bg-foreground/15 shadow-md backdrop-blur-xs transition-opacity hover:bg-foreground/35 {canScrollLeft
+			? 'opacity-100'
+			: 'pointer-events-none opacity-0'}"
+		onclick={scrollLeft}
+		aria-label="Scroll left"
+	>
+		<ChevronLeft class="h-4 w-4" />
+	</button>
+
+	<div
+		class="scrollbar-hide flex items-start gap-{gapSize} overflow-x-auto"
+		bind:this={scrollContainer}
+		onscroll={updateScrollButtons}
+	>
+		{@render children?.()}
+	</div>
+
+	<button
+		class="absolute top-1/2 right-4 z-10 flex h-6 w-6 -translate-y-1/2 items-center justify-center rounded-full bg-foreground/15 shadow-md backdrop-blur-xs transition-opacity hover:bg-foreground/35 {canScrollRight
+			? 'opacity-100'
+			: 'pointer-events-none opacity-0'}"
+		onclick={scrollRight}
+		aria-label="Scroll right"
+	>
+		<ChevronRight class="h-4 w-4" />
+	</button>
+</div>
diff --git a/tools/server/webui/src/lib/components/app/misc/KeyboardShortcutInfo.svelte b/tools/server/webui/src/lib/components/app/misc/KeyboardShortcutInfo.svelte
index 5b7522fe1b..da55abda02 100644
--- a/tools/server/webui/src/lib/components/app/misc/KeyboardShortcutInfo.svelte
+++ b/tools/server/webui/src/lib/components/app/misc/KeyboardShortcutInfo.svelte
@@ -11,7 +11,9 @@
 
 	let baseClasses =
 		'px-1 pointer-events-none inline-flex select-none items-center gap-0.5 font-sans text-md font-medium opacity-0 transition-opacity -my-1';
-	let variantClasses = variant === 'destructive' ? 'text-destructive' : 'text-muted-foreground';
+	let variantClasses = $derived(
+		variant === 'destructive' ? 'text-destructive' : 'text-muted-foreground'
+	);
 </script>
 
 <kbd class="{baseClasses} {variantClasses} {className}">
diff --git a/tools/server/webui/src/lib/components/app/misc/TruncatedText.svelte b/tools/server/webui/src/lib/components/app/misc/TruncatedText.svelte
new file mode 100644
index 0000000000..9a8731fc78
--- /dev/null
+++ b/tools/server/webui/src/lib/components/app/misc/TruncatedText.svelte
@@ -0,0 +1,48 @@
+<script lang="ts">
+	import * as Tooltip from '$lib/components/ui/tooltip';
+
+	interface Props {
+		text: string;
+		class?: string;
+	}
+
+	let { text, class: className = '' }: Props = $props();
+
+	let textElement: HTMLSpanElement | undefined = $state();
+	let isTruncated = $state(false);
+
+	function checkTruncation() {
+		if (textElement) {
+			isTruncated = textElement.scrollWidth > textElement.clientWidth;
+		}
+	}
+
+	$effect(() => {
+		if (textElement) {
+			checkTruncation();
+
+			const observer = new ResizeObserver(checkTruncation);
+			observer.observe(textElement);
+
+			return () => observer.disconnect();
+		}
+	});
+</script>
+
+{#if isTruncated}
+	<Tooltip.Root>
+		<Tooltip.Trigger class={className}>
+			<span bind:this={textElement} class="block truncate">
+				{text}
+			</span>
+		</Tooltip.Trigger>
+
+		<Tooltip.Content class="z-[9999]">
+			<p>{text}</p>
+		</Tooltip.Content>
+	</Tooltip.Root>
+{:else}
+	<span bind:this={textElement} class="{className} block truncate">
+		{text}
+	</span>
+{/if}
diff --git a/tools/server/webui/src/lib/components/app/misc/index.ts b/tools/server/webui/src/lib/components/app/misc/index.ts
new file mode 100644
index 0000000000..02bd70b24f
--- /dev/null
+++ b/tools/server/webui/src/lib/components/app/misc/index.ts
@@ -0,0 +1,45 @@
+/**
+ *
+ * MISC
+ *
+ * Miscellaneous utility components.
+ *
+ */
+
+/**
+ * **ConversationSelection** - Multi-select conversation picker
+ *
+ * List of conversations with checkboxes for multi-selection.
+ * Used in import/export dialogs for selecting conversations.
+ *
+ * **Features:**
+ * - Search/filter conversations by name
+ * - Select all / deselect all controls
+ * - Shift-click for range selection
+ * - Message count display per conversation
+ * - Mode-specific UI (export vs import)
+ */
+export { default as ConversationSelection } from './ConversationSelection.svelte';
+
+/**
+ * Horizontal scrollable carousel with navigation arrows.
+ * Used for displaying items in a horizontally scrollable container
+ * with left/right navigation buttons that appear on hover.
+ */
+export { default as HorizontalScrollCarousel } from './HorizontalScrollCarousel.svelte';
+
+/**
+ * **TruncatedText** - Text with ellipsis and tooltip
+ *
+ * Displays text with automatic truncation and full content in tooltip.
+ * Useful for long names or paths in constrained spaces.
+ */
+export { default as TruncatedText } from './TruncatedText.svelte';
+
+/**
+ * **KeyboardShortcutInfo** - Keyboard shortcut hint display
+ *
+ * Displays keyboard shortcut hints (e.g., "⌘ + Enter").
+ * Supports special keys like shift, cmd, and custom text.
+ */
+export { default as KeyboardShortcutInfo } from './KeyboardShortcutInfo.svelte';
diff --git a/tools/server/webui/src/lib/components/app/navigation/DropdownMenuActions.svelte b/tools/server/webui/src/lib/components/app/navigation/DropdownMenuActions.svelte
new file mode 100644
index 0000000000..83d856d10e
--- /dev/null
+++ b/tools/server/webui/src/lib/components/app/navigation/DropdownMenuActions.svelte
@@ -0,0 +1,86 @@
+<script lang="ts">
+	import * as DropdownMenu from '$lib/components/ui/dropdown-menu';
+	import * as Tooltip from '$lib/components/ui/tooltip';
+	import { KeyboardShortcutInfo } from '$lib/components/app';
+	import type { Component } from 'svelte';
+
+	interface ActionItem {
+		icon: Component;
+		label: string;
+		onclick: (event: Event) => void;
+		variant?: 'default' | 'destructive';
+		disabled?: boolean;
+		shortcut?: string[];
+		separator?: boolean;
+	}
+
+	interface Props {
+		triggerIcon: Component;
+		triggerTooltip?: string;
+		triggerClass?: string;
+		actions: ActionItem[];
+		align?: 'start' | 'center' | 'end';
+		open?: boolean;
+	}
+
+	let {
+		triggerIcon,
+		triggerTooltip,
+		triggerClass = '',
+		actions,
+		align = 'end',
+		open = $bindable(false)
+	}: Props = $props();
+</script>
+
+<DropdownMenu.Root bind:open>
+	<DropdownMenu.Trigger
+		class="flex h-6 w-6 cursor-pointer items-center justify-center rounded-md p-0 text-sm font-medium transition-colors hover:bg-accent hover:text-accent-foreground focus:bg-accent focus:text-accent-foreground focus:outline-none disabled:pointer-events-none disabled:opacity-50 data-[state=open]:bg-accent data-[state=open]:text-accent-foreground {triggerClass}"
+		onclick={(e) => e.stopPropagation()}
+	>
+		{#if triggerTooltip}
+			<Tooltip.Root>
+				<Tooltip.Trigger>
+					{@render iconComponent(triggerIcon, 'h-3 w-3')}
+					<span class="sr-only">{triggerTooltip}</span>
+				</Tooltip.Trigger>
+				<Tooltip.Content>
+					<p>{triggerTooltip}</p>
+				</Tooltip.Content>
+			</Tooltip.Root>
+		{:else}
+			{@render iconComponent(triggerIcon, 'h-3 w-3')}
+		{/if}
+	</DropdownMenu.Trigger>
+
+	<DropdownMenu.Content {align} class="z-[999999] w-48">
+		{#each actions as action, index (action.label)}
+			{#if action.separator && index > 0}
+				<DropdownMenu.Separator />
+			{/if}
+
+			<DropdownMenu.Item
+				onclick={action.onclick}
+				variant={action.variant}
+				disabled={action.disabled}
+				class="flex items-center justify-between hover:[&>kbd]:opacity-100"
+			>
+				<div class="flex items-center gap-2">
+					{@render iconComponent(
+						action.icon,
+						`h-4 w-4 ${action.variant === 'destructive' ? 'text-destructive' : ''}`
+					)}
+					{action.label}
+				</div>
+
+				{#if action.shortcut}
+					<KeyboardShortcutInfo keys={action.shortcut} variant={action.variant} />
+				{/if}
+			</DropdownMenu.Item>
+		{/each}
+	</DropdownMenu.Content>
+</DropdownMenu.Root>
+
+{#snippet iconComponent(IconComponent: Component, className: string)}
+	<IconComponent class={className} />
+{/snippet}
diff --git a/tools/server/webui/src/lib/components/app/navigation/DropdownMenuSearchable.svelte b/tools/server/webui/src/lib/components/app/navigation/DropdownMenuSearchable.svelte
new file mode 100644
index 0000000000..3bd68d3bd6
--- /dev/null
+++ b/tools/server/webui/src/lib/components/app/navigation/DropdownMenuSearchable.svelte
@@ -0,0 +1,50 @@
+<script lang="ts">
+	import type { Snippet } from 'svelte';
+	import * as DropdownMenu from '$lib/components/ui/dropdown-menu';
+	import { SearchInput } from '$lib/components/app';
+
+	interface Props {
+		placeholder?: string;
+		searchValue?: string;
+		onSearchChange?: (value: string) => void;
+		onSearchKeyDown?: (event: KeyboardEvent) => void;
+		emptyMessage?: string;
+		isEmpty?: boolean;
+		children: Snippet;
+		footer?: Snippet;
+	}
+
+	let {
+		placeholder = 'Search...',
+		searchValue = $bindable(''),
+		onSearchChange,
+		onSearchKeyDown,
+		emptyMessage = 'No items found',
+		isEmpty = false,
+		children,
+		footer
+	}: Props = $props();
+</script>
+
+<div class="sticky top-0 z-10 mb-2 bg-popover p-1 pt-2">
+	<SearchInput
+		{placeholder}
+		bind:value={searchValue}
+		onInput={onSearchChange}
+		onKeyDown={onSearchKeyDown}
+	/>
+</div>
+
+<div class="overflow-y-auto">
+	{@render children()}
+
+	{#if isEmpty}
+		<div class="px-2 py-3 text-center text-sm text-muted-foreground">{emptyMessage}</div>
+	{/if}
+</div>
+
+{#if footer}
+	<DropdownMenu.Separator />
+
+	{@render footer()}
+{/if}
diff --git a/tools/server/webui/src/lib/components/app/navigation/index.ts b/tools/server/webui/src/lib/components/app/navigation/index.ts
new file mode 100644
index 0000000000..051491b866
--- /dev/null
+++ b/tools/server/webui/src/lib/components/app/navigation/index.ts
@@ -0,0 +1,65 @@
+/**
+ *
+ * NAVIGATION & MENUS
+ *
+ * Components for dropdown menus and action selection.
+ *
+ */
+
+/**
+ * **DropdownMenuSearchable** - Searchable content for dropdown menus
+ *
+ * Renders a search input with filtered content area, empty state, and optional footer.
+ * Designed to be injected into any dropdown container (DropdownMenu.Content,
+ * DropdownMenu.SubContent, etc.) without providing its own Root.
+ *
+ * **Features:**
+ * - Search/filter input
+ * - Keyboard navigation support
+ * - Custom content and footer via snippets
+ * - Empty state message
+ *
+ * @example
+ * ```svelte
+ * <DropdownMenu.Root>
+ *   <DropdownMenu.Trigger>...</DropdownMenu.Trigger>
+ *   <DropdownMenu.Content class="pt-0">
+ *     <DropdownMenuSearchable
+ *       bind:searchValue
+ *       placeholder="Search..."
+ *       isEmpty={filteredItems.length === 0}
+ *     >
+ *       {#each items as item}<Item {item} />{/each}
+ *     </DropdownMenuSearchable>
+ *   </DropdownMenu.Content>
+ * </DropdownMenu.Root>
+ * ```
+ */
+export { default as DropdownMenuSearchable } from './DropdownMenuSearchable.svelte';
+
+/**
+ * **DropdownMenuActions** - Multi-action dropdown menu
+ *
+ * Dropdown menu for multiple action options with icons and shortcuts.
+ * Supports destructive variants and keyboard shortcut hints.
+ *
+ * **Features:**
+ * - Configurable trigger icon with tooltip
+ * - Action items with icons and labels
+ * - Destructive variant styling
+ * - Keyboard shortcut display
+ * - Separator support between groups
+ *
+ * @example
+ * ```svelte
+ * <DropdownMenuActions
+ *   triggerIcon={MoreHorizontal}
+ *   triggerTooltip="More actions"
+ *   actions={[
+ *     { icon: Edit, label: 'Edit', onclick: handleEdit },
+ *     { icon: Trash, label: 'Delete', onclick: handleDelete, variant: 'destructive' }
+ *   ]}
+ * />
+ * ```
+ */
+export { default as DropdownMenuActions } from './DropdownMenuActions.svelte';
diff --git a/tools/server/webui/src/lib/components/app/server/ServerErrorSplash.svelte b/tools/server/webui/src/lib/components/app/server/ServerErrorSplash.svelte
index fa4c2842cc..520e5bf56f 100644
--- a/tools/server/webui/src/lib/components/app/server/ServerErrorSplash.svelte
+++ b/tools/server/webui/src/lib/components/app/server/ServerErrorSplash.svelte
@@ -8,6 +8,7 @@
 	import { serverStore, serverLoading } from '$lib/stores/server.svelte';
 	import { config, settingsStore } from '$lib/stores/settings.svelte';
 	import { fade, fly, scale } from 'svelte/transition';
+	import { KeyboardKey } from '$lib/enums/keyboard';
 
 	interface Props {
 		class?: string;
@@ -117,7 +118,7 @@
 	}
 
 	function handleApiKeyKeydown(event: KeyboardEvent) {
-		if (event.key === 'Enter') {
+		if (event.key === KeyboardKey.ENTER) {
 			handleSaveApiKey();
 		}
 	}
diff --git a/tools/server/webui/src/lib/components/app/server/ServerStatus.svelte b/tools/server/webui/src/lib/components/app/server/ServerStatus.svelte
index d9f6d4a32a..86a962de12 100644
--- a/tools/server/webui/src/lib/components/app/server/ServerStatus.svelte
+++ b/tools/server/webui/src/lib/components/app/server/ServerStatus.svelte
@@ -48,7 +48,7 @@
 			{model || 'Unknown Model'}
 		</Badge>
 
-		{#if serverData.default_generation_settings.n_ctx}
+		{#if serverData?.default_generation_settings?.n_ctx}
 			<Badge variant="secondary" class="text-xs">
 				ctx: {serverData.default_generation_settings.n_ctx.toLocaleString()}
 			</Badge>
diff --git a/tools/server/webui/src/lib/components/app/server/index.ts b/tools/server/webui/src/lib/components/app/server/index.ts
new file mode 100644
index 0000000000..39ac5b482d
--- /dev/null
+++ b/tools/server/webui/src/lib/components/app/server/index.ts
@@ -0,0 +1,80 @@
+/**
+ *
+ * SERVER
+ *
+ * Components for displaying server connection state and handling
+ * connection errors. Integrates with serverStore for state management.
+ *
+ */
+
+/**
+ * **ServerStatus** - Server connection status indicator
+ *
+ * Compact status display showing connection state, model name,
+ * and context size. Used in headers and loading screens.
+ *
+ * **Architecture:**
+ * - Reads state from serverStore (props, loading, error)
+ * - Displays model name from modelsStore
+ *
+ * **Features:**
+ * - Status dot: green (connected), yellow (connecting), red (error), gray (unknown)
+ * - Status text label
+ * - Model name badge with icon
+ * - Context size badge
+ * - Optional error action button
+ *
+ * @example
+ * ```svelte
+ * <ServerStatus showActions />
+ * ```
+ */
+export { default as ServerStatus } from './ServerStatus.svelte';
+
+/**
+ * **ServerErrorSplash** - Full-screen connection error display
+ *
+ * Blocking error screen shown when server connection fails.
+ * Provides retry options and API key input for authentication errors.
+ *
+ * **Architecture:**
+ * - Detects access denied errors for API key flow
+ * - Validates API key against server before saving
+ * - Integrates with settingsStore for API key persistence
+ *
+ * **Features:**
+ * - Error message display with icon
+ * - Retry connection button with loading state
+ * - API key input for authentication errors
+ * - API key validation with success/error feedback
+ * - Troubleshooting section with server start commands
+ * - Animated transitions for UI elements
+ *
+ * @example
+ * ```svelte
+ * <ServerErrorSplash
+ *   error={serverError}
+ *   onRetry={handleRetry}
+ *   showTroubleshooting
+ * />
+ * ```
+ */
+export { default as ServerErrorSplash } from './ServerErrorSplash.svelte';
+
+/**
+ * **ServerLoadingSplash** - Full-screen loading display
+ *
+ * Shown during initial server connection. Displays loading animation
+ * with ServerStatus component for real-time connection state.
+ *
+ * **Features:**
+ * - Animated server icon
+ * - Customizable loading message
+ * - Embedded ServerStatus for live updates
+ *
+ * @example
+ * ```svelte
+ * <ServerLoadingSplash message="Connecting to server..." />
+ * ```
+ */
+export { default as ServerLoadingSplash } from './ServerLoadingSplash.svelte';
diff --git a/tools/server/webui/src/lib/components/ui/badge/badge.svelte b/tools/server/webui/src/lib/components/ui/badge/badge.svelte
index 4d15145493..c3e6ac0720 100644
--- a/tools/server/webui/src/lib/components/ui/badge/badge.svelte
+++ b/tools/server/webui/src/lib/components/ui/badge/badge.svelte
@@ -42,7 +42,7 @@
 	bind:this={ref}
 	data-slot="badge"
 	{href}
-	class={cn(badgeVariants({ variant }), className)}
+	class={cn(badgeVariants({ variant }), className, 'backdrop-blur-sm')}
 	{...restProps}
 >
 	{@render children?.()}
diff --git a/tools/server/webui/src/lib/components/ui/button/button.svelte b/tools/server/webui/src/lib/components/ui/button/button.svelte
index d12c8de147..d29358c8e0 100644
--- a/tools/server/webui/src/lib/components/ui/button/button.svelte
+++ b/tools/server/webui/src/lib/components/ui/button/button.svelte
@@ -12,8 +12,9 @@
 					'bg-destructive shadow-xs hover:bg-destructive/90 focus-visible:ring-destructive/20 dark:focus-visible:ring-destructive/40 dark:bg-destructive/60 text-white',
 				outline:
 					'bg-background shadow-xs hover:bg-accent hover:text-accent-foreground dark:bg-input/30 dark:border-input dark:hover:bg-input/50 border',
-				secondary: 'bg-secondary text-secondary-foreground shadow-xs hover:bg-secondary/80',
-				ghost: 'hover:bg-accent hover:text-accent-foreground dark:hover:bg-accent/50',
+				secondary:
+					'dark:bg-secondary dark:text-secondary-foreground bg-background shadow-sm text-foreground hover:bg-muted-foreground/20',
+				ghost: 'hover:text-accent-foreground hover:bg-muted-foreground/10',
 				link: 'text-primary underline-offset-4 hover:underline'
 			},
 			size: {
diff --git a/tools/server/webui/src/lib/components/ui/card/card.svelte b/tools/server/webui/src/lib/components/ui/card/card.svelte
index c40d14309f..b9dcd2de6f 100644
--- a/tools/server/webui/src/lib/components/ui/card/card.svelte
+++ b/tools/server/webui/src/lib/components/ui/card/card.svelte
@@ -1,6 +1,7 @@
 <script lang="ts">
 	import type { HTMLAttributes } from 'svelte/elements';
 	import { cn, type WithElementRef } from '$lib/components/ui/utils';
+	import { BOX_BORDER } from '$lib/constants/css-classes';
 
 	let {
 		ref = $bindable(null),
@@ -14,7 +15,8 @@
 	bind:this={ref}
 	data-slot="card"
 	class={cn(
-		'flex flex-col gap-6 rounded-xl border bg-card py-6 text-card-foreground shadow-sm',
+		'flex flex-col gap-6 rounded-xl bg-card py-6 text-card-foreground shadow-sm',
+		BOX_BORDER,
 		className
 	)}
 	{...restProps}
diff --git a/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-content.svelte b/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-content.svelte
index 869c38e848..6013ca2661 100644
--- a/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-content.svelte
+++ b/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-content.svelte
@@ -19,7 +19,7 @@
 		data-slot="dropdown-menu-content"
 		{sideOffset}
 		class={cn(
-			'z-50 max-h-(--bits-dropdown-menu-content-available-height) min-w-[8rem] origin-(--bits-dropdown-menu-content-transform-origin) overflow-x-hidden overflow-y-auto rounded-md border border-border bg-popover p-1 text-popover-foreground shadow-md outline-none data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-right-2 data-[side=right]:slide-in-from-left-2 data-[side=top]:slide-in-from-bottom-2 data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=closed]:zoom-out-95 data-[state=open]:animate-in data-[state=open]:fade-in-0 data-[state=open]:zoom-in-95 dark:border-border/20',
+			'z-50 max-h-(--bits-dropdown-menu-content-available-height) min-w-[8rem] origin-(--bits-dropdown-menu-content-transform-origin) overflow-x-hidden overflow-y-auto rounded-md border border-border bg-popover p-1.5 text-popover-foreground shadow-md outline-none data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-right-2 data-[side=right]:slide-in-from-left-2 data-[side=top]:slide-in-from-bottom-2 data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=closed]:zoom-out-95 data-[state=open]:animate-in data-[state=open]:fade-in-0 data-[state=open]:zoom-in-95 dark:border-border/20',
 			className
 		)}
 		{...restProps}
diff --git a/tools/server/webui/src/lib/components/ui/input/input.svelte b/tools/server/webui/src/lib/components/ui/input/input.svelte
index 889b720716..2b6279b642 100644
--- a/tools/server/webui/src/lib/components/ui/input/input.svelte
+++ b/tools/server/webui/src/lib/components/ui/input/input.svelte
@@ -44,6 +44,7 @@
 			'aria-invalid:border-destructive aria-invalid:ring-destructive/20 dark:aria-invalid:ring-destructive/40',
 			className
 		)}
+		style="backdrop-filter: blur(0.5rem);"
 		{type}
 		bind:value
 		{...restProps}
diff --git a/tools/server/webui/src/lib/components/ui/sidebar/sidebar-trigger.svelte b/tools/server/webui/src/lib/components/ui/sidebar/sidebar-trigger.svelte
index 29d3a9c43a..0d5baf6d6d 100644
--- a/tools/server/webui/src/lib/components/ui/sidebar/sidebar-trigger.svelte
+++ b/tools/server/webui/src/lib/components/ui/sidebar/sidebar-trigger.svelte
@@ -1,6 +1,5 @@
 <script lang="ts">
 	import { Button } from '$lib/components/ui/button/index.js';
-	import { cn } from '$lib/components/ui/utils.js';
 	import PanelLeftIcon from '@lucide/svelte/icons/panel-left';
 	import type { ComponentProps } from 'svelte';
 	import { useSidebar } from './context.svelte.js';
@@ -22,7 +21,7 @@
 	data-slot="sidebar-trigger"
 	variant="ghost"
 	size="icon"
-	class={cn('size-7', className)}
+	class="rounded-full backdrop-blur-lg {className} h-9! w-9!"
 	type="button"
 	onclick={(e) => {
 		onclick?.(e);
diff --git a/tools/server/webui/src/lib/components/ui/switch/switch.svelte b/tools/server/webui/src/lib/components/ui/switch/switch.svelte
index 5a5975e137..e0848790d3 100644
--- a/tools/server/webui/src/lib/components/ui/switch/switch.svelte
+++ b/tools/server/webui/src/lib/components/ui/switch/switch.svelte
@@ -15,7 +15,7 @@
 	bind:checked
 	data-slot="switch"
 	class={cn(
-		'peer inline-flex h-[1.15rem] w-8 shrink-0 items-center rounded-full border border-transparent shadow-xs transition-all outline-none focus-visible:border-ring focus-visible:ring-[3px] focus-visible:ring-ring/50 disabled:cursor-not-allowed disabled:opacity-50 data-[state=checked]:bg-primary data-[state=unchecked]:bg-input dark:data-[state=unchecked]:bg-input/80',
+		'peer inline-flex h-[1.15rem] w-8 shrink-0 cursor-pointer items-center rounded-full border border-transparent shadow-xs transition-all outline-none focus-visible:border-ring focus-visible:ring-[3px] focus-visible:ring-ring/50 disabled:cursor-not-allowed disabled:opacity-50 data-[state=checked]:bg-primary data-[state=unchecked]:bg-input dark:data-[state=unchecked]:bg-input/80',
 		className
 	)}
 	{...restProps}
diff --git a/tools/server/webui/src/lib/components/ui/tooltip/tooltip-content.svelte b/tools/server/webui/src/lib/components/ui/tooltip/tooltip-content.svelte
index 72ea93a010..011d492f3d 100644
--- a/tools/server/webui/src/lib/components/ui/tooltip/tooltip-content.svelte
+++ b/tools/server/webui/src/lib/components/ui/tooltip/tooltip-content.svelte
@@ -9,22 +9,28 @@
 		side = 'top',
 		children,
 		arrowClasses,
+		noPortal = false,
 		...restProps
 	}: TooltipPrimitive.ContentProps & {
 		arrowClasses?: string;
+		noPortal?: boolean;
 	} = $props();
+
+	const contentClass = $derived(
+		cn(
+			'z-50 w-fit origin-(--bits-tooltip-content-transform-origin) animate-in rounded-md bg-primary px-3 py-1.5 text-xs text-balance text-primary-foreground fade-in-0 zoom-in-95 data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-right-2 data-[side=right]:slide-in-from-left-2 data-[side=top]:slide-in-from-bottom-2 data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=closed]:zoom-out-95',
+			className
+		)
+	);
 </script>
 
-<TooltipPrimitive.Portal>
+{#snippet tooltipContent()}
 	<TooltipPrimitive.Content
 		bind:ref
 		data-slot="tooltip-content"
 		{sideOffset}
 		{side}
-		class={cn(
-			'z-50 w-fit origin-(--bits-tooltip-content-transform-origin) animate-in rounded-md bg-primary px-3 py-1.5 text-xs text-balance text-primary-foreground fade-in-0 zoom-in-95 data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-right-2 data-[side=right]:slide-in-from-left-2 data-[side=top]:slide-in-from-bottom-2 data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=closed]:zoom-out-95',
-			className
-		)}
+		class={contentClass}
 		{...restProps}
 	>
 		{@render children?.()}
@@ -44,4 +50,12 @@
 			{/snippet}
 		</TooltipPrimitive.Arrow>
 	</TooltipPrimitive.Content>
-</TooltipPrimitive.Portal>
+{/snippet}
+
+{#if noPortal}
+	{@render tooltipContent()}
+{:else}
+	<TooltipPrimitive.Portal>
+		{@render tooltipContent()}
+	</TooltipPrimitive.Portal>
+{/if}
diff --git a/tools/server/webui/src/lib/constants/binary-detection.ts b/tools/server/webui/src/lib/constants/binary-detection.ts
index a4440fde5d..eac919ad96 100644
--- a/tools/server/webui/src/lib/constants/binary-detection.ts
+++ b/tools/server/webui/src/lib/constants/binary-detection.ts
@@ -1,9 +1,6 @@
 export interface BinaryDetectionOptions {
-	/** Number of characters to check from the beginning of the file */
 	prefixLength: number;
-	/** Maximum ratio of suspicious characters allowed (0.0 to 1.0) */
 	suspiciousCharThresholdRatio: number;
-	/** Maximum absolute number of null bytes allowed */
 	maxAbsoluteNullBytes: number;
 }
 
diff --git a/tools/server/webui/src/lib/constants/chat-form.ts b/tools/server/webui/src/lib/constants/chat-form.ts
new file mode 100644
index 0000000000..c5e3dc3d1b
--- /dev/null
+++ b/tools/server/webui/src/lib/constants/chat-form.ts
@@ -0,0 +1,3 @@
+export const INITIAL_FILE_SIZE = 0;
+export const PROMPT_CONTENT_SEPARATOR = '\n\n';
+export const CLIPBOARD_CONTENT_QUOTE_PREFIX = '"';
diff --git a/tools/server/webui/src/lib/constants/code-blocks.ts b/tools/server/webui/src/lib/constants/code-blocks.ts
new file mode 100644
index 0000000000..0f7265104d
--- /dev/null
+++ b/tools/server/webui/src/lib/constants/code-blocks.ts
@@ -0,0 +1,8 @@
+export const CODE_BLOCK_SCROLL_CONTAINER_CLASS = 'code-block-scroll-container';
+export const CODE_BLOCK_WRAPPER_CLASS = 'code-block-wrapper';
+export const CODE_BLOCK_HEADER_CLASS = 'code-block-header';
+export const CODE_BLOCK_ACTIONS_CLASS = 'code-block-actions';
+export const CODE_LANGUAGE_CLASS = 'code-language';
+export const COPY_CODE_BTN_CLASS = 'copy-code-btn';
+export const PREVIEW_CODE_BTN_CLASS = 'preview-code-btn';
+export const RELATIVE_CLASS = 'relative';
diff --git a/tools/server/webui/src/lib/constants/code.ts b/tools/server/webui/src/lib/constants/code.ts
new file mode 100644
index 0000000000..12bcd0db77
--- /dev/null
+++ b/tools/server/webui/src/lib/constants/code.ts
@@ -0,0 +1,7 @@
+export const NEWLINE = '\n';
+export const DEFAULT_LANGUAGE = 'text';
+export const LANG_PATTERN = /^(\w*)\n?/;
+export const AMPERSAND_REGEX = /&/g;
+export const LT_REGEX = /</g;
+export const GT_REGEX = />/g;
+export const FENCE_PATTERN = /^```|\n```/g;
diff --git a/tools/server/webui/src/lib/constants/css-classes.ts b/tools/server/webui/src/lib/constants/css-classes.ts
new file mode 100644
index 0000000000..46076e55f6
--- /dev/null
+++ b/tools/server/webui/src/lib/constants/css-classes.ts
@@ -0,0 +1,10 @@
+export const BOX_BORDER =
+	'border border-border/30 focus-within:border-border  dark:border-border/20 dark:focus-within:border-border';
+
+export const INPUT_CLASSES = `
+    bg-muted/60 dark:bg-muted/75
+    ${BOX_BORDER}
+    shadow-sm
+    outline-none
+    text-foreground
+`;
diff --git a/tools/server/webui/src/lib/constants/formatters.ts b/tools/server/webui/src/lib/constants/formatters.ts
new file mode 100644
index 0000000000..d6d1b883ff
--- /dev/null
+++ b/tools/server/webui/src/lib/constants/formatters.ts
@@ -0,0 +1,8 @@
+export const MS_PER_SECOND = 1000;
+export const SECONDS_PER_MINUTE = 60;
+export const SECONDS_PER_HOUR = 3600;
+export const SHORT_DURATION_THRESHOLD = 1;
+export const MEDIUM_DURATION_THRESHOLD = 10;
+
+/** Default display value when no performance time is available */
+export const DEFAULT_PERFORMANCE_TIME = '0s';
diff --git a/tools/server/webui/src/lib/constants/markdown.ts b/tools/server/webui/src/lib/constants/markdown.ts
new file mode 100644
index 0000000000..783d31a22c
--- /dev/null
+++ b/tools/server/webui/src/lib/constants/markdown.ts
@@ -0,0 +1,4 @@
+export const IMAGE_NOT_ERROR_BOUND_SELECTOR = 'img:not([data-error-bound])';
+export const DATA_ERROR_BOUND_ATTR = 'errorBound';
+export const DATA_ERROR_HANDLED_ATTR = 'errorHandled';
+export const BOOL_TRUE_STRING = 'true';
diff --git a/tools/server/webui/src/lib/constants/processing-info.ts b/tools/server/webui/src/lib/constants/processing-info.ts
index 726439211b..2c3f7dc534 100644
--- a/tools/server/webui/src/lib/constants/processing-info.ts
+++ b/tools/server/webui/src/lib/constants/processing-info.ts
@@ -1 +1,8 @@
 export const PROCESSING_INFO_TIMEOUT = 2000;
+
+/**
+ * Statistics units labels
+ */
+export const STATS_UNITS = {
+	TOKENS_PER_SECOND: 't/s'
+} as const;
diff --git a/tools/server/webui/src/lib/constants/settings-config.ts b/tools/server/webui/src/lib/constants/settings-config.ts
index cac48a557c..1b959f3b69 100644
--- a/tools/server/webui/src/lib/constants/settings-config.ts
+++ b/tools/server/webui/src/lib/constants/settings-config.ts
@@ -7,7 +7,8 @@ export const SETTING_CONFIG_DEFAULT: Record<string, string | number | boolean> =
 	theme: 'system',
 	showThoughtInProgress: false,
 	showToolCalls: false,
-	disableReasoningFormat: false,
+	disableReasoningParsing: false,
+	showRawOutputSwitch: false,
 	keepStatsVisible: false,
 	showMessageStats: true,
 	askForTitleConfirmation: false,
@@ -92,8 +93,10 @@ export const SETTING_CONFIG_INFO: Record<string, string> = {
 	showThoughtInProgress: 'Expand thought process by default when generating messages.',
 	showToolCalls:
 		'Display tool call labels and payloads from Harmony-compatible delta.tool_calls data below assistant messages.',
-	disableReasoningFormat:
-		'Show raw LLM output without backend parsing and frontend Markdown rendering to inspect streaming across different models.',
+	disableReasoningParsing:
+		'Send reasoning_format=none to prevent server-side extraction of reasoning tokens into separate field',
+	showRawOutputSwitch:
+		'Show toggle button to display messages as plain text instead of Markdown-formatted content',
 	keepStatsVisible: 'Keep processing statistics visible after generation finishes.',
 	showMessageStats:
 		'Display generation statistics (tokens/second, token count, duration) below each assistant message.',
diff --git a/tools/server/webui/src/lib/constants/settings-fields.ts b/tools/server/webui/src/lib/constants/settings-fields.ts
new file mode 100644
index 0000000000..79a6e92870
--- /dev/null
+++ b/tools/server/webui/src/lib/constants/settings-fields.ts
@@ -0,0 +1,33 @@
+/**
+ * List of all numeric fields in settings configuration.
+ * These fields will be converted from strings to numbers during save.
+ */
+export const NUMERIC_FIELDS = [
+	'temperature',
+	'top_k',
+	'top_p',
+	'min_p',
+	'max_tokens',
+	'pasteLongTextToFileLen',
+	'dynatemp_range',
+	'dynatemp_exponent',
+	'typ_p',
+	'xtc_probability',
+	'xtc_threshold',
+	'repeat_last_n',
+	'repeat_penalty',
+	'presence_penalty',
+	'frequency_penalty',
+	'dry_multiplier',
+	'dry_base',
+	'dry_allowed_length',
+	'dry_penalty_last_n',
+	'agenticMaxTurns',
+	'agenticMaxToolPreviewLines'
+] as const;
+
+/**
+ * Fields that must be positive integers (>= 1).
+ * These will be clamped to minimum 1 and rounded during save.
+ */
+export const POSITIVE_INTEGER_FIELDS = ['agenticMaxTurns', 'agenticMaxToolPreviewLines'] as const;
diff --git a/tools/server/webui/src/lib/constants/tooltip-config.ts b/tools/server/webui/src/lib/constants/tooltip-config.ts
index 3c30c8c072..ad76ab3522 100644
--- a/tools/server/webui/src/lib/constants/tooltip-config.ts
+++ b/tools/server/webui/src/lib/constants/tooltip-config.ts
@@ -1 +1 @@
-export const TOOLTIP_DELAY_DURATION = 100;
+export const TOOLTIP_DELAY_DURATION = 500;
diff --git a/tools/server/webui/src/lib/constants/ui.ts b/tools/server/webui/src/lib/constants/ui.ts
new file mode 100644
index 0000000000..a75b30f2f8
--- /dev/null
+++ b/tools/server/webui/src/lib/constants/ui.ts
@@ -0,0 +1 @@
+export const SYSTEM_MESSAGE_PLACEHOLDER = 'System message';
diff --git a/tools/server/webui/src/lib/contexts/chat-actions.context.ts b/tools/server/webui/src/lib/contexts/chat-actions.context.ts
new file mode 100644
index 0000000000..eba0fec027
--- /dev/null
+++ b/tools/server/webui/src/lib/contexts/chat-actions.context.ts
@@ -0,0 +1,34 @@
+import { getContext, setContext } from 'svelte';
+
+export interface ChatActionsContext {
+	copy: (message: DatabaseMessage) => void;
+	delete: (message: DatabaseMessage) => void;
+	navigateToSibling: (siblingId: string) => void;
+	editWithBranching: (
+		message: DatabaseMessage,
+		newContent: string,
+		newExtras?: DatabaseMessageExtra[]
+	) => void;
+	editWithReplacement: (
+		message: DatabaseMessage,
+		newContent: string,
+		shouldBranch: boolean
+	) => void;
+	editUserMessagePreserveResponses: (
+		message: DatabaseMessage,
+		newContent: string,
+		newExtras?: DatabaseMessageExtra[]
+	) => void;
+	regenerateWithBranching: (message: DatabaseMessage, modelOverride?: string) => void;
+	continueAssistantMessage: (message: DatabaseMessage) => void;
+}
+
+const CHAT_ACTIONS_KEY = Symbol.for('chat-actions');
+
+export function setChatActionsContext(ctx: ChatActionsContext): ChatActionsContext {
+	return setContext(CHAT_ACTIONS_KEY, ctx);
+}
+
+export function getChatActionsContext(): ChatActionsContext {
+	return getContext(CHAT_ACTIONS_KEY);
+}
diff --git a/tools/server/webui/src/lib/contexts/index.ts b/tools/server/webui/src/lib/contexts/index.ts
new file mode 100644
index 0000000000..73ff6f96fa
--- /dev/null
+++ b/tools/server/webui/src/lib/contexts/index.ts
@@ -0,0 +1,13 @@
+export {
+	getMessageEditContext,
+	setMessageEditContext,
+	type MessageEditContext,
+	type MessageEditState,
+	type MessageEditActions
+} from './message-edit.context';
+
+export {
+	getChatActionsContext,
+	setChatActionsContext,
+	type ChatActionsContext
+} from './chat-actions.context';
diff --git a/tools/server/webui/src/lib/contexts/message-edit.context.ts b/tools/server/webui/src/lib/contexts/message-edit.context.ts
new file mode 100644
index 0000000000..7af116daa5
--- /dev/null
+++ b/tools/server/webui/src/lib/contexts/message-edit.context.ts
@@ -0,0 +1,39 @@
+import { getContext, setContext } from 'svelte';
+
+export interface MessageEditState {
+	readonly isEditing: boolean;
+	readonly editedContent: string;
+	readonly editedExtras: DatabaseMessageExtra[];
+	readonly editedUploadedFiles: ChatUploadedFile[];
+	readonly originalContent: string;
+	readonly originalExtras: DatabaseMessageExtra[];
+	readonly showSaveOnlyOption: boolean;
+}
+
+export interface MessageEditActions {
+	setContent: (content: string) => void;
+	setExtras: (extras: DatabaseMessageExtra[]) => void;
+	setUploadedFiles: (files: ChatUploadedFile[]) => void;
+	save: () => void;
+	saveOnly: () => void;
+	cancel: () => void;
+	startEdit: () => void;
+}
+
+export type MessageEditContext = MessageEditState & MessageEditActions;
+
+const MESSAGE_EDIT_KEY = Symbol.for('chat-message-edit');
+
+/**
+ * Sets the message edit context. Call this in the parent component (ChatMessage.svelte).
+ */
+export function setMessageEditContext(ctx: MessageEditContext): MessageEditContext {
+	return setContext(MESSAGE_EDIT_KEY, ctx);
+}
+
+/**
+ * Gets the message edit context. Call this in child components.
+ */
+export function getMessageEditContext(): MessageEditContext {
+	return getContext(MESSAGE_EDIT_KEY);
+}
diff --git a/tools/server/webui/src/lib/enums/chat.ts b/tools/server/webui/src/lib/enums/chat.ts
index 2b9eb7bc2e..0b6f357d9a 100644
--- a/tools/server/webui/src/lib/enums/chat.ts
+++ b/tools/server/webui/src/lib/enums/chat.ts
@@ -1,4 +1,51 @@
 export enum ChatMessageStatsView {
 	GENERATION = 'generation',
-	READING = 'reading'
+	READING = 'reading',
+	TOOLS = 'tools',
+	SUMMARY = 'summary'
+}
+
+/**
+ * Reasoning format options for API requests.
+ */
+export enum ReasoningFormat {
+	NONE = 'none',
+	AUTO = 'auto'
+}
+
+/**
+ * Message roles for chat messages.
+ */
+export enum MessageRole {
+	USER = 'user',
+	ASSISTANT = 'assistant',
+	SYSTEM = 'system',
+	TOOL = 'tool'
+}
+
+/**
+ * Message types for different content kinds.
+ */
+export enum MessageType {
+	ROOT = 'root',
+	TEXT = 'text',
+	THINK = 'think',
+	SYSTEM = 'system'
+}
+
+/**
+ * Content part types for API chat message content.
+ */
+export enum ContentPartType {
+	TEXT = 'text',
+	IMAGE_URL = 'image_url',
+	INPUT_AUDIO = 'input_audio'
+}
+
+/**
+ * Error dialog types for displaying server/timeout errors.
+ */
+export enum ErrorDialogType {
+	TIMEOUT = 'timeout',
+	SERVER = 'server'
 }
diff --git a/tools/server/webui/src/lib/enums/keyboard.ts b/tools/server/webui/src/lib/enums/keyboard.ts
new file mode 100644
index 0000000000..b8f6d5f7a2
--- /dev/null
+++ b/tools/server/webui/src/lib/enums/keyboard.ts
@@ -0,0 +1,15 @@
+/**
+ * Keyboard key names for event handling
+ */
+export enum KeyboardKey {
+	ENTER = 'Enter',
+	ESCAPE = 'Escape',
+	ARROW_UP = 'ArrowUp',
+	ARROW_DOWN = 'ArrowDown',
+	TAB = 'Tab',
+	D_LOWER = 'd',
+	D_UPPER = 'D',
+	E_UPPER = 'E',
+	K_LOWER = 'k',
+	O_UPPER = 'O'
+}
diff --git a/tools/server/webui/src/lib/enums/settings.ts b/tools/server/webui/src/lib/enums/settings.ts
new file mode 100644
index 0000000000..f17f219762
--- /dev/null
+++ b/tools/server/webui/src/lib/enums/settings.ts
@@ -0,0 +1,26 @@
+/**
+ * Parameter source - indicates whether a parameter uses default or custom value
+ */
+export enum ParameterSource {
+	DEFAULT = 'default',
+	CUSTOM = 'custom'
+}
+
+/**
+ * Syncable parameter type - data types for parameters that can be synced with server
+ */
+export enum SyncableParameterType {
+	NUMBER = 'number',
+	STRING = 'string',
+	BOOLEAN = 'boolean'
+}
+
+/**
+ * Settings field type - defines the input type for settings fields
+ */
+export enum SettingsFieldType {
+	INPUT = 'input',
+	TEXTAREA = 'textarea',
+	CHECKBOX = 'checkbox',
+	SELECT = 'select'
+}
diff --git a/tools/server/webui/src/lib/hooks/use-auto-scroll.svelte.ts b/tools/server/webui/src/lib/hooks/use-auto-scroll.svelte.ts
new file mode 100644
index 0000000000..bbaa5d1362
--- /dev/null
+++ b/tools/server/webui/src/lib/hooks/use-auto-scroll.svelte.ts
@@ -0,0 +1,165 @@
+import { AUTO_SCROLL_AT_BOTTOM_THRESHOLD, AUTO_SCROLL_INTERVAL } from '$lib/constants/auto-scroll';
+
+export interface AutoScrollOptions {
+	/** Whether auto-scroll is disabled globally (e.g., from settings) */
+	disabled?: boolean;
+}
+
+/**
+ * Creates an auto-scroll controller for a scrollable container.
+ *
+ * Features:
+ * - Auto-scrolls to bottom during streaming/loading
+ * - Stops auto-scroll when user manually scrolls up
+ * - Resumes auto-scroll when user scrolls back to bottom
+ */
+export class AutoScrollController {
+	private _autoScrollEnabled = $state(true);
+	private _userScrolledUp = $state(false);
+	private _lastScrollTop = $state(0);
+	private _scrollInterval: ReturnType<typeof setInterval> | undefined;
+	private _scrollTimeout: ReturnType<typeof setTimeout> | undefined;
+	private _container: HTMLElement | undefined;
+	private _disabled: boolean;
+
+	constructor(options: AutoScrollOptions = {}) {
+		this._disabled = options.disabled ?? false;
+	}
+
+	get autoScrollEnabled(): boolean {
+		return this._autoScrollEnabled;
+	}
+
+	get userScrolledUp(): boolean {
+		return this._userScrolledUp;
+	}
+
+	/**
+	 * Binds the controller to a scrollable container element.
+	 */
+	setContainer(container: HTMLElement | undefined): void {
+		this._container = container;
+	}
+
+	/**
+	 * Updates the disabled state.
+	 */
+	setDisabled(disabled: boolean): void {
+		this._disabled = disabled;
+		if (disabled) {
+			this._autoScrollEnabled = false;
+			this.stopInterval();
+		}
+	}
+
+	/**
+	 * Handles scroll events to detect user scroll direction and toggle auto-scroll.
+	 */
+	handleScroll(): void {
+		if (this._disabled || !this._container) return;
+
+		const { scrollTop, scrollHeight, clientHeight } = this._container;
+		const distanceFromBottom = scrollHeight - scrollTop - clientHeight;
+		const isAtBottom = distanceFromBottom < AUTO_SCROLL_AT_BOTTOM_THRESHOLD;
+
+		if (scrollTop < this._lastScrollTop && !isAtBottom) {
+			this._userScrolledUp = true;
+			this._autoScrollEnabled = false;
+		} else if (isAtBottom && this._userScrolledUp) {
+			this._userScrolledUp = false;
+			this._autoScrollEnabled = true;
+		}
+
+		if (this._scrollTimeout) {
+			clearTimeout(this._scrollTimeout);
+		}
+
+		this._scrollTimeout = setTimeout(() => {
+			if (isAtBottom) {
+				this._userScrolledUp = false;
+				this._autoScrollEnabled = true;
+			}
+		}, AUTO_SCROLL_INTERVAL);
+
+		this._lastScrollTop = scrollTop;
+	}
+
+	/**
+	 * Scrolls the container to the bottom.
+	 */
+	scrollToBottom(behavior: ScrollBehavior = 'smooth'): void {
+		if (this._disabled || !this._container) return;
+
+		this._container.scrollTo({
+			top: this._container.scrollHeight,
+			behavior
+		});
+	}
+
+	/**
+	 * Enables auto-scroll (e.g., when user sends a message).
+	 */
+	enable(): void {
+		if (this._disabled) return;
+		this._userScrolledUp = false;
+		this._autoScrollEnabled = true;
+	}
+
+	/**
+	 * Starts the auto-scroll interval for continuous scrolling during streaming.
+	 */
+	startInterval(): void {
+		if (this._disabled || this._scrollInterval) return;
+
+		this._scrollInterval = setInterval(() => {
+			this.scrollToBottom();
+		}, AUTO_SCROLL_INTERVAL);
+	}
+
+	/**
+	 * Stops the auto-scroll interval.
+	 */
+	stopInterval(): void {
+		if (this._scrollInterval) {
+			clearInterval(this._scrollInterval);
+			this._scrollInterval = undefined;
+		}
+	}
+
+	/**
+	 * Updates the auto-scroll interval based on streaming state.
+	 * Call this in a $effect to automatically manage the interval.
+	 */
+	updateInterval(isStreaming: boolean): void {
+		if (this._disabled) {
+			this.stopInterval();
+			return;
+		}
+
+		if (isStreaming && this._autoScrollEnabled) {
+			if (!this._scrollInterval) {
+				this.startInterval();
+			}
+		} else {
+			this.stopInterval();
+		}
+	}
+
+	/**
+	 * Cleans up resources. Call this in onDestroy or when the component unmounts.
+	 */
+	destroy(): void {
+		this.stopInterval();
+		if (this._scrollTimeout) {
+			clearTimeout(this._scrollTimeout);
+			this._scrollTimeout = undefined;
+		}
+	}
+}
+
+/**
+ * Creates a new AutoScrollController instance.
+ */
+export function createAutoScrollController(options: AutoScrollOptions = {}): AutoScrollController {
+	return new AutoScrollController(options);
+}
diff --git a/tools/server/webui/src/lib/hooks/use-processing-state.svelte.ts b/tools/server/webui/src/lib/hooks/use-processing-state.svelte.ts
index c06cf28864..068440cdc0 100644
--- a/tools/server/webui/src/lib/hooks/use-processing-state.svelte.ts
+++ b/tools/server/webui/src/lib/hooks/use-processing-state.svelte.ts
@@ -1,7 +1,9 @@
 import { activeProcessingState } from '$lib/stores/chat.svelte';
 import { config } from '$lib/stores/settings.svelte';
+import { STATS_UNITS } from '$lib/constants/processing-info';
+import type { ApiProcessingState } from '$lib/types';
 
-export interface LiveProcessingStats {
+interface LiveProcessingStats {
 	tokensProcessed: number;
 	totalTokens: number;
 	timeMs: number;
@@ -9,7 +11,7 @@ export interface LiveProcessingStats {
 	etaSecs?: number;
 }
 
-export interface LiveGenerationStats {
+interface LiveGenerationStats {
 	tokensGenerated: number;
 	timeMs: number;
 	tokensPerSecond: number;
@@ -18,6 +20,7 @@ export interface LiveGenerationStats {
 export interface UseProcessingStateReturn {
 	readonly processingState: ApiProcessingState | null;
 	getProcessingDetails(): string[];
+	getTechnicalDetails(): string[];
 	getProcessingMessage(): string;
 	getPromptProgressText(): string | null;
 	getLiveProcessingStats(): LiveProcessingStats | null;
@@ -138,8 +141,31 @@ export function useProcessingState(): UseProcessingStateReturn {
 
 		const details: string[] = [];
 
+		// Show prompt processing progress with ETA during preparation phase
+		if (stateToUse.promptProgress) {
+			const { processed, total, time_ms, cache } = stateToUse.promptProgress;
+			const actualProcessed = processed - cache;
+			const actualTotal = total - cache;
+
+			if (actualProcessed < actualTotal && actualProcessed > 0) {
+				const percent = Math.round((actualProcessed / actualTotal) * 100);
+				const eta = getETASecs(actualProcessed, actualTotal, time_ms);
+
+				if (eta !== undefined) {
+					const etaSecs = Math.ceil(eta);
+					details.push(`Processing ${percent}% (ETA: ${etaSecs}s)`);
+				} else {
+					details.push(`Processing ${percent}%`);
+				}
+			}
+		}
+
 		// Always show context info when we have valid data
-		if (stateToUse.contextUsed >= 0 && stateToUse.contextTotal > 0) {
+		if (
+			typeof stateToUse.contextTotal === 'number' &&
+			stateToUse.contextUsed >= 0 &&
+			stateToUse.contextTotal > 0
+		) {
 			const contextPercent = Math.round((stateToUse.contextUsed / stateToUse.contextTotal) * 100);
 
 			details.push(
@@ -163,7 +189,57 @@ export function useProcessingState(): UseProcessingStateReturn {
 		}
 
 		if (stateToUse.tokensPerSecond && stateToUse.tokensPerSecond > 0) {
-			details.push(`${stateToUse.tokensPerSecond.toFixed(1)} tokens/sec`);
+			details.push(`${stateToUse.tokensPerSecond.toFixed(1)} ${STATS_UNITS.TOKENS_PER_SECOND}`);
+		}
+
+		if (stateToUse.speculative) {
+			details.push('Speculative decoding enabled');
+		}
+
+		return details;
+	}
+
+	/**
+	 * Returns technical details without the progress message (for bottom bar)
+	 */
+	function getTechnicalDetails(): string[] {
+		const stateToUse = processingState || lastKnownState;
+		if (!stateToUse) {
+			return [];
+		}
+
+		const details: string[] = [];
+
+		// Always show context info when we have valid data
+		if (
+			typeof stateToUse.contextTotal === 'number' &&
+			stateToUse.contextUsed >= 0 &&
+			stateToUse.contextTotal > 0
+		) {
+			const contextPercent = Math.round((stateToUse.contextUsed / stateToUse.contextTotal) * 100);
+
+			details.push(
+				`Context: ${stateToUse.contextUsed}/${stateToUse.contextTotal} (${contextPercent}%)`
+			);
+		}
+
+		if (stateToUse.outputTokensUsed > 0) {
+			// Handle infinite max_tokens (-1) case
+			if (stateToUse.outputTokensMax <= 0) {
+				details.push(`Output: ${stateToUse.outputTokensUsed}/∞`);
+			} else {
+				const outputPercent = Math.round(
+					(stateToUse.outputTokensUsed / stateToUse.outputTokensMax) * 100
+				);
+
+				details.push(
+					`Output: ${stateToUse.outputTokensUsed}/${stateToUse.outputTokensMax} (${outputPercent}%)`
+				);
+			}
+		}
+
+		if (stateToUse.tokensPerSecond && stateToUse.tokensPerSecond > 0) {
+			details.push(`${stateToUse.tokensPerSecond.toFixed(1)} ${STATS_UNITS.TOKENS_PER_SECOND}`);
 		}
 
 		if (stateToUse.speculative) {
@@ -251,6 +327,7 @@ export function useProcessingState(): UseProcessingStateReturn {
 			return processingState;
 		},
 		getProcessingDetails,
+		getTechnicalDetails,
 		getProcessingMessage,
 		getPromptProgressText,
 		getLiveProcessingStats,
diff --git a/tools/server/webui/src/lib/markdown/enhance-code-blocks.ts b/tools/server/webui/src/lib/markdown/enhance-code-blocks.ts
index 6f0e03e211..168de97403 100644
--- a/tools/server/webui/src/lib/markdown/enhance-code-blocks.ts
+++ b/tools/server/webui/src/lib/markdown/enhance-code-blocks.ts
@@ -13,6 +13,16 @@
 import type { Plugin } from 'unified';
 import type { Root, Element, ElementContent } from 'hast';
 import { visit } from 'unist-util-visit';
+import {
+	CODE_BLOCK_SCROLL_CONTAINER_CLASS,
+	CODE_BLOCK_WRAPPER_CLASS,
+	CODE_BLOCK_HEADER_CLASS,
+	CODE_BLOCK_ACTIONS_CLASS,
+	CODE_LANGUAGE_CLASS,
+	COPY_CODE_BTN_CLASS,
+	PREVIEW_CODE_BTN_CLASS,
+	RELATIVE_CLASS
+} from '$lib/constants/code-blocks';
 
 declare global {
 	interface Window {
@@ -42,7 +52,7 @@ function createCopyButton(codeId: string): Element {
 		type: 'element',
 		tagName: 'button',
 		properties: {
-			className: ['copy-code-btn'],
+			className: [COPY_CODE_BTN_CLASS],
 			'data-code-id': codeId,
 			title: 'Copy code',
 			type: 'button'
@@ -56,7 +66,7 @@ function createPreviewButton(codeId: string): Element {
 		type: 'element',
 		tagName: 'button',
 		properties: {
-			className: ['preview-code-btn'],
+			className: [PREVIEW_CODE_BTN_CLASS],
 			'data-code-id': codeId,
 			title: 'Preview code',
 			type: 'button'
@@ -75,30 +85,39 @@ function createHeader(language: string, codeId: string): Element {
 	return {
 		type: 'element',
 		tagName: 'div',
-		properties: { className: ['code-block-header'] },
+		properties: { className: [CODE_BLOCK_HEADER_CLASS] },
 		children: [
 			{
 				type: 'element',
 				tagName: 'span',
-				properties: { className: ['code-language'] },
+				properties: { className: [CODE_LANGUAGE_CLASS] },
 				children: [{ type: 'text', value: language }]
 			},
 			{
 				type: 'element',
 				tagName: 'div',
-				properties: { className: ['code-block-actions'] },
+				properties: { className: [CODE_BLOCK_ACTIONS_CLASS] },
 				children: actions
 			}
 		]
 	};
 }
 
+function createScrollContainer(preElement: Element): Element {
+	return {
+		type: 'element',
+		tagName: 'div',
+		properties: { className: [CODE_BLOCK_SCROLL_CONTAINER_CLASS] },
+		children: [preElement]
+	};
+}
+
 function createWrapper(header: Element, preElement: Element): Element {
 	return {
 		type: 'element',
 		tagName: 'div',
-		properties: { className: ['code-block-wrapper'] },
-		children: [header, preElement]
+		properties: { className: [CODE_BLOCK_WRAPPER_CLASS, RELATIVE_CLASS] },
+		children: [header, createScrollContainer(preElement)]
 	};
 }
 
diff --git a/tools/server/webui/src/lib/services/chat.ts b/tools/server/webui/src/lib/services/chat.ts
index 02fc6381c0..55af0ce816 100644
--- a/tools/server/webui/src/lib/services/chat.ts
+++ b/tools/server/webui/src/lib/services/chat.ts
@@ -90,7 +90,7 @@ export class ChatService {
 			custom,
 			timings_per_token,
 			// Config options
-			disableReasoningFormat
+			disableReasoningParsing
 		} = options;
 
 		const normalizedMessages: ApiChatMessageData[] = messages
@@ -127,7 +127,7 @@ export class ChatService {
 			requestBody.model = options.model;
 		}
 
-		requestBody.reasoning_format = disableReasoningFormat ? 'none' : 'auto';
+		requestBody.reasoning_format = disableReasoningParsing ? 'none' : 'auto';
 
 		if (temperature !== undefined) requestBody.temperature = temperature;
 		if (max_tokens !== undefined) {
diff --git a/tools/server/webui/src/lib/services/database.service.ts b/tools/server/webui/src/lib/services/database.service.ts
new file mode 100644
index 0000000000..0d5a9c1b99
--- /dev/null
+++ b/tools/server/webui/src/lib/services/database.service.ts
@@ -0,0 +1,368 @@
+import Dexie, { type EntityTable } from 'dexie';
+import { findDescendantMessages } from '$lib/utils';
+
+class LlamacppDatabase extends Dexie {
+	conversations!: EntityTable<DatabaseConversation, string>;
+	messages!: EntityTable<DatabaseMessage, string>;
+
+	constructor() {
+		super('LlamacppWebui');
+
+		this.version(1).stores({
+			conversations: 'id, lastModified, currNode, name',
+			messages: 'id, convId, type, role, timestamp, parent, children'
+		});
+	}
+}
+
+const db = new LlamacppDatabase();
+import { v4 as uuid } from 'uuid';
+import { MessageRole } from '$lib/enums/chat';
+
+export class DatabaseService {
+	/**
+	 *
+	 *
+	 * Conversations
+	 *
+	 *
+	 */
+
+	/**
+	 * Creates a new conversation.
+	 *
+	 * @param name - Name of the conversation
+	 * @returns The created conversation
+	 */
+	static async createConversation(name: string): Promise<DatabaseConversation> {
+		const conversation: DatabaseConversation = {
+			id: uuid(),
+			name,
+			lastModified: Date.now(),
+			currNode: ''
+		};
+
+		await db.conversations.add(conversation);
+		return conversation;
+	}
+
+	/**
+	 *
+	 *
+	 * Messages
+	 *
+	 *
+	 */
+
+	/**
+	 * Creates a new message branch by adding a message and updating parent/child relationships.
+	 * Also updates the conversation's currNode to point to the new message.
+	 *
+	 * @param message - Message to add (without id)
+	 * @param parentId - Parent message ID to attach to
+	 * @returns The created message
+	 */
+	static async createMessageBranch(
+		message: Omit<DatabaseMessage, 'id'>,
+		parentId: string | null
+	): Promise<DatabaseMessage> {
+		return await db.transaction('rw', [db.conversations, db.messages], async () => {
+			// Handle null parent (root message case)
+			if (parentId !== null) {
+				const parentMessage = await db.messages.get(parentId);
+				if (!parentMessage) {
+					throw new Error(`Parent message ${parentId} not found`);
+				}
+			}
+
+			const newMessage: DatabaseMessage = {
+				...message,
+				id: uuid(),
+				parent: parentId,
+				toolCalls: message.toolCalls ?? '',
+				children: []
+			};
+
+			await db.messages.add(newMessage);
+
+			// Update parent's children array if parent exists
+			if (parentId !== null) {
+				const parentMessage = await db.messages.get(parentId);
+				if (parentMessage) {
+					await db.messages.update(parentId, {
+						children: [...parentMessage.children, newMessage.id]
+					});
+				}
+			}
+
+			await this.updateConversation(message.convId, {
+				currNode: newMessage.id
+			});
+
+			return newMessage;
+		});
+	}
+
+	/**
+	 * Creates a root message for a new conversation.
+	 * Root messages are not displayed but serve as the tree root for branching.
+	 *
+	 * @param convId - Conversation ID
+	 * @returns The created root message
+	 */
+	static async createRootMessage(convId: string): Promise<string> {
+		const rootMessage: DatabaseMessage = {
+			id: uuid(),
+			convId,
+			type: 'root',
+			timestamp: Date.now(),
+			role: MessageRole.SYSTEM,
+			content: '',
+			parent: null,
+			toolCalls: '',
+			children: []
+		};
+
+		await db.messages.add(rootMessage);
+		return rootMessage.id;
+	}
+
+	/**
+	 * Creates a system prompt message for a conversation.
+	 *
+	 * @param convId - Conversation ID
+	 * @param systemPrompt - The system prompt content (must be non-empty)
+	 * @param parentId - Parent message ID (typically the root message)
+	 * @returns The created system message
+	 * @throws Error if systemPrompt is empty
+	 */
+	static async createSystemMessage(
+		convId: string,
+		systemPrompt: string,
+		parentId: string
+	): Promise<DatabaseMessage> {
+		const trimmedPrompt = systemPrompt.trim();
+		if (!trimmedPrompt) {
+			throw new Error('Cannot create system message with empty content');
+		}
+
+		const systemMessage: DatabaseMessage = {
+			id: uuid(),
+			convId,
+			type: MessageRole.SYSTEM,
+			timestamp: Date.now(),
+			role: MessageRole.SYSTEM,
+			content: trimmedPrompt,
+			parent: parentId,
+			children: []
+		};
+
+		await db.messages.add(systemMessage);
+
+		const parentMessage = await db.messages.get(parentId);
+		if (parentMessage) {
+			await db.messages.update(parentId, {
+				children: [...parentMessage.children, systemMessage.id]
+			});
+		}
+
+		return systemMessage;
+	}
+
+	/**
+	 * Deletes a conversation and all its messages.
+	 *
+	 * @param id - Conversation ID
+	 */
+	static async deleteConversation(id: string): Promise<void> {
+		await db.transaction('rw', [db.conversations, db.messages], async () => {
+			await db.conversations.delete(id);
+			await db.messages.where('convId').equals(id).delete();
+		});
+	}
+
+	/**
+	 * Deletes a message and removes it from its parent's children array.
+	 *
+	 * @param messageId - ID of the message to delete
+	 */
+	static async deleteMessage(messageId: string): Promise<void> {
+		await db.transaction('rw', db.messages, async () => {
+			const message = await db.messages.get(messageId);
+			if (!message) return;
+
+			// Remove this message from its parent's children array
+			if (message.parent) {
+				const parent = await db.messages.get(message.parent);
+				if (parent) {
+					parent.children = parent.children.filter((childId: string) => childId !== messageId);
+					await db.messages.put(parent);
+				}
+			}
+
+			// Delete the message
+			await db.messages.delete(messageId);
+		});
+	}
+
+	/**
+	 * Deletes a message and all its descendant messages (cascading deletion).
+	 * This removes the entire branch starting from the specified message.
+	 *
+	 * @param conversationId - ID of the conversation containing the message
+	 * @param messageId - ID of the root message to delete (along with all descendants)
+	 * @returns Array of all deleted message IDs
+	 */
+	static async deleteMessageCascading(
+		conversationId: string,
+		messageId: string
+	): Promise<string[]> {
+		return await db.transaction('rw', db.messages, async () => {
+			// Get all messages in the conversation to find descendants
+			const allMessages = await db.messages.where('convId').equals(conversationId).toArray();
+
+			// Find all descendant messages
+			const descendants = findDescendantMessages(allMessages, messageId);
+			const allToDelete = [messageId, ...descendants];
+
+			// Get the message to delete for parent cleanup
+			const message = await db.messages.get(messageId);
+			if (message && message.parent) {
+				const parent = await db.messages.get(message.parent);
+				if (parent) {
+					parent.children = parent.children.filter((childId: string) => childId !== messageId);
+					await db.messages.put(parent);
+				}
+			}
+
+			// Delete all messages in the branch
+			await db.messages.bulkDelete(allToDelete);
+
+			return allToDelete;
+		});
+	}
+
+	/**
+	 * Gets all conversations, sorted by last modified time (newest first).
+	 *
+	 * @returns Array of conversations
+	 */
+	static async getAllConversations(): Promise<DatabaseConversation[]> {
+		return await db.conversations.orderBy('lastModified').reverse().toArray();
+	}
+
+	/**
+	 * Gets a conversation by ID.
+	 *
+	 * @param id - Conversation ID
+	 * @returns The conversation if found, otherwise undefined
+	 */
+	static async getConversation(id: string): Promise<DatabaseConversation | undefined> {
+		return await db.conversations.get(id);
+	}
+
+	/**
+	 * Gets all messages in a conversation, sorted by timestamp (oldest first).
+	 *
+	 * @param convId - Conversation ID
+	 * @returns Array of messages in the conversation
+	 */
+	static async getConversationMessages(convId: string): Promise<DatabaseMessage[]> {
+		return await db.messages.where('convId').equals(convId).sortBy('timestamp');
+	}
+
+	/**
+	 * Updates a conversation.
+	 *
+	 * @param id - Conversation ID
+	 * @param updates - Partial updates to apply
+	 * @returns Promise that resolves when the conversation is updated
+	 */
+	static async updateConversation(
+		id: string,
+		updates: Partial<Omit<DatabaseConversation, 'id'>>
+	): Promise<void> {
+		await db.conversations.update(id, {
+			...updates,
+			lastModified: Date.now()
+		});
+	}
+
+	/**
+	 *
+	 *
+	 * Navigation
+	 *
+	 *
+	 */
+
+	/**
+	 * Updates the conversation's current node (active branch).
+	 * This determines which conversation path is currently being viewed.
+	 *
+	 * @param convId - Conversation ID
+	 * @param nodeId - Message ID to set as current node
+	 */
+	static async updateCurrentNode(convId: string, nodeId: string): Promise<void> {
+		await this.updateConversation(convId, {
+			currNode: nodeId
+		});
+	}
+
+	/**
+	 * Updates a message.
+	 *
+	 * @param id - Message ID
+	 * @param updates - Partial updates to apply
+	 * @returns Promise that resolves when the message is updated
+	 */
+	static async updateMessage(
+		id: string,
+		updates: Partial<Omit<DatabaseMessage, 'id'>>
+	): Promise<void> {
+		await db.messages.update(id, updates);
+	}
+
+	/**
+	 *
+	 *
+	 * Import
+	 *
+	 *
+	 */
+
+	/**
+	 * Imports multiple conversations and their messages.
+	 * Skips conversations that already exist.
+	 *
+	 * @param data - Array of { conv, messages } objects
+	 */
+	static async importConversations(
+		data: { conv: DatabaseConversation; messages: DatabaseMessage[] }[]
+	): Promise<{ imported: number; skipped: number }> {
+		let importedCount = 0;
+		let skippedCount = 0;
+
+		return await db.transaction('rw', [db.conversations, db.messages], async () => {
+			for (const item of data) {
+				const { conv, messages } = item;
+
+				const existing = await db.conversations.get(conv.id);
+				if (existing) {
+					console.warn(`Conversation "${conv.name}" already exists, skipping...`);
+					skippedCount++;
+					continue;
+				}
+
+				await db.conversations.add(conv);
+				for (const msg of messages) {
+					await db.messages.put(msg);
+				}
+
+				importedCount++;
+			}
+
+			return { imported: importedCount, skipped: skippedCount };
+		});
+	}
+}
diff --git a/tools/server/webui/src/lib/services/models.service.ts b/tools/server/webui/src/lib/services/models.service.ts
new file mode 100644
index 0000000000..7357c3f400
--- /dev/null
+++ b/tools/server/webui/src/lib/services/models.service.ts
@@ -0,0 +1,99 @@
+import { ServerModelStatus } from '$lib/enums';
+import { apiFetch, apiPost } from '$lib/utils/api-fetch';
+
+export class ModelsService {
+	/**
+	 *
+	 *
+	 * Listing
+	 *
+	 *
+	 */
+
+	/**
+	 * Fetch list of models from OpenAI-compatible endpoint.
+	 * Works in both MODEL and ROUTER modes.
+	 *
+	 * @returns List of available models with basic metadata
+	 */
+	static async list(): Promise<ApiModelListResponse> {
+		return apiFetch<ApiModelListResponse>('/v1/models');
+	}
+
+	/**
+	 * Fetch list of all models with detailed metadata (ROUTER mode).
+	 * Returns models with load status, paths, and other metadata
+	 * beyond what the OpenAI-compatible endpoint provides.
+	 *
+	 * @returns List of models with detailed status and configuration info
+	 */
+	static async listRouter(): Promise<ApiRouterModelsListResponse> {
+		return apiFetch<ApiRouterModelsListResponse>('/v1/models');
+	}
+
+	/**
+	 *
+	 *
+	 * Load/Unload
+	 *
+	 *
+	 */
+
+	/**
+	 * Load a model (ROUTER mode only).
+	 * Sends POST request to `/models/load`. Note: the endpoint returns success
+	 * before loading completes — use polling to await actual load status.
+	 *
+	 * @param modelId - Model identifier to load
+	 * @param extraArgs - Optional additional arguments to pass to the model instance
+	 * @returns Load response from the server
+	 */
+	static async load(modelId: string, extraArgs?: string[]): Promise<ApiRouterModelsLoadResponse> {
+		const payload: { model: string; extra_args?: string[] } = { model: modelId };
+		if (extraArgs && extraArgs.length > 0) {
+			payload.extra_args = extraArgs;
+		}
+
+		return apiPost<ApiRouterModelsLoadResponse>('/models/load', payload);
+	}
+
+	/**
+	 * Unload a model (ROUTER mode only).
+	 * Sends POST request to `/models/unload`. Note: the endpoint returns success
+	 * before unloading completes — use polling to await actual unload status.
+	 *
+	 * @param modelId - Model identifier to unload
+	 * @returns Unload response from the server
+	 */
+	static async unload(modelId: string): Promise<ApiRouterModelsUnloadResponse> {
+		return apiPost<ApiRouterModelsUnloadResponse>('/models/unload', { model: modelId });
+	}
+
+	/**
+	 *
+	 *
+	 * Status
+	 *
+	 *
+	 */
+
+	/**
+	 * Check if a model is loaded based on its metadata.
+	 *
+	 * @param model - Model data entry from the API response
+	 * @returns True if the model status is LOADED
+	 */
+	static isModelLoaded(model: ApiModelDataEntry): boolean {
+		return model.status.value === ServerModelStatus.LOADED;
+	}
+
+	/**
+	 * Check if a model is currently loading.
+	 *
+	 * @param model - Model data entry from the API response
+	 * @returns True if the model status is LOADING
+	 */
+	static isModelLoading(model: ApiModelDataEntry): boolean {
+		return model.status.value === ServerModelStatus.LOADING;
+	}
+}
diff --git a/tools/server/webui/src/lib/services/parameter-sync.service.spec.ts b/tools/server/webui/src/lib/services/parameter-sync.service.spec.ts
new file mode 100644
index 0000000000..46cce5e7cb
--- /dev/null
+++ b/tools/server/webui/src/lib/services/parameter-sync.service.spec.ts
@@ -0,0 +1,148 @@
+import { describe, it, expect } from 'vitest';
+import { ParameterSyncService } from './parameter-sync.service';
+
+describe('ParameterSyncService', () => {
+	describe('roundFloatingPoint', () => {
+		it('should fix JavaScript floating-point precision issues', () => {
+			// Test the specific values from the screenshot
+			const mockServerParams = {
+				top_p: 0.949999988079071,
+				min_p: 0.009999999776482582,
+				temperature: 0.800000011920929,
+				top_k: 40,
+				samplers: ['top_k', 'typ_p', 'top_p', 'min_p', 'temperature']
+			};
+
+			const result = ParameterSyncService.extractServerDefaults({
+				...mockServerParams,
+				// Add other required fields to match the API type
+				n_predict: 512,
+				seed: -1,
+				dynatemp_range: 0.0,
+				dynatemp_exponent: 1.0,
+				xtc_probability: 0.0,
+				xtc_threshold: 0.1,
+				typ_p: 1.0,
+				repeat_last_n: 64,
+				repeat_penalty: 1.0,
+				presence_penalty: 0.0,
+				frequency_penalty: 0.0,
+				dry_multiplier: 0.0,
+				dry_base: 1.75,
+				dry_allowed_length: 2,
+				dry_penalty_last_n: -1,
+				mirostat: 0,
+				mirostat_tau: 5.0,
+				mirostat_eta: 0.1,
+				stop: [],
+				max_tokens: -1,
+				n_keep: 0,
+				n_discard: 0,
+				ignore_eos: false,
+				stream: true,
+				logit_bias: [],
+				n_probs: 0,
+				min_keep: 0,
+				grammar: '',
+				grammar_lazy: false,
+				grammar_triggers: [],
+				preserved_tokens: [],
+				chat_format: '',
+				reasoning_format: '',
+				reasoning_in_content: false,
+				thinking_forced_open: false,
+				'speculative.n_max': 0,
+				'speculative.n_min': 0,
+				'speculative.p_min': 0.0,
+				timings_per_token: false,
+				post_sampling_probs: false,
+				lora: [],
+				top_n_sigma: 0.0,
+				dry_sequence_breakers: []
+			} as ApiLlamaCppServerProps['default_generation_settings']['params']);
+
+			// Check that the problematic floating-point values are rounded correctly
+			expect(result.top_p).toBe(0.95);
+			expect(result.min_p).toBe(0.01);
+			expect(result.temperature).toBe(0.8);
+			expect(result.top_k).toBe(40); // Integer should remain unchanged
+			expect(result.samplers).toBe('top_k;typ_p;top_p;min_p;temperature');
+		});
+
+		it('should preserve non-numeric values', () => {
+			const mockServerParams = {
+				samplers: ['top_k', 'temperature'],
+				max_tokens: -1,
+				temperature: 0.7
+			};
+
+			const result = ParameterSyncService.extractServerDefaults({
+				...mockServerParams,
+				// Minimal required fields
+				n_predict: 512,
+				seed: -1,
+				dynatemp_range: 0.0,
+				dynatemp_exponent: 1.0,
+				top_k: 40,
+				top_p: 0.95,
+				min_p: 0.05,
+				xtc_probability: 0.0,
+				xtc_threshold: 0.1,
+				typ_p: 1.0,
+				repeat_last_n: 64,
+				repeat_penalty: 1.0,
+				presence_penalty: 0.0,
+				frequency_penalty: 0.0,
+				dry_multiplier: 0.0,
+				dry_base: 1.75,
+				dry_allowed_length: 2,
+				dry_penalty_last_n: -1,
+				mirostat: 0,
+				mirostat_tau: 5.0,
+				mirostat_eta: 0.1,
+				stop: [],
+				n_keep: 0,
+				n_discard: 0,
+				ignore_eos: false,
+				stream: true,
+				logit_bias: [],
+				n_probs: 0,
+				min_keep: 0,
+				grammar: '',
+				grammar_lazy: false,
+				grammar_triggers: [],
+				preserved_tokens: [],
+				chat_format: '',
+				reasoning_format: '',
+				reasoning_in_content: false,
+				thinking_forced_open: false,
+				'speculative.n_max': 0,
+				'speculative.n_min': 0,
+				'speculative.p_min': 0.0,
+				timings_per_token: false,
+				post_sampling_probs: false,
+				lora: [],
+				top_n_sigma: 0.0,
+				dry_sequence_breakers: []
+			} as ApiLlamaCppServerProps['default_generation_settings']['params']);
+
+			expect(result.samplers).toBe('top_k;temperature');
+			expect(result.max_tokens).toBe(-1);
+			expect(result.temperature).toBe(0.7);
+		});
+
+		it('should merge webui settings from props when provided', () => {
+			const result = ParameterSyncService.extractServerDefaults(null, {
+				pasteLongTextToFileLen: 0,
+				pdfAsImage: true,
+				renderUserContentAsMarkdown: false,
+				theme: 'dark'
+			});
+
+			expect(result.pasteLongTextToFileLen).toBe(0);
+			expect(result.pdfAsImage).toBe(true);
+			expect(result.renderUserContentAsMarkdown).toBe(false);
+			expect(result.theme).toBeUndefined();
+		});
+	});
+});
diff --git a/tools/server/webui/src/lib/services/parameter-sync.service.ts b/tools/server/webui/src/lib/services/parameter-sync.service.ts
new file mode 100644
index 0000000000..6cb53d12d1
--- /dev/null
+++ b/tools/server/webui/src/lib/services/parameter-sync.service.ts
@@ -0,0 +1,400 @@
+import { normalizeFloatingPoint } from '$lib/utils';
+import { SyncableParameterType, ParameterSource } from '$lib/enums/settings';
+
+type ParameterValue = string | number | boolean;
+type ParameterRecord = Record<string, ParameterValue>;
+
+interface ParameterInfo {
+	value: string | number | boolean;
+	source: ParameterSource;
+	serverDefault?: string | number | boolean;
+	userOverride?: string | number | boolean;
+}
+
+interface SyncableParameter {
+	key: string;
+	serverKey: string;
+	type: SyncableParameterType;
+	canSync: boolean;
+}
+
+/**
+ * Mapping of webui setting keys to server parameter keys.
+ * Only parameters listed here can be synced from the server `/props` endpoint.
+ * Each entry defines the webui key, corresponding server key, value type,
+ * and whether sync is enabled.
+ */
+export const SYNCABLE_PARAMETERS: SyncableParameter[] = [
+	{
+		key: 'temperature',
+		serverKey: 'temperature',
+		type: SyncableParameterType.NUMBER,
+		canSync: true
+	},
+	{ key: 'top_k', serverKey: 'top_k', type: SyncableParameterType.NUMBER, canSync: true },
+	{ key: 'top_p', serverKey: 'top_p', type: SyncableParameterType.NUMBER, canSync: true },
+	{ key: 'min_p', serverKey: 'min_p', type: SyncableParameterType.NUMBER, canSync: true },
+	{
+		key: 'dynatemp_range',
+		serverKey: 'dynatemp_range',
+		type: SyncableParameterType.NUMBER,
+		canSync: true
+	},
+	{
+		key: 'dynatemp_exponent',
+		serverKey: 'dynatemp_exponent',
+		type: SyncableParameterType.NUMBER,
+		canSync: true
+	},
+	{
+		key: 'xtc_probability',
+		serverKey: 'xtc_probability',
+		type: SyncableParameterType.NUMBER,
+		canSync: true
+	},
+	{
+		key: 'xtc_threshold',
+		serverKey: 'xtc_threshold',
+		type: SyncableParameterType.NUMBER,
+		canSync: true
+	},
+	{ key: 'typ_p', serverKey: 'typ_p', type: SyncableParameterType.NUMBER, canSync: true },
+	{
+		key: 'repeat_last_n',
+		serverKey: 'repeat_last_n',
+		type: SyncableParameterType.NUMBER,
+		canSync: true
+	},
+	{
+		key: 'repeat_penalty',
+		serverKey: 'repeat_penalty',
+		type: SyncableParameterType.NUMBER,
+		canSync: true
+	},
+	{
+		key: 'presence_penalty',
+		serverKey: 'presence_penalty',
+		type: SyncableParameterType.NUMBER,
+		canSync: true
+	},
+	{
+		key: 'frequency_penalty',
+		serverKey: 'frequency_penalty',
+		type: SyncableParameterType.NUMBER,
+		canSync: true
+	},
+	{
+		key: 'dry_multiplier',
+		serverKey: 'dry_multiplier',
+		type: SyncableParameterType.NUMBER,
+		canSync: true
+	},
+	{ key: 'dry_base', serverKey: 'dry_base', type: SyncableParameterType.NUMBER, canSync: true },
+	{
+		key: 'dry_allowed_length',
+		serverKey: 'dry_allowed_length',
+		type: SyncableParameterType.NUMBER,
+		canSync: true
+	},
+	{
+		key: 'dry_penalty_last_n',
+		serverKey: 'dry_penalty_last_n',
+		type: SyncableParameterType.NUMBER,
+		canSync: true
+	},
+	{ key: 'max_tokens', serverKey: 'max_tokens', type: SyncableParameterType.NUMBER, canSync: true },
+	{ key: 'samplers', serverKey: 'samplers', type: SyncableParameterType.STRING, canSync: true },
+	{
+		key: 'pasteLongTextToFileLen',
+		serverKey: 'pasteLongTextToFileLen',
+		type: SyncableParameterType.NUMBER,
+		canSync: true
+	},
+	{
+		key: 'pdfAsImage',
+		serverKey: 'pdfAsImage',
+		type: SyncableParameterType.BOOLEAN,
+		canSync: true
+	},
+	{
+		key: 'showThoughtInProgress',
+		serverKey: 'showThoughtInProgress',
+		type: SyncableParameterType.BOOLEAN,
+		canSync: true
+	},
+	{
+		key: 'keepStatsVisible',
+		serverKey: 'keepStatsVisible',
+		type: SyncableParameterType.BOOLEAN,
+		canSync: true
+	},
+	{
+		key: 'showMessageStats',
+		serverKey: 'showMessageStats',
+		type: SyncableParameterType.BOOLEAN,
+		canSync: true
+	},
+	{
+		key: 'askForTitleConfirmation',
+		serverKey: 'askForTitleConfirmation',
+		type: SyncableParameterType.BOOLEAN,
+		canSync: true
+	},
+	{
+		key: 'disableAutoScroll',
+		serverKey: 'disableAutoScroll',
+		type: SyncableParameterType.BOOLEAN,
+		canSync: true
+	},
+	{
+		key: 'renderUserContentAsMarkdown',
+		serverKey: 'renderUserContentAsMarkdown',
+		type: SyncableParameterType.BOOLEAN,
+		canSync: true
+	},
+	{
+		key: 'autoMicOnEmpty',
+		serverKey: 'autoMicOnEmpty',
+		type: SyncableParameterType.BOOLEAN,
+		canSync: true
+	},
+	{
+		key: 'pyInterpreterEnabled',
+		serverKey: 'pyInterpreterEnabled',
+		type: SyncableParameterType.BOOLEAN,
+		canSync: true
+	},
+	{
+		key: 'enableContinueGeneration',
+		serverKey: 'enableContinueGeneration',
+		type: SyncableParameterType.BOOLEAN,
+		canSync: true
+	}
+];
+
+export class ParameterSyncService {
+	/**
+	 *
+	 *
+	 * Extraction
+	 *
+	 *
+	 */
+
+	/**
+	 * Round floating-point numbers to avoid JavaScript precision issues.
+	 * E.g., 0.1 + 0.2 = 0.30000000000000004 → 0.3
+	 *
+	 * @param value - Parameter value to normalize
+	 * @returns Precision-normalized value
+	 */
+	private static roundFloatingPoint(value: ParameterValue): ParameterValue {
+		return normalizeFloatingPoint(value) as ParameterValue;
+	}
+
+	/**
+	 * Extract server default parameters that can be synced from `/props` response.
+	 * Handles both generation settings parameters and webui-specific settings.
+	 * Converts samplers array to semicolon-delimited string for UI display.
+	 *
+	 * @param serverParams - Raw generation settings from server `/props` endpoint
+	 * @param webuiSettings - Optional webui-specific settings from server
+	 * @returns Record of extracted parameter key-value pairs with normalized precision
+	 */
+	static extractServerDefaults(
+		serverParams: ApiLlamaCppServerProps['default_generation_settings']['params'] | null,
+		webuiSettings?: Record<string, string | number | boolean>
+	): ParameterRecord {
+		const extracted: ParameterRecord = {};
+
+		if (serverParams) {
+			for (const param of SYNCABLE_PARAMETERS) {
+				if (param.canSync && param.serverKey in serverParams) {
+					const value = (serverParams as unknown as Record<string, ParameterValue>)[
+						param.serverKey
+					];
+					if (value !== undefined) {
+						// Apply precision rounding to avoid JavaScript floating-point issues
+						extracted[param.key] = this.roundFloatingPoint(value);
+					}
+				}
+			}
+
+			// Handle samplers array conversion to string
+			if (serverParams.samplers && Array.isArray(serverParams.samplers)) {
+				extracted.samplers = serverParams.samplers.join(';');
+			}
+		}
+
+		if (webuiSettings) {
+			for (const param of SYNCABLE_PARAMETERS) {
+				if (param.canSync && param.serverKey in webuiSettings) {
+					const value = webuiSettings[param.serverKey];
+					if (value !== undefined) {
+						extracted[param.key] = this.roundFloatingPoint(value);
+					}
+				}
+			}
+		}
+
+		return extracted;
+	}
+
+	/**
+	 *
+	 *
+	 * Merging
+	 *
+	 *
+	 */
+
+	/**
+	 * Merge server defaults with current user settings.
+	 * User overrides always take priority — only parameters not in `userOverrides`
+	 * set will be updated from server defaults.
+	 *
+	 * @param currentSettings - Current parameter values in the settings store
+	 * @param serverDefaults - Default values extracted from server props
+	 * @param userOverrides - Set of parameter keys explicitly overridden by the user
+	 * @returns Merged parameter record with user overrides preserved
+	 */
+	static mergeWithServerDefaults(
+		currentSettings: ParameterRecord,
+		serverDefaults: ParameterRecord,
+		userOverrides: Set<string> = new Set()
+	): ParameterRecord {
+		const merged = { ...currentSettings };
+
+		for (const [key, serverValue] of Object.entries(serverDefaults)) {
+			// Only update if user hasn't explicitly overridden this parameter
+			if (!userOverrides.has(key)) {
+				merged[key] = this.roundFloatingPoint(serverValue);
+			}
+		}
+
+		return merged;
+	}
+
+	/**
+	 *
+	 *
+	 * Info
+	 *
+	 *
+	 */
+
+	/**
+	 * Get parameter information including source and values.
+	 * Used by ChatSettingsParameterSourceIndicator to display the correct badge
+	 * (Custom vs Default) for each parameter in the settings UI.
+	 *
+	 * @param key - The parameter key to get info for
+	 * @param currentValue - The current value of the parameter
+	 * @param propsDefaults - Server default values from `/props`
+	 * @param userOverrides - Set of parameter keys explicitly overridden by the user
+	 * @returns Parameter info with source, server default, and user override values
+	 */
+	static getParameterInfo(
+		key: string,
+		currentValue: ParameterValue,
+		propsDefaults: ParameterRecord,
+		userOverrides: Set<string>
+	): ParameterInfo {
+		const hasPropsDefault = propsDefaults[key] !== undefined;
+		const isUserOverride = userOverrides.has(key);
+
+		// Simple logic: either using default (from props) or custom (user override)
+		const source = isUserOverride ? ParameterSource.CUSTOM : ParameterSource.DEFAULT;
+
+		return {
+			value: currentValue,
+			source,
+			serverDefault: hasPropsDefault ? propsDefaults[key] : undefined, // Keep same field name for compatibility
+			userOverride: isUserOverride ? currentValue : undefined
+		};
+	}
+
+	/**
+	 * Check if a parameter can be synced from server.
+	 *
+	 * @param key - The parameter key to check
+	 * @returns True if the parameter is in the syncable parameters list
+	 */
+	static canSyncParameter(key: string): boolean {
+		return SYNCABLE_PARAMETERS.some((param) => param.key === key && param.canSync);
+	}
+
+	/**
+	 * Get all syncable parameter keys.
+	 *
+	 * @returns Array of parameter keys that can be synced from server
+	 */
+	static getSyncableParameterKeys(): string[] {
+		return SYNCABLE_PARAMETERS.filter((param) => param.canSync).map((param) => param.key);
+	}
+
+	/**
+	 * Validate a server parameter value against its expected type.
+	 *
+	 * @param key - The parameter key to validate
+	 * @param value - The value to validate
+	 * @returns True if value matches the expected type for this parameter
+	 */
+	static validateServerParameter(key: string, value: ParameterValue): boolean {
+		const param = SYNCABLE_PARAMETERS.find((p) => p.key === key);
+		if (!param) return false;
+
+		switch (param.type) {
+			case SyncableParameterType.NUMBER:
+				return typeof value === 'number' && !isNaN(value);
+			case SyncableParameterType.STRING:
+				return typeof value === 'string';
+			case SyncableParameterType.BOOLEAN:
+				return typeof value === 'boolean';
+			default:
+				return false;
+		}
+	}
+
+	/**
+	 *
+	 *
+	 * Diff
+	 *
+	 *
+	 */
+
+	/**
+	 * Create a diff between current settings and server defaults.
+	 * Shows which parameters differ from server values, useful for debugging
+	 * and for the "Reset to defaults" functionality.
+	 *
+	 * @param currentSettings - Current parameter values in the settings store
+	 * @param serverDefaults - Default values extracted from server props
+	 * @returns Record of parameter diffs with current value, server value, and whether they differ
+	 */
+	static createParameterDiff(
+		currentSettings: ParameterRecord,
+		serverDefaults: ParameterRecord
+	): Record<string, { current: ParameterValue; server: ParameterValue; differs: boolean }> {
+		const diff: Record<
+			string,
+			{ current: ParameterValue; server: ParameterValue; differs: boolean }
+		> = {};
+
+		for (const key of this.getSyncableParameterKeys()) {
+			const currentValue = currentSettings[key];
+			const serverValue = serverDefaults[key];
+
+			if (serverValue !== undefined) {
+				diff[key] = {
+					current: currentValue,
+					server: serverValue,
+					differs: currentValue !== serverValue
+				};
+			}
+		}
+
+		return diff;
+	}
+}
diff --git a/tools/server/webui/src/lib/services/parameter-sync.ts b/tools/server/webui/src/lib/services/parameter-sync.ts
index d124cf5c8d..333260701f 100644
--- a/tools/server/webui/src/lib/services/parameter-sync.ts
+++ b/tools/server/webui/src/lib/services/parameter-sync.ts
@@ -70,12 +70,6 @@ export const SYNCABLE_PARAMETERS: SyncableParameter[] = [
 		canSync: true
 	},
 	{ key: 'showToolCalls', serverKey: 'showToolCalls', type: 'boolean', canSync: true },
-	{
-		key: 'disableReasoningFormat',
-		serverKey: 'disableReasoningFormat',
-		type: 'boolean',
-		canSync: true
-	},
 	{ key: 'keepStatsVisible', serverKey: 'keepStatsVisible', type: 'boolean', canSync: true },
 	{ key: 'showMessageStats', serverKey: 'showMessageStats', type: 'boolean', canSync: true },
 	{
diff --git a/tools/server/webui/src/lib/services/props.service.ts b/tools/server/webui/src/lib/services/props.service.ts
new file mode 100644
index 0000000000..7373b7e016
--- /dev/null
+++ b/tools/server/webui/src/lib/services/props.service.ts
@@ -0,0 +1,47 @@
+import { apiFetchWithParams } from '$lib/utils/api-fetch';
+
+export class PropsService {
+	/**
+	 *
+	 *
+	 * Fetching
+	 *
+	 *
+	 */
+
+	/**
+	 * Fetches global server properties from the `/props` endpoint.
+	 * In MODEL mode, returns modalities for the single loaded model.
+	 * In ROUTER mode, returns server-wide settings without model-specific modalities.
+	 *
+	 * @param autoload - If false, prevents automatic model loading (default: false)
+	 * @returns Server properties including default generation settings and capabilities
+	 * @throws {Error} If the request fails or returns invalid data
+	 */
+	static async fetch(autoload = false): Promise<ApiLlamaCppServerProps> {
+		const params: Record<string, string> = {};
+		if (!autoload) {
+			params.autoload = 'false';
+		}
+
+		return apiFetchWithParams<ApiLlamaCppServerProps>('./props', params, { authOnly: true });
+	}
+
+	/**
+	 * Fetches server properties for a specific model (ROUTER mode only).
+	 * Required in ROUTER mode because global `/props` does not include per-model modalities.
+	 *
+	 * @param modelId - The model ID to fetch properties for
+	 * @param autoload - If false, prevents automatic model loading (default: false)
+	 * @returns Server properties specific to the requested model
+	 * @throws {Error} If the request fails, model not found, or model not loaded
+	 */
+	static async fetchForModel(modelId: string, autoload = false): Promise<ApiLlamaCppServerProps> {
+		const params: Record<string, string> = { model: modelId };
+		if (!autoload) {
+			params.autoload = 'false';
+		}
+
+		return apiFetchWithParams<ApiLlamaCppServerProps>('./props', params, { authOnly: true });
+	}
+}
diff --git a/tools/server/webui/src/lib/stores/chat.svelte.ts b/tools/server/webui/src/lib/stores/chat.svelte.ts
index 879b2f3245..f00f418b4c 100644
--- a/tools/server/webui/src/lib/stores/chat.svelte.ts
+++ b/tools/server/webui/src/lib/stores/chat.svelte.ts
@@ -15,6 +15,7 @@ import {
 } from '$lib/utils';
 import { SvelteMap } from 'svelte/reactivity';
 import { DEFAULT_CONTEXT } from '$lib/constants/default-context';
+import { SYSTEM_MESSAGE_PLACEHOLDER } from '$lib/constants/ui';
 
 /**
  * chatStore - Active AI interaction and streaming state management
@@ -76,6 +77,10 @@ class ChatStore {
 	private isStreamingActive = $state(false);
 	private isEditModeActive = $state(false);
 	private addFilesHandler: ((files: File[]) => void) | null = $state(null);
+	pendingEditMessageId = $state<string | null>(null);
+	// Draft preservation for navigation (e.g., when adding system prompt from welcome page)
+	private _pendingDraftMessage = $state<string>('');
+	private _pendingDraftFiles = $state<ChatUploadedFile[]>([]);
 
 	// ─────────────────────────────────────────────────────────────────────────────
 	// Loading State
@@ -113,6 +118,16 @@ class ChatStore {
 		this.isLoading = this.isChatLoading(convId);
 		const streamingState = this.getChatStreaming(convId);
 		this.currentResponse = streamingState?.response || '';
+		this.isStreamingActive = streamingState !== undefined;
+		this.setActiveProcessingConversation(convId);
+
+		// Sync streaming content to activeMessages so UI displays current content
+		if (streamingState?.response && streamingState?.messageId) {
+			const idx = conversationsStore.findMessageIndex(streamingState.messageId);
+			if (idx !== -1) {
+				conversationsStore.updateMessageAtIndex(idx, { content: streamingState.response });
+			}
+		}
 	}
 
 	/**
@@ -455,6 +470,166 @@ class ChatStore {
 		}
 	}
 
+	/**
+	 * Adds a system message at the top of a conversation and triggers edit mode.
+	 * The system message is inserted between root and the first message of the active branch.
+	 * Creates a new conversation if one doesn't exist.
+	 */
+	async addSystemPrompt(): Promise<void> {
+		let activeConv = conversationsStore.activeConversation;
+
+		// Create conversation if needed
+		if (!activeConv) {
+			await conversationsStore.createConversation();
+			activeConv = conversationsStore.activeConversation;
+		}
+		if (!activeConv) return;
+
+		try {
+			// Get all messages to find the root
+			const allMessages = await conversationsStore.getConversationMessages(activeConv.id);
+			const rootMessage = allMessages.find((m) => m.type === 'root' && m.parent === null);
+			let rootId: string;
+
+			// Create root message if it doesn't exist
+			if (!rootMessage) {
+				rootId = await DatabaseService.createRootMessage(activeConv.id);
+			} else {
+				rootId = rootMessage.id;
+			}
+
+			// Check if there's already a system message as root's child
+			const existingSystemMessage = allMessages.find(
+				(m) => m.role === 'system' && m.parent === rootId
+			);
+
+			if (existingSystemMessage) {
+				// If system message exists, just trigger edit mode on it
+				this.pendingEditMessageId = existingSystemMessage.id;
+
+				// Make sure it's in active messages at the beginning
+				if (!conversationsStore.activeMessages.some((m) => m.id === existingSystemMessage.id)) {
+					conversationsStore.activeMessages.unshift(existingSystemMessage);
+				}
+				return;
+			}
+
+			// Find the first message of the active branch (child of root that's in activeMessages)
+			const activeMessages = conversationsStore.activeMessages;
+			const firstActiveMessage = activeMessages.find((m) => m.parent === rootId);
+
+			// Create new system message with placeholder content (will be edited by user)
+			const systemMessage = await DatabaseService.createSystemMessage(
+				activeConv.id,
+				SYSTEM_MESSAGE_PLACEHOLDER,
+				rootId
+			);
+
+			// If there's a first message in the active branch, re-parent it to the system message
+			if (firstActiveMessage) {
+				// Update the first message's parent to be the system message
+				await DatabaseService.updateMessage(firstActiveMessage.id, {
+					parent: systemMessage.id
+				});
+
+				// Update the system message's children to include the first message
+				await DatabaseService.updateMessage(systemMessage.id, {
+					children: [firstActiveMessage.id]
+				});
+
+				// Remove first message from root's children
+				const updatedRootChildren = rootMessage
+					? rootMessage.children.filter((id: string) => id !== firstActiveMessage.id)
+					: [];
+				// Note: system message was already added to root's children by createSystemMessage
+				await DatabaseService.updateMessage(rootId, {
+					children: [
+						...updatedRootChildren.filter((id: string) => id !== systemMessage.id),
+						systemMessage.id
+					]
+				});
+
+				// Update local state
+				const firstMsgIndex = conversationsStore.findMessageIndex(firstActiveMessage.id);
+				if (firstMsgIndex !== -1) {
+					conversationsStore.updateMessageAtIndex(firstMsgIndex, { parent: systemMessage.id });
+				}
+			}
+
+			// Add system message to active messages at the beginning
+			conversationsStore.activeMessages.unshift(systemMessage);
+
+			// Set pending edit message ID to trigger edit mode
+			this.pendingEditMessageId = systemMessage.id;
+
+			conversationsStore.updateConversationTimestamp();
+		} catch (error) {
+			console.error('Failed to add system prompt:', error);
+		}
+	}
+
+	/**
+	 * Removes a system message placeholder without deleting its children.
+	 * Re-parents children back to the root message.
+	 * If this is a new empty conversation (only root + system placeholder), deletes the entire conversation.
+	 * @returns true if the entire conversation was deleted, false otherwise
+	 */
+	async removeSystemPromptPlaceholder(messageId: string): Promise<boolean> {
+		const activeConv = conversationsStore.activeConversation;
+		if (!activeConv) return false;
+
+		try {
+			const allMessages = await conversationsStore.getConversationMessages(activeConv.id);
+			const systemMessage = allMessages.find((m) => m.id === messageId);
+			if (!systemMessage || systemMessage.role !== 'system') return false;
+
+			const rootMessage = allMessages.find((m) => m.type === 'root' && m.parent === null);
+			if (!rootMessage) return false;
+
+			// Check if this is a new empty conversation (only root + system placeholder)
+			const isEmptyConversation = allMessages.length === 2 && systemMessage.children.length === 0;
+
+			if (isEmptyConversation) {
+				// Delete the entire conversation
+				await conversationsStore.deleteConversation(activeConv.id);
+				return true;
+			}
+
+			// Re-parent system message's children to root
+			for (const childId of systemMessage.children) {
+				await DatabaseService.updateMessage(childId, { parent: rootMessage.id });
+
+				// Update local state
+				const childIndex = conversationsStore.findMessageIndex(childId);
+				if (childIndex !== -1) {
+					conversationsStore.updateMessageAtIndex(childIndex, { parent: rootMessage.id });
+				}
+			}
+
+			// Update root's children: remove system message, add system's children
+			const newRootChildren = [
+				...rootMessage.children.filter((id: string) => id !== messageId),
+				...systemMessage.children
+			];
+			await DatabaseService.updateMessage(rootMessage.id, { children: newRootChildren });
+
+			// Delete the system message (without cascade)
+			await DatabaseService.deleteMessage(messageId);
+
+			// Remove from active messages
+			const systemIndex = conversationsStore.findMessageIndex(messageId);
+			if (systemIndex !== -1) {
+				conversationsStore.activeMessages.splice(systemIndex, 1);
+			}
+
+			conversationsStore.updateConversationTimestamp();
+			return false;
+		} catch (error) {
+			console.error('Failed to remove system prompt placeholder:', error);
+			return false;
+		}
+	}
+
 	private async createAssistantMessage(parentId?: string): Promise<DatabaseMessage | null> {
 		const activeConv = conversationsStore.activeConversation;
 		if (!activeConv) return null;
@@ -916,6 +1091,28 @@ class ChatStore {
 		if (!activeConv)
 			return { totalCount: 0, userMessages: 0, assistantMessages: 0, messageTypes: [] };
 		const allMessages = await conversationsStore.getConversationMessages(activeConv.id);
+		const messageToDelete = allMessages.find((m) => m.id === messageId);
+
+		// For system messages, don't count descendants as they will be preserved (reparented to root)
+		if (messageToDelete?.role === 'system') {
+			const messagesToDelete = allMessages.filter((m) => m.id === messageId);
+			let userMessages = 0,
+				assistantMessages = 0;
+			const messageTypes: string[] = [];
+
+			for (const msg of messagesToDelete) {
+				if (msg.role === 'user') {
+					userMessages++;
+					if (!messageTypes.includes('user message')) messageTypes.push('user message');
+				} else if (msg.role === 'assistant') {
+					assistantMessages++;
+					if (!messageTypes.includes('assistant response')) messageTypes.push('assistant response');
+				}
+			}
+
+			return { totalCount: 1, userMessages, assistantMessages, messageTypes };
+		}
+
 		const descendants = findDescendantMessages(allMessages, messageId);
 		const allToDelete = [messageId, ...descendants];
 		const messagesToDelete = allMessages.filter((m) => allToDelete.includes(m.id));
@@ -1381,6 +1578,31 @@ class ChatStore {
 		return this.addFilesHandler;
 	}
 
+	savePendingDraft(message: string, files: ChatUploadedFile[]): void {
+		this._pendingDraftMessage = message;
+		this._pendingDraftFiles = [...files];
+	}
+
+	consumePendingDraft(): { message: string; files: ChatUploadedFile[] } | null {
+		if (!this._pendingDraftMessage && this._pendingDraftFiles.length === 0) {
+			return null;
+		}
+
+		const draft = {
+			message: this._pendingDraftMessage,
+			files: [...this._pendingDraftFiles]
+		};
+
+		this._pendingDraftMessage = '';
+		this._pendingDraftFiles = [];
+
+		return draft;
+	}
+
+	hasPendingDraft(): boolean {
+		return Boolean(this._pendingDraftMessage) || this._pendingDraftFiles.length > 0;
+	}
+
 	public getAllLoadingChats(): string[] {
 		return Array.from(this.chatLoadingStates.keys());
 	}
@@ -1427,7 +1649,7 @@ class ChatStore {
 
 		// Config options needed by ChatService
 		if (currentConfig.systemMessage) apiOptions.systemMessage = currentConfig.systemMessage;
-		if (currentConfig.disableReasoningFormat) apiOptions.disableReasoningFormat = true;
+		if (currentConfig.disableReasoningParsing) apiOptions.disableReasoningParsing = true;
 
 		if (hasValue(currentConfig.temperature))
 			apiOptions.temperature = Number(currentConfig.temperature);
@@ -1485,3 +1707,7 @@ export const isEditing = () => chatStore.isEditing();
 export const isLoading = () => chatStore.isLoading;
 export const setEditModeActive = (handler: (files: File[]) => void) =>
 	chatStore.setEditModeActive(handler);
+export const pendingEditMessageId = () => chatStore.pendingEditMessageId;
+export const clearPendingEditMessageId = () => (chatStore.pendingEditMessageId = null);
+export const removeSystemPromptPlaceholder = (messageId: string) =>
+	chatStore.removeSystemPromptPlaceholder(messageId);
diff --git a/tools/server/webui/src/lib/types/api.d.ts b/tools/server/webui/src/lib/types/api.d.ts
index 714509f024..307e3b71d9 100644
--- a/tools/server/webui/src/lib/types/api.d.ts
+++ b/tools/server/webui/src/lib/types/api.d.ts
@@ -1,8 +1,19 @@
-import type { ServerModelStatus, ServerRole } from '$lib/enums';
-import type { ChatMessagePromptProgress } from './chat';
+import type { ContentPartType, ServerModelStatus, ServerRole } from '$lib/enums';
+import type { ChatMessagePromptProgress, ChatRole } from './chat';
+
+export interface ApiChatCompletionToolFunction {
+	name: string;
+	description?: string;
+	parameters: Record<string, unknown>;
+}
+
+export interface ApiChatCompletionTool {
+	type: 'function';
+	function: ApiChatCompletionToolFunction;
+}
 
 export interface ApiChatMessageContentPart {
-	type: 'text' | 'image_url' | 'input_audio';
+	type: ContentPartType;
 	text?: string;
 	image_url?: {
 		url: string;
@@ -34,6 +45,8 @@ export interface ApiErrorResponse {
 export interface ApiChatMessageData {
 	role: ChatRole;
 	content: string | ApiChatMessageContentPart[];
+	tool_calls?: ApiChatCompletionToolCall[];
+	tool_call_id?: string;
 	timestamp?: number;
 }
 
@@ -188,6 +201,7 @@ export interface ApiChatCompletionRequest {
 	stream?: boolean;
 	model?: string;
 	return_progress?: boolean;
+	tools?: ApiChatCompletionTool[];
 	// Reasoning parameters
 	reasoning_format?: string;
 	// Generation parameters
@@ -247,6 +261,7 @@ export interface ApiChatCompletionStreamChunk {
 			model?: string;
 			tool_calls?: ApiChatCompletionToolCallDelta[];
 		};
+		finish_reason?: string | null;
 	}>;
 	timings?: {
 		prompt_n?: number;
@@ -267,8 +282,9 @@ export interface ApiChatCompletionResponse {
 			content: string;
 			reasoning_content?: string;
 			model?: string;
-			tool_calls?: ApiChatCompletionToolCallDelta[];
+			tool_calls?: ApiChatCompletionToolCall[];
 		};
+		finish_reason?: string | null;
 	}>;
 }
 
@@ -335,7 +351,7 @@ export interface ApiProcessingState {
 	tokensDecoded: number;
 	tokensRemaining: number;
 	contextUsed: number;
-	contextTotal: number;
+	contextTotal: number | null;
 	outputTokensUsed: number; // Total output tokens (thinking + regular content)
 	outputTokensMax: number; // Max output tokens allowed
 	temperature: number;
diff --git a/tools/server/webui/src/lib/types/models.d.ts b/tools/server/webui/src/lib/types/models.d.ts
index ef44a2cb6d..505867a1f0 100644
--- a/tools/server/webui/src/lib/types/models.d.ts
+++ b/tools/server/webui/src/lib/types/models.d.ts
@@ -1,8 +1,5 @@
 import type { ApiModelDataEntry, ApiModelDetails } from '$lib/types/api';
 
-/**
- * Model modalities - vision and audio capabilities
- */
 export interface ModelModalities {
 	vision: boolean;
 	audio: boolean;
@@ -14,8 +11,15 @@ export interface ModelOption {
 	model: string;
 	description?: string;
 	capabilities: string[];
-	/** Model modalities from /props endpoint */
 	modalities?: ModelModalities;
 	details?: ApiModelDetails['details'];
 	meta?: ApiModelDataEntry['meta'];
 }
+
+/**
+ * Modality capabilities for file validation
+ */
+export interface ModalityCapabilities {
+	hasVision: boolean;
+	hasAudio: boolean;
+}
diff --git a/tools/server/webui/src/lib/types/settings.d.ts b/tools/server/webui/src/lib/types/settings.d.ts
index 38b3047dd0..d894245ec3 100644
--- a/tools/server/webui/src/lib/types/settings.d.ts
+++ b/tools/server/webui/src/lib/types/settings.d.ts
@@ -18,8 +18,8 @@ export interface SettingsChatServiceOptions {
 	model?: string;
 	// System message to inject
 	systemMessage?: string;
-	// Disable reasoning format (use 'none' instead of 'auto')
-	disableReasoningFormat?: boolean;
+	// Disable reasoning parsing (use 'none' instead of 'auto')
+	disableReasoningParsing?: boolean;
 	// Generation parameters
 	temperature?: number;
 	max_tokens?: number;
diff --git a/tools/server/webui/src/lib/utils/abort.ts b/tools/server/webui/src/lib/utils/abort.ts
new file mode 100644
index 0000000000..fc4f31ec69
--- /dev/null
+++ b/tools/server/webui/src/lib/utils/abort.ts
@@ -0,0 +1,151 @@
+/**
+ * Abort Signal Utilities
+ *
+ * Provides utilities for consistent AbortSignal propagation across the application.
+ * These utilities help ensure that async operations can be properly cancelled
+ * when needed (e.g., user stops generation, navigates away, etc.).
+ */
+
+/**
+ * Throws an AbortError if the signal is aborted.
+ * Use this at the start of async operations to fail fast.
+ *
+ * @param signal - Optional AbortSignal to check
+ * @throws DOMException with name 'AbortError' if signal is aborted
+ *
+ * @example
+ * ```ts
+ * async function fetchData(signal?: AbortSignal) {
+ *   throwIfAborted(signal);
+ *   // ... proceed with operation
+ * }
+ * ```
+ */
+export function throwIfAborted(signal?: AbortSignal): void {
+	if (signal?.aborted) {
+		throw new DOMException('Operation was aborted', 'AbortError');
+	}
+}
+
+/**
+ * Checks if an error is an AbortError.
+ * Use this to distinguish between user-initiated cancellation and actual errors.
+ *
+ * @param error - Error to check
+ * @returns true if the error is an AbortError
+ *
+ * @example
+ * ```ts
+ * try {
+ *   await fetchData(signal);
+ * } catch (error) {
+ *   if (isAbortError(error)) {
+ *     // User cancelled - no error dialog needed
+ *     return;
+ *   }
+ *   // Handle actual error
+ * }
+ * ```
+ */
+export function isAbortError(error: unknown): boolean {
+	if (error instanceof DOMException && error.name === 'AbortError') {
+		return true;
+	}
+	if (error instanceof Error && error.name === 'AbortError') {
+		return true;
+	}
+	return false;
+}
+
+/**
+ * Creates a new AbortController that is linked to one or more parent signals.
+ * When any parent signal aborts, the returned controller also aborts.
+ *
+ * Useful for creating child operations that should be cancelled when
+ * either the parent operation or their own timeout/condition triggers.
+ *
+ * @param signals - Parent signals to link to (undefined signals are ignored)
+ * @returns A new AbortController linked to all provided signals
+ *
+ * @example
+ * ```ts
+ * // Link to user's abort signal and add a timeout
+ * const linked = createLinkedController(userSignal, timeoutSignal);
+ * await fetch(url, { signal: linked.signal });
+ * ```
+ */
+export function createLinkedController(...signals: (AbortSignal | undefined)[]): AbortController {
+	const controller = new AbortController();
+
+	for (const signal of signals) {
+		if (!signal) continue;
+
+		// If already aborted, abort immediately
+		if (signal.aborted) {
+			controller.abort(signal.reason);
+			return controller;
+		}
+
+		// Link to parent signal
+		signal.addEventListener('abort', () => controller.abort(signal.reason), { once: true });
+	}
+
+	return controller;
+}
+
+/**
+ * Creates an AbortSignal that times out after the specified duration.
+ *
+ * @param ms - Timeout duration in milliseconds
+ * @returns AbortSignal that will abort after the timeout
+ *
+ * @example
+ * ```ts
+ * const signal = createTimeoutSignal(5000); // 5 second timeout
+ * await fetch(url, { signal });
+ * ```
+ */
+export function createTimeoutSignal(ms: number): AbortSignal {
+	return AbortSignal.timeout(ms);
+}
+
+/**
+ * Wraps a promise to reject if the signal is aborted.
+ * Useful for making non-abortable promises respect an AbortSignal.
+ *
+ * @param promise - Promise to wrap
+ * @param signal - AbortSignal to respect
+ * @returns Promise that rejects with AbortError if signal aborts
+ *
+ * @example
+ * ```ts
+ * // Make a non-abortable operation respect abort signal
+ * const result = await withAbortSignal(
+ *   someNonAbortableOperation(),
+ *   signal
+ * );
+ * ```
+ */
+export async function withAbortSignal<T>(promise: Promise<T>, signal?: AbortSignal): Promise<T> {
+	if (!signal) return promise;
+
+	throwIfAborted(signal);
+
+	return new Promise<T>((resolve, reject) => {
+		const abortHandler = () => {
+			reject(new DOMException('Operation was aborted', 'AbortError'));
+		};
+
+		signal.addEventListener('abort', abortHandler, { once: true });
+
+		promise
+			.then((value) => {
+				signal.removeEventListener('abort', abortHandler);
+				resolve(value);
+			})
+			.catch((error) => {
+				signal.removeEventListener('abort', abortHandler);
+				reject(error);
+			});
+	});
+}
diff --git a/tools/server/webui/src/lib/utils/api-fetch.ts b/tools/server/webui/src/lib/utils/api-fetch.ts
new file mode 100644
index 0000000000..28757a966f
--- /dev/null
+++ b/tools/server/webui/src/lib/utils/api-fetch.ts
@@ -0,0 +1,154 @@
+import { base } from '$app/paths';
+import { getJsonHeaders, getAuthHeaders } from './api-headers';
+
+/**
+ * API Fetch Utilities
+ *
+ * Provides common fetch patterns used across services:
+ * - Automatic JSON headers
+ * - Error handling with proper error messages
+ * - Base path resolution
+ */
+
+export interface ApiFetchOptions extends Omit<RequestInit, 'headers'> {
+	/**
+	 * Use auth-only headers (no Content-Type).
+	 * Default: false (uses JSON headers with Content-Type: application/json)
+	 */
+	authOnly?: boolean;
+	/**
+	 * Additional headers to merge with default headers.
+	 */
+	headers?: Record<string, string>;
+}
+
+/**
+ * Fetch JSON data from an API endpoint with standard headers and error handling.
+ *
+ * @param path - API path (will be prefixed with base path)
+ * @param options - Fetch options with additional authOnly flag
+ * @returns Parsed JSON response
+ * @throws Error with formatted message on failure
+ *
+ * @example
+ * ```typescript
+ * // GET request
+ * const models = await apiFetch<ApiModelListResponse>('/v1/models');
+ *
+ * // POST request
+ * const result = await apiFetch<ApiResponse>('/models/load', {
+ *   method: 'POST',
+ *   body: JSON.stringify({ model: 'gpt-4' })
+ * });
+ * ```
+ */
+export async function apiFetch<T>(path: string, options: ApiFetchOptions = {}): Promise<T> {
+	const { authOnly = false, headers: customHeaders, ...fetchOptions } = options;
+
+	const baseHeaders = authOnly ? getAuthHeaders() : getJsonHeaders();
+	const headers = { ...baseHeaders, ...customHeaders };
+
+	const url = path.startsWith('http://') || path.startsWith('https://') ? path : `${base}${path}`;
+
+	const response = await fetch(url, {
+		...fetchOptions,
+		headers
+	});
+
+	if (!response.ok) {
+		const errorMessage = await parseErrorMessage(response);
+		throw new Error(errorMessage);
+	}
+
+	return response.json() as Promise<T>;
+}
+
+/**
+ * Fetch with URL constructed from base URL and query parameters.
+ *
+ * @param basePath - Base API path
+ * @param params - Query parameters to append
+ * @param options - Fetch options
+ * @returns Parsed JSON response
+ *
+ * @example
+ * ```typescript
+ * const props = await apiFetchWithParams<ApiProps>('./props', {
+ *   model: 'gpt-4',
+ *   autoload: 'false'
+ * });
+ * ```
+ */
+export async function apiFetchWithParams<T>(
+	basePath: string,
+	params: Record<string, string>,
+	options: ApiFetchOptions = {}
+): Promise<T> {
+	const url = new URL(basePath, window.location.href);
+
+	for (const [key, value] of Object.entries(params)) {
+		if (value !== undefined && value !== null) {
+			url.searchParams.set(key, value);
+		}
+	}
+
+	const { authOnly = false, headers: customHeaders, ...fetchOptions } = options;
+
+	const baseHeaders = authOnly ? getAuthHeaders() : getJsonHeaders();
+	const headers = { ...baseHeaders, ...customHeaders };
+
+	const response = await fetch(url.toString(), {
+		...fetchOptions,
+		headers
+	});
+
+	if (!response.ok) {
+		const errorMessage = await parseErrorMessage(response);
+		throw new Error(errorMessage);
+	}
+
+	return response.json() as Promise<T>;
+}
+
+/**
+ * POST JSON data to an API endpoint.
+ *
+ * @param path - API path
+ * @param body - Request body (will be JSON stringified)
+ * @param options - Additional fetch options
+ * @returns Parsed JSON response
+ */
+export async function apiPost<T, B = unknown>(
+	path: string,
+	body: B,
+	options: ApiFetchOptions = {}
+): Promise<T> {
+	return apiFetch<T>(path, {
+		method: 'POST',
+		body: JSON.stringify(body),
+		...options
+	});
+}
+
+/**
+ * Parse error message from a failed response.
+ * Tries to extract error message from JSON body, falls back to status text.
+ */
+async function parseErrorMessage(response: Response): Promise<string> {
+	try {
+		const errorData = await response.json();
+		if (errorData?.error?.message) {
+			return errorData.error.message;
+		}
+		if (errorData?.error && typeof errorData.error === 'string') {
+			return errorData.error;
+		}
+		if (errorData?.message) {
+			return errorData.message;
+		}
+	} catch {
+		// JSON parsing failed, use status text
+	}
+
+	return `Request failed: ${response.status} ${response.statusText}`;
+}
diff --git a/tools/server/webui/src/lib/utils/branching.ts b/tools/server/webui/src/lib/utils/branching.ts
index 3be56047a5..ee3a505eed 100644
--- a/tools/server/webui/src/lib/utils/branching.ts
+++ b/tools/server/webui/src/lib/utils/branching.ts
@@ -15,6 +15,8 @@
  *        └── message 5 (assistant)
  */
 
+import { MessageRole } from '$lib/enums/chat';
+
 /**
  * Filters messages to get the conversation path from root to a specific leaf node.
  * If the leafNodeId doesn't exist, returns the path with the latest timestamp.
@@ -65,8 +67,13 @@ export function filterByLeafNodeId(
 		currentNode = nodeMap.get(currentNode.parent);
 	}
 
-	// Sort by timestamp to get chronological order (root to leaf)
-	result.sort((a, b) => a.timestamp - b.timestamp);
+	// Sort: system messages first, then by timestamp
+	result.sort((a, b) => {
+		if (a.role === MessageRole.SYSTEM && b.role !== MessageRole.SYSTEM) return -1;
+		if (a.role !== MessageRole.SYSTEM && b.role === MessageRole.SYSTEM) return 1;
+
+		return a.timestamp - b.timestamp;
+	});
 	return result;
 }
 
diff --git a/tools/server/webui/src/lib/utils/browser-only.ts b/tools/server/webui/src/lib/utils/browser-only.ts
index 0af800638b..27d2be4aaa 100644
--- a/tools/server/webui/src/lib/utils/browser-only.ts
+++ b/tools/server/webui/src/lib/utils/browser-only.ts
@@ -23,7 +23,7 @@ export {
 } from './pdf-processing';
 
 // File conversion utilities (depends on pdf-processing)
-export { parseFilesToMessageExtras, type FileProcessingResult } from './convert-files-to-extra';
+export { parseFilesToMessageExtras } from './convert-files-to-extra';
 
 // File upload processing utilities (depends on pdf-processing, svg-to-png, webp-to-png)
 export { processFilesToChatUploaded } from './process-uploaded-files';
diff --git a/tools/server/webui/src/lib/utils/cache-ttl.ts b/tools/server/webui/src/lib/utils/cache-ttl.ts
new file mode 100644
index 0000000000..9d1f005822
--- /dev/null
+++ b/tools/server/webui/src/lib/utils/cache-ttl.ts
@@ -0,0 +1,293 @@
+const DEFAULT_CACHE_TTL_MS = 5 * 60 * 1000;
+const DEFAULT_CACHE_MAX_ENTRIES = 100;
+
+/**
+ * TTL Cache - Time-To-Live cache implementation for memory optimization
+ *
+ * Provides automatic expiration of cached entries to prevent memory bloat
+ * in long-running sessions.
+ *
+ * @example
+ * ```ts
+ * const cache = new TTLCache<string, ApiData>({ ttlMs: 5 * 60 * 1000 }); // 5 minutes
+ * cache.set('key', data);
+ * const value = cache.get('key'); // null if expired
+ * ```
+ */
+
+export interface TTLCacheOptions {
+	/** Time-to-live in milliseconds. Default: 5 minutes */
+	ttlMs?: number;
+	/** Maximum number of entries. Oldest entries are evicted when exceeded. Default: 100 */
+	maxEntries?: number;
+	/** Callback when an entry expires or is evicted */
+	onEvict?: (key: string, value: unknown) => void;
+}
+
+interface CacheEntry<T> {
+	value: T;
+	expiresAt: number;
+	lastAccessed: number;
+}
+
+export class TTLCache<K extends string, V> {
+	private cache = new Map<K, CacheEntry<V>>();
+	private readonly ttlMs: number;
+	private readonly maxEntries: number;
+	private readonly onEvict?: (key: string, value: unknown) => void;
+
+	constructor(options: TTLCacheOptions = {}) {
+		this.ttlMs = options.ttlMs ?? DEFAULT_CACHE_TTL_MS;
+		this.maxEntries = options.maxEntries ?? DEFAULT_CACHE_MAX_ENTRIES;
+		this.onEvict = options.onEvict;
+	}
+
+	/**
+	 * Get a value from cache. Returns null if expired or not found.
+	 */
+	get(key: K): V | null {
+		const entry = this.cache.get(key);
+		if (!entry) return null;
+
+		if (Date.now() > entry.expiresAt) {
+			this.delete(key);
+			return null;
+		}
+
+		// Update last accessed time for LRU-like behavior
+		entry.lastAccessed = Date.now();
+		return entry.value;
+	}
+
+	/**
+	 * Set a value in cache with TTL.
+	 */
+	set(key: K, value: V, customTtlMs?: number): void {
+		// Evict oldest entries if at capacity
+		if (this.cache.size >= this.maxEntries && !this.cache.has(key)) {
+			this.evictOldest();
+		}
+
+		const ttl = customTtlMs ?? this.ttlMs;
+		const now = Date.now();
+
+		this.cache.set(key, {
+			value,
+			expiresAt: now + ttl,
+			lastAccessed: now
+		});
+	}
+
+	/**
+	 * Check if key exists and is not expired.
+	 */
+	has(key: K): boolean {
+		const entry = this.cache.get(key);
+		if (!entry) return false;
+
+		if (Date.now() > entry.expiresAt) {
+			this.delete(key);
+			return false;
+		}
+
+		return true;
+	}
+
+	/**
+	 * Delete a specific key from cache.
+	 */
+	delete(key: K): boolean {
+		const entry = this.cache.get(key);
+		if (entry && this.onEvict) {
+			this.onEvict(key, entry.value);
+		}
+		return this.cache.delete(key);
+	}
+
+	/**
+	 * Clear all entries from cache.
+	 */
+	clear(): void {
+		if (this.onEvict) {
+			for (const [key, entry] of this.cache) {
+				this.onEvict(key, entry.value);
+			}
+		}
+		this.cache.clear();
+	}
+
+	/**
+	 * Get the number of entries (including potentially expired ones).
+	 */
+	get size(): number {
+		return this.cache.size;
+	}
+
+	/**
+	 * Remove all expired entries from cache.
+	 * Call periodically for proactive cleanup.
+	 */
+	prune(): number {
+		const now = Date.now();
+		let pruned = 0;
+
+		for (const [key, entry] of this.cache) {
+			if (now > entry.expiresAt) {
+				this.delete(key);
+				pruned++;
+			}
+		}
+
+		return pruned;
+	}
+
+	/**
+	 * Get all valid (non-expired) keys.
+	 */
+	keys(): K[] {
+		const now = Date.now();
+		const validKeys: K[] = [];
+
+		for (const [key, entry] of this.cache) {
+			if (now <= entry.expiresAt) {
+				validKeys.push(key);
+			}
+		}
+
+		return validKeys;
+	}
+
+	/**
+	 * Evict the oldest (least recently accessed) entry.
+	 */
+	private evictOldest(): void {
+		let oldestKey: K | null = null;
+		let oldestTime = Infinity;
+
+		for (const [key, entry] of this.cache) {
+			if (entry.lastAccessed < oldestTime) {
+				oldestTime = entry.lastAccessed;
+				oldestKey = key;
+			}
+		}
+
+		if (oldestKey !== null) {
+			this.delete(oldestKey);
+		}
+	}
+
+	/**
+	 * Refresh TTL for an existing entry without changing the value.
+	 */
+	touch(key: K): boolean {
+		const entry = this.cache.get(key);
+		if (!entry) return false;
+
+		const now = Date.now();
+		if (now > entry.expiresAt) {
+			this.delete(key);
+			return false;
+		}
+
+		entry.expiresAt = now + this.ttlMs;
+		entry.lastAccessed = now;
+		return true;
+	}
+}
+
+/**
+ * Reactive TTL Map for Svelte stores
+ * Wraps SvelteMap with TTL functionality
+ */
+export class ReactiveTTLMap<K extends string, V> {
+	private entries = $state<Map<K, CacheEntry<V>>>(new Map());
+	private readonly ttlMs: number;
+	private readonly maxEntries: number;
+
+	constructor(options: TTLCacheOptions = {}) {
+		this.ttlMs = options.ttlMs ?? DEFAULT_CACHE_TTL_MS;
+		this.maxEntries = options.maxEntries ?? DEFAULT_CACHE_MAX_ENTRIES;
+	}
+
+	get(key: K): V | null {
+		const entry = this.entries.get(key);
+		if (!entry) return null;
+
+		if (Date.now() > entry.expiresAt) {
+			this.entries.delete(key);
+			return null;
+		}
+
+		entry.lastAccessed = Date.now();
+		return entry.value;
+	}
+
+	set(key: K, value: V, customTtlMs?: number): void {
+		if (this.entries.size >= this.maxEntries && !this.entries.has(key)) {
+			this.evictOldest();
+		}
+
+		const ttl = customTtlMs ?? this.ttlMs;
+		const now = Date.now();
+
+		this.entries.set(key, {
+			value,
+			expiresAt: now + ttl,
+			lastAccessed: now
+		});
+	}
+
+	has(key: K): boolean {
+		const entry = this.entries.get(key);
+		if (!entry) return false;
+
+		if (Date.now() > entry.expiresAt) {
+			this.entries.delete(key);
+			return false;
+		}
+
+		return true;
+	}
+
+	delete(key: K): boolean {
+		return this.entries.delete(key);
+	}
+
+	clear(): void {
+		this.entries.clear();
+	}
+
+	get size(): number {
+		return this.entries.size;
+	}
+
+	prune(): number {
+		const now = Date.now();
+		let pruned = 0;
+
+		for (const [key, entry] of this.entries) {
+			if (now > entry.expiresAt) {
+				this.entries.delete(key);
+				pruned++;
+			}
+		}
+
+		return pruned;
+	}
+
+	private evictOldest(): void {
+		let oldestKey: K | null = null;
+		let oldestTime = Infinity;
+
+		for (const [key, entry] of this.entries) {
+			if (entry.lastAccessed < oldestTime) {
+				oldestTime = entry.lastAccessed;
+				oldestKey = key;
+			}
+		}
+
+		if (oldestKey !== null) {
+			this.entries.delete(oldestKey);
+		}
+	}
+}
diff --git a/tools/server/webui/src/lib/utils/code.ts b/tools/server/webui/src/lib/utils/code.ts
new file mode 100644
index 0000000000..67efc6b27e
--- /dev/null
+++ b/tools/server/webui/src/lib/utils/code.ts
@@ -0,0 +1,85 @@
+import hljs from 'highlight.js';
+import {
+	NEWLINE,
+	DEFAULT_LANGUAGE,
+	LANG_PATTERN,
+	AMPERSAND_REGEX,
+	LT_REGEX,
+	GT_REGEX,
+	FENCE_PATTERN
+} from '$lib/constants/code';
+
+export interface IncompleteCodeBlock {
+	language: string;
+	code: string;
+	openingIndex: number;
+}
+
+/**
+ * Highlights code using highlight.js
+ * @param code - The code to highlight
+ * @param language - The programming language
+ * @returns HTML string with syntax highlighting
+ */
+export function highlightCode(code: string, language: string): string {
+	if (!code) return '';
+
+	try {
+		const lang = language.toLowerCase();
+		const isSupported = hljs.getLanguage(lang);
+
+		if (isSupported) {
+			return hljs.highlight(code, { language: lang }).value;
+		} else {
+			return hljs.highlightAuto(code).value;
+		}
+	} catch {
+		// Fallback to escaped plain text
+		return code
+			.replace(AMPERSAND_REGEX, '&amp;')
+			.replace(LT_REGEX, '&lt;')
+			.replace(GT_REGEX, '&gt;');
+	}
+}
+
+/**
+ * Detects if markdown ends with an incomplete code block (opened but not closed).
+ * Returns the code block info if found, null otherwise.
+ * @param markdown - The raw markdown string to check
+ * @returns IncompleteCodeBlock info or null
+ */
+export function detectIncompleteCodeBlock(markdown: string): IncompleteCodeBlock | null {
+	// Count all code fences in the markdown
+	// A code block is incomplete if there's an odd number of ``` fences
+	const fencePattern = new RegExp(FENCE_PATTERN.source, FENCE_PATTERN.flags);
+	const fences: number[] = [];
+	let fenceMatch;
+
+	while ((fenceMatch = fencePattern.exec(markdown)) !== null) {
+		// Store the position after the ```
+		const pos = fenceMatch[0].startsWith(NEWLINE) ? fenceMatch.index + 1 : fenceMatch.index;
+		fences.push(pos);
+	}
+
+	// If even number of fences (including 0), all code blocks are closed
+	if (fences.length % 2 === 0) {
+		return null;
+	}
+
+	// Odd number means last code block is incomplete
+	// The last fence is the opening of the incomplete block
+	const openingIndex = fences[fences.length - 1];
+	const afterOpening = markdown.slice(openingIndex + 3);
+
+	// Extract language and code content
+	const langMatch = afterOpening.match(LANG_PATTERN);
+	const language = langMatch?.[1] || DEFAULT_LANGUAGE;
+	const codeStartIndex = openingIndex + 3 + (langMatch?.[0]?.length ?? 0);
+	const code = markdown.slice(codeStartIndex);
+
+	return {
+		language,
+		code,
+		openingIndex
+	};
+}
diff --git a/tools/server/webui/src/lib/utils/data-url.ts b/tools/server/webui/src/lib/utils/data-url.ts
new file mode 100644
index 0000000000..6f55be793d
--- /dev/null
+++ b/tools/server/webui/src/lib/utils/data-url.ts
@@ -0,0 +1,10 @@
+/**
+ * Creates a base64 data URL from MIME type and base64-encoded data.
+ *
+ * @param mimeType - The MIME type (e.g., 'image/png', 'audio/mp3')
+ * @param base64Data - The base64-encoded data
+ * @returns A data URL string in format 'data:{mimeType};base64,{data}'
+ */
+export function createBase64DataUrl(mimeType: string, base64Data: string): string {
+	return `data:${mimeType};base64,${base64Data}`;
+}
diff --git a/tools/server/webui/src/lib/utils/debounce.ts b/tools/server/webui/src/lib/utils/debounce.ts
new file mode 100644
index 0000000000..90a5a01783
--- /dev/null
+++ b/tools/server/webui/src/lib/utils/debounce.ts
@@ -0,0 +1,22 @@
+/**
+ * @param fn - The function to debounce
+ * @param delay - The delay in milliseconds
+ * @returns A debounced version of the function
+ */
+export function debounce<T extends (...args: Parameters<T>) => void>(
+	fn: T,
+	delay: number
+): (...args: Parameters<T>) => void {
+	let timeoutId: ReturnType<typeof setTimeout> | null = null;
+
+	return (...args: Parameters<T>) => {
+		if (timeoutId) {
+			clearTimeout(timeoutId);
+		}
+
+		timeoutId = setTimeout(() => {
+			fn(...args);
+			timeoutId = null;
+		}, delay);
+	};
+}
diff --git a/tools/server/webui/src/lib/utils/image-error-fallback.ts b/tools/server/webui/src/lib/utils/image-error-fallback.ts
new file mode 100644
index 0000000000..6e3260f4ae
--- /dev/null
+++ b/tools/server/webui/src/lib/utils/image-error-fallback.ts
@@ -0,0 +1,10 @@
+/**
+ * Simplified HTML fallback for external images that fail to load.
+ * Displays a centered message with a link to open the image in a new tab.
+ */
+export function getImageErrorFallbackHtml(src: string): string {
+	return `<div class="image-error-content">
+		<span>Image cannot be displayed</span>
+		<a href="${src}" target="_blank" rel="noopener noreferrer">(open link)</a>
+	</div>`;
+}
diff --git a/tools/server/webui/src/lib/utils/index.ts b/tools/server/webui/src/lib/utils/index.ts
index 588167b8ca..5eb2bbaea1 100644
--- a/tools/server/webui/src/lib/utils/index.ts
+++ b/tools/server/webui/src/lib/utils/index.ts
@@ -9,6 +9,7 @@
 
 // API utilities
 export { getAuthHeaders, getJsonHeaders } from './api-headers';
+export { apiFetch, apiFetchWithParams, apiPost, type ApiFetchOptions } from './api-fetch';
 export { validateApiKey } from './api-key-validation';
 
 // Attachment utilities
@@ -75,8 +76,7 @@ export { maskInlineLaTeX, preprocessLaTeX } from './latex-protection';
 export {
 	isFileTypeSupportedByModel,
 	filterFilesByModalities,
-	generateModalityErrorMessage,
-	type ModalityCapabilities
+	generateModalityErrorMessage
 } from './modality-file-validation';
 
 // Model name utilities
@@ -93,3 +93,6 @@ export { getLanguageFromFilename } from './syntax-highlight-language';
 
 // Text file utilities
 export { isTextFileByName, readFileAsText, isLikelyTextFile } from './text-files';
+
+// Image error fallback utilities
+export { getImageErrorFallbackHtml } from './image-error-fallback';
diff --git a/tools/server/webui/src/lib/utils/modality-file-validation.ts b/tools/server/webui/src/lib/utils/modality-file-validation.ts
index 136c084146..02fb4e4a36 100644
--- a/tools/server/webui/src/lib/utils/modality-file-validation.ts
+++ b/tools/server/webui/src/lib/utils/modality-file-validation.ts
@@ -5,12 +5,7 @@
 
 import { getFileTypeCategory } from '$lib/utils';
 import { FileTypeCategory } from '$lib/enums';
-
-/** Modality capabilities for file validation */
-export interface ModalityCapabilities {
-	hasVision: boolean;
-	hasAudio: boolean;
-}
+import type { ModalityCapabilities } from '$lib/types/models';
 
 /**
  * Check if a file type is supported by the given modalities
diff --git a/tools/server/webui/src/lib/utils/text-files.ts b/tools/server/webui/src/lib/utils/text-files.ts
index e8006de64d..2f1a575d1d 100644
--- a/tools/server/webui/src/lib/utils/text-files.ts
+++ b/tools/server/webui/src/lib/utils/text-files.ts
@@ -3,10 +3,8 @@
  * Handles text file detection, reading, and validation
  */
 
-import {
-	DEFAULT_BINARY_DETECTION_OPTIONS,
-	type BinaryDetectionOptions
-} from '$lib/constants/binary-detection';
+import { DEFAULT_BINARY_DETECTION_OPTIONS } from '$lib/constants/binary-detection';
+import type { BinaryDetectionOptions } from '$lib/constants/binary-detection';
 import { FileExtensionText } from '$lib/enums';
 
 /**
diff --git a/tools/server/webui/tests/stories/ChatMessage.stories.svelte b/tools/server/webui/tests/stories/ChatMessage.stories.svelte
index 5f4de7d476..a3579cf04e 100644
--- a/tools/server/webui/tests/stories/ChatMessage.stories.svelte
+++ b/tools/server/webui/tests/stories/ChatMessage.stories.svelte
@@ -93,7 +93,7 @@
 	}}
 	play={async () => {
 		const { settingsStore } = await import('$lib/stores/settings.svelte');
-		settingsStore.updateConfig('disableReasoningFormat', false);
+		settingsStore.updateConfig('showRawOutputSwitch', false);
 	}}
 />
 
@@ -105,7 +105,7 @@
 	}}
 	play={async () => {
 		const { settingsStore } = await import('$lib/stores/settings.svelte');
-		settingsStore.updateConfig('disableReasoningFormat', false);
+		settingsStore.updateConfig('showRawOutputSwitch', false);
 	}}
 />
 
@@ -117,7 +117,7 @@
 	}}
 	play={async () => {
 		const { settingsStore } = await import('$lib/stores/settings.svelte');
-		settingsStore.updateConfig('disableReasoningFormat', false);
+		settingsStore.updateConfig('showRawOutputSwitch', false);
 	}}
 />
 
@@ -129,7 +129,7 @@
 	}}
 	play={async () => {
 		const { settingsStore } = await import('$lib/stores/settings.svelte');
-		settingsStore.updateConfig('disableReasoningFormat', true);
+		settingsStore.updateConfig('showRawOutputSwitch', true);
 	}}
 />
 
@@ -141,7 +141,7 @@
 	asChild
 	play={async () => {
 		const { settingsStore } = await import('$lib/stores/settings.svelte');
-		settingsStore.updateConfig('disableReasoningFormat', false);
+		settingsStore.updateConfig('showRawOutputSwitch', false);
 		// Phase 1: Stream reasoning content in chunks
 		let reasoningText =
 			'I need to think about this carefully. Let me break down the problem:\n\n1. The user is asking for help with something complex\n2. I should provide a thorough and helpful response\n3. I need to consider multiple approaches\n4. The best solution would be to explain step by step\n\nThis approach will ensure clarity and understanding.';
@@ -193,7 +193,7 @@
 	}}
 	play={async () => {
 		const { settingsStore } = await import('$lib/stores/settings.svelte');
-		settingsStore.updateConfig('disableReasoningFormat', false);
+		settingsStore.updateConfig('showRawOutputSwitch', false);
 		// Import the chat store to simulate loading state
 		const { chatStore } = await import('$lib/stores/chat.svelte');
 
diff --git a/vendor/cpp-httplib/CMakeLists.txt b/vendor/cpp-httplib/CMakeLists.txt
index a8a59e02f4..a5887476af 100644
--- a/vendor/cpp-httplib/CMakeLists.txt
+++ b/vendor/cpp-httplib/CMakeLists.txt
@@ -39,7 +39,7 @@ if (LLAMA_BUILD_BORINGSSL)
     set(FIPS OFF CACHE BOOL "Enable FIPS (BoringSSL)")
 
     set(BORINGSSL_GIT "https://boringssl.googlesource.com/boringssl" CACHE STRING "BoringSSL git repository")
-    set(BORINGSSL_VERSION "0.20260204.0" CACHE STRING "BoringSSL version")
+    set(BORINGSSL_VERSION "0.20260211.0" CACHE STRING "BoringSSL version")
 
     message(STATUS "Fetching BoringSSL version ${BORINGSSL_VERSION}")
 
diff --git a/vendor/cpp-httplib/httplib.cpp b/vendor/cpp-httplib/httplib.cpp
index ba5f9c8ff9..e309a7ad5d 100644
--- a/vendor/cpp-httplib/httplib.cpp
+++ b/vendor/cpp-httplib/httplib.cpp
@@ -6,8 +6,472 @@ namespace httplib {
  * Implementation that will be part of the .cc file if split into .h + .cc.
  */
 
+namespace stream {
+
+// stream::Result implementations
+Result::Result() : chunk_size_(8192) {}
+
+Result::Result(ClientImpl::StreamHandle &&handle, size_t chunk_size)
+    : handle_(std::move(handle)), chunk_size_(chunk_size) {}
+
+Result::Result(Result &&other) noexcept
+    : handle_(std::move(other.handle_)), buffer_(std::move(other.buffer_)),
+      current_size_(other.current_size_), chunk_size_(other.chunk_size_),
+      finished_(other.finished_) {
+  other.current_size_ = 0;
+  other.finished_ = true;
+}
+
+Result &Result::operator=(Result &&other) noexcept {
+  if (this != &other) {
+    handle_ = std::move(other.handle_);
+    buffer_ = std::move(other.buffer_);
+    current_size_ = other.current_size_;
+    chunk_size_ = other.chunk_size_;
+    finished_ = other.finished_;
+    other.current_size_ = 0;
+    other.finished_ = true;
+  }
+  return *this;
+}
+
+bool Result::is_valid() const { return handle_.is_valid(); }
+Result::operator bool() const { return is_valid(); }
+
+int Result::status() const {
+  return handle_.response ? handle_.response->status : -1;
+}
+
+const Headers &Result::headers() const {
+  static const Headers empty_headers;
+  return handle_.response ? handle_.response->headers : empty_headers;
+}
+
+std::string Result::get_header_value(const std::string &key,
+                                            const char *def) const {
+  return handle_.response ? handle_.response->get_header_value(key, def) : def;
+}
+
+bool Result::has_header(const std::string &key) const {
+  return handle_.response ? handle_.response->has_header(key) : false;
+}
+
+Error Result::error() const { return handle_.error; }
+Error Result::read_error() const { return handle_.get_read_error(); }
+bool Result::has_read_error() const { return handle_.has_read_error(); }
+
+bool Result::next() {
+  if (!handle_.is_valid() || finished_) { return false; }
+
+  if (buffer_.size() < chunk_size_) { buffer_.resize(chunk_size_); }
+
+  ssize_t n = handle_.read(&buffer_[0], chunk_size_);
+  if (n > 0) {
+    current_size_ = static_cast<size_t>(n);
+    return true;
+  }
+
+  current_size_ = 0;
+  finished_ = true;
+  return false;
+}
+
+const char *Result::data() const { return buffer_.data(); }
+size_t Result::size() const { return current_size_; }
+
+std::string Result::read_all() {
+  std::string result;
+  while (next()) {
+    result.append(data(), size());
+  }
+  return result;
+}
+
+} // namespace stream
+
+namespace sse {
+
+// SSEMessage implementations
+SSEMessage::SSEMessage() : event("message") {}
+
+void SSEMessage::clear() {
+  event = "message";
+  data.clear();
+  id.clear();
+}
+
+// SSEClient implementations
+SSEClient::SSEClient(Client &client, const std::string &path)
+    : client_(client), path_(path) {}
+
+SSEClient::SSEClient(Client &client, const std::string &path,
+                            const Headers &headers)
+    : client_(client), path_(path), headers_(headers) {}
+
+SSEClient::~SSEClient() { stop(); }
+
+SSEClient &SSEClient::on_message(MessageHandler handler) {
+  on_message_ = std::move(handler);
+  return *this;
+}
+
+SSEClient &SSEClient::on_event(const std::string &type,
+                                      MessageHandler handler) {
+  event_handlers_[type] = std::move(handler);
+  return *this;
+}
+
+SSEClient &SSEClient::on_open(OpenHandler handler) {
+  on_open_ = std::move(handler);
+  return *this;
+}
+
+SSEClient &SSEClient::on_error(ErrorHandler handler) {
+  on_error_ = std::move(handler);
+  return *this;
+}
+
+SSEClient &SSEClient::set_reconnect_interval(int ms) {
+  reconnect_interval_ms_ = ms;
+  return *this;
+}
+
+SSEClient &SSEClient::set_max_reconnect_attempts(int n) {
+  max_reconnect_attempts_ = n;
+  return *this;
+}
+
+bool SSEClient::is_connected() const { return connected_.load(); }
+
+const std::string &SSEClient::last_event_id() const {
+  return last_event_id_;
+}
+
+void SSEClient::start() {
+  running_.store(true);
+  run_event_loop();
+}
+
+void SSEClient::start_async() {
+  running_.store(true);
+  async_thread_ = std::thread([this]() { run_event_loop(); });
+}
+
+void SSEClient::stop() {
+  running_.store(false);
+  client_.stop(); // Cancel any pending operations
+  if (async_thread_.joinable()) { async_thread_.join(); }
+}
+
+bool SSEClient::parse_sse_line(const std::string &line, SSEMessage &msg,
+                                      int &retry_ms) {
+  // Blank line signals end of event
+  if (line.empty() || line == "\r") { return true; }
+
+  // Lines starting with ':' are comments (ignored)
+  if (!line.empty() && line[0] == ':') { return false; }
+
+  // Find the colon separator
+  auto colon_pos = line.find(':');
+  if (colon_pos == std::string::npos) {
+    // Line with no colon is treated as field name with empty value
+    return false;
+  }
+
+  auto field = line.substr(0, colon_pos);
+  std::string value;
+
+  // Value starts after colon, skip optional single space
+  if (colon_pos + 1 < line.size()) {
+    auto value_start = colon_pos + 1;
+    if (line[value_start] == ' ') { value_start++; }
+    value = line.substr(value_start);
+    // Remove trailing \r if present
+    if (!value.empty() && value.back() == '\r') { value.pop_back(); }
+  }
+
+  // Handle known fields
+  if (field == "event") {
+    msg.event = value;
+  } else if (field == "data") {
+    // Multiple data lines are concatenated with newlines
+    if (!msg.data.empty()) { msg.data += "\n"; }
+    msg.data += value;
+  } else if (field == "id") {
+    // Empty id is valid (clears the last event ID)
+    msg.id = value;
+  } else if (field == "retry") {
+    // Parse retry interval in milliseconds
+    {
+      int v = 0;
+      auto res =
+          detail::from_chars(value.data(), value.data() + value.size(), v);
+      if (res.ec == std::errc{}) { retry_ms = v; }
+    }
+  }
+  // Unknown fields are ignored per SSE spec
+
+  return false;
+}
+
+void SSEClient::run_event_loop() {
+  auto reconnect_count = 0;
+
+  while (running_.load()) {
+    // Build headers, including Last-Event-ID if we have one
+    auto request_headers = headers_;
+    if (!last_event_id_.empty()) {
+      request_headers.emplace("Last-Event-ID", last_event_id_);
+    }
+
+    // Open streaming connection
+    auto result = stream::Get(client_, path_, request_headers);
+
+    // Connection error handling
+    if (!result) {
+      connected_.store(false);
+      if (on_error_) { on_error_(result.error()); }
+
+      if (!should_reconnect(reconnect_count)) { break; }
+      wait_for_reconnect();
+      reconnect_count++;
+      continue;
+    }
+
+    if (result.status() != 200) {
+      connected_.store(false);
+      // For certain errors, don't reconnect
+      if (result.status() == 204 || // No Content - server wants us to stop
+          result.status() == 404 || // Not Found
+          result.status() == 401 || // Unauthorized
+          result.status() == 403) { // Forbidden
+        if (on_error_) { on_error_(Error::Connection); }
+        break;
+      }
+
+      if (on_error_) { on_error_(Error::Connection); }
+
+      if (!should_reconnect(reconnect_count)) { break; }
+      wait_for_reconnect();
+      reconnect_count++;
+      continue;
+    }
+
+    // Connection successful
+    connected_.store(true);
+    reconnect_count = 0;
+    if (on_open_) { on_open_(); }
+
+    // Event receiving loop
+    std::string buffer;
+    SSEMessage current_msg;
+
+    while (running_.load() && result.next()) {
+      buffer.append(result.data(), result.size());
+
+      // Process complete lines in the buffer
+      size_t line_start = 0;
+      size_t newline_pos;
+
+      while ((newline_pos = buffer.find('\n', line_start)) !=
+             std::string::npos) {
+        auto line = buffer.substr(line_start, newline_pos - line_start);
+        line_start = newline_pos + 1;
+
+        // Parse the line and check if event is complete
+        auto event_complete =
+            parse_sse_line(line, current_msg, reconnect_interval_ms_);
+
+        if (event_complete && !current_msg.data.empty()) {
+          // Update last_event_id for reconnection
+          if (!current_msg.id.empty()) { last_event_id_ = current_msg.id; }
+
+          // Dispatch event to appropriate handler
+          dispatch_event(current_msg);
+
+          current_msg.clear();
+        }
+      }
+
+      // Keep unprocessed data in buffer
+      buffer.erase(0, line_start);
+    }
+
+    // Connection ended
+    connected_.store(false);
+
+    if (!running_.load()) { break; }
+
+    // Check for read errors
+    if (result.has_read_error()) {
+      if (on_error_) { on_error_(result.read_error()); }
+    }
+
+    if (!should_reconnect(reconnect_count)) { break; }
+    wait_for_reconnect();
+    reconnect_count++;
+  }
+
+  connected_.store(false);
+}
+
+void SSEClient::dispatch_event(const SSEMessage &msg) {
+  // Check for specific event type handler first
+  auto it = event_handlers_.find(msg.event);
+  if (it != event_handlers_.end()) {
+    it->second(msg);
+    return;
+  }
+
+  // Fall back to generic message handler
+  if (on_message_) { on_message_(msg); }
+}
+
+bool SSEClient::should_reconnect(int count) const {
+  if (!running_.load()) { return false; }
+  if (max_reconnect_attempts_ == 0) { return true; } // unlimited
+  return count < max_reconnect_attempts_;
+}
+
+void SSEClient::wait_for_reconnect() {
+  // Use small increments to check running_ flag frequently
+  auto waited = 0;
+  while (running_.load() && waited < reconnect_interval_ms_) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+    waited += 100;
+  }
+}
+
+} // namespace sse
+
+#ifdef CPPHTTPLIB_SSL_ENABLED
+/*
+ * TLS abstraction layer - internal function declarations
+ * These are implementation details and not part of the public API.
+ */
+namespace tls {
+
+// Client context
+ctx_t create_client_context();
+void free_context(ctx_t ctx);
+bool set_min_version(ctx_t ctx, Version version);
+bool load_ca_pem(ctx_t ctx, const char *pem, size_t len);
+bool load_ca_file(ctx_t ctx, const char *file_path);
+bool load_ca_dir(ctx_t ctx, const char *dir_path);
+bool load_system_certs(ctx_t ctx);
+bool set_client_cert_pem(ctx_t ctx, const char *cert, const char *key,
+                         const char *password);
+bool set_client_cert_file(ctx_t ctx, const char *cert_path,
+                          const char *key_path, const char *password);
+
+// Server context
+ctx_t create_server_context();
+bool set_server_cert_pem(ctx_t ctx, const char *cert, const char *key,
+                         const char *password);
+bool set_server_cert_file(ctx_t ctx, const char *cert_path,
+                          const char *key_path, const char *password);
+bool set_client_ca_file(ctx_t ctx, const char *ca_file, const char *ca_dir);
+void set_verify_client(ctx_t ctx, bool require);
+
+// Session management
+session_t create_session(ctx_t ctx, socket_t sock);
+void free_session(session_t session);
+bool set_sni(session_t session, const char *hostname);
+bool set_hostname(session_t session, const char *hostname);
+
+// Handshake (non-blocking capable)
+TlsError connect(session_t session);
+TlsError accept(session_t session);
+
+// Handshake with timeout (blocking until timeout)
+bool connect_nonblocking(session_t session, socket_t sock, time_t timeout_sec,
+                         time_t timeout_usec, TlsError *err);
+bool accept_nonblocking(session_t session, socket_t sock, time_t timeout_sec,
+                        time_t timeout_usec, TlsError *err);
+
+// I/O (non-blocking capable)
+ssize_t read(session_t session, void *buf, size_t len, TlsError &err);
+ssize_t write(session_t session, const void *buf, size_t len, TlsError &err);
+int pending(const_session_t session);
+void shutdown(session_t session, bool graceful);
+
+// Connection state
+bool is_peer_closed(session_t session, socket_t sock);
+
+// Certificate verification
+cert_t get_peer_cert(const_session_t session);
+void free_cert(cert_t cert);
+bool verify_hostname(cert_t cert, const char *hostname);
+uint64_t hostname_mismatch_code();
+long get_verify_result(const_session_t session);
+
+// Certificate introspection
+std::string get_cert_subject_cn(cert_t cert);
+std::string get_cert_issuer_name(cert_t cert);
+bool get_cert_sans(cert_t cert, std::vector<SanEntry> &sans);
+bool get_cert_validity(cert_t cert, time_t &not_before, time_t &not_after);
+std::string get_cert_serial(cert_t cert);
+bool get_cert_der(cert_t cert, std::vector<unsigned char> &der);
+const char *get_sni(const_session_t session);
+
+// CA store management
+ca_store_t create_ca_store(const char *pem, size_t len);
+void free_ca_store(ca_store_t store);
+bool set_ca_store(ctx_t ctx, ca_store_t store);
+size_t get_ca_certs(ctx_t ctx, std::vector<cert_t> &certs);
+std::vector<std::string> get_ca_names(ctx_t ctx);
+
+// Dynamic certificate update (for servers)
+bool update_server_cert(ctx_t ctx, const char *cert_pem, const char *key_pem,
+                        const char *password);
+bool update_server_client_ca(ctx_t ctx, const char *ca_pem);
+
+// Certificate verification callback
+bool set_verify_callback(ctx_t ctx, VerifyCallback callback);
+long get_verify_error(const_session_t session);
+std::string verify_error_string(long error_code);
+
+// TlsError information
+uint64_t peek_error();
+uint64_t get_error();
+std::string error_string(uint64_t code);
+
+} // namespace tls
+#endif // CPPHTTPLIB_SSL_ENABLED
+
+/*
+ * Group 1: detail namespace - Non-SSL utilities
+ */
+
 namespace detail {
 
+bool set_socket_opt_impl(socket_t sock, int level, int optname,
+                                const void *optval, socklen_t optlen) {
+  return setsockopt(sock, level, optname,
+#ifdef _WIN32
+                    reinterpret_cast<const char *>(optval),
+#else
+                    optval,
+#endif
+                    optlen) == 0;
+}
+
+bool set_socket_opt(socket_t sock, int level, int optname, int optval) {
+  return set_socket_opt_impl(sock, level, optname, &optval, sizeof(optval));
+}
+
+bool set_socket_opt_time(socket_t sock, int level, int optname,
+                                time_t sec, time_t usec) {
+#ifdef _WIN32
+  auto timeout = static_cast<uint32_t>(sec * 1000 + usec / 1000);
+#else
+  timeval timeout;
+  timeout.tv_sec = static_cast<long>(sec);
+  timeout.tv_usec = static_cast<decltype(timeout.tv_usec)>(usec);
+#endif
+  return set_socket_opt_impl(sock, level, optname, &timeout, sizeof(timeout));
+}
+
 bool is_hex(char c, int &v) {
   if (isdigit(c)) {
     v = c - '0';
@@ -940,39 +1404,6 @@ private:
   static const size_t read_buff_size_ = 1024l * 4;
 };
 
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-class SSLSocketStream final : public Stream {
-public:
-  SSLSocketStream(
-      socket_t sock, SSL *ssl, time_t read_timeout_sec,
-      time_t read_timeout_usec, time_t write_timeout_sec,
-      time_t write_timeout_usec, time_t max_timeout_msec = 0,
-      std::chrono::time_point<std::chrono::steady_clock> start_time =
-          (std::chrono::steady_clock::time_point::min)());
-  ~SSLSocketStream() override;
-
-  bool is_readable() const override;
-  bool wait_readable() const override;
-  bool wait_writable() const override;
-  ssize_t read(char *ptr, size_t size) override;
-  ssize_t write(const char *ptr, size_t size) override;
-  void get_remote_ip_and_port(std::string &ip, int &port) const override;
-  void get_local_ip_and_port(std::string &ip, int &port) const override;
-  socket_t socket() const override;
-  time_t duration() const override;
-
-private:
-  socket_t sock_;
-  SSL *ssl_;
-  time_t read_timeout_sec_;
-  time_t read_timeout_usec_;
-  time_t write_timeout_sec_;
-  time_t write_timeout_usec_;
-  time_t max_timeout_msec_;
-  const std::chrono::time_point<std::chrono::steady_clock> start_time_;
-};
-#endif
-
 bool keep_alive(const std::atomic<socket_t> &svr_sock, socket_t sock,
                        time_t keep_alive_timeout_sec) {
   using namespace std::chrono;
@@ -2270,14 +2701,23 @@ bool read_headers(Stream &strm, Headers &headers) {
   return true;
 }
 
-bool read_content_with_length(Stream &strm, size_t len,
-                                     DownloadProgress progress,
-                                     ContentReceiverWithProgress out) {
+enum class ReadContentResult {
+  Success,         // Successfully read the content
+  PayloadTooLarge, // The content exceeds the specified payload limit
+  Error            // An error occurred while reading the content
+};
+
+ReadContentResult read_content_with_length(
+    Stream &strm, size_t len, DownloadProgress progress,
+    ContentReceiverWithProgress out,
+    size_t payload_max_length = (std::numeric_limits<size_t>::max)()) {
   char buf[CPPHTTPLIB_RECV_BUFSIZ];
 
   detail::BodyReader br;
   br.stream = &strm;
+  br.has_content_length = true;
   br.content_length = len;
+  br.payload_max_length = payload_max_length;
   br.chunked = false;
   br.bytes_read = 0;
   br.last_error = Error::Success;
@@ -2287,36 +2727,27 @@ bool read_content_with_length(Stream &strm, size_t len,
     auto read_len = static_cast<size_t>(len - r);
     auto to_read = (std::min)(read_len, CPPHTTPLIB_RECV_BUFSIZ);
     auto n = detail::read_body_content(&strm, br, buf, to_read);
-    if (n <= 0) { return false; }
+    if (n <= 0) {
+      // Check if it was a payload size error
+      if (br.last_error == Error::ExceedMaxPayloadSize) {
+        return ReadContentResult::PayloadTooLarge;
+      }
+      return ReadContentResult::Error;
+    }
 
-    if (!out(buf, static_cast<size_t>(n), r, len)) { return false; }
+    if (!out(buf, static_cast<size_t>(n), r, len)) {
+      return ReadContentResult::Error;
+    }
     r += static_cast<size_t>(n);
 
     if (progress) {
-      if (!progress(r, len)) { return false; }
+      if (!progress(r, len)) { return ReadContentResult::Error; }
     }
   }
 
-  return true;
+  return ReadContentResult::Success;
 }
 
-void skip_content_with_length(Stream &strm, size_t len) {
-  char buf[CPPHTTPLIB_RECV_BUFSIZ];
-  size_t r = 0;
-  while (r < len) {
-    auto read_len = static_cast<size_t>(len - r);
-    auto n = strm.read(buf, (std::min)(read_len, CPPHTTPLIB_RECV_BUFSIZ));
-    if (n <= 0) { return; }
-    r += static_cast<size_t>(n);
-  }
-}
-
-enum class ReadContentResult {
-  Success,         // Successfully read the content
-  PayloadTooLarge, // The content exceeds the specified payload limit
-  Error            // An error occurred while reading the content
-};
-
 ReadContentResult
 read_content_without_length(Stream &strm, size_t payload_max_length,
                             ContentReceiverWithProgress out) {
@@ -2462,12 +2893,13 @@ bool read_content(Stream &strm, T &x, size_t payload_max_length, int &status,
 
           if (is_invalid_value) {
             ret = false;
-          } else if (len > payload_max_length) {
-            exceed_payload_max_length = true;
-            skip_content_with_length(strm, len);
-            ret = false;
           } else if (len > 0) {
-            ret = read_content_with_length(strm, len, std::move(progress), out);
+            auto result = read_content_with_length(
+                strm, len, std::move(progress), out, payload_max_length);
+            ret = (result == ReadContentResult::Success);
+            if (result == ReadContentResult::PayloadTooLarge) {
+              exceed_payload_max_length = true;
+            }
           }
         }
 
@@ -3645,226 +4077,6 @@ bool has_crlf(const std::string &s) {
   return false;
 }
 
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-std::string message_digest(const std::string &s, const EVP_MD *algo) {
-  auto context = std::unique_ptr<EVP_MD_CTX, decltype(&EVP_MD_CTX_free)>(
-      EVP_MD_CTX_new(), EVP_MD_CTX_free);
-
-  unsigned int hash_length = 0;
-  unsigned char hash[EVP_MAX_MD_SIZE];
-
-  EVP_DigestInit_ex(context.get(), algo, nullptr);
-  EVP_DigestUpdate(context.get(), s.c_str(), s.size());
-  EVP_DigestFinal_ex(context.get(), hash, &hash_length);
-
-  std::stringstream ss;
-  for (auto i = 0u; i < hash_length; ++i) {
-    ss << std::hex << std::setw(2) << std::setfill('0')
-       << static_cast<unsigned int>(hash[i]);
-  }
-
-  return ss.str();
-}
-
-std::string MD5(const std::string &s) {
-  return message_digest(s, EVP_md5());
-}
-
-std::string SHA_256(const std::string &s) {
-  return message_digest(s, EVP_sha256());
-}
-
-std::string SHA_512(const std::string &s) {
-  return message_digest(s, EVP_sha512());
-}
-
-std::pair<std::string, std::string> make_digest_authentication_header(
-    const Request &req, const std::map<std::string, std::string> &auth,
-    size_t cnonce_count, const std::string &cnonce, const std::string &username,
-    const std::string &password, bool is_proxy = false) {
-  std::string nc;
-  {
-    std::stringstream ss;
-    ss << std::setfill('0') << std::setw(8) << std::hex << cnonce_count;
-    nc = ss.str();
-  }
-
-  std::string qop;
-  if (auth.find("qop") != auth.end()) {
-    qop = auth.at("qop");
-    if (qop.find("auth-int") != std::string::npos) {
-      qop = "auth-int";
-    } else if (qop.find("auth") != std::string::npos) {
-      qop = "auth";
-    } else {
-      qop.clear();
-    }
-  }
-
-  std::string algo = "MD5";
-  if (auth.find("algorithm") != auth.end()) { algo = auth.at("algorithm"); }
-
-  std::string response;
-  {
-    auto H = algo == "SHA-256"   ? detail::SHA_256
-             : algo == "SHA-512" ? detail::SHA_512
-                                 : detail::MD5;
-
-    auto A1 = username + ":" + auth.at("realm") + ":" + password;
-
-    auto A2 = req.method + ":" + req.path;
-    if (qop == "auth-int") { A2 += ":" + H(req.body); }
-
-    if (qop.empty()) {
-      response = H(H(A1) + ":" + auth.at("nonce") + ":" + H(A2));
-    } else {
-      response = H(H(A1) + ":" + auth.at("nonce") + ":" + nc + ":" + cnonce +
-                   ":" + qop + ":" + H(A2));
-    }
-  }
-
-  auto opaque = (auth.find("opaque") != auth.end()) ? auth.at("opaque") : "";
-
-  auto field = "Digest username=\"" + username + "\", realm=\"" +
-               auth.at("realm") + "\", nonce=\"" + auth.at("nonce") +
-               "\", uri=\"" + req.path + "\", algorithm=" + algo +
-               (qop.empty() ? ", response=\""
-                            : ", qop=" + qop + ", nc=" + nc + ", cnonce=\"" +
-                                  cnonce + "\", response=\"") +
-               response + "\"" +
-               (opaque.empty() ? "" : ", opaque=\"" + opaque + "\"");
-
-  auto key = is_proxy ? "Proxy-Authorization" : "Authorization";
-  return std::make_pair(key, field);
-}
-
-bool is_ssl_peer_could_be_closed(SSL *ssl, socket_t sock) {
-  detail::set_nonblocking(sock, true);
-  auto se = detail::scope_exit([&]() { detail::set_nonblocking(sock, false); });
-
-  char buf[1];
-  return !SSL_peek(ssl, buf, 1) &&
-         SSL_get_error(ssl, 0) == SSL_ERROR_ZERO_RETURN;
-}
-
-#ifdef _WIN32
-// NOTE: This code came up with the following stackoverflow post:
-// https://stackoverflow.com/questions/9507184/can-openssl-on-windows-use-the-system-certificate-store
-bool load_system_certs_on_windows(X509_STORE *store) {
-  auto hStore = CertOpenSystemStoreW((HCRYPTPROV_LEGACY)NULL, L"ROOT");
-  if (!hStore) { return false; }
-
-  auto result = false;
-  PCCERT_CONTEXT pContext = NULL;
-  while ((pContext = CertEnumCertificatesInStore(hStore, pContext)) !=
-         nullptr) {
-    auto encoded_cert =
-        static_cast<const unsigned char *>(pContext->pbCertEncoded);
-
-    auto x509 = d2i_X509(NULL, &encoded_cert, pContext->cbCertEncoded);
-    if (x509) {
-      X509_STORE_add_cert(store, x509);
-      X509_free(x509);
-      result = true;
-    }
-  }
-
-  CertFreeCertificateContext(pContext);
-  CertCloseStore(hStore, 0);
-
-  return result;
-}
-#elif defined(CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN) && TARGET_OS_MAC
-template <typename T>
-using CFObjectPtr =
-    std::unique_ptr<typename std::remove_pointer<T>::type, void (*)(CFTypeRef)>;
-
-void cf_object_ptr_deleter(CFTypeRef obj) {
-  if (obj) { CFRelease(obj); }
-}
-
-bool retrieve_certs_from_keychain(CFObjectPtr<CFArrayRef> &certs) {
-  CFStringRef keys[] = {kSecClass, kSecMatchLimit, kSecReturnRef};
-  CFTypeRef values[] = {kSecClassCertificate, kSecMatchLimitAll,
-                        kCFBooleanTrue};
-
-  CFObjectPtr<CFDictionaryRef> query(
-      CFDictionaryCreate(nullptr, reinterpret_cast<const void **>(keys), values,
-                         sizeof(keys) / sizeof(keys[0]),
-                         &kCFTypeDictionaryKeyCallBacks,
-                         &kCFTypeDictionaryValueCallBacks),
-      cf_object_ptr_deleter);
-
-  if (!query) { return false; }
-
-  CFTypeRef security_items = nullptr;
-  if (SecItemCopyMatching(query.get(), &security_items) != errSecSuccess ||
-      CFArrayGetTypeID() != CFGetTypeID(security_items)) {
-    return false;
-  }
-
-  certs.reset(reinterpret_cast<CFArrayRef>(security_items));
-  return true;
-}
-
-bool retrieve_root_certs_from_keychain(CFObjectPtr<CFArrayRef> &certs) {
-  CFArrayRef root_security_items = nullptr;
-  if (SecTrustCopyAnchorCertificates(&root_security_items) != errSecSuccess) {
-    return false;
-  }
-
-  certs.reset(root_security_items);
-  return true;
-}
-
-bool add_certs_to_x509_store(CFArrayRef certs, X509_STORE *store) {
-  auto result = false;
-  for (auto i = 0; i < CFArrayGetCount(certs); ++i) {
-    const auto cert = reinterpret_cast<const __SecCertificate *>(
-        CFArrayGetValueAtIndex(certs, i));
-
-    if (SecCertificateGetTypeID() != CFGetTypeID(cert)) { continue; }
-
-    CFDataRef cert_data = nullptr;
-    if (SecItemExport(cert, kSecFormatX509Cert, 0, nullptr, &cert_data) !=
-        errSecSuccess) {
-      continue;
-    }
-
-    CFObjectPtr<CFDataRef> cert_data_ptr(cert_data, cf_object_ptr_deleter);
-
-    auto encoded_cert = static_cast<const unsigned char *>(
-        CFDataGetBytePtr(cert_data_ptr.get()));
-
-    auto x509 =
-        d2i_X509(NULL, &encoded_cert, CFDataGetLength(cert_data_ptr.get()));
-
-    if (x509) {
-      X509_STORE_add_cert(store, x509);
-      X509_free(x509);
-      result = true;
-    }
-  }
-
-  return result;
-}
-
-bool load_system_certs_on_macos(X509_STORE *store) {
-  auto result = false;
-  CFObjectPtr<CFArrayRef> certs(nullptr, cf_object_ptr_deleter);
-  if (retrieve_certs_from_keychain(certs) && certs) {
-    result = add_certs_to_x509_store(certs.get(), store);
-  }
-
-  if (retrieve_root_certs_from_keychain(certs) && certs) {
-    result = add_certs_to_x509_store(certs.get(), store) || result;
-  }
-
-  return result;
-}
-#endif // _WIN32
-#endif // CPPHTTPLIB_OPENSSL_SUPPORT
-
 #ifdef _WIN32
 class WSInit {
 public:
@@ -3984,8 +4196,393 @@ bool is_field_content(const std::string &s) {
 bool is_field_value(const std::string &s) { return is_field_content(s); }
 
 } // namespace fields
+} // namespace detail
+
+/*
+ * Group 2: detail namespace - SSL common utilities
+ */
+
+#ifdef CPPHTTPLIB_SSL_ENABLED
+namespace detail {
+
+class SSLSocketStream final : public Stream {
+public:
+  SSLSocketStream(
+      socket_t sock, tls::session_t session, time_t read_timeout_sec,
+      time_t read_timeout_usec, time_t write_timeout_sec,
+      time_t write_timeout_usec, time_t max_timeout_msec = 0,
+      std::chrono::time_point<std::chrono::steady_clock> start_time =
+          (std::chrono::steady_clock::time_point::min)());
+  ~SSLSocketStream() override;
+
+  bool is_readable() const override;
+  bool wait_readable() const override;
+  bool wait_writable() const override;
+  ssize_t read(char *ptr, size_t size) override;
+  ssize_t write(const char *ptr, size_t size) override;
+  void get_remote_ip_and_port(std::string &ip, int &port) const override;
+  void get_local_ip_and_port(std::string &ip, int &port) const override;
+  socket_t socket() const override;
+  time_t duration() const override;
+
+private:
+  socket_t sock_;
+  tls::session_t session_;
+  time_t read_timeout_sec_;
+  time_t read_timeout_usec_;
+  time_t write_timeout_sec_;
+  time_t write_timeout_usec_;
+  time_t max_timeout_msec_;
+  const std::chrono::time_point<std::chrono::steady_clock> start_time_;
+};
+
+#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
+std::string message_digest(const std::string &s, const EVP_MD *algo) {
+  auto context = std::unique_ptr<EVP_MD_CTX, decltype(&EVP_MD_CTX_free)>(
+      EVP_MD_CTX_new(), EVP_MD_CTX_free);
+
+  unsigned int hash_length = 0;
+  unsigned char hash[EVP_MAX_MD_SIZE];
+
+  EVP_DigestInit_ex(context.get(), algo, nullptr);
+  EVP_DigestUpdate(context.get(), s.c_str(), s.size());
+  EVP_DigestFinal_ex(context.get(), hash, &hash_length);
+
+  std::stringstream ss;
+  for (auto i = 0u; i < hash_length; ++i) {
+    ss << std::hex << std::setw(2) << std::setfill('0')
+       << static_cast<unsigned int>(hash[i]);
+  }
+
+  return ss.str();
+}
+
+std::string MD5(const std::string &s) {
+  return message_digest(s, EVP_md5());
+}
+
+std::string SHA_256(const std::string &s) {
+  return message_digest(s, EVP_sha256());
+}
+
+std::string SHA_512(const std::string &s) {
+  return message_digest(s, EVP_sha512());
+}
+#elif defined(CPPHTTPLIB_MBEDTLS_SUPPORT)
+namespace {
+template <size_t N>
+std::string hash_to_hex(const unsigned char (&hash)[N]) {
+  std::stringstream ss;
+  for (size_t i = 0; i < N; ++i) {
+    ss << std::hex << std::setw(2) << std::setfill('0')
+       << static_cast<unsigned int>(hash[i]);
+  }
+  return ss.str();
+}
+} // namespace
+
+std::string MD5(const std::string &s) {
+  unsigned char hash[16];
+#ifdef CPPHTTPLIB_MBEDTLS_V3
+  mbedtls_md5(reinterpret_cast<const unsigned char *>(s.c_str()), s.size(),
+              hash);
+#else
+  mbedtls_md5_ret(reinterpret_cast<const unsigned char *>(s.c_str()), s.size(),
+                  hash);
+#endif
+  return hash_to_hex(hash);
+}
+
+std::string SHA_256(const std::string &s) {
+  unsigned char hash[32];
+#ifdef CPPHTTPLIB_MBEDTLS_V3
+  mbedtls_sha256(reinterpret_cast<const unsigned char *>(s.c_str()), s.size(),
+                 hash, 0);
+#else
+  mbedtls_sha256_ret(reinterpret_cast<const unsigned char *>(s.c_str()),
+                     s.size(), hash, 0);
+#endif
+  return hash_to_hex(hash);
+}
+
+std::string SHA_512(const std::string &s) {
+  unsigned char hash[64];
+#ifdef CPPHTTPLIB_MBEDTLS_V3
+  mbedtls_sha512(reinterpret_cast<const unsigned char *>(s.c_str()), s.size(),
+                 hash, 0);
+#else
+  mbedtls_sha512_ret(reinterpret_cast<const unsigned char *>(s.c_str()),
+                     s.size(), hash, 0);
+#endif
+  return hash_to_hex(hash);
+}
+#endif
+
+bool is_ip_address(const std::string &host) {
+  struct in_addr addr4;
+  struct in6_addr addr6;
+  return inet_pton(AF_INET, host.c_str(), &addr4) == 1 ||
+         inet_pton(AF_INET6, host.c_str(), &addr6) == 1;
+}
+
+template <typename T>
+bool process_server_socket_ssl(
+    const std::atomic<socket_t> &svr_sock, tls::session_t session,
+    socket_t sock, size_t keep_alive_max_count, time_t keep_alive_timeout_sec,
+    time_t read_timeout_sec, time_t read_timeout_usec, time_t write_timeout_sec,
+    time_t write_timeout_usec, T callback) {
+  return process_server_socket_core(
+      svr_sock, sock, keep_alive_max_count, keep_alive_timeout_sec,
+      [&](bool close_connection, bool &connection_closed) {
+        SSLSocketStream strm(sock, session, read_timeout_sec, read_timeout_usec,
+                             write_timeout_sec, write_timeout_usec);
+        return callback(strm, close_connection, connection_closed);
+      });
+}
+
+template <typename T>
+bool process_client_socket_ssl(
+    tls::session_t session, socket_t sock, time_t read_timeout_sec,
+    time_t read_timeout_usec, time_t write_timeout_sec,
+    time_t write_timeout_usec, time_t max_timeout_msec,
+    std::chrono::time_point<std::chrono::steady_clock> start_time, T callback) {
+  SSLSocketStream strm(sock, session, read_timeout_sec, read_timeout_usec,
+                       write_timeout_sec, write_timeout_usec, max_timeout_msec,
+                       start_time);
+  return callback(strm);
+}
+
+std::pair<std::string, std::string> make_digest_authentication_header(
+    const Request &req, const std::map<std::string, std::string> &auth,
+    size_t cnonce_count, const std::string &cnonce, const std::string &username,
+    const std::string &password, bool is_proxy = false) {
+  std::string nc;
+  {
+    std::stringstream ss;
+    ss << std::setfill('0') << std::setw(8) << std::hex << cnonce_count;
+    nc = ss.str();
+  }
+
+  std::string qop;
+  if (auth.find("qop") != auth.end()) {
+    qop = auth.at("qop");
+    if (qop.find("auth-int") != std::string::npos) {
+      qop = "auth-int";
+    } else if (qop.find("auth") != std::string::npos) {
+      qop = "auth";
+    } else {
+      qop.clear();
+    }
+  }
+
+  std::string algo = "MD5";
+  if (auth.find("algorithm") != auth.end()) { algo = auth.at("algorithm"); }
+
+  std::string response;
+  {
+    auto H = algo == "SHA-256"   ? detail::SHA_256
+             : algo == "SHA-512" ? detail::SHA_512
+                                 : detail::MD5;
+
+    auto A1 = username + ":" + auth.at("realm") + ":" + password;
+
+    auto A2 = req.method + ":" + req.path;
+    if (qop == "auth-int") { A2 += ":" + H(req.body); }
+
+    if (qop.empty()) {
+      response = H(H(A1) + ":" + auth.at("nonce") + ":" + H(A2));
+    } else {
+      response = H(H(A1) + ":" + auth.at("nonce") + ":" + nc + ":" + cnonce +
+                   ":" + qop + ":" + H(A2));
+    }
+  }
+
+  auto opaque = (auth.find("opaque") != auth.end()) ? auth.at("opaque") : "";
+
+  auto field = "Digest username=\"" + username + "\", realm=\"" +
+               auth.at("realm") + "\", nonce=\"" + auth.at("nonce") +
+               "\", uri=\"" + req.path + "\", algorithm=" + algo +
+               (qop.empty() ? ", response=\""
+                            : ", qop=" + qop + ", nc=" + nc + ", cnonce=\"" +
+                                  cnonce + "\", response=\"") +
+               response + "\"" +
+               (opaque.empty() ? "" : ", opaque=\"" + opaque + "\"");
+
+  auto key = is_proxy ? "Proxy-Authorization" : "Authorization";
+  return std::make_pair(key, field);
+}
+
+bool match_hostname(const std::string &pattern,
+                           const std::string &hostname) {
+  // Exact match (case-insensitive)
+  if (detail::case_ignore::equal(hostname, pattern)) { return true; }
+
+  // Split both pattern and hostname into components by '.'
+  std::vector<std::string> pattern_components;
+  if (!pattern.empty()) {
+    split(pattern.data(), pattern.data() + pattern.size(), '.',
+          [&](const char *b, const char *e) {
+            pattern_components.emplace_back(b, e);
+          });
+  }
+
+  std::vector<std::string> host_components;
+  if (!hostname.empty()) {
+    split(hostname.data(), hostname.data() + hostname.size(), '.',
+          [&](const char *b, const char *e) {
+            host_components.emplace_back(b, e);
+          });
+  }
+
+  // Component count must match
+  if (host_components.size() != pattern_components.size()) { return false; }
+
+  // Compare each component with wildcard support
+  // Supports: "*" (full wildcard), "prefix*" (partial wildcard)
+  // https://bugs.launchpad.net/ubuntu/+source/firefox-3.0/+bug/376484
+  auto itr = pattern_components.begin();
+  for (const auto &h : host_components) {
+    auto &p = *itr;
+    if (!detail::case_ignore::equal(p, h) && p != "*") {
+      bool partial_match = false;
+      if (!p.empty() && p[p.size() - 1] == '*') {
+        const auto prefix_length = p.size() - 1;
+        if (prefix_length == 0) {
+          partial_match = true;
+        } else if (h.size() >= prefix_length) {
+          partial_match =
+              std::equal(p.begin(),
+                         p.begin() + static_cast<std::string::difference_type>(
+                                         prefix_length),
+                         h.begin(), [](const char ca, const char cb) {
+                           return detail::case_ignore::to_lower(ca) ==
+                                  detail::case_ignore::to_lower(cb);
+                         });
+        }
+      }
+      if (!partial_match) { return false; }
+    }
+    ++itr;
+  }
+
+  return true;
+}
+
+#ifdef _WIN32
+// Verify certificate using Windows CertGetCertificateChain API.
+// This provides real-time certificate validation with Windows Update
+// integration, independent of the TLS backend (OpenSSL or MbedTLS).
+bool verify_cert_with_windows_schannel(
+    const std::vector<unsigned char> &der_cert, const std::string &hostname,
+    bool verify_hostname, unsigned long &out_error) {
+  if (der_cert.empty()) { return false; }
+
+  out_error = 0;
+
+  // Create Windows certificate context from DER data
+  auto cert_context = CertCreateCertificateContext(
+      X509_ASN_ENCODING | PKCS_7_ASN_ENCODING, der_cert.data(),
+      static_cast<DWORD>(der_cert.size()));
+
+  if (!cert_context) {
+    out_error = GetLastError();
+    return false;
+  }
+
+  auto cert_guard =
+      scope_exit([&] { CertFreeCertificateContext(cert_context); });
+
+  // Setup chain parameters
+  CERT_CHAIN_PARA chain_para = {};
+  chain_para.cbSize = sizeof(chain_para);
+
+  // Build certificate chain with revocation checking
+  PCCERT_CHAIN_CONTEXT chain_context = nullptr;
+  auto chain_result = CertGetCertificateChain(
+      nullptr, cert_context, nullptr, cert_context->hCertStore, &chain_para,
+      CERT_CHAIN_CACHE_END_CERT | CERT_CHAIN_REVOCATION_CHECK_END_CERT |
+          CERT_CHAIN_REVOCATION_ACCUMULATIVE_TIMEOUT,
+      nullptr, &chain_context);
+
+  if (!chain_result || !chain_context) {
+    out_error = GetLastError();
+    return false;
+  }
+
+  auto chain_guard =
+      scope_exit([&] { CertFreeCertificateChain(chain_context); });
+
+  // Check if chain has errors
+  if (chain_context->TrustStatus.dwErrorStatus != CERT_TRUST_NO_ERROR) {
+    out_error = chain_context->TrustStatus.dwErrorStatus;
+    return false;
+  }
+
+  // Verify SSL policy
+  SSL_EXTRA_CERT_CHAIN_POLICY_PARA extra_policy_para = {};
+  extra_policy_para.cbSize = sizeof(extra_policy_para);
+#ifdef AUTHTYPE_SERVER
+  extra_policy_para.dwAuthType = AUTHTYPE_SERVER;
+#endif
+
+  std::wstring whost;
+  if (verify_hostname) {
+    whost = u8string_to_wstring(hostname.c_str());
+    extra_policy_para.pwszServerName = const_cast<wchar_t *>(whost.c_str());
+  }
+
+  CERT_CHAIN_POLICY_PARA policy_para = {};
+  policy_para.cbSize = sizeof(policy_para);
+#ifdef CERT_CHAIN_POLICY_IGNORE_ALL_REV_UNKNOWN_FLAGS
+  policy_para.dwFlags = CERT_CHAIN_POLICY_IGNORE_ALL_REV_UNKNOWN_FLAGS;
+#else
+  policy_para.dwFlags = 0;
+#endif
+  policy_para.pvExtraPolicyPara = &extra_policy_para;
+
+  CERT_CHAIN_POLICY_STATUS policy_status = {};
+  policy_status.cbSize = sizeof(policy_status);
+
+  if (!CertVerifyCertificateChainPolicy(CERT_CHAIN_POLICY_SSL, chain_context,
+                                        &policy_para, &policy_status)) {
+    out_error = GetLastError();
+    return false;
+  }
+
+  if (policy_status.dwError != 0) {
+    out_error = policy_status.dwError;
+    return false;
+  }
+
+  return true;
+}
+#endif // _WIN32
 
 } // namespace detail
+#endif // CPPHTTPLIB_SSL_ENABLED
+
+/*
+ * Group 3: httplib namespace - Non-SSL public API implementations
+ */
+
+void default_socket_options(socket_t sock) {
+  detail::set_socket_opt(sock, SOL_SOCKET,
+#ifdef SO_REUSEPORT
+                         SO_REUSEPORT,
+#else
+                         SO_REUSEADDR,
+#endif
+                         1);
+}
+
+std::string get_bearer_token_auth(const Request &req) {
+  if (req.has_header("Authorization")) {
+    constexpr auto bearer_header_prefix_len = detail::str_len("Bearer ");
+    return req.get_header_value("Authorization")
+        .substr(bearer_header_prefix_len);
+  }
+  return "";
+}
 
 const char *status_message(int status) {
   switch (status) {
@@ -4426,6 +5023,11 @@ make_bearer_token_authentication_header(const std::string &token,
 }
 
 // Request implementation
+size_t Request::get_header_value_u64(const std::string &key, size_t def,
+                                            size_t id) const {
+  return detail::get_header_value_u64(headers, key, def, id);
+}
+
 bool Request::has_header(const std::string &key) const {
   return detail::has_header(headers, key);
 }
@@ -4547,6 +5149,11 @@ size_t MultipartFormData::get_file_count(const std::string &key) const {
 }
 
 // Response implementation
+size_t Response::get_header_value_u64(const std::string &key, size_t def,
+                                             size_t id) const {
+  return detail::get_header_value_u64(headers, key, def, id);
+}
+
 bool Response::has_header(const std::string &key) const {
   return headers.find(key) != headers.end();
 }
@@ -4662,6 +5269,12 @@ void Response::set_file_content(const std::string &path) {
 }
 
 // Result implementation
+size_t Result::get_request_header_value_u64(const std::string &key,
+                                                   size_t def,
+                                                   size_t id) const {
+  return detail::get_header_value_u64(request_headers_, key, def, id);
+}
+
 bool Result::has_request_header(const std::string &key) const {
   return request_headers_.find(key) != request_headers_.end();
 }
@@ -4697,13 +5310,16 @@ ssize_t detail::BodyReader::read(char *buf, size_t len) {
 
   if (!chunked) {
     // Content-Length based reading
-    if (bytes_read >= content_length) {
+    if (has_content_length && bytes_read >= content_length) {
       eof = true;
       return 0;
     }
 
-    auto remaining = content_length - bytes_read;
-    auto to_read = (std::min)(len, remaining);
+    auto to_read = len;
+    if (has_content_length) {
+      auto remaining = content_length - bytes_read;
+      to_read = (std::min)(len, remaining);
+    }
     auto n = stream->read(buf, to_read);
 
     if (n < 0) {
@@ -4721,7 +5337,12 @@ ssize_t detail::BodyReader::read(char *buf, size_t len) {
     }
 
     bytes_read += static_cast<size_t>(n);
-    if (bytes_read >= content_length) { eof = true; }
+    if (has_content_length && bytes_read >= content_length) { eof = true; }
+    if (payload_max_length > 0 && bytes_read > payload_max_length) {
+      last_error = Error::ExceedMaxPayloadSize;
+      eof = true;
+      return -1;
+    }
     return n;
   }
 
@@ -4745,9 +5366,83 @@ ssize_t detail::BodyReader::read(char *buf, size_t len) {
   }
 
   bytes_read += static_cast<size_t>(n);
+  if (payload_max_length > 0 && bytes_read > payload_max_length) {
+    last_error = Error::ExceedMaxPayloadSize;
+    eof = true;
+    return -1;
+  }
   return n;
 }
 
+// ThreadPool implementation
+ThreadPool::ThreadPool(size_t n, size_t mqr)
+    : shutdown_(false), max_queued_requests_(mqr) {
+  threads_.reserve(n);
+  while (n) {
+    threads_.emplace_back(worker(*this));
+    n--;
+  }
+}
+
+bool ThreadPool::enqueue(std::function<void()> fn) {
+  {
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (max_queued_requests_ > 0 && jobs_.size() >= max_queued_requests_) {
+      return false;
+    }
+    jobs_.push_back(std::move(fn));
+  }
+
+  cond_.notify_one();
+  return true;
+}
+
+void ThreadPool::shutdown() {
+  // Stop all worker threads...
+  {
+    std::unique_lock<std::mutex> lock(mutex_);
+    shutdown_ = true;
+  }
+
+  cond_.notify_all();
+
+  // Join...
+  for (auto &t : threads_) {
+    t.join();
+  }
+}
+
+ThreadPool::worker::worker(ThreadPool &pool) : pool_(pool) {}
+
+void ThreadPool::worker::operator()() {
+  for (;;) {
+    std::function<void()> fn;
+    {
+      std::unique_lock<std::mutex> lock(pool_.mutex_);
+
+      pool_.cond_.wait(lock,
+                       [&] { return !pool_.jobs_.empty() || pool_.shutdown_; });
+
+      if (pool_.shutdown_ && pool_.jobs_.empty()) { break; }
+
+      fn = pool_.jobs_.front();
+      pool_.jobs_.pop_front();
+    }
+
+    assert(true == static_cast<bool>(fn));
+    fn();
+  }
+
+#if defined(CPPHTTPLIB_OPENSSL_SUPPORT) && !defined(OPENSSL_IS_BORINGSSL) &&   \
+    !defined(LIBRESSL_VERSION_NUMBER)
+  OPENSSL_thread_stop();
+#endif
+}
+
+/*
+ * Group 1 (continued): detail namespace - Stream implementations
+ */
+
 namespace detail {
 
 void calc_actual_timeout(time_t max_timeout_msec, time_t duration_msec,
@@ -5076,6 +5771,155 @@ bool check_and_write_headers(Stream &strm, Headers &headers,
 
 } // namespace detail
 
+/*
+ * Group 2 (continued): detail namespace - SSLSocketStream implementation
+ */
+
+#ifdef CPPHTTPLIB_SSL_ENABLED
+namespace detail {
+
+// SSL socket stream implementation
+SSLSocketStream::SSLSocketStream(
+    socket_t sock, tls::session_t session, time_t read_timeout_sec,
+    time_t read_timeout_usec, time_t write_timeout_sec,
+    time_t write_timeout_usec, time_t max_timeout_msec,
+    std::chrono::time_point<std::chrono::steady_clock> start_time)
+    : sock_(sock), session_(session), read_timeout_sec_(read_timeout_sec),
+      read_timeout_usec_(read_timeout_usec),
+      write_timeout_sec_(write_timeout_sec),
+      write_timeout_usec_(write_timeout_usec),
+      max_timeout_msec_(max_timeout_msec), start_time_(start_time) {
+#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
+  // Clear AUTO_RETRY for proper non-blocking I/O timeout handling
+  // Note: create_session() also clears this, but SSLClient currently
+  // uses ssl_new() which does not. Until full TLS API migration is complete,
+  // we need to ensure AUTO_RETRY is cleared here regardless of how the
+  // SSL session was created.
+  SSL_clear_mode(static_cast<SSL *>(session), SSL_MODE_AUTO_RETRY);
+#endif
+}
+
+SSLSocketStream::~SSLSocketStream() = default;
+
+bool SSLSocketStream::is_readable() const {
+  return tls::pending(session_) > 0;
+}
+
+bool SSLSocketStream::wait_readable() const {
+  if (max_timeout_msec_ <= 0) {
+    return select_read(sock_, read_timeout_sec_, read_timeout_usec_) > 0;
+  }
+
+  time_t read_timeout_sec;
+  time_t read_timeout_usec;
+  calc_actual_timeout(max_timeout_msec_, duration(), read_timeout_sec_,
+                      read_timeout_usec_, read_timeout_sec, read_timeout_usec);
+
+  return select_read(sock_, read_timeout_sec, read_timeout_usec) > 0;
+}
+
+bool SSLSocketStream::wait_writable() const {
+  return select_write(sock_, write_timeout_sec_, write_timeout_usec_) > 0 &&
+         is_socket_alive(sock_) && !tls::is_peer_closed(session_, sock_);
+}
+
+ssize_t SSLSocketStream::read(char *ptr, size_t size) {
+  if (tls::pending(session_) > 0) {
+    tls::TlsError err;
+    auto ret = tls::read(session_, ptr, size, err);
+    if (ret == 0 || err.code == tls::ErrorCode::PeerClosed) {
+      error_ = Error::ConnectionClosed;
+    }
+    return ret;
+  } else if (wait_readable()) {
+    tls::TlsError err;
+    auto ret = tls::read(session_, ptr, size, err);
+    if (ret < 0) {
+      auto n = 1000;
+#ifdef _WIN32
+      while (--n >= 0 && (err.code == tls::ErrorCode::WantRead ||
+                          (err.code == tls::ErrorCode::SyscallError &&
+                           WSAGetLastError() == WSAETIMEDOUT))) {
+#else
+      while (--n >= 0 && err.code == tls::ErrorCode::WantRead) {
+#endif
+        if (tls::pending(session_) > 0) {
+          return tls::read(session_, ptr, size, err);
+        } else if (wait_readable()) {
+          std::this_thread::sleep_for(std::chrono::microseconds{10});
+          ret = tls::read(session_, ptr, size, err);
+          if (ret >= 0) { return ret; }
+        } else {
+          break;
+        }
+      }
+      assert(ret < 0);
+    } else if (ret == 0 || err.code == tls::ErrorCode::PeerClosed) {
+      error_ = Error::ConnectionClosed;
+    }
+    return ret;
+  } else {
+    error_ = Error::Timeout;
+    return -1;
+  }
+}
+
+ssize_t SSLSocketStream::write(const char *ptr, size_t size) {
+  if (wait_writable()) {
+    auto handle_size =
+        std::min<size_t>(size, (std::numeric_limits<int>::max)());
+
+    tls::TlsError err;
+    auto ret = tls::write(session_, ptr, handle_size, err);
+    if (ret < 0) {
+      auto n = 1000;
+#ifdef _WIN32
+      while (--n >= 0 && (err.code == tls::ErrorCode::WantWrite ||
+                          (err.code == tls::ErrorCode::SyscallError &&
+                           WSAGetLastError() == WSAETIMEDOUT))) {
+#else
+      while (--n >= 0 && err.code == tls::ErrorCode::WantWrite) {
+#endif
+        if (wait_writable()) {
+          std::this_thread::sleep_for(std::chrono::microseconds{10});
+          ret = tls::write(session_, ptr, handle_size, err);
+          if (ret >= 0) { return ret; }
+        } else {
+          break;
+        }
+      }
+      assert(ret < 0);
+    }
+    return ret;
+  }
+  return -1;
+}
+
+void SSLSocketStream::get_remote_ip_and_port(std::string &ip,
+                                                    int &port) const {
+  detail::get_remote_ip_and_port(sock_, ip, port);
+}
+
+void SSLSocketStream::get_local_ip_and_port(std::string &ip,
+                                                   int &port) const {
+  detail::get_local_ip_and_port(sock_, ip, port);
+}
+
+socket_t SSLSocketStream::socket() const { return sock_; }
+
+time_t SSLSocketStream::duration() const {
+  return std::chrono::duration_cast<std::chrono::milliseconds>(
+             std::chrono::steady_clock::now() - start_time_)
+      .count();
+}
+
+} // namespace detail
+#endif // CPPHTTPLIB_SSL_ENABLED
+
+/*
+ * Group 4: Server implementation
+ */
+
 // HTTP server implementation
 Server::Server()
     : new_task_queue(
@@ -5677,36 +6521,40 @@ bool Server::read_content_core(
   // are true (no Transfer-Encoding and no Content-Length), then the message
   // body length is zero (no message body is present).
   //
-  // For non-SSL builds, peek into the socket to detect clients that send a
-  // body without a Content-Length header (raw HTTP over TCP). If there is
-  // pending data that exceeds the configured payload limit, treat this as an
-  // oversized request and fail early (causing connection close). For SSL
-  // builds we cannot reliably peek the decrypted application bytes, so keep
-  // the original behaviour.
-#if !defined(CPPHTTPLIB_OPENSSL_SUPPORT)
+  // For non-SSL builds, detect clients that send a body without a
+  // Content-Length header (raw HTTP over TCP). Check both the stream's
+  // internal read buffer (data already read from the socket during header
+  // parsing) and the socket itself for pending data. If data is found and
+  // exceeds the configured payload limit, reject with 413.
+  // For SSL builds we cannot reliably peek the decrypted application bytes,
+  // so keep the original behaviour.
+#if !defined(CPPHTTPLIB_SSL_ENABLED)
   if (!req.has_header("Content-Length") &&
       !detail::is_chunked_transfer_encoding(req.headers)) {
-    // Only peek if payload_max_length is set to a finite value
+    // Only check if payload_max_length is set to a finite value
     if (payload_max_length_ > 0 &&
         payload_max_length_ < (std::numeric_limits<size_t>::max)()) {
-      socket_t s = strm.socket();
-      if (s != INVALID_SOCKET) {
-        // Peek to check if there is any pending data
-        char peekbuf[1];
-        ssize_t n = ::recv(s, peekbuf, 1, MSG_PEEK);
-        if (n > 0) {
-          // There is data, so read it with payload limit enforcement
-          auto result = detail::read_content_without_length(
-              strm, payload_max_length_, out);
-          if (result == detail::ReadContentResult::PayloadTooLarge) {
-            res.status = StatusCode::PayloadTooLarge_413;
-            return false;
-          } else if (result != detail::ReadContentResult::Success) {
-            return false;
-          }
-          return true;
+      // Check if there is data already buffered in the stream (read during
+      // header parsing) or pending on the socket. Use a non-blocking socket
+      // check to avoid deadlock when the client sends no body.
+      bool has_data = strm.is_readable();
+      if (!has_data) {
+        socket_t s = strm.socket();
+        if (s != INVALID_SOCKET) {
+          has_data = detail::select_read(s, 0, 0) > 0;
         }
       }
+      if (has_data) {
+        auto result =
+            detail::read_content_without_length(strm, payload_max_length_, out);
+        if (result == detail::ReadContentResult::PayloadTooLarge) {
+          res.status = StatusCode::PayloadTooLarge_413;
+          return false;
+        } else if (result != detail::ReadContentResult::Success) {
+          return false;
+        }
+        return true;
+      }
     }
     return true;
   }
@@ -5815,8 +6663,10 @@ bool Server::check_if_not_modified(const Request &req, Response &res,
       // simplified implementation requires exact matches.
       auto ret = detail::split_find(val.data(), val.data() + val.size(), ',',
                                     [&](const char *b, const char *e) {
-                                      return std::equal(b, e, "*") ||
-                                             std::equal(b, e, etag.begin());
+                                      auto seg_len = static_cast<size_t>(e - b);
+                                      return (seg_len == 1 && *b == '*') ||
+                                             (seg_len == etag.size() &&
+                                              std::equal(b, e, etag.begin()));
                                     });
 
       if (ret) {
@@ -6518,6 +7368,9 @@ void Server::output_error_log(const Error &err,
   }
 }
 
+/*
+ * Group 5: ClientImpl and Client (Universal) implementation
+ */
 // HTTP client implementation
 ClientImpl::ClientImpl(const std::string &host)
     : ClientImpl(host, 80, std::string(), std::string()) {}
@@ -6561,10 +7414,6 @@ void ClientImpl::copy_settings(const ClientImpl &rhs) {
   basic_auth_username_ = rhs.basic_auth_username_;
   basic_auth_password_ = rhs.basic_auth_password_;
   bearer_token_auth_token_ = rhs.bearer_token_auth_token_;
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  digest_auth_username_ = rhs.digest_auth_username_;
-  digest_auth_password_ = rhs.digest_auth_password_;
-#endif
   keep_alive_ = rhs.keep_alive_;
   follow_location_ = rhs.follow_location_;
   path_encode_ = rhs.path_encode_;
@@ -6574,28 +7423,27 @@ void ClientImpl::copy_settings(const ClientImpl &rhs) {
   socket_options_ = rhs.socket_options_;
   compress_ = rhs.compress_;
   decompress_ = rhs.decompress_;
+  payload_max_length_ = rhs.payload_max_length_;
+  has_payload_max_length_ = rhs.has_payload_max_length_;
   interface_ = rhs.interface_;
   proxy_host_ = rhs.proxy_host_;
   proxy_port_ = rhs.proxy_port_;
   proxy_basic_auth_username_ = rhs.proxy_basic_auth_username_;
   proxy_basic_auth_password_ = rhs.proxy_basic_auth_password_;
   proxy_bearer_token_auth_token_ = rhs.proxy_bearer_token_auth_token_;
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  proxy_digest_auth_username_ = rhs.proxy_digest_auth_username_;
-  proxy_digest_auth_password_ = rhs.proxy_digest_auth_password_;
-#endif
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  ca_cert_file_path_ = rhs.ca_cert_file_path_;
-  ca_cert_dir_path_ = rhs.ca_cert_dir_path_;
-  ca_cert_store_ = rhs.ca_cert_store_;
-#endif
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  server_certificate_verification_ = rhs.server_certificate_verification_;
-  server_hostname_verification_ = rhs.server_hostname_verification_;
-  server_certificate_verifier_ = rhs.server_certificate_verifier_;
-#endif
   logger_ = rhs.logger_;
   error_logger_ = rhs.error_logger_;
+
+#ifdef CPPHTTPLIB_SSL_ENABLED
+  digest_auth_username_ = rhs.digest_auth_username_;
+  digest_auth_password_ = rhs.digest_auth_password_;
+  proxy_digest_auth_username_ = rhs.proxy_digest_auth_username_;
+  proxy_digest_auth_password_ = rhs.proxy_digest_auth_password_;
+  ca_cert_file_path_ = rhs.ca_cert_file_path_;
+  ca_cert_dir_path_ = rhs.ca_cert_dir_path_;
+  server_certificate_verification_ = rhs.server_certificate_verification_;
+  server_hostname_verification_ = rhs.server_hostname_verification_;
+#endif
 }
 
 socket_t ClientImpl::create_client_socket(Error &error) const {
@@ -6631,22 +7479,6 @@ bool ClientImpl::ensure_socket_connection(Socket &socket, Error &error) {
   return create_and_connect_socket(socket, error);
 }
 
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-bool SSLClient::ensure_socket_connection(Socket &socket, Error &error) {
-  if (!ClientImpl::ensure_socket_connection(socket, error)) { return false; }
-
-  if (!proxy_host_.empty() && proxy_port_ != -1) { return true; }
-
-  if (!initialize_ssl(socket, error)) {
-    shutdown_socket(socket);
-    close_socket(socket);
-    return false;
-  }
-
-  return true;
-}
-#endif
-
 void ClientImpl::shutdown_ssl(Socket & /*socket*/,
                                      bool /*shutdown_gracefully*/) {
   // If there are any requests in flight from threads other than us, then it's
@@ -6671,9 +7503,10 @@ void ClientImpl::close_socket(Socket &socket) {
          socket_requests_are_from_thread_ == std::this_thread::get_id());
 
   // It is also a bug if this happens while SSL is still active
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
+#ifdef CPPHTTPLIB_SSL_ENABLED
   assert(socket.ssl == nullptr);
 #endif
+
   if (socket.sock == INVALID_SOCKET) { return; }
   detail::close_socket(socket.sock);
   socket.sock = INVALID_SOCKET;
@@ -6722,6 +7555,8 @@ bool ClientImpl::send(Request &req, Response &res, Error &error) {
   if (error == Error::SSLPeerCouldBeClosed_) {
     assert(!ret);
     ret = send_(req, res, error);
+    // If still failing with SSLPeerCouldBeClosed_, convert to Read error
+    if (error == Error::SSLPeerCouldBeClosed_) { error = Error::Read; }
   }
   return ret;
 }
@@ -6739,9 +7574,9 @@ bool ClientImpl::send_(Request &req, Response &res, Error &error) {
     if (socket_.is_open()) {
       is_alive = detail::is_socket_alive(socket_.sock);
 
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
+#ifdef CPPHTTPLIB_SSL_ENABLED
       if (is_alive && is_ssl()) {
-        if (detail::is_ssl_peer_could_be_closed(socket_.ssl, socket_.sock)) {
+        if (tls::is_peer_closed(socket_.ssl, socket_.sock)) {
           is_alive = false;
         }
       }
@@ -6765,7 +7600,7 @@ bool ClientImpl::send_(Request &req, Response &res, Error &error) {
         return false;
       }
 
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
+#ifdef CPPHTTPLIB_SSL_ENABLED
       // TODO: refactoring
       if (is_ssl()) {
         auto &scli = static_cast<SSLClient &>(*this);
@@ -6847,9 +7682,9 @@ Result ClientImpl::send_(Request &&req) {
   auto res = detail::make_unique<Response>();
   auto error = Error::Success;
   auto ret = send(req, *res, error);
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
+#ifdef CPPHTTPLIB_SSL_ENABLED
   return Result{ret ? std::move(res) : nullptr, error, std::move(req.headers),
-                last_ssl_error_, last_openssl_error_};
+                last_ssl_error_, last_backend_error_};
 #else
   return Result{ret ? std::move(res) : nullptr, error, std::move(req.headers)};
 #endif
@@ -6926,9 +7761,9 @@ ClientImpl::open_stream(const std::string &method, const std::string &path,
     auto is_alive = false;
     if (socket_.is_open()) {
       is_alive = detail::is_socket_alive(socket_.sock);
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
+#ifdef CPPHTTPLIB_SSL_ENABLED
       if (is_alive && is_ssl()) {
-        if (detail::is_ssl_peer_could_be_closed(socket_.ssl, socket_.sock)) {
+        if (tls::is_peer_closed(socket_.ssl, socket_.sock)) {
           is_alive = false;
         }
       }
@@ -6946,7 +7781,7 @@ ClientImpl::open_stream(const std::string &method, const std::string &path,
         return handle;
       }
 
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
+#ifdef CPPHTTPLIB_SSL_ENABLED
       if (is_ssl()) {
         auto &scli = static_cast<SSLClient &>(*this);
         if (!proxy_host_.empty() && proxy_port_ != -1) {
@@ -6962,11 +7797,12 @@ ClientImpl::open_stream(const std::string &method, const std::string &path,
     transfer_socket_ownership_to_handle(handle);
   }
 
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  if (is_ssl() && handle.connection_->ssl) {
+#ifdef CPPHTTPLIB_SSL_ENABLED
+  if (is_ssl() && handle.connection_->session) {
     handle.socket_stream_ = detail::make_unique<detail::SSLSocketStream>(
-        handle.connection_->sock, handle.connection_->ssl, read_timeout_sec_,
-        read_timeout_usec_, write_timeout_sec_, write_timeout_usec_);
+        handle.connection_->sock, handle.connection_->session,
+        read_timeout_sec_, read_timeout_usec_, write_timeout_sec_,
+        write_timeout_usec_);
   } else {
     handle.socket_stream_ = detail::make_unique<detail::SocketStream>(
         handle.connection_->sock, read_timeout_sec_, read_timeout_usec_,
@@ -7016,9 +7852,11 @@ ClientImpl::open_stream(const std::string &method, const std::string &path,
   }
 
   handle.body_reader_.stream = handle.stream_;
+  handle.body_reader_.payload_max_length = payload_max_length_;
 
   auto content_length_str = handle.response->get_header_value("Content-Length");
   if (!content_length_str.empty()) {
+    handle.body_reader_.has_content_length = true;
     handle.body_reader_.content_length =
         static_cast<size_t>(std::stoull(content_length_str));
   }
@@ -7066,6 +7904,7 @@ ssize_t ClientImpl::StreamHandle::read_with_decompression(char *buf,
     auto to_copy = (std::min)(len, available);
     std::memcpy(buf, decompress_buffer_.data() + decompress_offset_, to_copy);
     decompress_offset_ += to_copy;
+    decompressed_bytes_read_ += to_copy;
     return static_cast<ssize_t>(to_copy);
   }
 
@@ -7081,12 +7920,16 @@ ssize_t ClientImpl::StreamHandle::read_with_decompression(char *buf,
 
     if (n <= 0) { return n; }
 
-    bool decompress_ok =
-        decompressor_->decompress(compressed_buf, static_cast<size_t>(n),
-                                  [this](const char *data, size_t data_len) {
-                                    decompress_buffer_.append(data, data_len);
-                                    return true;
-                                  });
+    bool decompress_ok = decompressor_->decompress(
+        compressed_buf, static_cast<size_t>(n),
+        [this](const char *data, size_t data_len) {
+          decompress_buffer_.append(data, data_len);
+          auto limit = body_reader_.payload_max_length;
+          if (decompressed_bytes_read_ + decompress_buffer_.size() > limit) {
+            return false;
+          }
+          return true;
+        });
 
     if (!decompress_ok) {
       body_reader_.last_error = Error::Read;
@@ -7099,6 +7942,7 @@ ssize_t ClientImpl::StreamHandle::read_with_decompression(char *buf,
   auto to_copy = (std::min)(len, decompress_buffer_.size());
   std::memcpy(buf, decompress_buffer_.data(), to_copy);
   decompress_offset_ = to_copy;
+  decompressed_bytes_read_ += to_copy;
   return static_cast<ssize_t>(to_copy);
 }
 
@@ -7121,7 +7965,6 @@ void ClientImpl::StreamHandle::parse_trailers_if_needed() {
   }
 }
 
-// Inline method implementations for `ChunkedDecoder`.
 namespace detail {
 
 ChunkedDecoder::ChunkedDecoder(Stream &s) : strm(s) {}
@@ -7185,8 +8028,8 @@ bool ChunkedDecoder::parse_trailers_into(Headers &dest,
 void
 ClientImpl::transfer_socket_ownership_to_handle(StreamHandle &handle) {
   handle.connection_->sock = socket_.sock;
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  handle.connection_->ssl = socket_.ssl;
+#ifdef CPPHTTPLIB_SSL_ENABLED
+  handle.connection_->session = socket_.ssl;
   socket_.ssl = nullptr;
 #endif
   socket_.sock = INVALID_SOCKET;
@@ -7239,7 +8082,7 @@ bool ClientImpl::handle_request(Stream &strm, Request &req,
     ret = redirect(req, res, error);
   }
 
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
+#ifdef CPPHTTPLIB_SSL_ENABLED
   if ((res.status == StatusCode::Unauthorized_401 ||
        res.status == StatusCode::ProxyAuthenticationRequired_407) &&
       req.authorization_count_ < 5) {
@@ -7343,7 +8186,7 @@ bool ClientImpl::create_redirect_client(
 
   // Create appropriate client type and handle redirect
   if (need_ssl) {
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
+#ifdef CPPHTTPLIB_SSL_ENABLED
     // Create SSL client for HTTPS redirect
     SSLClient redirect_client(host, port);
 
@@ -7363,9 +8206,10 @@ bool ClientImpl::create_redirect_client(
           server_hostname_verification_);
     }
 
-    // Handle CA certificate store and paths if available
-    if (ca_cert_store_ && X509_STORE_up_ref(ca_cert_store_)) {
-      redirect_client.set_ca_cert_store(ca_cert_store_);
+    // Transfer CA certificate to redirect client
+    if (!ca_cert_pem_.empty()) {
+      redirect_client.load_ca_cert_store(ca_cert_pem_.c_str(),
+                                         ca_cert_pem_.size());
     }
     if (!ca_cert_file_path_.empty()) {
       redirect_client.set_ca_cert_path(ca_cert_file_path_, ca_cert_dir_path_);
@@ -7418,7 +8262,7 @@ void ClientImpl::setup_redirect_client(ClientType &client) {
   if (!bearer_token_auth_token_.empty()) {
     client.set_bearer_token_auth(bearer_token_auth_token_);
   }
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
+#ifdef CPPHTTPLIB_SSL_ENABLED
   if (!digest_auth_username_.empty()) {
     client.set_digest_auth(digest_auth_username_, digest_auth_password_);
   }
@@ -7438,7 +8282,7 @@ void ClientImpl::setup_redirect_client(ClientType &client) {
     if (!proxy_bearer_token_auth_token_.empty()) {
       client.set_proxy_bearer_token_auth(proxy_bearer_token_auth_token_);
     }
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
+#ifdef CPPHTTPLIB_SSL_ENABLED
     if (!proxy_digest_auth_username_.empty()) {
       client.set_proxy_digest_auth(proxy_digest_auth_username_,
                                    proxy_digest_auth_password_);
@@ -7809,9 +8653,9 @@ Result ClientImpl::send_with_content_provider_and_receiver(
       std::move(content_provider_without_length), content_type,
       std::move(content_receiver), error);
 
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
+#ifdef CPPHTTPLIB_SSL_ENABLED
   return Result{std::move(res), error, std::move(req.headers), last_ssl_error_,
-                last_openssl_error_};
+                last_backend_error_};
 #else
   return Result{std::move(res), error, std::move(req.headers)};
 #endif
@@ -7851,11 +8695,11 @@ bool ClientImpl::process_request(Stream &strm, Request &req,
   auto write_request_success =
       write_request(strm, req, close_connection, error, expect_100_continue);
 
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  if (is_ssl()) {
+#ifdef CPPHTTPLIB_SSL_ENABLED
+  if (is_ssl() && !expect_100_continue) {
     auto is_proxy_enabled = !proxy_host_.empty() && proxy_port_ != -1;
     if (!is_proxy_enabled) {
-      if (detail::is_ssl_peer_could_be_closed(socket_.ssl, socket_.sock)) {
+      if (tls::is_peer_closed(socket_.ssl, socket_.sock)) {
         error = Error::SSLPeerCouldBeClosed_;
         output_error_log(error, &req);
         return false;
@@ -7937,6 +8781,11 @@ bool ClientImpl::process_request(Stream &strm, Request &req,
                   [&](const char *buf, size_t n, size_t /*off*/,
                       size_t /*len*/) {
                     assert(res.body.size() + n <= res.body.max_size());
+                    if (payload_max_length_ > 0 &&
+                        (res.body.size() >= payload_max_length_ ||
+                         n > payload_max_length_ - res.body.size())) {
+                      return false;
+                    }
                     res.body.append(buf, n);
                     return true;
                   });
@@ -7965,9 +8814,12 @@ bool ClientImpl::process_request(Stream &strm, Request &req,
 
     if (res.status != StatusCode::NotModified_304) {
       int dummy_status;
-      if (!detail::read_content(strm, res, (std::numeric_limits<size_t>::max)(),
-                                dummy_status, std::move(progress),
-                                std::move(out), decompress_)) {
+      auto max_length = (!has_payload_max_length_ && req.content_receiver)
+                            ? (std::numeric_limits<size_t>::max)()
+                            : payload_max_length_;
+      if (!detail::read_content(strm, res, max_length, dummy_status,
+                                std::move(progress), std::move(out),
+                                decompress_)) {
         if (error != Error::Canceled) { error = Error::Read; }
         output_error_log(error, &req);
         return false;
@@ -8878,14 +9730,6 @@ void ClientImpl::set_bearer_token_auth(const std::string &token) {
   bearer_token_auth_token_ = token;
 }
 
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-void ClientImpl::set_digest_auth(const std::string &username,
-                                        const std::string &password) {
-  digest_auth_username_ = username;
-  digest_auth_password_ = password;
-}
-#endif
-
 void ClientImpl::set_keep_alive(bool on) { keep_alive_ = on; }
 
 void ClientImpl::set_follow_location(bool on) { follow_location_ = on; }
@@ -8922,6 +9766,11 @@ void ClientImpl::set_compress(bool on) { compress_ = on; }
 
 void ClientImpl::set_decompress(bool on) { decompress_ = on; }
 
+void ClientImpl::set_payload_max_length(size_t length) {
+  payload_max_length_ = length;
+  has_payload_max_length_ = true;
+}
+
 void ClientImpl::set_interface(const std::string &intf) {
   interface_ = intf;
 }
@@ -8941,11 +9790,11 @@ void ClientImpl::set_proxy_bearer_token_auth(const std::string &token) {
   proxy_bearer_token_auth_token_ = token;
 }
 
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-void ClientImpl::set_proxy_digest_auth(const std::string &username,
-                                              const std::string &password) {
-  proxy_digest_auth_username_ = username;
-  proxy_digest_auth_password_ = password;
+#ifdef CPPHTTPLIB_SSL_ENABLED
+void ClientImpl::set_digest_auth(const std::string &username,
+                                        const std::string &password) {
+  digest_auth_username_ = username;
+  digest_auth_password_ = password;
 }
 
 void ClientImpl::set_ca_cert_path(const std::string &ca_cert_file_path,
@@ -8954,12 +9803,23 @@ void ClientImpl::set_ca_cert_path(const std::string &ca_cert_file_path,
   ca_cert_dir_path_ = ca_cert_dir_path;
 }
 
-void ClientImpl::set_ca_cert_store(X509_STORE *ca_cert_store) {
-  if (ca_cert_store && ca_cert_store != ca_cert_store_) {
-    ca_cert_store_ = ca_cert_store;
-  }
+void ClientImpl::set_proxy_digest_auth(const std::string &username,
+                                              const std::string &password) {
+  proxy_digest_auth_username_ = username;
+  proxy_digest_auth_password_ = password;
 }
 
+void ClientImpl::enable_server_certificate_verification(bool enabled) {
+  server_certificate_verification_ = enabled;
+}
+
+void ClientImpl::enable_server_hostname_verification(bool enabled) {
+  server_hostname_verification_ = enabled;
+}
+#endif
+
+// ClientImpl::set_ca_cert_store is defined after TLS namespace (uses helpers)
+#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
 X509_STORE *ClientImpl::create_ca_cert_store(const char *ca_cert,
                                                     std::size_t size) const {
   auto mem = BIO_new_mem_buf(ca_cert, static_cast<int>(size));
@@ -8984,17 +9844,9 @@ X509_STORE *ClientImpl::create_ca_cert_store(const char *ca_cert,
   return cts;
 }
 
-void ClientImpl::enable_server_certificate_verification(bool enabled) {
-  server_certificate_verification_ = enabled;
-}
-
-void ClientImpl::enable_server_hostname_verification(bool enabled) {
-  server_hostname_verification_ = enabled;
-}
-
 void ClientImpl::set_server_certificate_verifier(
-    std::function<SSLVerifierResponse(SSL *ssl)> verifier) {
-  server_certificate_verifier_ = verifier;
+    std::function<SSLVerifierResponse(SSL *ssl)> /*verifier*/) {
+  // Base implementation does nothing - SSLClient overrides this
 }
 #endif
 
@@ -9007,958 +9859,24 @@ void ClientImpl::set_error_logger(ErrorLogger error_logger) {
 }
 
 /*
- * SSL Implementation
+ * SSL/TLS Common Implementation
  */
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-namespace detail {
 
-bool is_ip_address(const std::string &host) {
-  struct in_addr addr4;
-  struct in6_addr addr6;
-  return inet_pton(AF_INET, host.c_str(), &addr4) == 1 ||
-         inet_pton(AF_INET6, host.c_str(), &addr6) == 1;
-}
-
-template <typename U, typename V>
-SSL *ssl_new(socket_t sock, SSL_CTX *ctx, std::mutex &ctx_mutex,
-                    U SSL_connect_or_accept, V setup) {
-  SSL *ssl = nullptr;
-  {
-    std::lock_guard<std::mutex> guard(ctx_mutex);
-    ssl = SSL_new(ctx);
-  }
-
-  if (ssl) {
-    set_nonblocking(sock, true);
-    auto bio = BIO_new_socket(static_cast<int>(sock), BIO_NOCLOSE);
-    BIO_set_nbio(bio, 1);
-    SSL_set_bio(ssl, bio, bio);
-
-    if (!setup(ssl) || SSL_connect_or_accept(ssl) != 1) {
-      SSL_shutdown(ssl);
-      {
-        std::lock_guard<std::mutex> guard(ctx_mutex);
-        SSL_free(ssl);
-      }
-      set_nonblocking(sock, false);
-      return nullptr;
-    }
-    BIO_set_nbio(bio, 0);
-    set_nonblocking(sock, false);
-  }
-
-  return ssl;
-}
-
-void ssl_delete(std::mutex &ctx_mutex, SSL *ssl, socket_t sock,
-                       bool shutdown_gracefully) {
-  // sometimes we may want to skip this to try to avoid SIGPIPE if we know
-  // the remote has closed the network connection
-  // Note that it is not always possible to avoid SIGPIPE, this is merely a
-  // best-efforts.
-  if (shutdown_gracefully) {
-    (void)(sock);
-    // SSL_shutdown() returns 0 on first call (indicating close_notify alert
-    // sent) and 1 on subsequent call (indicating close_notify alert received)
-    if (SSL_shutdown(ssl) == 0) {
-      // Expected to return 1, but even if it doesn't, we free ssl
-      SSL_shutdown(ssl);
-    }
-  }
-
-  std::lock_guard<std::mutex> guard(ctx_mutex);
-  SSL_free(ssl);
-}
-
-template <typename U>
-bool ssl_connect_or_accept_nonblocking(socket_t sock, SSL *ssl,
-                                       U ssl_connect_or_accept,
-                                       time_t timeout_sec, time_t timeout_usec,
-                                       int *ssl_error) {
-  auto res = 0;
-  while ((res = ssl_connect_or_accept(ssl)) != 1) {
-    auto err = SSL_get_error(ssl, res);
-    switch (err) {
-    case SSL_ERROR_WANT_READ:
-      if (select_read(sock, timeout_sec, timeout_usec) > 0) { continue; }
-      break;
-    case SSL_ERROR_WANT_WRITE:
-      if (select_write(sock, timeout_sec, timeout_usec) > 0) { continue; }
-      break;
-    default: break;
-    }
-    if (ssl_error) { *ssl_error = err; }
-    return false;
-  }
-  return true;
-}
-
-template <typename T>
-bool process_server_socket_ssl(
-    const std::atomic<socket_t> &svr_sock, SSL *ssl, socket_t sock,
-    size_t keep_alive_max_count, time_t keep_alive_timeout_sec,
-    time_t read_timeout_sec, time_t read_timeout_usec, time_t write_timeout_sec,
-    time_t write_timeout_usec, T callback) {
-  return process_server_socket_core(
-      svr_sock, sock, keep_alive_max_count, keep_alive_timeout_sec,
-      [&](bool close_connection, bool &connection_closed) {
-        SSLSocketStream strm(sock, ssl, read_timeout_sec, read_timeout_usec,
-                             write_timeout_sec, write_timeout_usec);
-        return callback(strm, close_connection, connection_closed);
-      });
-}
-
-template <typename T>
-bool process_client_socket_ssl(
-    SSL *ssl, socket_t sock, time_t read_timeout_sec, time_t read_timeout_usec,
-    time_t write_timeout_sec, time_t write_timeout_usec,
-    time_t max_timeout_msec,
-    std::chrono::time_point<std::chrono::steady_clock> start_time, T callback) {
-  SSLSocketStream strm(sock, ssl, read_timeout_sec, read_timeout_usec,
-                       write_timeout_sec, write_timeout_usec, max_timeout_msec,
-                       start_time);
-  return callback(strm);
-}
-
-// SSL socket stream implementation
-SSLSocketStream::SSLSocketStream(
-    socket_t sock, SSL *ssl, time_t read_timeout_sec, time_t read_timeout_usec,
-    time_t write_timeout_sec, time_t write_timeout_usec,
-    time_t max_timeout_msec,
-    std::chrono::time_point<std::chrono::steady_clock> start_time)
-    : sock_(sock), ssl_(ssl), read_timeout_sec_(read_timeout_sec),
-      read_timeout_usec_(read_timeout_usec),
-      write_timeout_sec_(write_timeout_sec),
-      write_timeout_usec_(write_timeout_usec),
-      max_timeout_msec_(max_timeout_msec), start_time_(start_time) {
-  SSL_clear_mode(ssl, SSL_MODE_AUTO_RETRY);
-}
-
-SSLSocketStream::~SSLSocketStream() = default;
-
-bool SSLSocketStream::is_readable() const {
-  return SSL_pending(ssl_) > 0;
-}
-
-bool SSLSocketStream::wait_readable() const {
-  if (max_timeout_msec_ <= 0) {
-    return select_read(sock_, read_timeout_sec_, read_timeout_usec_) > 0;
-  }
-
-  time_t read_timeout_sec;
-  time_t read_timeout_usec;
-  calc_actual_timeout(max_timeout_msec_, duration(), read_timeout_sec_,
-                      read_timeout_usec_, read_timeout_sec, read_timeout_usec);
-
-  return select_read(sock_, read_timeout_sec, read_timeout_usec) > 0;
-}
-
-bool SSLSocketStream::wait_writable() const {
-  return select_write(sock_, write_timeout_sec_, write_timeout_usec_) > 0 &&
-         is_socket_alive(sock_) && !is_ssl_peer_could_be_closed(ssl_, sock_);
-}
-
-ssize_t SSLSocketStream::read(char *ptr, size_t size) {
-  if (SSL_pending(ssl_) > 0) {
-    auto ret = SSL_read(ssl_, ptr, static_cast<int>(size));
-    if (ret == 0) { error_ = Error::ConnectionClosed; }
-    return ret;
-  } else if (wait_readable()) {
-    auto ret = SSL_read(ssl_, ptr, static_cast<int>(size));
-    if (ret < 0) {
-      auto err = SSL_get_error(ssl_, ret);
-      auto n = 1000;
-#ifdef _WIN32
-      while (--n >= 0 && (err == SSL_ERROR_WANT_READ ||
-                          (err == SSL_ERROR_SYSCALL &&
-                           WSAGetLastError() == WSAETIMEDOUT))) {
-#else
-      while (--n >= 0 && err == SSL_ERROR_WANT_READ) {
-#endif
-        if (SSL_pending(ssl_) > 0) {
-          return SSL_read(ssl_, ptr, static_cast<int>(size));
-        } else if (wait_readable()) {
-          std::this_thread::sleep_for(std::chrono::microseconds{10});
-          ret = SSL_read(ssl_, ptr, static_cast<int>(size));
-          if (ret >= 0) { return ret; }
-          err = SSL_get_error(ssl_, ret);
-        } else {
-          break;
-        }
-      }
-      assert(ret < 0);
-    } else if (ret == 0) {
-      error_ = Error::ConnectionClosed;
-    }
-    return ret;
-  } else {
-    error_ = Error::Timeout;
-    return -1;
-  }
-}
-
-ssize_t SSLSocketStream::write(const char *ptr, size_t size) {
-  if (wait_writable()) {
-    auto handle_size = static_cast<int>(
-        std::min<size_t>(size, (std::numeric_limits<int>::max)()));
-
-    auto ret = SSL_write(ssl_, ptr, static_cast<int>(handle_size));
-    if (ret < 0) {
-      auto err = SSL_get_error(ssl_, ret);
-      auto n = 1000;
-#ifdef _WIN32
-      while (--n >= 0 && (err == SSL_ERROR_WANT_WRITE ||
-                          (err == SSL_ERROR_SYSCALL &&
-                           WSAGetLastError() == WSAETIMEDOUT))) {
-#else
-      while (--n >= 0 && err == SSL_ERROR_WANT_WRITE) {
-#endif
-        if (wait_writable()) {
-          std::this_thread::sleep_for(std::chrono::microseconds{10});
-          ret = SSL_write(ssl_, ptr, static_cast<int>(handle_size));
-          if (ret >= 0) { return ret; }
-          err = SSL_get_error(ssl_, ret);
-        } else {
-          break;
-        }
-      }
-      assert(ret < 0);
-    }
-    return ret;
-  }
-  return -1;
-}
-
-void SSLSocketStream::get_remote_ip_and_port(std::string &ip,
-                                                    int &port) const {
-  detail::get_remote_ip_and_port(sock_, ip, port);
-}
-
-void SSLSocketStream::get_local_ip_and_port(std::string &ip,
-                                                   int &port) const {
-  detail::get_local_ip_and_port(sock_, ip, port);
-}
-
-socket_t SSLSocketStream::socket() const { return sock_; }
-
-time_t SSLSocketStream::duration() const {
-  return std::chrono::duration_cast<std::chrono::milliseconds>(
-             std::chrono::steady_clock::now() - start_time_)
-      .count();
-}
-
-} // namespace detail
-
-// SSL HTTP server implementation
-SSLServer::SSLServer(const char *cert_path, const char *private_key_path,
-                            const char *client_ca_cert_file_path,
-                            const char *client_ca_cert_dir_path,
-                            const char *private_key_password) {
-  ctx_ = SSL_CTX_new(TLS_server_method());
-
-  if (ctx_) {
-    SSL_CTX_set_options(ctx_,
-                        SSL_OP_NO_COMPRESSION |
-                            SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION);
-
-    SSL_CTX_set_min_proto_version(ctx_, TLS1_2_VERSION);
-
-    if (private_key_password != nullptr && (private_key_password[0] != '\0')) {
-      SSL_CTX_set_default_passwd_cb_userdata(
-          ctx_,
-          reinterpret_cast<void *>(const_cast<char *>(private_key_password)));
-    }
-
-    if (SSL_CTX_use_certificate_chain_file(ctx_, cert_path) != 1 ||
-        SSL_CTX_use_PrivateKey_file(ctx_, private_key_path, SSL_FILETYPE_PEM) !=
-            1 ||
-        SSL_CTX_check_private_key(ctx_) != 1) {
-      last_ssl_error_ = static_cast<int>(ERR_get_error());
-      SSL_CTX_free(ctx_);
-      ctx_ = nullptr;
-    } else if (client_ca_cert_file_path || client_ca_cert_dir_path) {
-      SSL_CTX_load_verify_locations(ctx_, client_ca_cert_file_path,
-                                    client_ca_cert_dir_path);
-
-      // Set client CA list to be sent to clients during TLS handshake
-      if (client_ca_cert_file_path) {
-        auto ca_list = SSL_load_client_CA_file(client_ca_cert_file_path);
-        if (ca_list != nullptr) {
-          SSL_CTX_set_client_CA_list(ctx_, ca_list);
-        } else {
-          // Failed to load client CA list, but we continue since
-          // SSL_CTX_load_verify_locations already succeeded and
-          // certificate verification will still work
-          last_ssl_error_ = static_cast<int>(ERR_get_error());
-        }
-      }
-
-      SSL_CTX_set_verify(
-          ctx_, SSL_VERIFY_PEER | SSL_VERIFY_FAIL_IF_NO_PEER_CERT, nullptr);
-    }
-  }
-}
-
-SSLServer::SSLServer(X509 *cert, EVP_PKEY *private_key,
-                            X509_STORE *client_ca_cert_store) {
-  ctx_ = SSL_CTX_new(TLS_server_method());
-
-  if (ctx_) {
-    SSL_CTX_set_options(ctx_,
-                        SSL_OP_NO_COMPRESSION |
-                            SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION);
-
-    SSL_CTX_set_min_proto_version(ctx_, TLS1_2_VERSION);
-
-    if (SSL_CTX_use_certificate(ctx_, cert) != 1 ||
-        SSL_CTX_use_PrivateKey(ctx_, private_key) != 1) {
-      SSL_CTX_free(ctx_);
-      ctx_ = nullptr;
-    } else if (client_ca_cert_store) {
-      SSL_CTX_set_cert_store(ctx_, client_ca_cert_store);
-
-      // Extract CA names from the store and set them as the client CA list
-      auto ca_list = extract_ca_names_from_x509_store(client_ca_cert_store);
-      if (ca_list) {
-        SSL_CTX_set_client_CA_list(ctx_, ca_list);
-      } else {
-        // Failed to extract CA names, record the error
-        last_ssl_error_ = static_cast<int>(ERR_get_error());
-      }
-
-      SSL_CTX_set_verify(
-          ctx_, SSL_VERIFY_PEER | SSL_VERIFY_FAIL_IF_NO_PEER_CERT, nullptr);
-    }
-  }
-}
-
-SSLServer::SSLServer(
-    const std::function<bool(SSL_CTX &ssl_ctx)> &setup_ssl_ctx_callback) {
-  ctx_ = SSL_CTX_new(TLS_method());
-  if (ctx_) {
-    if (!setup_ssl_ctx_callback(*ctx_)) {
-      SSL_CTX_free(ctx_);
-      ctx_ = nullptr;
-    }
-  }
-}
-
-SSLServer::~SSLServer() {
-  if (ctx_) { SSL_CTX_free(ctx_); }
-}
-
-bool SSLServer::is_valid() const { return ctx_; }
-
-SSL_CTX *SSLServer::ssl_context() const { return ctx_; }
-
-void SSLServer::update_certs(X509 *cert, EVP_PKEY *private_key,
-                                    X509_STORE *client_ca_cert_store) {
-
-  std::lock_guard<std::mutex> guard(ctx_mutex_);
-
-  SSL_CTX_use_certificate(ctx_, cert);
-  SSL_CTX_use_PrivateKey(ctx_, private_key);
-
-  if (client_ca_cert_store != nullptr) {
-    SSL_CTX_set_cert_store(ctx_, client_ca_cert_store);
-  }
-}
-
-bool SSLServer::process_and_close_socket(socket_t sock) {
-  auto ssl = detail::ssl_new(
-      sock, ctx_, ctx_mutex_,
-      [&](SSL *ssl2) {
-        return detail::ssl_connect_or_accept_nonblocking(
-            sock, ssl2, SSL_accept, read_timeout_sec_, read_timeout_usec_,
-            &last_ssl_error_);
-      },
-      [](SSL * /*ssl2*/) { return true; });
-
-  auto ret = false;
-  if (ssl) {
-    std::string remote_addr;
-    int remote_port = 0;
-    detail::get_remote_ip_and_port(sock, remote_addr, remote_port);
-
-    std::string local_addr;
-    int local_port = 0;
-    detail::get_local_ip_and_port(sock, local_addr, local_port);
-
-    ret = detail::process_server_socket_ssl(
-        svr_sock_, ssl, sock, keep_alive_max_count_, keep_alive_timeout_sec_,
-        read_timeout_sec_, read_timeout_usec_, write_timeout_sec_,
-        write_timeout_usec_,
-        [&](Stream &strm, bool close_connection, bool &connection_closed) {
-          return process_request(strm, remote_addr, remote_port, local_addr,
-                                 local_port, close_connection,
-                                 connection_closed,
-                                 [&](Request &req) { req.ssl = ssl; });
-        });
-
-    // Shutdown gracefully if the result seemed successful, non-gracefully if
-    // the connection appeared to be closed.
-    const bool shutdown_gracefully = ret;
-    detail::ssl_delete(ctx_mutex_, ssl, sock, shutdown_gracefully);
-  }
-
-  detail::shutdown_socket(sock);
-  detail::close_socket(sock);
-  return ret;
-}
-
-STACK_OF(X509_NAME) * SSLServer::extract_ca_names_from_x509_store(
-                                 X509_STORE *store) {
-  if (!store) { return nullptr; }
-
-  auto ca_list = sk_X509_NAME_new_null();
-  if (!ca_list) { return nullptr; }
-
-  // Get all objects from the store
-  auto objs = X509_STORE_get0_objects(store);
-  if (!objs) {
-    sk_X509_NAME_free(ca_list);
-    return nullptr;
-  }
-
-  // Iterate through objects and extract certificate subject names
-  for (int i = 0; i < sk_X509_OBJECT_num(objs); i++) {
-    auto obj = sk_X509_OBJECT_value(objs, i);
-    if (X509_OBJECT_get_type(obj) == X509_LU_X509) {
-      auto cert = X509_OBJECT_get0_X509(obj);
-      if (cert) {
-        auto subject = X509_get_subject_name(cert);
-        if (subject) {
-          auto name_dup = X509_NAME_dup(subject);
-          if (name_dup) { sk_X509_NAME_push(ca_list, name_dup); }
-        }
-      }
-    }
-  }
-
-  // If no names were extracted, free the list and return nullptr
-  if (sk_X509_NAME_num(ca_list) == 0) {
-    sk_X509_NAME_free(ca_list);
-    return nullptr;
-  }
-
-  return ca_list;
-}
-
-// SSL HTTP client implementation
-SSLClient::SSLClient(const std::string &host)
-    : SSLClient(host, 443, std::string(), std::string()) {}
-
-SSLClient::SSLClient(const std::string &host, int port)
-    : SSLClient(host, port, std::string(), std::string()) {}
-
-SSLClient::SSLClient(const std::string &host, int port,
-                            const std::string &client_cert_path,
-                            const std::string &client_key_path,
-                            const std::string &private_key_password)
-    : ClientImpl(host, port, client_cert_path, client_key_path) {
-  ctx_ = SSL_CTX_new(TLS_client_method());
-
-  SSL_CTX_set_min_proto_version(ctx_, TLS1_2_VERSION);
-
-  detail::split(&host_[0], &host_[host_.size()], '.',
-                [&](const char *b, const char *e) {
-                  host_components_.emplace_back(b, e);
-                });
-
-  if (!client_cert_path.empty() && !client_key_path.empty()) {
-    if (!private_key_password.empty()) {
-      SSL_CTX_set_default_passwd_cb_userdata(
-          ctx_, reinterpret_cast<void *>(
-                    const_cast<char *>(private_key_password.c_str())));
-    }
-
-    if (SSL_CTX_use_certificate_file(ctx_, client_cert_path.c_str(),
-                                     SSL_FILETYPE_PEM) != 1 ||
-        SSL_CTX_use_PrivateKey_file(ctx_, client_key_path.c_str(),
-                                    SSL_FILETYPE_PEM) != 1) {
-      last_openssl_error_ = ERR_get_error();
-      SSL_CTX_free(ctx_);
-      ctx_ = nullptr;
-    }
-  }
-}
-
-SSLClient::SSLClient(const std::string &host, int port,
-                            X509 *client_cert, EVP_PKEY *client_key,
-                            const std::string &private_key_password)
-    : ClientImpl(host, port) {
-  ctx_ = SSL_CTX_new(TLS_client_method());
-
-  detail::split(&host_[0], &host_[host_.size()], '.',
-                [&](const char *b, const char *e) {
-                  host_components_.emplace_back(b, e);
-                });
-
-  if (client_cert != nullptr && client_key != nullptr) {
-    if (!private_key_password.empty()) {
-      SSL_CTX_set_default_passwd_cb_userdata(
-          ctx_, reinterpret_cast<void *>(
-                    const_cast<char *>(private_key_password.c_str())));
-    }
-
-    if (SSL_CTX_use_certificate(ctx_, client_cert) != 1 ||
-        SSL_CTX_use_PrivateKey(ctx_, client_key) != 1) {
-      last_openssl_error_ = ERR_get_error();
-      SSL_CTX_free(ctx_);
-      ctx_ = nullptr;
-    }
-  }
-}
-
-SSLClient::~SSLClient() {
-  if (ctx_) { SSL_CTX_free(ctx_); }
-  // Make sure to shut down SSL since shutdown_ssl will resolve to the
-  // base function rather than the derived function once we get to the
-  // base class destructor, and won't free the SSL (causing a leak).
-  shutdown_ssl_impl(socket_, true);
-}
-
-bool SSLClient::is_valid() const { return ctx_; }
-
-void SSLClient::set_ca_cert_store(X509_STORE *ca_cert_store) {
-  if (ca_cert_store) {
-    if (ctx_) {
-      if (SSL_CTX_get_cert_store(ctx_) != ca_cert_store) {
-        // Free memory allocated for old cert and use new store
-        // `ca_cert_store`
-        SSL_CTX_set_cert_store(ctx_, ca_cert_store);
-        ca_cert_store_ = ca_cert_store;
-      }
-    } else {
-      X509_STORE_free(ca_cert_store);
-    }
-  }
-}
-
-void SSLClient::load_ca_cert_store(const char *ca_cert,
-                                          std::size_t size) {
-  set_ca_cert_store(ClientImpl::create_ca_cert_store(ca_cert, size));
-}
-
-long SSLClient::get_openssl_verify_result() const {
-  return verify_result_;
-}
-
-SSL_CTX *SSLClient::ssl_context() const { return ctx_; }
-
-bool SSLClient::create_and_connect_socket(Socket &socket, Error &error) {
-  if (!is_valid()) {
-    error = Error::SSLConnection;
-    return false;
-  }
-  return ClientImpl::create_and_connect_socket(socket, error);
-}
-
-// Assumes that socket_mutex_ is locked and that there are no requests in
-// flight
-bool SSLClient::connect_with_proxy(
-    Socket &socket,
-    std::chrono::time_point<std::chrono::steady_clock> start_time,
-    Response &res, bool &success, Error &error) {
-  success = true;
-  Response proxy_res;
-  if (!detail::process_client_socket(
-          socket.sock, read_timeout_sec_, read_timeout_usec_,
-          write_timeout_sec_, write_timeout_usec_, max_timeout_msec_,
-          start_time, [&](Stream &strm) {
-            Request req2;
-            req2.method = "CONNECT";
-            req2.path =
-                detail::make_host_and_port_string_always_port(host_, port_);
-            if (max_timeout_msec_ > 0) {
-              req2.start_time_ = std::chrono::steady_clock::now();
-            }
-            return process_request(strm, req2, proxy_res, false, error);
-          })) {
-    // Thread-safe to close everything because we are assuming there are no
-    // requests in flight
-    shutdown_ssl(socket, true);
-    shutdown_socket(socket);
-    close_socket(socket);
-    success = false;
-    return false;
-  }
-
-  if (proxy_res.status == StatusCode::ProxyAuthenticationRequired_407) {
-    if (!proxy_digest_auth_username_.empty() &&
-        !proxy_digest_auth_password_.empty()) {
-      std::map<std::string, std::string> auth;
-      if (detail::parse_www_authenticate(proxy_res, auth, true)) {
-        // Close the current socket and create a new one for the authenticated
-        // request
-        shutdown_ssl(socket, true);
-        shutdown_socket(socket);
-        close_socket(socket);
-
-        // Create a new socket for the authenticated CONNECT request
-        if (!ensure_socket_connection(socket, error)) {
-          success = false;
-          output_error_log(error, nullptr);
-          return false;
-        }
-
-        proxy_res = Response();
-        if (!detail::process_client_socket(
-                socket.sock, read_timeout_sec_, read_timeout_usec_,
-                write_timeout_sec_, write_timeout_usec_, max_timeout_msec_,
-                start_time, [&](Stream &strm) {
-                  Request req3;
-                  req3.method = "CONNECT";
-                  req3.path = detail::make_host_and_port_string_always_port(
-                      host_, port_);
-                  req3.headers.insert(detail::make_digest_authentication_header(
-                      req3, auth, 1, detail::random_string(10),
-                      proxy_digest_auth_username_, proxy_digest_auth_password_,
-                      true));
-                  if (max_timeout_msec_ > 0) {
-                    req3.start_time_ = std::chrono::steady_clock::now();
-                  }
-                  return process_request(strm, req3, proxy_res, false, error);
-                })) {
-          // Thread-safe to close everything because we are assuming there are
-          // no requests in flight
-          shutdown_ssl(socket, true);
-          shutdown_socket(socket);
-          close_socket(socket);
-          success = false;
-          return false;
-        }
-      }
-    }
-  }
-
-  // If status code is not 200, proxy request is failed.
-  // Set error to ProxyConnection and return proxy response
-  // as the response of the request
-  if (proxy_res.status != StatusCode::OK_200) {
-    error = Error::ProxyConnection;
-    output_error_log(error, nullptr);
-    res = std::move(proxy_res);
-    // Thread-safe to close everything because we are assuming there are
-    // no requests in flight
-    shutdown_ssl(socket, true);
-    shutdown_socket(socket);
-    close_socket(socket);
-    return false;
-  }
-
-  return true;
-}
-
-bool SSLClient::load_certs() {
-  auto ret = true;
-
-  std::call_once(initialize_cert_, [&]() {
-    std::lock_guard<std::mutex> guard(ctx_mutex_);
-    if (!ca_cert_file_path_.empty()) {
-      if (!SSL_CTX_load_verify_locations(ctx_, ca_cert_file_path_.c_str(),
-                                         nullptr)) {
-        last_openssl_error_ = ERR_get_error();
-        ret = false;
-      }
-    } else if (!ca_cert_dir_path_.empty()) {
-      if (!SSL_CTX_load_verify_locations(ctx_, nullptr,
-                                         ca_cert_dir_path_.c_str())) {
-        last_openssl_error_ = ERR_get_error();
-        ret = false;
-      }
-    } else if (!ca_cert_store_) {
-      auto loaded = false;
-#ifdef _WIN32
-      loaded =
-          detail::load_system_certs_on_windows(SSL_CTX_get_cert_store(ctx_));
-#elif defined(CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN) && TARGET_OS_MAC
-      loaded = detail::load_system_certs_on_macos(SSL_CTX_get_cert_store(ctx_));
-#endif // _WIN32
-      if (!loaded) { SSL_CTX_set_default_verify_paths(ctx_); }
-    }
-  });
-
-  return ret;
-}
-
-bool SSLClient::initialize_ssl(Socket &socket, Error &error) {
-  auto ssl = detail::ssl_new(
-      socket.sock, ctx_, ctx_mutex_,
-      [&](SSL *ssl2) {
-        if (server_certificate_verification_) {
-          if (!load_certs()) {
-            error = Error::SSLLoadingCerts;
-            output_error_log(error, nullptr);
-            return false;
-          }
-          SSL_set_verify(ssl2, SSL_VERIFY_NONE, nullptr);
-        }
-
-        if (!detail::ssl_connect_or_accept_nonblocking(
-                socket.sock, ssl2, SSL_connect, connection_timeout_sec_,
-                connection_timeout_usec_, &last_ssl_error_)) {
-          error = Error::SSLConnection;
-          output_error_log(error, nullptr);
-          return false;
-        }
-
-        if (server_certificate_verification_) {
-          auto verification_status = SSLVerifierResponse::NoDecisionMade;
-
-          if (server_certificate_verifier_) {
-            verification_status = server_certificate_verifier_(ssl2);
-          }
-
-          if (verification_status == SSLVerifierResponse::CertificateRejected) {
-            last_openssl_error_ = ERR_get_error();
-            error = Error::SSLServerVerification;
-            output_error_log(error, nullptr);
-            return false;
-          }
-
-          if (verification_status == SSLVerifierResponse::NoDecisionMade) {
-            verify_result_ = SSL_get_verify_result(ssl2);
-
-            if (verify_result_ != X509_V_OK) {
-              last_openssl_error_ = static_cast<unsigned long>(verify_result_);
-              error = Error::SSLServerVerification;
-              output_error_log(error, nullptr);
-              return false;
-            }
-
-            auto server_cert = SSL_get1_peer_certificate(ssl2);
-            auto se = detail::scope_exit([&] { X509_free(server_cert); });
-
-            if (server_cert == nullptr) {
-              last_openssl_error_ = ERR_get_error();
-              error = Error::SSLServerVerification;
-              output_error_log(error, nullptr);
-              return false;
-            }
-
-            if (server_hostname_verification_) {
-              if (!verify_host(server_cert)) {
-                last_openssl_error_ = X509_V_ERR_HOSTNAME_MISMATCH;
-                error = Error::SSLServerHostnameVerification;
-                output_error_log(error, nullptr);
-                return false;
-              }
-            }
-          }
-        }
-
-        return true;
-      },
-      [&](SSL *ssl2) {
-        // Set SNI only if host is not IP address
-        if (!detail::is_ip_address(host_)) {
-#if defined(OPENSSL_IS_BORINGSSL)
-          SSL_set_tlsext_host_name(ssl2, host_.c_str());
-#else
-          // NOTE: Direct call instead of using the OpenSSL macro to suppress
-          // -Wold-style-cast warning
-          SSL_ctrl(ssl2, SSL_CTRL_SET_TLSEXT_HOSTNAME,
-                   TLSEXT_NAMETYPE_host_name,
-                   static_cast<void *>(const_cast<char *>(host_.c_str())));
-#endif
-        }
-        return true;
-      });
-
-  if (ssl) {
-    socket.ssl = ssl;
-    return true;
-  }
-
-  if (ctx_ == nullptr) {
-    error = Error::SSLConnection;
-    last_openssl_error_ = ERR_get_error();
-  }
-
-  shutdown_socket(socket);
-  close_socket(socket);
-  return false;
-}
-
-void SSLClient::shutdown_ssl(Socket &socket, bool shutdown_gracefully) {
-  shutdown_ssl_impl(socket, shutdown_gracefully);
-}
-
-void SSLClient::shutdown_ssl_impl(Socket &socket,
-                                         bool shutdown_gracefully) {
-  if (socket.sock == INVALID_SOCKET) {
-    assert(socket.ssl == nullptr);
-    return;
-  }
-  if (socket.ssl) {
-    detail::ssl_delete(ctx_mutex_, socket.ssl, socket.sock,
-                       shutdown_gracefully);
-    socket.ssl = nullptr;
-  }
-  assert(socket.ssl == nullptr);
-}
-
-bool SSLClient::process_socket(
-    const Socket &socket,
-    std::chrono::time_point<std::chrono::steady_clock> start_time,
-    std::function<bool(Stream &strm)> callback) {
-  assert(socket.ssl);
-  return detail::process_client_socket_ssl(
-      socket.ssl, socket.sock, read_timeout_sec_, read_timeout_usec_,
-      write_timeout_sec_, write_timeout_usec_, max_timeout_msec_, start_time,
-      std::move(callback));
-}
-
-bool SSLClient::is_ssl() const { return true; }
-
-bool SSLClient::verify_host(X509 *server_cert) const {
-  /* Quote from RFC2818 section 3.1 "Server Identity"
-
-     If a subjectAltName extension of type dNSName is present, that MUST
-     be used as the identity. Otherwise, the (most specific) Common Name
-     field in the Subject field of the certificate MUST be used. Although
-     the use of the Common Name is existing practice, it is deprecated and
-     Certification Authorities are encouraged to use the dNSName instead.
-
-     Matching is performed using the matching rules specified by
-     [RFC2459].  If more than one identity of a given type is present in
-     the certificate (e.g., more than one dNSName name, a match in any one
-     of the set is considered acceptable.) Names may contain the wildcard
-     character * which is considered to match any single domain name
-     component or component fragment. E.g., *.a.com matches foo.a.com but
-     not bar.foo.a.com. f*.com matches foo.com but not bar.com.
-
-     In some cases, the URI is specified as an IP address rather than a
-     hostname. In this case, the iPAddress subjectAltName must be present
-     in the certificate and must exactly match the IP in the URI.
-
-  */
-  return verify_host_with_subject_alt_name(server_cert) ||
-         verify_host_with_common_name(server_cert);
-}
-
-bool
-SSLClient::verify_host_with_subject_alt_name(X509 *server_cert) const {
-  auto ret = false;
-
-  auto type = GEN_DNS;
-
-  struct in6_addr addr6 = {};
-  struct in_addr addr = {};
-  size_t addr_len = 0;
-
-#ifndef __MINGW32__
-  if (inet_pton(AF_INET6, host_.c_str(), &addr6)) {
-    type = GEN_IPADD;
-    addr_len = sizeof(struct in6_addr);
-  } else if (inet_pton(AF_INET, host_.c_str(), &addr)) {
-    type = GEN_IPADD;
-    addr_len = sizeof(struct in_addr);
+ClientConnection::~ClientConnection() {
+#ifdef CPPHTTPLIB_SSL_ENABLED
+  if (session) {
+    tls::shutdown(session, true);
+    tls::free_session(session);
+    session = nullptr;
   }
 #endif
 
-  auto alt_names = static_cast<const struct stack_st_GENERAL_NAME *>(
-      X509_get_ext_d2i(server_cert, NID_subject_alt_name, nullptr, nullptr));
-
-  if (alt_names) {
-    auto dsn_matched = false;
-    auto ip_matched = false;
-
-    auto count = sk_GENERAL_NAME_num(alt_names);
-
-    for (decltype(count) i = 0; i < count && !dsn_matched; i++) {
-      auto val = sk_GENERAL_NAME_value(alt_names, i);
-      if (!val || val->type != type) { continue; }
-
-      auto name =
-          reinterpret_cast<const char *>(ASN1_STRING_get0_data(val->d.ia5));
-      if (name == nullptr) { continue; }
-
-      auto name_len = static_cast<size_t>(ASN1_STRING_length(val->d.ia5));
-
-      switch (type) {
-      case GEN_DNS: dsn_matched = check_host_name(name, name_len); break;
-
-      case GEN_IPADD:
-        if (!memcmp(&addr6, name, addr_len) || !memcmp(&addr, name, addr_len)) {
-          ip_matched = true;
-        }
-        break;
-      }
-    }
-
-    if (dsn_matched || ip_matched) { ret = true; }
+  if (sock != INVALID_SOCKET) {
+    detail::close_socket(sock);
+    sock = INVALID_SOCKET;
   }
-
-  GENERAL_NAMES_free(const_cast<STACK_OF(GENERAL_NAME) *>(
-      reinterpret_cast<const STACK_OF(GENERAL_NAME) *>(alt_names)));
-  return ret;
 }
 
-bool SSLClient::verify_host_with_common_name(X509 *server_cert) const {
-  const auto subject_name = X509_get_subject_name(server_cert);
-
-  if (subject_name != nullptr) {
-    char name[BUFSIZ];
-    auto name_len = X509_NAME_get_text_by_NID(subject_name, NID_commonName,
-                                              name, sizeof(name));
-
-    if (name_len != -1) {
-      return check_host_name(name, static_cast<size_t>(name_len));
-    }
-  }
-
-  return false;
-}
-
-bool SSLClient::check_host_name(const char *pattern,
-                                       size_t pattern_len) const {
-  // Exact match (case-insensitive)
-  if (host_.size() == pattern_len &&
-      detail::case_ignore::equal(host_, std::string(pattern, pattern_len))) {
-    return true;
-  }
-
-  // Wildcard match
-  // https://bugs.launchpad.net/ubuntu/+source/firefox-3.0/+bug/376484
-  std::vector<std::string> pattern_components;
-  detail::split(&pattern[0], &pattern[pattern_len], '.',
-                [&](const char *b, const char *e) {
-                  pattern_components.emplace_back(b, e);
-                });
-
-  if (host_components_.size() != pattern_components.size()) { return false; }
-
-  auto itr = pattern_components.begin();
-  for (const auto &h : host_components_) {
-    auto &p = *itr;
-    if (!httplib::detail::case_ignore::equal(p, h) && p != "*") {
-      bool partial_match = false;
-      if (!p.empty() && p[p.size() - 1] == '*') {
-        const auto prefix_length = p.size() - 1;
-        if (prefix_length == 0) {
-          partial_match = true;
-        } else if (h.size() >= prefix_length) {
-          partial_match =
-              std::equal(p.begin(),
-                         p.begin() + static_cast<std::string::difference_type>(
-                                         prefix_length),
-                         h.begin(), [](const char ca, const char cb) {
-                           return httplib::detail::case_ignore::to_lower(ca) ==
-                                  httplib::detail::case_ignore::to_lower(cb);
-                         });
-        }
-      }
-      if (!partial_match) { return false; }
-    }
-    ++itr;
-  }
-
-  return true;
-}
-#endif
-
 // Universal client implementation
 Client::Client(const std::string &scheme_host_port)
     : Client(scheme_host_port, std::string(), std::string()) {}
@@ -9973,7 +9891,7 @@ Client::Client(const std::string &scheme_host_port,
   if (std::regex_match(scheme_host_port, m, re)) {
     auto scheme = m[1].str();
 
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
+#ifdef CPPHTTPLIB_SSL_ENABLED
     if (!scheme.empty() && (scheme != "http" && scheme != "https")) {
 #else
     if (!scheme.empty() && scheme != "http") {
@@ -9994,7 +9912,7 @@ Client::Client(const std::string &scheme_host_port,
     auto port = !port_str.empty() ? std::stoi(port_str) : (is_ssl ? 443 : 80);
 
     if (is_ssl) {
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
+#ifdef CPPHTTPLIB_SSL_ENABLED
       cli_ = detail::make_unique<SSLClient>(host, port, client_cert_path,
                                             client_key_path);
       is_ssl_ = is_ssl;
@@ -10579,12 +10497,6 @@ void Client::set_basic_auth(const std::string &username,
 void Client::set_bearer_token_auth(const std::string &token) {
   cli_->set_bearer_token_auth(token);
 }
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-void Client::set_digest_auth(const std::string &username,
-                                    const std::string &password) {
-  cli_->set_digest_auth(username, password);
-}
-#endif
 
 void Client::set_keep_alive(bool on) { cli_->set_keep_alive(on); }
 void Client::set_follow_location(bool on) {
@@ -10602,6 +10514,10 @@ void Client::set_compress(bool on) { cli_->set_compress(on); }
 
 void Client::set_decompress(bool on) { cli_->set_decompress(on); }
 
+void Client::set_payload_max_length(size_t length) {
+  cli_->set_payload_max_length(length);
+}
+
 void Client::set_interface(const std::string &intf) {
   cli_->set_interface(intf);
 }
@@ -10616,27 +10532,6 @@ void Client::set_proxy_basic_auth(const std::string &username,
 void Client::set_proxy_bearer_token_auth(const std::string &token) {
   cli_->set_proxy_bearer_token_auth(token);
 }
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-void Client::set_proxy_digest_auth(const std::string &username,
-                                          const std::string &password) {
-  cli_->set_proxy_digest_auth(username, password);
-}
-#endif
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-void Client::enable_server_certificate_verification(bool enabled) {
-  cli_->enable_server_certificate_verification(enabled);
-}
-
-void Client::enable_server_hostname_verification(bool enabled) {
-  cli_->enable_server_hostname_verification(enabled);
-}
-
-void Client::set_server_certificate_verifier(
-    std::function<SSLVerifierResponse(SSL *ssl)> verifier) {
-  cli_->set_server_certificate_verifier(verifier);
-}
-#endif
 
 void Client::set_logger(Logger logger) {
   cli_->set_logger(std::move(logger));
@@ -10646,35 +10541,3399 @@ void Client::set_error_logger(ErrorLogger error_logger) {
   cli_->set_error_logger(std::move(error_logger));
 }
 
+/*
+ * Group 6: SSL Server and Client implementation
+ */
+
+#ifdef CPPHTTPLIB_SSL_ENABLED
+
+// SSL HTTP server implementation
+SSLServer::SSLServer(const char *cert_path, const char *private_key_path,
+                            const char *client_ca_cert_file_path,
+                            const char *client_ca_cert_dir_path,
+                            const char *private_key_password) {
+  using namespace tls;
+
+  ctx_ = create_server_context();
+  if (!ctx_) { return; }
+
+  // Load server certificate and private key
+  if (!set_server_cert_file(ctx_, cert_path, private_key_path,
+                            private_key_password)) {
+    last_ssl_error_ = static_cast<int>(get_error());
+    free_context(ctx_);
+    ctx_ = nullptr;
+    return;
+  }
+
+  // Load client CA certificates for client authentication
+  if (client_ca_cert_file_path || client_ca_cert_dir_path) {
+    if (!set_client_ca_file(ctx_, client_ca_cert_file_path,
+                            client_ca_cert_dir_path)) {
+      last_ssl_error_ = static_cast<int>(get_error());
+      free_context(ctx_);
+      ctx_ = nullptr;
+      return;
+    }
+    // Enable client certificate verification
+    set_verify_client(ctx_, true);
+  }
+}
+
+SSLServer::SSLServer(const PemMemory &pem) {
+  using namespace tls;
+  ctx_ = create_server_context();
+  if (ctx_) {
+    if (!set_server_cert_pem(ctx_, pem.cert_pem, pem.key_pem,
+                             pem.private_key_password)) {
+      last_ssl_error_ = static_cast<int>(get_error());
+      free_context(ctx_);
+      ctx_ = nullptr;
+    } else if (pem.client_ca_pem && pem.client_ca_pem_len > 0) {
+      if (!load_ca_pem(ctx_, pem.client_ca_pem, pem.client_ca_pem_len)) {
+        last_ssl_error_ = static_cast<int>(get_error());
+        free_context(ctx_);
+        ctx_ = nullptr;
+      } else {
+        set_verify_client(ctx_, true);
+      }
+    }
+  }
+}
+
+SSLServer::SSLServer(const tls::ContextSetupCallback &setup_callback) {
+  using namespace tls;
+  ctx_ = create_server_context();
+  if (ctx_) {
+    if (!setup_callback(ctx_)) {
+      free_context(ctx_);
+      ctx_ = nullptr;
+    }
+  }
+}
+
+SSLServer::~SSLServer() {
+  if (ctx_) { tls::free_context(ctx_); }
+}
+
+bool SSLServer::is_valid() const { return ctx_ != nullptr; }
+
+bool SSLServer::process_and_close_socket(socket_t sock) {
+  using namespace tls;
+
+  // Create TLS session with mutex protection
+  session_t session = nullptr;
+  {
+    std::lock_guard<std::mutex> guard(ctx_mutex_);
+    session = create_session(static_cast<ctx_t>(ctx_), sock);
+  }
+
+  if (!session) {
+    last_ssl_error_ = static_cast<int>(get_error());
+    detail::shutdown_socket(sock);
+    detail::close_socket(sock);
+    return false;
+  }
+
+  // Use scope_exit to ensure cleanup on all paths (including exceptions)
+  bool handshake_done = false;
+  bool ret = false;
+  auto cleanup = detail::scope_exit([&] {
+    // Shutdown gracefully if handshake succeeded and processing was successful
+    if (handshake_done) { shutdown(session, ret); }
+    free_session(session);
+    detail::shutdown_socket(sock);
+    detail::close_socket(sock);
+  });
+
+  // Perform TLS accept handshake with timeout
+  TlsError tls_err;
+  if (!accept_nonblocking(session, sock, read_timeout_sec_, read_timeout_usec_,
+                          &tls_err)) {
 #ifdef CPPHTTPLIB_OPENSSL_SUPPORT
+    // Map TlsError to legacy ssl_error for backward compatibility
+    if (tls_err.code == ErrorCode::WantRead) {
+      last_ssl_error_ = SSL_ERROR_WANT_READ;
+    } else if (tls_err.code == ErrorCode::WantWrite) {
+      last_ssl_error_ = SSL_ERROR_WANT_WRITE;
+    } else {
+      last_ssl_error_ = SSL_ERROR_SSL;
+    }
+#else
+    last_ssl_error_ = static_cast<int>(get_error());
+#endif
+    return false;
+  }
+
+  handshake_done = true;
+
+  std::string remote_addr;
+  int remote_port = 0;
+  detail::get_remote_ip_and_port(sock, remote_addr, remote_port);
+
+  std::string local_addr;
+  int local_port = 0;
+  detail::get_local_ip_and_port(sock, local_addr, local_port);
+
+  ret = detail::process_server_socket_ssl(
+      svr_sock_, session, sock, keep_alive_max_count_, keep_alive_timeout_sec_,
+      read_timeout_sec_, read_timeout_usec_, write_timeout_sec_,
+      write_timeout_usec_,
+      [&](Stream &strm, bool close_connection, bool &connection_closed) {
+        return process_request(strm, remote_addr, remote_port, local_addr,
+                               local_port, close_connection, connection_closed,
+                               [&](Request &req) { req.ssl = session; });
+      });
+
+  return ret;
+}
+
+bool SSLServer::update_certs_pem(const char *cert_pem,
+                                        const char *key_pem,
+                                        const char *client_ca_pem,
+                                        const char *password) {
+  if (!ctx_) { return false; }
+  std::lock_guard<std::mutex> guard(ctx_mutex_);
+  if (!tls::update_server_cert(ctx_, cert_pem, key_pem, password)) {
+    return false;
+  }
+  if (client_ca_pem) {
+    return tls::update_server_client_ca(ctx_, client_ca_pem);
+  }
+  return true;
+}
+
+// SSL HTTP client implementation
+SSLClient::~SSLClient() {
+  if (ctx_) { tls::free_context(ctx_); }
+  // Make sure to shut down SSL since shutdown_ssl will resolve to the
+  // base function rather than the derived function once we get to the
+  // base class destructor, and won't free the SSL (causing a leak).
+  shutdown_ssl_impl(socket_, true);
+}
+
+bool SSLClient::is_valid() const { return ctx_ != nullptr; }
+
+void SSLClient::shutdown_ssl(Socket &socket, bool shutdown_gracefully) {
+  shutdown_ssl_impl(socket, shutdown_gracefully);
+}
+
+void SSLClient::shutdown_ssl_impl(Socket &socket,
+                                         bool shutdown_gracefully) {
+  if (socket.sock == INVALID_SOCKET) {
+    assert(socket.ssl == nullptr);
+    return;
+  }
+  if (socket.ssl) {
+    tls::shutdown(socket.ssl, shutdown_gracefully);
+    {
+      std::lock_guard<std::mutex> guard(ctx_mutex_);
+      tls::free_session(socket.ssl);
+    }
+    socket.ssl = nullptr;
+  }
+  assert(socket.ssl == nullptr);
+}
+
+bool SSLClient::process_socket(
+    const Socket &socket,
+    std::chrono::time_point<std::chrono::steady_clock> start_time,
+    std::function<bool(Stream &strm)> callback) {
+  assert(socket.ssl);
+  return detail::process_client_socket_ssl(
+      socket.ssl, socket.sock, read_timeout_sec_, read_timeout_usec_,
+      write_timeout_sec_, write_timeout_usec_, max_timeout_msec_, start_time,
+      std::move(callback));
+}
+
+bool SSLClient::is_ssl() const { return true; }
+
+bool SSLClient::create_and_connect_socket(Socket &socket, Error &error) {
+  if (!is_valid()) {
+    error = Error::SSLConnection;
+    return false;
+  }
+  return ClientImpl::create_and_connect_socket(socket, error);
+}
+
+// Assumes that socket_mutex_ is locked and that there are no requests in
+// flight
+bool SSLClient::connect_with_proxy(
+    Socket &socket,
+    std::chrono::time_point<std::chrono::steady_clock> start_time,
+    Response &res, bool &success, Error &error) {
+  success = true;
+  Response proxy_res;
+  if (!detail::process_client_socket(
+          socket.sock, read_timeout_sec_, read_timeout_usec_,
+          write_timeout_sec_, write_timeout_usec_, max_timeout_msec_,
+          start_time, [&](Stream &strm) {
+            Request req2;
+            req2.method = "CONNECT";
+            req2.path =
+                detail::make_host_and_port_string_always_port(host_, port_);
+            if (max_timeout_msec_ > 0) {
+              req2.start_time_ = std::chrono::steady_clock::now();
+            }
+            return process_request(strm, req2, proxy_res, false, error);
+          })) {
+    // Thread-safe to close everything because we are assuming there are no
+    // requests in flight
+    shutdown_ssl(socket, true);
+    shutdown_socket(socket);
+    close_socket(socket);
+    success = false;
+    return false;
+  }
+
+  if (proxy_res.status == StatusCode::ProxyAuthenticationRequired_407) {
+    if (!proxy_digest_auth_username_.empty() &&
+        !proxy_digest_auth_password_.empty()) {
+      std::map<std::string, std::string> auth;
+      if (detail::parse_www_authenticate(proxy_res, auth, true)) {
+        // Close the current socket and create a new one for the authenticated
+        // request
+        shutdown_ssl(socket, true);
+        shutdown_socket(socket);
+        close_socket(socket);
+
+        // Create a new socket for the authenticated CONNECT request
+        if (!ensure_socket_connection(socket, error)) {
+          success = false;
+          output_error_log(error, nullptr);
+          return false;
+        }
+
+        proxy_res = Response();
+        if (!detail::process_client_socket(
+                socket.sock, read_timeout_sec_, read_timeout_usec_,
+                write_timeout_sec_, write_timeout_usec_, max_timeout_msec_,
+                start_time, [&](Stream &strm) {
+                  Request req3;
+                  req3.method = "CONNECT";
+                  req3.path = detail::make_host_and_port_string_always_port(
+                      host_, port_);
+                  req3.headers.insert(detail::make_digest_authentication_header(
+                      req3, auth, 1, detail::random_string(10),
+                      proxy_digest_auth_username_, proxy_digest_auth_password_,
+                      true));
+                  if (max_timeout_msec_ > 0) {
+                    req3.start_time_ = std::chrono::steady_clock::now();
+                  }
+                  return process_request(strm, req3, proxy_res, false, error);
+                })) {
+          // Thread-safe to close everything because we are assuming there are
+          // no requests in flight
+          shutdown_ssl(socket, true);
+          shutdown_socket(socket);
+          close_socket(socket);
+          success = false;
+          return false;
+        }
+      }
+    }
+  }
+
+  // If status code is not 200, proxy request is failed.
+  // Set error to ProxyConnection and return proxy response
+  // as the response of the request
+  if (proxy_res.status != StatusCode::OK_200) {
+    error = Error::ProxyConnection;
+    output_error_log(error, nullptr);
+    res = std::move(proxy_res);
+    // Thread-safe to close everything because we are assuming there are
+    // no requests in flight
+    shutdown_ssl(socket, true);
+    shutdown_socket(socket);
+    close_socket(socket);
+    return false;
+  }
+
+  return true;
+}
+
+bool SSLClient::ensure_socket_connection(Socket &socket, Error &error) {
+  if (!ClientImpl::ensure_socket_connection(socket, error)) { return false; }
+
+  if (!proxy_host_.empty() && proxy_port_ != -1) { return true; }
+
+  if (!initialize_ssl(socket, error)) {
+    shutdown_socket(socket);
+    close_socket(socket);
+    return false;
+  }
+
+  return true;
+}
+
+// SSL HTTP client implementation
+SSLClient::SSLClient(const std::string &host)
+    : SSLClient(host, 443, std::string(), std::string()) {}
+
+SSLClient::SSLClient(const std::string &host, int port)
+    : SSLClient(host, port, std::string(), std::string()) {}
+
+SSLClient::SSLClient(const std::string &host, int port,
+                            const std::string &client_cert_path,
+                            const std::string &client_key_path,
+                            const std::string &private_key_password)
+    : ClientImpl(host, port, client_cert_path, client_key_path) {
+  ctx_ = tls::create_client_context();
+  if (!ctx_) { return; }
+
+  tls::set_min_version(ctx_, tls::Version::TLS1_2);
+
+  if (!client_cert_path.empty() && !client_key_path.empty()) {
+    const char *password =
+        private_key_password.empty() ? nullptr : private_key_password.c_str();
+    if (!tls::set_client_cert_file(ctx_, client_cert_path.c_str(),
+                                   client_key_path.c_str(), password)) {
+      last_backend_error_ = tls::get_error();
+      tls::free_context(ctx_);
+      ctx_ = nullptr;
+    }
+  }
+}
+
+SSLClient::SSLClient(const std::string &host, int port,
+                            const PemMemory &pem)
+    : ClientImpl(host, port) {
+  ctx_ = tls::create_client_context();
+  if (!ctx_) { return; }
+
+  tls::set_min_version(ctx_, tls::Version::TLS1_2);
+
+  if (pem.cert_pem && pem.key_pem) {
+    if (!tls::set_client_cert_pem(ctx_, pem.cert_pem, pem.key_pem,
+                                  pem.private_key_password)) {
+      last_backend_error_ = tls::get_error();
+      tls::free_context(ctx_);
+      ctx_ = nullptr;
+    }
+  }
+}
+
+void SSLClient::set_ca_cert_store(tls::ca_store_t ca_cert_store) {
+  if (ca_cert_store && ctx_) {
+    // set_ca_store takes ownership of ca_cert_store
+    tls::set_ca_store(ctx_, ca_cert_store);
+  } else if (ca_cert_store) {
+    tls::free_ca_store(ca_cert_store);
+  }
+}
+
+void
+SSLClient::set_server_certificate_verifier(tls::VerifyCallback verifier) {
+  if (!ctx_) { return; }
+  tls::set_verify_callback(ctx_, verifier);
+}
+
+void SSLClient::set_session_verifier(
+    std::function<SSLVerifierResponse(tls::session_t)> verifier) {
+  session_verifier_ = std::move(verifier);
+}
+
+#if defined(_WIN32) &&                                                         \
+    !defined(CPPHTTPLIB_DISABLE_WINDOWS_AUTOMATIC_ROOT_CERTIFICATES_UPDATE)
+void SSLClient::enable_windows_certificate_verification(bool enabled) {
+  enable_windows_cert_verification_ = enabled;
+}
+#endif
+
+void SSLClient::load_ca_cert_store(const char *ca_cert,
+                                          std::size_t size) {
+  if (ctx_ && ca_cert && size > 0) {
+    ca_cert_pem_.assign(ca_cert, size); // Store for redirect transfer
+    tls::load_ca_pem(ctx_, ca_cert, size);
+  }
+}
+
+bool SSLClient::load_certs() {
+  auto ret = true;
+
+  std::call_once(initialize_cert_, [&]() {
+    std::lock_guard<std::mutex> guard(ctx_mutex_);
+
+    if (!ca_cert_file_path_.empty()) {
+      if (!tls::load_ca_file(ctx_, ca_cert_file_path_.c_str())) {
+        last_backend_error_ = tls::get_error();
+        ret = false;
+      }
+    } else if (!ca_cert_dir_path_.empty()) {
+      if (!tls::load_ca_dir(ctx_, ca_cert_dir_path_.c_str())) {
+        last_backend_error_ = tls::get_error();
+        ret = false;
+      }
+    } else if (ca_cert_pem_.empty()) {
+      if (!tls::load_system_certs(ctx_)) {
+        last_backend_error_ = tls::get_error();
+      }
+    }
+  });
+
+  return ret;
+}
+
+bool SSLClient::initialize_ssl(Socket &socket, Error &error) {
+  using namespace tls;
+
+  // Load CA certificates if server verification is enabled
+  if (server_certificate_verification_) {
+    if (!load_certs()) {
+      error = Error::SSLLoadingCerts;
+      output_error_log(error, nullptr);
+      return false;
+    }
+  }
+
+  bool is_ip = detail::is_ip_address(host_);
+
+#ifdef CPPHTTPLIB_MBEDTLS_SUPPORT
+  // MbedTLS needs explicit verification mode (OpenSSL uses SSL_VERIFY_NONE
+  // by default and performs all verification post-handshake).
+  // For IP addresses with verification enabled, use OPTIONAL mode since
+  // MbedTLS requires hostname for VERIFY_REQUIRED.
+  if (is_ip && server_certificate_verification_) {
+    set_verify_client(ctx_, false);
+  } else {
+    set_verify_client(ctx_, server_certificate_verification_);
+  }
+#endif
+
+  // Create TLS session
+  session_t session = nullptr;
+  {
+    std::lock_guard<std::mutex> guard(ctx_mutex_);
+    session = create_session(ctx_, socket.sock);
+  }
+
+  if (!session) {
+    error = Error::SSLConnection;
+    last_backend_error_ = get_error();
+    return false;
+  }
+
+  // Use scope_exit to ensure session is freed on error paths
+  bool success = false;
+  auto session_guard = detail::scope_exit([&] {
+    if (!success) { free_session(session); }
+  });
+
+  // Set SNI extension (skip for IP addresses per RFC 6066).
+  // On MbedTLS, set_sni also enables hostname verification internally.
+  // On OpenSSL, set_sni only sets SNI; verification is done post-handshake.
+  if (!is_ip) {
+    if (!set_sni(session, host_.c_str())) {
+      error = Error::SSLConnection;
+      last_backend_error_ = get_error();
+      return false;
+    }
+  }
+
+  // Perform non-blocking TLS handshake with timeout
+  TlsError tls_err;
+  if (!connect_nonblocking(session, socket.sock, connection_timeout_sec_,
+                           connection_timeout_usec_, &tls_err)) {
+    last_ssl_error_ = static_cast<int>(tls_err.code);
+    last_backend_error_ = tls_err.backend_code;
+    if (tls_err.code == ErrorCode::CertVerifyFailed) {
+      error = Error::SSLServerVerification;
+    } else if (tls_err.code == ErrorCode::HostnameMismatch) {
+      error = Error::SSLServerHostnameVerification;
+    } else {
+      error = Error::SSLConnection;
+    }
+    output_error_log(error, nullptr);
+    return false;
+  }
+
+  // Post-handshake session verifier callback
+  auto verification_status = SSLVerifierResponse::NoDecisionMade;
+  if (session_verifier_) { verification_status = session_verifier_(session); }
+
+  if (verification_status == SSLVerifierResponse::CertificateRejected) {
+    last_backend_error_ = get_error();
+    error = Error::SSLServerVerification;
+    output_error_log(error, nullptr);
+    return false;
+  }
+
+  // Default server certificate verification
+  if (verification_status == SSLVerifierResponse::NoDecisionMade &&
+      server_certificate_verification_) {
+    verify_result_ = tls::get_verify_result(session);
+    if (verify_result_ != 0) {
+      last_backend_error_ = static_cast<unsigned long>(verify_result_);
+      error = Error::SSLServerVerification;
+      output_error_log(error, nullptr);
+      return false;
+    }
+
+    auto server_cert = get_peer_cert(session);
+    if (!server_cert) {
+      last_backend_error_ = get_error();
+      error = Error::SSLServerVerification;
+      output_error_log(error, nullptr);
+      return false;
+    }
+    auto cert_guard = detail::scope_exit([&] { free_cert(server_cert); });
+
+    // Hostname verification (post-handshake for all cases).
+    // On OpenSSL, verification is always post-handshake (SSL_VERIFY_NONE).
+    // On MbedTLS, set_sni already enabled hostname verification during
+    // handshake for non-IP hosts, but this check is still needed for IP
+    // addresses where SNI is not set.
+    if (server_hostname_verification_) {
+      if (!verify_hostname(server_cert, host_.c_str())) {
+        last_backend_error_ = hostname_mismatch_code();
+        error = Error::SSLServerHostnameVerification;
+        output_error_log(error, nullptr);
+        return false;
+      }
+    }
+
+#if defined(_WIN32) &&                                                         \
+    !defined(CPPHTTPLIB_DISABLE_WINDOWS_AUTOMATIC_ROOT_CERTIFICATES_UPDATE)
+    // Additional Windows Schannel verification.
+    // This provides real-time certificate validation with Windows Update
+    // integration, working with both OpenSSL and MbedTLS backends.
+    // Skip when a custom CA cert is specified, as the Windows certificate
+    // store would not know about user-provided CA certificates.
+    if (enable_windows_cert_verification_ && ca_cert_file_path_.empty() &&
+        ca_cert_dir_path_.empty() && ca_cert_pem_.empty()) {
+      std::vector<unsigned char> der;
+      if (get_cert_der(server_cert, der)) {
+        unsigned long wincrypt_error = 0;
+        if (!detail::verify_cert_with_windows_schannel(
+                der, host_, server_hostname_verification_, wincrypt_error)) {
+          last_backend_error_ = wincrypt_error;
+          error = Error::SSLServerVerification;
+          output_error_log(error, nullptr);
+          return false;
+        }
+      }
+    }
+#endif
+  }
+
+  success = true;
+  socket.ssl = session;
+  return true;
+}
+
+void Client::set_digest_auth(const std::string &username,
+                                    const std::string &password) {
+  cli_->set_digest_auth(username, password);
+}
+
+void Client::set_proxy_digest_auth(const std::string &username,
+                                          const std::string &password) {
+  cli_->set_proxy_digest_auth(username, password);
+}
+
+void Client::enable_server_certificate_verification(bool enabled) {
+  cli_->enable_server_certificate_verification(enabled);
+}
+
+void Client::enable_server_hostname_verification(bool enabled) {
+  cli_->enable_server_hostname_verification(enabled);
+}
+
+#if defined(_WIN32) &&                                                         \
+    !defined(CPPHTTPLIB_DISABLE_WINDOWS_AUTOMATIC_ROOT_CERTIFICATES_UPDATE)
+void Client::enable_windows_certificate_verification(bool enabled) {
+  if (is_ssl_) {
+    static_cast<SSLClient &>(*cli_).enable_windows_certificate_verification(
+        enabled);
+  }
+}
+#endif
+
 void Client::set_ca_cert_path(const std::string &ca_cert_file_path,
                                      const std::string &ca_cert_dir_path) {
   cli_->set_ca_cert_path(ca_cert_file_path, ca_cert_dir_path);
 }
 
-void Client::set_ca_cert_store(X509_STORE *ca_cert_store) {
+void Client::set_ca_cert_store(tls::ca_store_t ca_cert_store) {
   if (is_ssl_) {
     static_cast<SSLClient &>(*cli_).set_ca_cert_store(ca_cert_store);
-  } else {
-    cli_->set_ca_cert_store(ca_cert_store);
+  } else if (ca_cert_store) {
+    tls::free_ca_store(ca_cert_store);
   }
 }
 
 void Client::load_ca_cert_store(const char *ca_cert, std::size_t size) {
-  set_ca_cert_store(cli_->create_ca_cert_store(ca_cert, size));
+  set_ca_cert_store(tls::create_ca_store(ca_cert, size));
 }
 
-long Client::get_openssl_verify_result() const {
+void
+Client::set_server_certificate_verifier(tls::VerifyCallback verifier) {
   if (is_ssl_) {
-    return static_cast<SSLClient &>(*cli_).get_openssl_verify_result();
+    static_cast<SSLClient &>(*cli_).set_server_certificate_verifier(
+        std::move(verifier));
   }
-  return -1; // NOTE: -1 doesn't match any of X509_V_ERR_???
 }
 
+void Client::set_session_verifier(
+    std::function<SSLVerifierResponse(tls::session_t)> verifier) {
+  if (is_ssl_) {
+    static_cast<SSLClient &>(*cli_).set_session_verifier(std::move(verifier));
+  }
+}
+
+tls::ctx_t Client::tls_context() const {
+  if (is_ssl_) { return static_cast<SSLClient &>(*cli_).tls_context(); }
+  return nullptr;
+}
+
+#endif // CPPHTTPLIB_SSL_ENABLED
+
+/*
+ * Group 7: TLS abstraction layer - Common API
+ */
+
+#ifdef CPPHTTPLIB_SSL_ENABLED
+
+namespace tls {
+
+// Helper for PeerCert construction
+PeerCert get_peer_cert_from_session(const_session_t session) {
+  return PeerCert(get_peer_cert(session));
+}
+
+namespace impl {
+
+VerifyCallback &get_verify_callback() {
+  static thread_local VerifyCallback callback;
+  return callback;
+}
+
+VerifyCallback &get_mbedtls_verify_callback() {
+  static thread_local VerifyCallback callback;
+  return callback;
+}
+
+} // namespace impl
+
+bool set_client_ca_file(ctx_t ctx, const char *ca_file,
+                               const char *ca_dir) {
+  if (!ctx) { return false; }
+
+  bool success = true;
+  if (ca_file && *ca_file) {
+    if (!load_ca_file(ctx, ca_file)) { success = false; }
+  }
+  if (ca_dir && *ca_dir) {
+    if (!load_ca_dir(ctx, ca_dir)) { success = false; }
+  }
+
+#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
+  // Set CA list for client certificate request (CertificateRequest message)
+  if (ca_file && *ca_file) {
+    auto list = SSL_load_client_CA_file(ca_file);
+    if (list) { SSL_CTX_set_client_CA_list(static_cast<SSL_CTX *>(ctx), list); }
+  }
+#endif
+
+  return success;
+}
+
+bool set_server_cert_pem(ctx_t ctx, const char *cert, const char *key,
+                                const char *password) {
+  return set_client_cert_pem(ctx, cert, key, password);
+}
+
+bool set_server_cert_file(ctx_t ctx, const char *cert_path,
+                                 const char *key_path, const char *password) {
+  return set_client_cert_file(ctx, cert_path, key_path, password);
+}
+
+// PeerCert implementation
+PeerCert::PeerCert() = default;
+
+PeerCert::PeerCert(cert_t cert) : cert_(cert) {}
+
+PeerCert::PeerCert(PeerCert &&other) noexcept : cert_(other.cert_) {
+  other.cert_ = nullptr;
+}
+
+PeerCert &PeerCert::operator=(PeerCert &&other) noexcept {
+  if (this != &other) {
+    if (cert_) { free_cert(cert_); }
+    cert_ = other.cert_;
+    other.cert_ = nullptr;
+  }
+  return *this;
+}
+
+PeerCert::~PeerCert() {
+  if (cert_) { free_cert(cert_); }
+}
+
+PeerCert::operator bool() const { return cert_ != nullptr; }
+
+std::string PeerCert::subject_cn() const {
+  return cert_ ? get_cert_subject_cn(cert_) : std::string();
+}
+
+std::string PeerCert::issuer_name() const {
+  return cert_ ? get_cert_issuer_name(cert_) : std::string();
+}
+
+bool PeerCert::check_hostname(const char *hostname) const {
+  return cert_ ? verify_hostname(cert_, hostname) : false;
+}
+
+std::vector<SanEntry> PeerCert::sans() const {
+  std::vector<SanEntry> result;
+  if (cert_) { get_cert_sans(cert_, result); }
+  return result;
+}
+
+bool PeerCert::validity(time_t &not_before, time_t &not_after) const {
+  return cert_ ? get_cert_validity(cert_, not_before, not_after) : false;
+}
+
+std::string PeerCert::serial() const {
+  return cert_ ? get_cert_serial(cert_) : std::string();
+}
+
+// VerifyContext method implementations
+std::string VerifyContext::subject_cn() const {
+  return cert ? get_cert_subject_cn(cert) : std::string();
+}
+
+std::string VerifyContext::issuer_name() const {
+  return cert ? get_cert_issuer_name(cert) : std::string();
+}
+
+bool VerifyContext::check_hostname(const char *hostname) const {
+  return cert ? verify_hostname(cert, hostname) : false;
+}
+
+std::vector<SanEntry> VerifyContext::sans() const {
+  std::vector<SanEntry> result;
+  if (cert) { get_cert_sans(cert, result); }
+  return result;
+}
+
+bool VerifyContext::validity(time_t &not_before,
+                                    time_t &not_after) const {
+  return cert ? get_cert_validity(cert, not_before, not_after) : false;
+}
+
+std::string VerifyContext::serial() const {
+  return cert ? get_cert_serial(cert) : std::string();
+}
+
+// TlsError static method implementation
+std::string TlsError::verify_error_to_string(long error_code) {
+  return verify_error_string(error_code);
+}
+
+} // namespace tls
+
+// Request::peer_cert() implementation
+tls::PeerCert Request::peer_cert() const {
+  return tls::get_peer_cert_from_session(ssl);
+}
+
+// Request::sni() implementation
+std::string Request::sni() const {
+  if (!ssl) { return std::string(); }
+  const char *s = tls::get_sni(ssl);
+  return s ? std::string(s) : std::string();
+}
+
+#endif // CPPHTTPLIB_SSL_ENABLED
+
+/*
+ * Group 8: TLS abstraction layer - OpenSSL backend
+ */
+
+#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
 SSL_CTX *Client::ssl_context() const {
   if (is_ssl_) { return static_cast<SSLClient &>(*cli_).ssl_context(); }
   return nullptr;
 }
+
+void Client::set_server_certificate_verifier(
+    std::function<SSLVerifierResponse(SSL *ssl)> verifier) {
+  cli_->set_server_certificate_verifier(verifier);
+}
+
+long Client::get_verify_result() const {
+  if (is_ssl_) { return static_cast<SSLClient &>(*cli_).get_verify_result(); }
+  return -1; // NOTE: -1 doesn't match any of X509_V_ERR_???
+}
+#endif // CPPHTTPLIB_OPENSSL_SUPPORT
+
+/*
+ * OpenSSL Backend Implementation
+ */
+
+#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
+namespace tls {
+
+namespace impl {
+
+// OpenSSL-specific helpers for converting native types to PEM
+std::string x509_to_pem(X509 *cert) {
+  if (!cert) return {};
+  BIO *bio = BIO_new(BIO_s_mem());
+  if (!bio) return {};
+  if (PEM_write_bio_X509(bio, cert) != 1) {
+    BIO_free(bio);
+    return {};
+  }
+  char *data = nullptr;
+  long len = BIO_get_mem_data(bio, &data);
+  std::string pem(data, static_cast<size_t>(len));
+  BIO_free(bio);
+  return pem;
+}
+
+std::string evp_pkey_to_pem(EVP_PKEY *key) {
+  if (!key) return {};
+  BIO *bio = BIO_new(BIO_s_mem());
+  if (!bio) return {};
+  if (PEM_write_bio_PrivateKey(bio, key, nullptr, nullptr, 0, nullptr,
+                               nullptr) != 1) {
+    BIO_free(bio);
+    return {};
+  }
+  char *data = nullptr;
+  long len = BIO_get_mem_data(bio, &data);
+  std::string pem(data, static_cast<size_t>(len));
+  BIO_free(bio);
+  return pem;
+}
+
+std::string x509_store_to_pem(X509_STORE *store) {
+  if (!store) return {};
+  std::string pem;
+  auto objs = X509_STORE_get0_objects(store);
+  if (!objs) return {};
+  auto count = sk_X509_OBJECT_num(objs);
+  for (decltype(count) i = 0; i < count; i++) {
+    auto obj = sk_X509_OBJECT_value(objs, i);
+    if (X509_OBJECT_get_type(obj) == X509_LU_X509) {
+      auto cert = X509_OBJECT_get0_X509(obj);
+      if (cert) { pem += x509_to_pem(cert); }
+    }
+  }
+  return pem;
+}
+
+// Helper to map OpenSSL SSL_get_error to ErrorCode
+ErrorCode map_ssl_error(int ssl_error, int &out_errno) {
+  switch (ssl_error) {
+  case SSL_ERROR_NONE: return ErrorCode::Success;
+  case SSL_ERROR_WANT_READ: return ErrorCode::WantRead;
+  case SSL_ERROR_WANT_WRITE: return ErrorCode::WantWrite;
+  case SSL_ERROR_ZERO_RETURN: return ErrorCode::PeerClosed;
+  case SSL_ERROR_SYSCALL: out_errno = errno; return ErrorCode::SyscallError;
+  case SSL_ERROR_SSL:
+  default: return ErrorCode::Fatal;
+  }
+}
+
+// Helper: Create client CA list from PEM string
+// Returns a new STACK_OF(X509_NAME)* or nullptr on failure
+// Caller takes ownership of returned list
+STACK_OF(X509_NAME) *
+    create_client_ca_list_from_pem(const char *ca_pem) {
+  if (!ca_pem) { return nullptr; }
+
+  auto ca_list = sk_X509_NAME_new_null();
+  if (!ca_list) { return nullptr; }
+
+  BIO *bio = BIO_new_mem_buf(ca_pem, -1);
+  if (!bio) {
+    sk_X509_NAME_pop_free(ca_list, X509_NAME_free);
+    return nullptr;
+  }
+
+  X509 *cert = nullptr;
+  while ((cert = PEM_read_bio_X509(bio, nullptr, nullptr, nullptr)) !=
+         nullptr) {
+    X509_NAME *name = X509_get_subject_name(cert);
+    if (name) { sk_X509_NAME_push(ca_list, X509_NAME_dup(name)); }
+    X509_free(cert);
+  }
+  BIO_free(bio);
+
+  return ca_list;
+}
+
+// Helper: Extract CA names from X509_STORE
+// Returns a new STACK_OF(X509_NAME)* or nullptr on failure
+// Caller takes ownership of returned list
+STACK_OF(X509_NAME) *
+    extract_client_ca_list_from_store(X509_STORE *store) {
+  if (!store) { return nullptr; }
+
+  auto ca_list = sk_X509_NAME_new_null();
+  if (!ca_list) { return nullptr; }
+
+  auto objs = X509_STORE_get0_objects(store);
+  if (!objs) {
+    sk_X509_NAME_free(ca_list);
+    return nullptr;
+  }
+
+  auto count = sk_X509_OBJECT_num(objs);
+  for (decltype(count) i = 0; i < count; i++) {
+    auto obj = sk_X509_OBJECT_value(objs, i);
+    if (X509_OBJECT_get_type(obj) == X509_LU_X509) {
+      auto cert = X509_OBJECT_get0_X509(obj);
+      if (cert) {
+        auto subject = X509_get_subject_name(cert);
+        if (subject) {
+          auto name_dup = X509_NAME_dup(subject);
+          if (name_dup) { sk_X509_NAME_push(ca_list, name_dup); }
+        }
+      }
+    }
+  }
+
+  if (sk_X509_NAME_num(ca_list) == 0) {
+    sk_X509_NAME_free(ca_list);
+    return nullptr;
+  }
+
+  return ca_list;
+}
+
+// OpenSSL verify callback wrapper
+int openssl_verify_callback(int preverify_ok, X509_STORE_CTX *ctx) {
+  auto &callback = get_verify_callback();
+  if (!callback) { return preverify_ok; }
+
+  // Get SSL object from X509_STORE_CTX
+  auto ssl = static_cast<SSL *>(
+      X509_STORE_CTX_get_ex_data(ctx, SSL_get_ex_data_X509_STORE_CTX_idx()));
+  if (!ssl) { return preverify_ok; }
+
+  // Get current certificate and depth
+  auto cert = X509_STORE_CTX_get_current_cert(ctx);
+  int depth = X509_STORE_CTX_get_error_depth(ctx);
+  int error = X509_STORE_CTX_get_error(ctx);
+
+  // Build context
+  VerifyContext verify_ctx;
+  verify_ctx.session = static_cast<session_t>(ssl);
+  verify_ctx.cert = static_cast<cert_t>(cert);
+  verify_ctx.depth = depth;
+  verify_ctx.preverify_ok = (preverify_ok != 0);
+  verify_ctx.error_code = error;
+  verify_ctx.error_string =
+      (error != X509_V_OK) ? X509_verify_cert_error_string(error) : nullptr;
+
+  return callback(verify_ctx) ? 1 : 0;
+}
+
+} // namespace impl
+
+ctx_t create_client_context() {
+  SSL_CTX *ctx = SSL_CTX_new(TLS_client_method());
+  if (ctx) {
+    // Disable auto-retry to properly handle non-blocking I/O
+    SSL_CTX_clear_mode(ctx, SSL_MODE_AUTO_RETRY);
+    // Set minimum TLS version
+    SSL_CTX_set_min_proto_version(ctx, TLS1_2_VERSION);
+  }
+  return static_cast<ctx_t>(ctx);
+}
+
+void free_context(ctx_t ctx) {
+  if (ctx) { SSL_CTX_free(static_cast<SSL_CTX *>(ctx)); }
+}
+
+bool set_min_version(ctx_t ctx, Version version) {
+  if (!ctx) return false;
+  return SSL_CTX_set_min_proto_version(static_cast<SSL_CTX *>(ctx),
+                                       static_cast<int>(version)) == 1;
+}
+
+bool load_ca_pem(ctx_t ctx, const char *pem, size_t len) {
+  if (!ctx || !pem || len == 0) return false;
+
+  auto ssl_ctx = static_cast<SSL_CTX *>(ctx);
+  auto store = SSL_CTX_get_cert_store(ssl_ctx);
+  if (!store) return false;
+
+  auto bio = BIO_new_mem_buf(pem, static_cast<int>(len));
+  if (!bio) return false;
+
+  bool ok = true;
+  X509 *cert = nullptr;
+  while ((cert = PEM_read_bio_X509(bio, nullptr, nullptr, nullptr)) !=
+         nullptr) {
+    if (X509_STORE_add_cert(store, cert) != 1) {
+      // Ignore duplicate errors
+      auto err = ERR_peek_last_error();
+      if (ERR_GET_REASON(err) != X509_R_CERT_ALREADY_IN_HASH_TABLE) {
+        ok = false;
+      }
+    }
+    X509_free(cert);
+    if (!ok) break;
+  }
+  BIO_free(bio);
+
+  // Clear any "no more certificates" errors
+  ERR_clear_error();
+  return ok;
+}
+
+bool load_ca_file(ctx_t ctx, const char *file_path) {
+  if (!ctx || !file_path) return false;
+  return SSL_CTX_load_verify_locations(static_cast<SSL_CTX *>(ctx), file_path,
+                                       nullptr) == 1;
+}
+
+bool load_ca_dir(ctx_t ctx, const char *dir_path) {
+  if (!ctx || !dir_path) return false;
+  return SSL_CTX_load_verify_locations(static_cast<SSL_CTX *>(ctx), nullptr,
+                                       dir_path) == 1;
+}
+
+bool load_system_certs(ctx_t ctx) {
+  if (!ctx) return false;
+  auto ssl_ctx = static_cast<SSL_CTX *>(ctx);
+
+#ifdef _WIN32
+  // Windows: Load from system certificate store (ROOT and CA)
+  auto store = SSL_CTX_get_cert_store(ssl_ctx);
+  if (!store) return false;
+
+  bool loaded_any = false;
+  static const wchar_t *store_names[] = {L"ROOT", L"CA"};
+  for (auto store_name : store_names) {
+    auto hStore = CertOpenSystemStoreW(NULL, store_name);
+    if (!hStore) continue;
+
+    PCCERT_CONTEXT pContext = nullptr;
+    while ((pContext = CertEnumCertificatesInStore(hStore, pContext)) !=
+           nullptr) {
+      const unsigned char *data = pContext->pbCertEncoded;
+      auto x509 = d2i_X509(nullptr, &data, pContext->cbCertEncoded);
+      if (x509) {
+        if (X509_STORE_add_cert(store, x509) == 1) { loaded_any = true; }
+        X509_free(x509);
+      }
+    }
+    CertCloseStore(hStore, 0);
+  }
+  return loaded_any;
+
+#elif defined(__APPLE__)
+#ifdef CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN
+  // macOS: Load from Keychain
+  auto store = SSL_CTX_get_cert_store(ssl_ctx);
+  if (!store) return false;
+
+  CFArrayRef certs = nullptr;
+  if (SecTrustCopyAnchorCertificates(&certs) != errSecSuccess || !certs) {
+    return SSL_CTX_set_default_verify_paths(ssl_ctx) == 1;
+  }
+
+  bool loaded_any = false;
+  auto count = CFArrayGetCount(certs);
+  for (CFIndex i = 0; i < count; i++) {
+    auto cert = reinterpret_cast<SecCertificateRef>(
+        const_cast<void *>(CFArrayGetValueAtIndex(certs, i)));
+    CFDataRef der = SecCertificateCopyData(cert);
+    if (der) {
+      const unsigned char *data = CFDataGetBytePtr(der);
+      auto x509 = d2i_X509(nullptr, &data, CFDataGetLength(der));
+      if (x509) {
+        if (X509_STORE_add_cert(store, x509) == 1) { loaded_any = true; }
+        X509_free(x509);
+      }
+      CFRelease(der);
+    }
+  }
+  CFRelease(certs);
+  return loaded_any || SSL_CTX_set_default_verify_paths(ssl_ctx) == 1;
+#else
+  return SSL_CTX_set_default_verify_paths(ssl_ctx) == 1;
 #endif
 
+#else
+  // Other Unix: use default verify paths
+  return SSL_CTX_set_default_verify_paths(ssl_ctx) == 1;
+#endif
+}
+
+bool set_client_cert_pem(ctx_t ctx, const char *cert, const char *key,
+                                const char *password) {
+  if (!ctx || !cert || !key) return false;
+
+  auto ssl_ctx = static_cast<SSL_CTX *>(ctx);
+
+  // Load certificate
+  auto cert_bio = BIO_new_mem_buf(cert, -1);
+  if (!cert_bio) return false;
+
+  auto x509 = PEM_read_bio_X509(cert_bio, nullptr, nullptr, nullptr);
+  BIO_free(cert_bio);
+  if (!x509) return false;
+
+  auto cert_ok = SSL_CTX_use_certificate(ssl_ctx, x509) == 1;
+  X509_free(x509);
+  if (!cert_ok) return false;
+
+  // Load private key
+  auto key_bio = BIO_new_mem_buf(key, -1);
+  if (!key_bio) return false;
+
+  auto pkey = PEM_read_bio_PrivateKey(key_bio, nullptr, nullptr,
+                                      password ? const_cast<char *>(password)
+                                               : nullptr);
+  BIO_free(key_bio);
+  if (!pkey) return false;
+
+  auto key_ok = SSL_CTX_use_PrivateKey(ssl_ctx, pkey) == 1;
+  EVP_PKEY_free(pkey);
+
+  return key_ok && SSL_CTX_check_private_key(ssl_ctx) == 1;
+}
+
+bool set_client_cert_file(ctx_t ctx, const char *cert_path,
+                                 const char *key_path, const char *password) {
+  if (!ctx || !cert_path || !key_path) return false;
+
+  auto ssl_ctx = static_cast<SSL_CTX *>(ctx);
+
+  if (password && password[0] != '\0') {
+    SSL_CTX_set_default_passwd_cb_userdata(
+        ssl_ctx, reinterpret_cast<void *>(const_cast<char *>(password)));
+  }
+
+  return SSL_CTX_use_certificate_chain_file(ssl_ctx, cert_path) == 1 &&
+         SSL_CTX_use_PrivateKey_file(ssl_ctx, key_path, SSL_FILETYPE_PEM) == 1;
+}
+
+ctx_t create_server_context() {
+  SSL_CTX *ctx = SSL_CTX_new(TLS_server_method());
+  if (ctx) {
+    SSL_CTX_set_options(ctx, SSL_OP_NO_COMPRESSION |
+                                 SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION);
+    SSL_CTX_set_min_proto_version(ctx, TLS1_2_VERSION);
+  }
+  return static_cast<ctx_t>(ctx);
+}
+
+void set_verify_client(ctx_t ctx, bool require) {
+  if (!ctx) return;
+  SSL_CTX_set_verify(static_cast<SSL_CTX *>(ctx),
+                     require
+                         ? (SSL_VERIFY_PEER | SSL_VERIFY_FAIL_IF_NO_PEER_CERT)
+                         : SSL_VERIFY_NONE,
+                     nullptr);
+}
+
+session_t create_session(ctx_t ctx, socket_t sock) {
+  if (!ctx || sock == INVALID_SOCKET) return nullptr;
+
+  auto ssl_ctx = static_cast<SSL_CTX *>(ctx);
+  SSL *ssl = SSL_new(ssl_ctx);
+  if (!ssl) return nullptr;
+
+  // Disable auto-retry for proper non-blocking I/O handling
+  SSL_clear_mode(ssl, SSL_MODE_AUTO_RETRY);
+
+  auto bio = BIO_new_socket(static_cast<int>(sock), BIO_NOCLOSE);
+  if (!bio) {
+    SSL_free(ssl);
+    return nullptr;
+  }
+
+  SSL_set_bio(ssl, bio, bio);
+  return static_cast<session_t>(ssl);
+}
+
+void free_session(session_t session) {
+  if (session) { SSL_free(static_cast<SSL *>(session)); }
+}
+
+bool set_sni(session_t session, const char *hostname) {
+  if (!session || !hostname) return false;
+
+  auto ssl = static_cast<SSL *>(session);
+
+  // Set SNI (Server Name Indication) only - does not enable verification
+#if defined(OPENSSL_IS_BORINGSSL)
+  return SSL_set_tlsext_host_name(ssl, hostname) == 1;
+#else
+  // Direct call instead of macro to suppress -Wold-style-cast warning
+  return SSL_ctrl(ssl, SSL_CTRL_SET_TLSEXT_HOSTNAME, TLSEXT_NAMETYPE_host_name,
+                  static_cast<void *>(const_cast<char *>(hostname))) == 1;
+#endif
+}
+
+bool set_hostname(session_t session, const char *hostname) {
+  if (!session || !hostname) return false;
+
+  auto ssl = static_cast<SSL *>(session);
+
+  // Set SNI (Server Name Indication)
+  if (!set_sni(session, hostname)) { return false; }
+
+  // Enable hostname verification
+  auto param = SSL_get0_param(ssl);
+  if (!param) return false;
+
+  X509_VERIFY_PARAM_set_hostflags(param, X509_CHECK_FLAG_NO_PARTIAL_WILDCARDS);
+  if (X509_VERIFY_PARAM_set1_host(param, hostname, 0) != 1) { return false; }
+
+  SSL_set_verify(ssl, SSL_VERIFY_PEER, nullptr);
+  return true;
+}
+
+TlsError connect(session_t session) {
+  if (!session) { return TlsError(); }
+
+  auto ssl = static_cast<SSL *>(session);
+  auto ret = SSL_connect(ssl);
+
+  TlsError err;
+  if (ret == 1) {
+    err.code = ErrorCode::Success;
+  } else {
+    auto ssl_err = SSL_get_error(ssl, ret);
+    err.code = impl::map_ssl_error(ssl_err, err.sys_errno);
+    err.backend_code = ERR_get_error();
+  }
+  return err;
+}
+
+TlsError accept(session_t session) {
+  if (!session) { return TlsError(); }
+
+  auto ssl = static_cast<SSL *>(session);
+  auto ret = SSL_accept(ssl);
+
+  TlsError err;
+  if (ret == 1) {
+    err.code = ErrorCode::Success;
+  } else {
+    auto ssl_err = SSL_get_error(ssl, ret);
+    err.code = impl::map_ssl_error(ssl_err, err.sys_errno);
+    err.backend_code = ERR_get_error();
+  }
+  return err;
+}
+
+bool connect_nonblocking(session_t session, socket_t sock,
+                                time_t timeout_sec, time_t timeout_usec,
+                                TlsError *err) {
+  if (!session) {
+    if (err) { err->code = ErrorCode::Fatal; }
+    return false;
+  }
+
+  auto ssl = static_cast<SSL *>(session);
+  auto bio = SSL_get_rbio(ssl);
+
+  // Set non-blocking mode for handshake
+  detail::set_nonblocking(sock, true);
+  if (bio) { BIO_set_nbio(bio, 1); }
+
+  auto cleanup = detail::scope_exit([&]() {
+    // Restore blocking mode after handshake
+    if (bio) { BIO_set_nbio(bio, 0); }
+    detail::set_nonblocking(sock, false);
+  });
+
+  auto res = 0;
+  while ((res = SSL_connect(ssl)) != 1) {
+    auto ssl_err = SSL_get_error(ssl, res);
+    switch (ssl_err) {
+    case SSL_ERROR_WANT_READ:
+      if (detail::select_read(sock, timeout_sec, timeout_usec) > 0) {
+        continue;
+      }
+      break;
+    case SSL_ERROR_WANT_WRITE:
+      if (detail::select_write(sock, timeout_sec, timeout_usec) > 0) {
+        continue;
+      }
+      break;
+    default: break;
+    }
+    if (err) {
+      err->code = impl::map_ssl_error(ssl_err, err->sys_errno);
+      err->backend_code = ERR_get_error();
+    }
+    return false;
+  }
+  if (err) { err->code = ErrorCode::Success; }
+  return true;
+}
+
+bool accept_nonblocking(session_t session, socket_t sock,
+                               time_t timeout_sec, time_t timeout_usec,
+                               TlsError *err) {
+  if (!session) {
+    if (err) { err->code = ErrorCode::Fatal; }
+    return false;
+  }
+
+  auto ssl = static_cast<SSL *>(session);
+  auto bio = SSL_get_rbio(ssl);
+
+  // Set non-blocking mode for handshake
+  detail::set_nonblocking(sock, true);
+  if (bio) { BIO_set_nbio(bio, 1); }
+
+  auto cleanup = detail::scope_exit([&]() {
+    // Restore blocking mode after handshake
+    if (bio) { BIO_set_nbio(bio, 0); }
+    detail::set_nonblocking(sock, false);
+  });
+
+  auto res = 0;
+  while ((res = SSL_accept(ssl)) != 1) {
+    auto ssl_err = SSL_get_error(ssl, res);
+    switch (ssl_err) {
+    case SSL_ERROR_WANT_READ:
+      if (detail::select_read(sock, timeout_sec, timeout_usec) > 0) {
+        continue;
+      }
+      break;
+    case SSL_ERROR_WANT_WRITE:
+      if (detail::select_write(sock, timeout_sec, timeout_usec) > 0) {
+        continue;
+      }
+      break;
+    default: break;
+    }
+    if (err) {
+      err->code = impl::map_ssl_error(ssl_err, err->sys_errno);
+      err->backend_code = ERR_get_error();
+    }
+    return false;
+  }
+  if (err) { err->code = ErrorCode::Success; }
+  return true;
+}
+
+ssize_t read(session_t session, void *buf, size_t len, TlsError &err) {
+  if (!session || !buf) {
+    err.code = ErrorCode::Fatal;
+    return -1;
+  }
+
+  auto ssl = static_cast<SSL *>(session);
+  constexpr auto max_len =
+      static_cast<size_t>((std::numeric_limits<int>::max)());
+  if (len > max_len) { len = max_len; }
+  auto ret = SSL_read(ssl, buf, static_cast<int>(len));
+
+  if (ret > 0) {
+    err.code = ErrorCode::Success;
+    return ret;
+  }
+
+  auto ssl_err = SSL_get_error(ssl, ret);
+  err.code = impl::map_ssl_error(ssl_err, err.sys_errno);
+  if (err.code == ErrorCode::Fatal) { err.backend_code = ERR_get_error(); }
+  return -1;
+}
+
+ssize_t write(session_t session, const void *buf, size_t len,
+                     TlsError &err) {
+  if (!session || !buf) {
+    err.code = ErrorCode::Fatal;
+    return -1;
+  }
+
+  auto ssl = static_cast<SSL *>(session);
+  auto ret = SSL_write(ssl, buf, static_cast<int>(len));
+
+  if (ret > 0) {
+    err.code = ErrorCode::Success;
+    return ret;
+  }
+
+  auto ssl_err = SSL_get_error(ssl, ret);
+  err.code = impl::map_ssl_error(ssl_err, err.sys_errno);
+  if (err.code == ErrorCode::Fatal) { err.backend_code = ERR_get_error(); }
+  return -1;
+}
+
+int pending(const_session_t session) {
+  if (!session) return 0;
+  return SSL_pending(static_cast<SSL *>(const_cast<void *>(session)));
+}
+
+void shutdown(session_t session, bool graceful) {
+  if (!session) return;
+
+  auto ssl = static_cast<SSL *>(session);
+  if (graceful) {
+    // First call sends close_notify
+    if (SSL_shutdown(ssl) == 0) {
+      // Second call waits for peer's close_notify
+      SSL_shutdown(ssl);
+    }
+  }
+}
+
+bool is_peer_closed(session_t session, socket_t sock) {
+  if (!session) return true;
+
+  // Temporarily set socket to non-blocking to avoid blocking on SSL_peek
+  detail::set_nonblocking(sock, true);
+  auto se = detail::scope_exit([&]() { detail::set_nonblocking(sock, false); });
+
+  auto ssl = static_cast<SSL *>(session);
+  char buf;
+  auto ret = SSL_peek(ssl, &buf, 1);
+  if (ret > 0) return false;
+
+  auto err = SSL_get_error(ssl, ret);
+  return err == SSL_ERROR_ZERO_RETURN;
+}
+
+cert_t get_peer_cert(const_session_t session) {
+  if (!session) return nullptr;
+  return static_cast<cert_t>(SSL_get1_peer_certificate(
+      static_cast<SSL *>(const_cast<void *>(session))));
+}
+
+void free_cert(cert_t cert) {
+  if (cert) { X509_free(static_cast<X509 *>(cert)); }
+}
+
+bool verify_hostname(cert_t cert, const char *hostname) {
+  if (!cert || !hostname) return false;
+
+  auto x509 = static_cast<X509 *>(cert);
+
+  // Use X509_check_ip_asc for IP addresses, X509_check_host for DNS names
+  if (detail::is_ip_address(hostname)) {
+    return X509_check_ip_asc(x509, hostname, 0) == 1;
+  }
+  return X509_check_host(x509, hostname, strlen(hostname), 0, nullptr) == 1;
+}
+
+uint64_t hostname_mismatch_code() {
+  return static_cast<uint64_t>(X509_V_ERR_HOSTNAME_MISMATCH);
+}
+
+long get_verify_result(const_session_t session) {
+  if (!session) return X509_V_ERR_UNSPECIFIED;
+  return SSL_get_verify_result(static_cast<SSL *>(const_cast<void *>(session)));
+}
+
+std::string get_cert_subject_cn(cert_t cert) {
+  if (!cert) return "";
+  auto x509 = static_cast<X509 *>(cert);
+  auto subject_name = X509_get_subject_name(x509);
+  if (!subject_name) return "";
+
+  char buf[256];
+  auto len =
+      X509_NAME_get_text_by_NID(subject_name, NID_commonName, buf, sizeof(buf));
+  if (len < 0) return "";
+  return std::string(buf, static_cast<size_t>(len));
+}
+
+std::string get_cert_issuer_name(cert_t cert) {
+  if (!cert) return "";
+  auto x509 = static_cast<X509 *>(cert);
+  auto issuer_name = X509_get_issuer_name(x509);
+  if (!issuer_name) return "";
+
+  char buf[256];
+  X509_NAME_oneline(issuer_name, buf, sizeof(buf));
+  return std::string(buf);
+}
+
+bool get_cert_sans(cert_t cert, std::vector<SanEntry> &sans) {
+  sans.clear();
+  if (!cert) return false;
+  auto x509 = static_cast<X509 *>(cert);
+
+  auto names = static_cast<GENERAL_NAMES *>(
+      X509_get_ext_d2i(x509, NID_subject_alt_name, nullptr, nullptr));
+  if (!names) return true; // No SANs is valid
+
+  auto count = sk_GENERAL_NAME_num(names);
+  for (int i = 0; i < count; i++) {
+    auto gen = sk_GENERAL_NAME_value(names, i);
+    if (!gen) continue;
+
+    SanEntry entry;
+    switch (gen->type) {
+    case GEN_DNS:
+      entry.type = SanType::DNS;
+      if (gen->d.dNSName) {
+        entry.value = std::string(
+            reinterpret_cast<const char *>(
+                ASN1_STRING_get0_data(gen->d.dNSName)),
+            static_cast<size_t>(ASN1_STRING_length(gen->d.dNSName)));
+      }
+      break;
+    case GEN_IPADD:
+      entry.type = SanType::IP;
+      if (gen->d.iPAddress) {
+        auto data = ASN1_STRING_get0_data(gen->d.iPAddress);
+        auto len = ASN1_STRING_length(gen->d.iPAddress);
+        if (len == 4) {
+          // IPv4
+          char buf[INET_ADDRSTRLEN];
+          inet_ntop(AF_INET, data, buf, sizeof(buf));
+          entry.value = buf;
+        } else if (len == 16) {
+          // IPv6
+          char buf[INET6_ADDRSTRLEN];
+          inet_ntop(AF_INET6, data, buf, sizeof(buf));
+          entry.value = buf;
+        }
+      }
+      break;
+    case GEN_EMAIL:
+      entry.type = SanType::EMAIL;
+      if (gen->d.rfc822Name) {
+        entry.value = std::string(
+            reinterpret_cast<const char *>(
+                ASN1_STRING_get0_data(gen->d.rfc822Name)),
+            static_cast<size_t>(ASN1_STRING_length(gen->d.rfc822Name)));
+      }
+      break;
+    case GEN_URI:
+      entry.type = SanType::URI;
+      if (gen->d.uniformResourceIdentifier) {
+        entry.value = std::string(
+            reinterpret_cast<const char *>(
+                ASN1_STRING_get0_data(gen->d.uniformResourceIdentifier)),
+            static_cast<size_t>(
+                ASN1_STRING_length(gen->d.uniformResourceIdentifier)));
+      }
+      break;
+    default: entry.type = SanType::OTHER; break;
+    }
+
+    if (!entry.value.empty()) { sans.push_back(std::move(entry)); }
+  }
+
+  GENERAL_NAMES_free(names);
+  return true;
+}
+
+bool get_cert_validity(cert_t cert, time_t &not_before,
+                              time_t &not_after) {
+  if (!cert) return false;
+  auto x509 = static_cast<X509 *>(cert);
+
+  auto nb = X509_get0_notBefore(x509);
+  auto na = X509_get0_notAfter(x509);
+  if (!nb || !na) return false;
+
+  ASN1_TIME *epoch = ASN1_TIME_new();
+  if (!epoch) return false;
+  auto se = detail::scope_exit([&] { ASN1_TIME_free(epoch); });
+
+  if (!ASN1_TIME_set(epoch, 0)) return false;
+
+  int pday, psec;
+
+  if (!ASN1_TIME_diff(&pday, &psec, epoch, nb)) return false;
+  not_before = 86400 * (time_t)pday + psec;
+
+  if (!ASN1_TIME_diff(&pday, &psec, epoch, na)) return false;
+  not_after = 86400 * (time_t)pday + psec;
+
+  return true;
+}
+
+std::string get_cert_serial(cert_t cert) {
+  if (!cert) return "";
+  auto x509 = static_cast<X509 *>(cert);
+
+  auto serial = X509_get_serialNumber(x509);
+  if (!serial) return "";
+
+  auto bn = ASN1_INTEGER_to_BN(serial, nullptr);
+  if (!bn) return "";
+
+  auto hex = BN_bn2hex(bn);
+  BN_free(bn);
+  if (!hex) return "";
+
+  std::string result(hex);
+  OPENSSL_free(hex);
+  return result;
+}
+
+bool get_cert_der(cert_t cert, std::vector<unsigned char> &der) {
+  if (!cert) return false;
+  auto x509 = static_cast<X509 *>(cert);
+  auto len = i2d_X509(x509, nullptr);
+  if (len < 0) return false;
+  der.resize(static_cast<size_t>(len));
+  auto p = der.data();
+  i2d_X509(x509, &p);
+  return true;
+}
+
+const char *get_sni(const_session_t session) {
+  if (!session) return nullptr;
+  auto ssl = static_cast<SSL *>(const_cast<void *>(session));
+  return SSL_get_servername(ssl, TLSEXT_NAMETYPE_host_name);
+}
+
+uint64_t peek_error() { return ERR_peek_last_error(); }
+
+uint64_t get_error() { return ERR_get_error(); }
+
+std::string error_string(uint64_t code) {
+  char buf[256];
+  ERR_error_string_n(static_cast<unsigned long>(code), buf, sizeof(buf));
+  return std::string(buf);
+}
+
+ca_store_t create_ca_store(const char *pem, size_t len) {
+  auto mem = BIO_new_mem_buf(pem, static_cast<int>(len));
+  if (!mem) { return nullptr; }
+  auto mem_guard = detail::scope_exit([&] { BIO_free_all(mem); });
+
+  auto inf = PEM_X509_INFO_read_bio(mem, nullptr, nullptr, nullptr);
+  if (!inf) { return nullptr; }
+
+  auto store = X509_STORE_new();
+  if (store) {
+    for (auto i = 0; i < static_cast<int>(sk_X509_INFO_num(inf)); i++) {
+      auto itmp = sk_X509_INFO_value(inf, i);
+      if (!itmp) { continue; }
+      if (itmp->x509) { X509_STORE_add_cert(store, itmp->x509); }
+      if (itmp->crl) { X509_STORE_add_crl(store, itmp->crl); }
+    }
+  }
+
+  sk_X509_INFO_pop_free(inf, X509_INFO_free);
+  return static_cast<ca_store_t>(store);
+}
+
+void free_ca_store(ca_store_t store) {
+  if (store) { X509_STORE_free(static_cast<X509_STORE *>(store)); }
+}
+
+bool set_ca_store(ctx_t ctx, ca_store_t store) {
+  if (!ctx || !store) { return false; }
+  auto ssl_ctx = static_cast<SSL_CTX *>(ctx);
+  auto x509_store = static_cast<X509_STORE *>(store);
+
+  // Check if same store is already set
+  if (SSL_CTX_get_cert_store(ssl_ctx) == x509_store) { return true; }
+
+  // SSL_CTX_set_cert_store takes ownership and frees the old store
+  SSL_CTX_set_cert_store(ssl_ctx, x509_store);
+  return true;
+}
+
+size_t get_ca_certs(ctx_t ctx, std::vector<cert_t> &certs) {
+  certs.clear();
+  if (!ctx) { return 0; }
+  auto ssl_ctx = static_cast<SSL_CTX *>(ctx);
+
+  auto store = SSL_CTX_get_cert_store(ssl_ctx);
+  if (!store) { return 0; }
+
+  auto objs = X509_STORE_get0_objects(store);
+  if (!objs) { return 0; }
+
+  auto count = sk_X509_OBJECT_num(objs);
+  for (decltype(count) i = 0; i < count; i++) {
+    auto obj = sk_X509_OBJECT_value(objs, i);
+    if (!obj) { continue; }
+    if (X509_OBJECT_get_type(obj) == X509_LU_X509) {
+      auto x509 = X509_OBJECT_get0_X509(obj);
+      if (x509) {
+        // Increment reference count so caller can free it
+        X509_up_ref(x509);
+        certs.push_back(static_cast<cert_t>(x509));
+      }
+    }
+  }
+  return certs.size();
+}
+
+std::vector<std::string> get_ca_names(ctx_t ctx) {
+  std::vector<std::string> names;
+  if (!ctx) { return names; }
+  auto ssl_ctx = static_cast<SSL_CTX *>(ctx);
+
+  auto store = SSL_CTX_get_cert_store(ssl_ctx);
+  if (!store) { return names; }
+
+  auto objs = X509_STORE_get0_objects(store);
+  if (!objs) { return names; }
+
+  auto count = sk_X509_OBJECT_num(objs);
+  for (decltype(count) i = 0; i < count; i++) {
+    auto obj = sk_X509_OBJECT_value(objs, i);
+    if (!obj) { continue; }
+    if (X509_OBJECT_get_type(obj) == X509_LU_X509) {
+      auto x509 = X509_OBJECT_get0_X509(obj);
+      if (x509) {
+        auto subject = X509_get_subject_name(x509);
+        if (subject) {
+          char buf[512];
+          X509_NAME_oneline(subject, buf, sizeof(buf));
+          names.push_back(buf);
+        }
+      }
+    }
+  }
+  return names;
+}
+
+bool update_server_cert(ctx_t ctx, const char *cert_pem,
+                               const char *key_pem, const char *password) {
+  if (!ctx || !cert_pem || !key_pem) { return false; }
+  auto ssl_ctx = static_cast<SSL_CTX *>(ctx);
+
+  // Load certificate from PEM
+  auto cert_bio = BIO_new_mem_buf(cert_pem, -1);
+  if (!cert_bio) { return false; }
+  auto cert = PEM_read_bio_X509(cert_bio, nullptr, nullptr, nullptr);
+  BIO_free(cert_bio);
+  if (!cert) { return false; }
+
+  // Load private key from PEM
+  auto key_bio = BIO_new_mem_buf(key_pem, -1);
+  if (!key_bio) {
+    X509_free(cert);
+    return false;
+  }
+  auto key = PEM_read_bio_PrivateKey(key_bio, nullptr, nullptr,
+                                     password ? const_cast<char *>(password)
+                                              : nullptr);
+  BIO_free(key_bio);
+  if (!key) {
+    X509_free(cert);
+    return false;
+  }
+
+  // Update certificate and key
+  auto ret = SSL_CTX_use_certificate(ssl_ctx, cert) == 1 &&
+             SSL_CTX_use_PrivateKey(ssl_ctx, key) == 1;
+
+  X509_free(cert);
+  EVP_PKEY_free(key);
+  return ret;
+}
+
+bool update_server_client_ca(ctx_t ctx, const char *ca_pem) {
+  if (!ctx || !ca_pem) { return false; }
+  auto ssl_ctx = static_cast<SSL_CTX *>(ctx);
+
+  // Create new X509_STORE from PEM
+  auto store = create_ca_store(ca_pem, strlen(ca_pem));
+  if (!store) { return false; }
+
+  // SSL_CTX_set_cert_store takes ownership
+  SSL_CTX_set_cert_store(ssl_ctx, static_cast<X509_STORE *>(store));
+
+  // Set client CA list for client certificate request
+  auto ca_list = impl::create_client_ca_list_from_pem(ca_pem);
+  if (ca_list) {
+    // SSL_CTX_set_client_CA_list takes ownership of ca_list
+    SSL_CTX_set_client_CA_list(ssl_ctx, ca_list);
+  }
+
+  return true;
+}
+
+bool set_verify_callback(ctx_t ctx, VerifyCallback callback) {
+  if (!ctx) { return false; }
+  auto ssl_ctx = static_cast<SSL_CTX *>(ctx);
+
+  impl::get_verify_callback() = std::move(callback);
+
+  if (impl::get_verify_callback()) {
+    SSL_CTX_set_verify(ssl_ctx, SSL_VERIFY_PEER, impl::openssl_verify_callback);
+  } else {
+    SSL_CTX_set_verify(ssl_ctx, SSL_VERIFY_PEER, nullptr);
+  }
+  return true;
+}
+
+long get_verify_error(const_session_t session) {
+  if (!session) { return -1; }
+  auto ssl = static_cast<SSL *>(const_cast<void *>(session));
+  return SSL_get_verify_result(ssl);
+}
+
+std::string verify_error_string(long error_code) {
+  if (error_code == X509_V_OK) { return ""; }
+  const char *str = X509_verify_cert_error_string(static_cast<int>(error_code));
+  return str ? str : "unknown error";
+}
+
+namespace impl {
+
+// OpenSSL-specific helpers for public API wrappers
+ctx_t create_server_context_from_x509(X509 *cert, EVP_PKEY *key,
+                                             X509_STORE *client_ca_store,
+                                             int &out_error) {
+  out_error = 0;
+  auto cert_pem = x509_to_pem(cert);
+  auto key_pem = evp_pkey_to_pem(key);
+  if (cert_pem.empty() || key_pem.empty()) {
+    out_error = static_cast<int>(ERR_get_error());
+    return nullptr;
+  }
+
+  auto ctx = create_server_context();
+  if (!ctx) {
+    out_error = static_cast<int>(get_error());
+    return nullptr;
+  }
+
+  if (!set_server_cert_pem(ctx, cert_pem.c_str(), key_pem.c_str(), nullptr)) {
+    out_error = static_cast<int>(get_error());
+    free_context(ctx);
+    return nullptr;
+  }
+
+  if (client_ca_store) {
+    // Set cert store for verification (SSL_CTX_set_cert_store takes ownership)
+    SSL_CTX_set_cert_store(static_cast<SSL_CTX *>(ctx), client_ca_store);
+
+    // Extract and set client CA list directly from store (more efficient than
+    // PEM conversion)
+    auto ca_list = extract_client_ca_list_from_store(client_ca_store);
+    if (ca_list) {
+      SSL_CTX_set_client_CA_list(static_cast<SSL_CTX *>(ctx), ca_list);
+    }
+
+    set_verify_client(ctx, true);
+  }
+
+  return ctx;
+}
+
+void update_server_certs_from_x509(ctx_t ctx, X509 *cert, EVP_PKEY *key,
+                                          X509_STORE *client_ca_store) {
+  auto cert_pem = x509_to_pem(cert);
+  auto key_pem = evp_pkey_to_pem(key);
+
+  if (!cert_pem.empty() && !key_pem.empty()) {
+    update_server_cert(ctx, cert_pem.c_str(), key_pem.c_str(), nullptr);
+  }
+
+  if (client_ca_store) {
+    auto ca_pem = x509_store_to_pem(client_ca_store);
+    if (!ca_pem.empty()) { update_server_client_ca(ctx, ca_pem.c_str()); }
+    X509_STORE_free(client_ca_store);
+  }
+}
+
+ctx_t create_client_context_from_x509(X509 *cert, EVP_PKEY *key,
+                                             const char *password,
+                                             unsigned long &out_error) {
+  out_error = 0;
+  auto ctx = create_client_context();
+  if (!ctx) {
+    out_error = static_cast<unsigned long>(get_error());
+    return nullptr;
+  }
+
+  if (cert && key) {
+    auto cert_pem = x509_to_pem(cert);
+    auto key_pem = evp_pkey_to_pem(key);
+    if (cert_pem.empty() || key_pem.empty()) {
+      out_error = ERR_get_error();
+      free_context(ctx);
+      return nullptr;
+    }
+    if (!set_client_cert_pem(ctx, cert_pem.c_str(), key_pem.c_str(),
+                             password)) {
+      out_error = static_cast<unsigned long>(get_error());
+      free_context(ctx);
+      return nullptr;
+    }
+  }
+
+  return ctx;
+}
+
+} // namespace impl
+
+} // namespace tls
+
+// ClientImpl::set_ca_cert_store - defined here to use
+// tls::impl::x509_store_to_pem Deprecated: converts X509_STORE to PEM and
+// stores for redirect transfer
+void ClientImpl::set_ca_cert_store(X509_STORE *ca_cert_store) {
+  if (ca_cert_store) {
+    ca_cert_pem_ = tls::impl::x509_store_to_pem(ca_cert_store);
+  }
+}
+
+SSLServer::SSLServer(X509 *cert, EVP_PKEY *private_key,
+                            X509_STORE *client_ca_cert_store) {
+  ctx_ = tls::impl::create_server_context_from_x509(
+      cert, private_key, client_ca_cert_store, last_ssl_error_);
+}
+
+SSLServer::SSLServer(
+    const std::function<bool(SSL_CTX &ssl_ctx)> &setup_ssl_ctx_callback) {
+  // Use abstract API to create context
+  ctx_ = tls::create_server_context();
+  if (ctx_) {
+    // Pass to OpenSSL-specific callback (ctx_ is SSL_CTX* internally)
+    auto ssl_ctx = static_cast<SSL_CTX *>(ctx_);
+    if (!setup_ssl_ctx_callback(*ssl_ctx)) {
+      tls::free_context(ctx_);
+      ctx_ = nullptr;
+    }
+  }
+}
+
+SSL_CTX *SSLServer::ssl_context() const {
+  return static_cast<SSL_CTX *>(ctx_);
+}
+
+void SSLServer::update_certs(X509 *cert, EVP_PKEY *private_key,
+                                    X509_STORE *client_ca_cert_store) {
+  std::lock_guard<std::mutex> guard(ctx_mutex_);
+  tls::impl::update_server_certs_from_x509(ctx_, cert, private_key,
+                                           client_ca_cert_store);
+}
+
+SSLClient::SSLClient(const std::string &host, int port,
+                            X509 *client_cert, EVP_PKEY *client_key,
+                            const std::string &private_key_password)
+    : ClientImpl(host, port) {
+  const char *password =
+      private_key_password.empty() ? nullptr : private_key_password.c_str();
+  ctx_ = tls::impl::create_client_context_from_x509(
+      client_cert, client_key, password, last_backend_error_);
+}
+
+long SSLClient::get_verify_result() const { return verify_result_; }
+
+void SSLClient::set_server_certificate_verifier(
+    std::function<SSLVerifierResponse(SSL *ssl)> verifier) {
+  // Wrap SSL* callback into backend-independent session_verifier_
+  auto v = std::make_shared<std::function<SSLVerifierResponse(SSL *)>>(
+      std::move(verifier));
+  session_verifier_ = [v](tls::session_t session) {
+    return (*v)(static_cast<SSL *>(session));
+  };
+}
+
+SSL_CTX *SSLClient::ssl_context() const {
+  return static_cast<SSL_CTX *>(ctx_);
+}
+
+bool SSLClient::verify_host(X509 *server_cert) const {
+  /* Quote from RFC2818 section 3.1 "Server Identity"
+
+     If a subjectAltName extension of type dNSName is present, that MUST
+     be used as the identity. Otherwise, the (most specific) Common Name
+     field in the Subject field of the certificate MUST be used. Although
+     the use of the Common Name is existing practice, it is deprecated and
+     Certification Authorities are encouraged to use the dNSName instead.
+
+     Matching is performed using the matching rules specified by
+     [RFC2459].  If more than one identity of a given type is present in
+     the certificate (e.g., more than one dNSName name, a match in any one
+     of the set is considered acceptable.) Names may contain the wildcard
+     character * which is considered to match any single domain name
+     component or component fragment. E.g., *.a.com matches foo.a.com but
+     not bar.foo.a.com. f*.com matches foo.com but not bar.com.
+
+     In some cases, the URI is specified as an IP address rather than a
+     hostname. In this case, the iPAddress subjectAltName must be present
+     in the certificate and must exactly match the IP in the URI.
+
+  */
+  return verify_host_with_subject_alt_name(server_cert) ||
+         verify_host_with_common_name(server_cert);
+}
+
+bool
+SSLClient::verify_host_with_subject_alt_name(X509 *server_cert) const {
+  auto ret = false;
+
+  auto type = GEN_DNS;
+
+  struct in6_addr addr6 = {};
+  struct in_addr addr = {};
+  size_t addr_len = 0;
+
+#ifndef __MINGW32__
+  if (inet_pton(AF_INET6, host_.c_str(), &addr6)) {
+    type = GEN_IPADD;
+    addr_len = sizeof(struct in6_addr);
+  } else if (inet_pton(AF_INET, host_.c_str(), &addr)) {
+    type = GEN_IPADD;
+    addr_len = sizeof(struct in_addr);
+  }
+#endif
+
+  auto alt_names = static_cast<const struct stack_st_GENERAL_NAME *>(
+      X509_get_ext_d2i(server_cert, NID_subject_alt_name, nullptr, nullptr));
+
+  if (alt_names) {
+    auto dsn_matched = false;
+    auto ip_matched = false;
+
+    auto count = sk_GENERAL_NAME_num(alt_names);
+
+    for (decltype(count) i = 0; i < count && !dsn_matched; i++) {
+      auto val = sk_GENERAL_NAME_value(alt_names, i);
+      if (!val || val->type != type) { continue; }
+
+      auto name =
+          reinterpret_cast<const char *>(ASN1_STRING_get0_data(val->d.ia5));
+      if (name == nullptr) { continue; }
+
+      auto name_len = static_cast<size_t>(ASN1_STRING_length(val->d.ia5));
+
+      switch (type) {
+      case GEN_DNS:
+        dsn_matched =
+            detail::match_hostname(std::string(name, name_len), host_);
+        break;
+
+      case GEN_IPADD:
+        if (!memcmp(&addr6, name, addr_len) || !memcmp(&addr, name, addr_len)) {
+          ip_matched = true;
+        }
+        break;
+      }
+    }
+
+    if (dsn_matched || ip_matched) { ret = true; }
+  }
+
+  GENERAL_NAMES_free(const_cast<STACK_OF(GENERAL_NAME) *>(
+      reinterpret_cast<const STACK_OF(GENERAL_NAME) *>(alt_names)));
+  return ret;
+}
+
+bool SSLClient::verify_host_with_common_name(X509 *server_cert) const {
+  const auto subject_name = X509_get_subject_name(server_cert);
+
+  if (subject_name != nullptr) {
+    char name[BUFSIZ];
+    auto name_len = X509_NAME_get_text_by_NID(subject_name, NID_commonName,
+                                              name, sizeof(name));
+
+    if (name_len != -1) {
+      return detail::match_hostname(
+          std::string(name, static_cast<size_t>(name_len)), host_);
+    }
+  }
+
+  return false;
+}
+
+#endif // CPPHTTPLIB_OPENSSL_SUPPORT
+
+/*
+ * Group 9: TLS abstraction layer - Mbed TLS backend
+ */
+
+/*
+ * Mbed TLS Backend Implementation
+ */
+
+#ifdef CPPHTTPLIB_MBEDTLS_SUPPORT
+namespace tls {
+
+namespace impl {
+
+// Mbed TLS session wrapper
+struct MbedTlsSession {
+  mbedtls_ssl_context ssl;
+  socket_t sock = INVALID_SOCKET;
+  std::string hostname;     // For client: set via set_sni
+  std::string sni_hostname; // For server: received from client via SNI callback
+
+  MbedTlsSession() { mbedtls_ssl_init(&ssl); }
+
+  ~MbedTlsSession() { mbedtls_ssl_free(&ssl); }
+
+  MbedTlsSession(const MbedTlsSession &) = delete;
+  MbedTlsSession &operator=(const MbedTlsSession &) = delete;
+};
+
+// Thread-local error code accessor for Mbed TLS (since it doesn't have an error
+// queue)
+int &mbedtls_last_error() {
+  static thread_local int err = 0;
+  return err;
+}
+
+// Helper to map Mbed TLS error to ErrorCode
+ErrorCode map_mbedtls_error(int ret, int &out_errno) {
+  if (ret == 0) { return ErrorCode::Success; }
+  if (ret == MBEDTLS_ERR_SSL_WANT_READ) { return ErrorCode::WantRead; }
+  if (ret == MBEDTLS_ERR_SSL_WANT_WRITE) { return ErrorCode::WantWrite; }
+  if (ret == MBEDTLS_ERR_SSL_PEER_CLOSE_NOTIFY) {
+    return ErrorCode::PeerClosed;
+  }
+  if (ret == MBEDTLS_ERR_NET_CONN_RESET || ret == MBEDTLS_ERR_NET_SEND_FAILED ||
+      ret == MBEDTLS_ERR_NET_RECV_FAILED) {
+    out_errno = errno;
+    return ErrorCode::SyscallError;
+  }
+  if (ret == MBEDTLS_ERR_X509_CERT_VERIFY_FAILED) {
+    return ErrorCode::CertVerifyFailed;
+  }
+  return ErrorCode::Fatal;
+}
+
+// BIO-like send callback for Mbed TLS
+int mbedtls_net_send_cb(void *ctx, const unsigned char *buf,
+                               size_t len) {
+  auto sock = *static_cast<socket_t *>(ctx);
+#ifdef _WIN32
+  auto ret =
+      send(sock, reinterpret_cast<const char *>(buf), static_cast<int>(len), 0);
+  if (ret == SOCKET_ERROR) {
+    int err = WSAGetLastError();
+    if (err == WSAEWOULDBLOCK) { return MBEDTLS_ERR_SSL_WANT_WRITE; }
+    return MBEDTLS_ERR_NET_SEND_FAILED;
+  }
+#else
+  auto ret = send(sock, buf, len, 0);
+  if (ret < 0) {
+    if (errno == EAGAIN || errno == EWOULDBLOCK) {
+      return MBEDTLS_ERR_SSL_WANT_WRITE;
+    }
+    return MBEDTLS_ERR_NET_SEND_FAILED;
+  }
+#endif
+  return static_cast<int>(ret);
+}
+
+// BIO-like recv callback for Mbed TLS
+int mbedtls_net_recv_cb(void *ctx, unsigned char *buf, size_t len) {
+  auto sock = *static_cast<socket_t *>(ctx);
+#ifdef _WIN32
+  auto ret =
+      recv(sock, reinterpret_cast<char *>(buf), static_cast<int>(len), 0);
+  if (ret == SOCKET_ERROR) {
+    int err = WSAGetLastError();
+    if (err == WSAEWOULDBLOCK) { return MBEDTLS_ERR_SSL_WANT_READ; }
+    return MBEDTLS_ERR_NET_RECV_FAILED;
+  }
+#else
+  auto ret = recv(sock, buf, len, 0);
+  if (ret < 0) {
+    if (errno == EAGAIN || errno == EWOULDBLOCK) {
+      return MBEDTLS_ERR_SSL_WANT_READ;
+    }
+    return MBEDTLS_ERR_NET_RECV_FAILED;
+  }
+#endif
+  if (ret == 0) { return MBEDTLS_ERR_SSL_PEER_CLOSE_NOTIFY; }
+  return static_cast<int>(ret);
+}
+
+// MbedTlsContext constructor/destructor implementations
+MbedTlsContext::MbedTlsContext() {
+  mbedtls_ssl_config_init(&conf);
+  mbedtls_entropy_init(&entropy);
+  mbedtls_ctr_drbg_init(&ctr_drbg);
+  mbedtls_x509_crt_init(&ca_chain);
+  mbedtls_x509_crt_init(&own_cert);
+  mbedtls_pk_init(&own_key);
+}
+
+MbedTlsContext::~MbedTlsContext() {
+  mbedtls_pk_free(&own_key);
+  mbedtls_x509_crt_free(&own_cert);
+  mbedtls_x509_crt_free(&ca_chain);
+  mbedtls_ctr_drbg_free(&ctr_drbg);
+  mbedtls_entropy_free(&entropy);
+  mbedtls_ssl_config_free(&conf);
+}
+
+// Thread-local storage for SNI captured during handshake
+// This is needed because the SNI callback doesn't have a way to pass
+// session-specific data before the session is fully set up
+std::string &mbedpending_sni() {
+  static thread_local std::string sni;
+  return sni;
+}
+
+// SNI callback for Mbed TLS server to capture client's SNI hostname
+int mbedtls_sni_callback(void *p_ctx, mbedtls_ssl_context *ssl,
+                                const unsigned char *name, size_t name_len) {
+  (void)p_ctx;
+  (void)ssl;
+
+  // Store SNI name in thread-local storage
+  // It will be retrieved and stored in the session after handshake
+  if (name && name_len > 0) {
+    mbedpending_sni().assign(reinterpret_cast<const char *>(name), name_len);
+  } else {
+    mbedpending_sni().clear();
+  }
+  return 0; // Accept any SNI
+}
+
+int mbedtls_verify_callback(void *data, mbedtls_x509_crt *crt,
+                                   int cert_depth, uint32_t *flags);
+
+// Check if a string is an IPv4 address
+bool is_ipv4_address(const std::string &str) {
+  int dots = 0;
+  for (char c : str) {
+    if (c == '.') {
+      dots++;
+    } else if (!isdigit(static_cast<unsigned char>(c))) {
+      return false;
+    }
+  }
+  return dots == 3;
+}
+
+// Parse IPv4 address string to bytes
+bool parse_ipv4(const std::string &str, unsigned char *out) {
+  int parts[4];
+  if (sscanf(str.c_str(), "%d.%d.%d.%d", &parts[0], &parts[1], &parts[2],
+             &parts[3]) != 4) {
+    return false;
+  }
+  for (int i = 0; i < 4; i++) {
+    if (parts[i] < 0 || parts[i] > 255) return false;
+    out[i] = static_cast<unsigned char>(parts[i]);
+  }
+  return true;
+}
+
+// MbedTLS verify callback wrapper
+int mbedtls_verify_callback(void *data, mbedtls_x509_crt *crt,
+                                   int cert_depth, uint32_t *flags) {
+  auto &callback = get_verify_callback();
+  if (!callback) { return 0; } // Continue with default verification
+
+  // data points to the MbedTlsSession
+  auto *session = static_cast<MbedTlsSession *>(data);
+
+  // Build context
+  VerifyContext verify_ctx;
+  verify_ctx.session = static_cast<session_t>(session);
+  verify_ctx.cert = static_cast<cert_t>(crt);
+  verify_ctx.depth = cert_depth;
+  verify_ctx.preverify_ok = (*flags == 0);
+  verify_ctx.error_code = static_cast<long>(*flags);
+
+  // Convert Mbed TLS flags to error string
+  static thread_local char error_buf[256];
+  if (*flags != 0) {
+    mbedtls_x509_crt_verify_info(error_buf, sizeof(error_buf), "", *flags);
+    verify_ctx.error_string = error_buf;
+  } else {
+    verify_ctx.error_string = nullptr;
+  }
+
+  bool accepted = callback(verify_ctx);
+
+  if (accepted) {
+    *flags = 0; // Clear all error flags
+    return 0;
+  }
+  return MBEDTLS_ERR_X509_CERT_VERIFY_FAILED;
+}
+
+} // namespace impl
+
+ctx_t create_client_context() {
+  auto ctx = new (std::nothrow) impl::MbedTlsContext();
+  if (!ctx) { return nullptr; }
+
+  ctx->is_server = false;
+
+  // Seed the random number generator
+  const char *pers = "httplib_client";
+  int ret = mbedtls_ctr_drbg_seed(
+      &ctx->ctr_drbg, mbedtls_entropy_func, &ctx->entropy,
+      reinterpret_cast<const unsigned char *>(pers), strlen(pers));
+  if (ret != 0) {
+    impl::mbedtls_last_error() = ret;
+    delete ctx;
+    return nullptr;
+  }
+
+  // Set up SSL config for client
+  ret = mbedtls_ssl_config_defaults(&ctx->conf, MBEDTLS_SSL_IS_CLIENT,
+                                    MBEDTLS_SSL_TRANSPORT_STREAM,
+                                    MBEDTLS_SSL_PRESET_DEFAULT);
+  if (ret != 0) {
+    impl::mbedtls_last_error() = ret;
+    delete ctx;
+    return nullptr;
+  }
+
+  // Set random number generator
+  mbedtls_ssl_conf_rng(&ctx->conf, mbedtls_ctr_drbg_random, &ctx->ctr_drbg);
+
+  // Default: verify peer certificate
+  mbedtls_ssl_conf_authmode(&ctx->conf, MBEDTLS_SSL_VERIFY_REQUIRED);
+
+  // Set minimum TLS version to 1.2
+#ifdef CPPHTTPLIB_MBEDTLS_V3
+  mbedtls_ssl_conf_min_tls_version(&ctx->conf, MBEDTLS_SSL_VERSION_TLS1_2);
+#else
+  mbedtls_ssl_conf_min_version(&ctx->conf, MBEDTLS_SSL_MAJOR_VERSION_3,
+                               MBEDTLS_SSL_MINOR_VERSION_3);
+#endif
+
+  return static_cast<ctx_t>(ctx);
+}
+
+ctx_t create_server_context() {
+  auto ctx = new (std::nothrow) impl::MbedTlsContext();
+  if (!ctx) { return nullptr; }
+
+  ctx->is_server = true;
+
+  // Seed the random number generator
+  const char *pers = "httplib_server";
+  int ret = mbedtls_ctr_drbg_seed(
+      &ctx->ctr_drbg, mbedtls_entropy_func, &ctx->entropy,
+      reinterpret_cast<const unsigned char *>(pers), strlen(pers));
+  if (ret != 0) {
+    impl::mbedtls_last_error() = ret;
+    delete ctx;
+    return nullptr;
+  }
+
+  // Set up SSL config for server
+  ret = mbedtls_ssl_config_defaults(&ctx->conf, MBEDTLS_SSL_IS_SERVER,
+                                    MBEDTLS_SSL_TRANSPORT_STREAM,
+                                    MBEDTLS_SSL_PRESET_DEFAULT);
+  if (ret != 0) {
+    impl::mbedtls_last_error() = ret;
+    delete ctx;
+    return nullptr;
+  }
+
+  // Set random number generator
+  mbedtls_ssl_conf_rng(&ctx->conf, mbedtls_ctr_drbg_random, &ctx->ctr_drbg);
+
+  // Default: don't verify client
+  mbedtls_ssl_conf_authmode(&ctx->conf, MBEDTLS_SSL_VERIFY_NONE);
+
+  // Set minimum TLS version to 1.2
+#ifdef CPPHTTPLIB_MBEDTLS_V3
+  mbedtls_ssl_conf_min_tls_version(&ctx->conf, MBEDTLS_SSL_VERSION_TLS1_2);
+#else
+  mbedtls_ssl_conf_min_version(&ctx->conf, MBEDTLS_SSL_MAJOR_VERSION_3,
+                               MBEDTLS_SSL_MINOR_VERSION_3);
+#endif
+
+  // Set SNI callback to capture client's SNI hostname
+  mbedtls_ssl_conf_sni(&ctx->conf, impl::mbedtls_sni_callback, nullptr);
+
+  return static_cast<ctx_t>(ctx);
+}
+
+void free_context(ctx_t ctx) {
+  if (ctx) { delete static_cast<impl::MbedTlsContext *>(ctx); }
+}
+
+bool set_min_version(ctx_t ctx, Version version) {
+  if (!ctx) { return false; }
+  auto mctx = static_cast<impl::MbedTlsContext *>(ctx);
+
+#ifdef CPPHTTPLIB_MBEDTLS_V3
+  // Mbed TLS 3.x uses mbedtls_ssl_protocol_version enum
+  mbedtls_ssl_protocol_version min_ver = MBEDTLS_SSL_VERSION_TLS1_2;
+  if (version >= Version::TLS1_3) {
+#if defined(MBEDTLS_SSL_PROTO_TLS1_3)
+    min_ver = MBEDTLS_SSL_VERSION_TLS1_3;
+#endif
+  }
+  mbedtls_ssl_conf_min_tls_version(&mctx->conf, min_ver);
+#else
+  // Mbed TLS 2.x uses major/minor version numbers
+  int major = MBEDTLS_SSL_MAJOR_VERSION_3;
+  int minor = MBEDTLS_SSL_MINOR_VERSION_3; // TLS 1.2
+  if (version >= Version::TLS1_3) {
+#if defined(MBEDTLS_SSL_PROTO_TLS1_3)
+    minor = MBEDTLS_SSL_MINOR_VERSION_4; // TLS 1.3
+#else
+    minor = MBEDTLS_SSL_MINOR_VERSION_3; // Fall back to TLS 1.2
+#endif
+  }
+  mbedtls_ssl_conf_min_version(&mctx->conf, major, minor);
+#endif
+  return true;
+}
+
+bool load_ca_pem(ctx_t ctx, const char *pem, size_t len) {
+  if (!ctx || !pem) { return false; }
+  auto mctx = static_cast<impl::MbedTlsContext *>(ctx);
+
+  // mbedtls_x509_crt_parse expects null-terminated string for PEM
+  // Add null terminator if not present
+  std::string pem_str(pem, len);
+  int ret = mbedtls_x509_crt_parse(
+      &mctx->ca_chain, reinterpret_cast<const unsigned char *>(pem_str.c_str()),
+      pem_str.size() + 1);
+  if (ret != 0) {
+    impl::mbedtls_last_error() = ret;
+    return false;
+  }
+
+  mbedtls_ssl_conf_ca_chain(&mctx->conf, &mctx->ca_chain, nullptr);
+  return true;
+}
+
+bool load_ca_file(ctx_t ctx, const char *file_path) {
+  if (!ctx || !file_path) { return false; }
+  auto mctx = static_cast<impl::MbedTlsContext *>(ctx);
+
+  int ret = mbedtls_x509_crt_parse_file(&mctx->ca_chain, file_path);
+  if (ret != 0) {
+    impl::mbedtls_last_error() = ret;
+    return false;
+  }
+
+  mbedtls_ssl_conf_ca_chain(&mctx->conf, &mctx->ca_chain, nullptr);
+  return true;
+}
+
+bool load_ca_dir(ctx_t ctx, const char *dir_path) {
+  if (!ctx || !dir_path) { return false; }
+  auto mctx = static_cast<impl::MbedTlsContext *>(ctx);
+
+  int ret = mbedtls_x509_crt_parse_path(&mctx->ca_chain, dir_path);
+  if (ret < 0) { // Returns number of certs on success, negative on error
+    impl::mbedtls_last_error() = ret;
+    return false;
+  }
+
+  mbedtls_ssl_conf_ca_chain(&mctx->conf, &mctx->ca_chain, nullptr);
+  return true;
+}
+
+bool load_system_certs(ctx_t ctx) {
+  if (!ctx) { return false; }
+  auto mctx = static_cast<impl::MbedTlsContext *>(ctx);
+  bool loaded = false;
+
+#ifdef _WIN32
+  // Load from Windows certificate store (ROOT and CA)
+  static const wchar_t *store_names[] = {L"ROOT", L"CA"};
+  for (auto store_name : store_names) {
+    HCERTSTORE hStore = CertOpenSystemStoreW(0, store_name);
+    if (hStore) {
+      PCCERT_CONTEXT pContext = nullptr;
+      while ((pContext = CertEnumCertificatesInStore(hStore, pContext)) !=
+             nullptr) {
+        int ret = mbedtls_x509_crt_parse_der(
+            &mctx->ca_chain, pContext->pbCertEncoded, pContext->cbCertEncoded);
+        if (ret == 0) { loaded = true; }
+      }
+      CertCloseStore(hStore, 0);
+    }
+  }
+#elif defined(__APPLE__) && defined(CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN)
+  // Load from macOS Keychain
+  CFArrayRef certs = nullptr;
+  OSStatus status = SecTrustCopyAnchorCertificates(&certs);
+  if (status == errSecSuccess && certs) {
+    CFIndex count = CFArrayGetCount(certs);
+    for (CFIndex i = 0; i < count; i++) {
+      SecCertificateRef cert =
+          (SecCertificateRef)CFArrayGetValueAtIndex(certs, i);
+      CFDataRef data = SecCertificateCopyData(cert);
+      if (data) {
+        int ret = mbedtls_x509_crt_parse_der(
+            &mctx->ca_chain, CFDataGetBytePtr(data),
+            static_cast<size_t>(CFDataGetLength(data)));
+        if (ret == 0) { loaded = true; }
+        CFRelease(data);
+      }
+    }
+    CFRelease(certs);
+  }
+#else
+  // Try common CA certificate locations on Linux/Unix
+  static const char *ca_paths[] = {
+      "/etc/ssl/certs/ca-certificates.crt", // Debian/Ubuntu
+      "/etc/pki/tls/certs/ca-bundle.crt",   // RHEL/CentOS
+      "/etc/ssl/ca-bundle.pem",             // OpenSUSE
+      "/etc/pki/tls/cacert.pem",            // OpenELEC
+      "/etc/ssl/cert.pem",                  // Alpine, FreeBSD
+      nullptr};
+
+  for (const char **path = ca_paths; *path; ++path) {
+    int ret = mbedtls_x509_crt_parse_file(&mctx->ca_chain, *path);
+    if (ret >= 0) {
+      loaded = true;
+      break;
+    }
+  }
+
+  // Also try the CA directory
+  if (!loaded) {
+    static const char *ca_dirs[] = {"/etc/ssl/certs",     // Debian/Ubuntu
+                                    "/etc/pki/tls/certs", // RHEL/CentOS
+                                    "/usr/share/ca-certificates", nullptr};
+
+    for (const char **dir = ca_dirs; *dir; ++dir) {
+      int ret = mbedtls_x509_crt_parse_path(&mctx->ca_chain, *dir);
+      if (ret >= 0) {
+        loaded = true;
+        break;
+      }
+    }
+  }
+#endif
+
+  if (loaded) {
+    mbedtls_ssl_conf_ca_chain(&mctx->conf, &mctx->ca_chain, nullptr);
+  }
+  return loaded;
+}
+
+bool set_client_cert_pem(ctx_t ctx, const char *cert, const char *key,
+                                const char *password) {
+  if (!ctx || !cert || !key) { return false; }
+  auto mctx = static_cast<impl::MbedTlsContext *>(ctx);
+
+  // Parse certificate
+  std::string cert_str(cert);
+  int ret = mbedtls_x509_crt_parse(
+      &mctx->own_cert,
+      reinterpret_cast<const unsigned char *>(cert_str.c_str()),
+      cert_str.size() + 1);
+  if (ret != 0) {
+    impl::mbedtls_last_error() = ret;
+    return false;
+  }
+
+  // Parse private key
+  std::string key_str(key);
+  const unsigned char *pwd =
+      password ? reinterpret_cast<const unsigned char *>(password) : nullptr;
+  size_t pwd_len = password ? strlen(password) : 0;
+
+#ifdef CPPHTTPLIB_MBEDTLS_V3
+  ret = mbedtls_pk_parse_key(
+      &mctx->own_key, reinterpret_cast<const unsigned char *>(key_str.c_str()),
+      key_str.size() + 1, pwd, pwd_len, mbedtls_ctr_drbg_random,
+      &mctx->ctr_drbg);
+#else
+  ret = mbedtls_pk_parse_key(
+      &mctx->own_key, reinterpret_cast<const unsigned char *>(key_str.c_str()),
+      key_str.size() + 1, pwd, pwd_len);
+#endif
+  if (ret != 0) {
+    impl::mbedtls_last_error() = ret;
+    return false;
+  }
+
+  ret = mbedtls_ssl_conf_own_cert(&mctx->conf, &mctx->own_cert, &mctx->own_key);
+  if (ret != 0) {
+    impl::mbedtls_last_error() = ret;
+    return false;
+  }
+
+  return true;
+}
+
+bool set_client_cert_file(ctx_t ctx, const char *cert_path,
+                                 const char *key_path, const char *password) {
+  if (!ctx || !cert_path || !key_path) { return false; }
+  auto mctx = static_cast<impl::MbedTlsContext *>(ctx);
+
+  // Parse certificate file
+  int ret = mbedtls_x509_crt_parse_file(&mctx->own_cert, cert_path);
+  if (ret != 0) {
+    impl::mbedtls_last_error() = ret;
+    return false;
+  }
+
+  // Parse private key file
+#ifdef CPPHTTPLIB_MBEDTLS_V3
+  ret = mbedtls_pk_parse_keyfile(&mctx->own_key, key_path, password,
+                                 mbedtls_ctr_drbg_random, &mctx->ctr_drbg);
+#else
+  ret = mbedtls_pk_parse_keyfile(&mctx->own_key, key_path, password);
+#endif
+  if (ret != 0) {
+    impl::mbedtls_last_error() = ret;
+    return false;
+  }
+
+  ret = mbedtls_ssl_conf_own_cert(&mctx->conf, &mctx->own_cert, &mctx->own_key);
+  if (ret != 0) {
+    impl::mbedtls_last_error() = ret;
+    return false;
+  }
+
+  return true;
+}
+
+void set_verify_client(ctx_t ctx, bool require) {
+  if (!ctx) { return; }
+  auto mctx = static_cast<impl::MbedTlsContext *>(ctx);
+  mctx->verify_client = require;
+  if (require) {
+    mbedtls_ssl_conf_authmode(&mctx->conf, MBEDTLS_SSL_VERIFY_REQUIRED);
+  } else {
+    // If a verify callback is set, use OPTIONAL mode to ensure the callback
+    // is called (matching OpenSSL behavior). Otherwise use NONE.
+    mbedtls_ssl_conf_authmode(&mctx->conf, mctx->has_verify_callback
+                                               ? MBEDTLS_SSL_VERIFY_OPTIONAL
+                                               : MBEDTLS_SSL_VERIFY_NONE);
+  }
+}
+
+session_t create_session(ctx_t ctx, socket_t sock) {
+  if (!ctx || sock == INVALID_SOCKET) { return nullptr; }
+  auto mctx = static_cast<impl::MbedTlsContext *>(ctx);
+
+  auto session = new (std::nothrow) impl::MbedTlsSession();
+  if (!session) { return nullptr; }
+
+  session->sock = sock;
+
+  int ret = mbedtls_ssl_setup(&session->ssl, &mctx->conf);
+  if (ret != 0) {
+    impl::mbedtls_last_error() = ret;
+    delete session;
+    return nullptr;
+  }
+
+  // Set BIO callbacks
+  mbedtls_ssl_set_bio(&session->ssl, &session->sock, impl::mbedtls_net_send_cb,
+                      impl::mbedtls_net_recv_cb, nullptr);
+
+  // Set per-session verify callback with session pointer if callback is
+  // registered
+  if (mctx->has_verify_callback) {
+    mbedtls_ssl_set_verify(&session->ssl, impl::mbedtls_verify_callback,
+                           session);
+  }
+
+  return static_cast<session_t>(session);
+}
+
+void free_session(session_t session) {
+  if (session) { delete static_cast<impl::MbedTlsSession *>(session); }
+}
+
+bool set_sni(session_t session, const char *hostname) {
+  if (!session || !hostname) { return false; }
+  auto msession = static_cast<impl::MbedTlsSession *>(session);
+
+  int ret = mbedtls_ssl_set_hostname(&msession->ssl, hostname);
+  if (ret != 0) {
+    impl::mbedtls_last_error() = ret;
+    return false;
+  }
+
+  msession->hostname = hostname;
+  return true;
+}
+
+bool set_hostname(session_t session, const char *hostname) {
+  // In Mbed TLS, set_hostname also sets up hostname verification
+  return set_sni(session, hostname);
+}
+
+TlsError connect(session_t session) {
+  TlsError err;
+  if (!session) {
+    err.code = ErrorCode::Fatal;
+    return err;
+  }
+
+  auto msession = static_cast<impl::MbedTlsSession *>(session);
+  int ret = mbedtls_ssl_handshake(&msession->ssl);
+
+  if (ret == 0) {
+    err.code = ErrorCode::Success;
+  } else {
+    err.code = impl::map_mbedtls_error(ret, err.sys_errno);
+    err.backend_code = static_cast<uint64_t>(-ret);
+    impl::mbedtls_last_error() = ret;
+  }
+
+  return err;
+}
+
+TlsError accept(session_t session) {
+  // Same as connect for Mbed TLS - handshake works for both client and server
+  auto result = connect(session);
+
+  // After successful handshake, capture SNI from thread-local storage
+  if (result.code == ErrorCode::Success && session) {
+    auto msession = static_cast<impl::MbedTlsSession *>(session);
+    msession->sni_hostname = std::move(impl::mbedpending_sni());
+    impl::mbedpending_sni().clear();
+  }
+
+  return result;
+}
+
+bool connect_nonblocking(session_t session, socket_t sock,
+                                time_t timeout_sec, time_t timeout_usec,
+                                TlsError *err) {
+  if (!session) {
+    if (err) { err->code = ErrorCode::Fatal; }
+    return false;
+  }
+
+  auto msession = static_cast<impl::MbedTlsSession *>(session);
+
+  // Set socket to non-blocking mode
+  detail::set_nonblocking(sock, true);
+  auto cleanup =
+      detail::scope_exit([&]() { detail::set_nonblocking(sock, false); });
+
+  int ret;
+  while ((ret = mbedtls_ssl_handshake(&msession->ssl)) != 0) {
+    if (ret == MBEDTLS_ERR_SSL_WANT_READ) {
+      if (detail::select_read(sock, timeout_sec, timeout_usec) > 0) {
+        continue;
+      }
+    } else if (ret == MBEDTLS_ERR_SSL_WANT_WRITE) {
+      if (detail::select_write(sock, timeout_sec, timeout_usec) > 0) {
+        continue;
+      }
+    }
+
+    // TlsError or timeout
+    if (err) {
+      err->code = impl::map_mbedtls_error(ret, err->sys_errno);
+      err->backend_code = static_cast<uint64_t>(-ret);
+    }
+    impl::mbedtls_last_error() = ret;
+    return false;
+  }
+
+  if (err) { err->code = ErrorCode::Success; }
+  return true;
+}
+
+bool accept_nonblocking(session_t session, socket_t sock,
+                               time_t timeout_sec, time_t timeout_usec,
+                               TlsError *err) {
+  // Same implementation as connect for Mbed TLS
+  bool result =
+      connect_nonblocking(session, sock, timeout_sec, timeout_usec, err);
+
+  // After successful handshake, capture SNI from thread-local storage
+  if (result && session) {
+    auto msession = static_cast<impl::MbedTlsSession *>(session);
+    msession->sni_hostname = std::move(impl::mbedpending_sni());
+    impl::mbedpending_sni().clear();
+  }
+
+  return result;
+}
+
+ssize_t read(session_t session, void *buf, size_t len, TlsError &err) {
+  if (!session || !buf) {
+    err.code = ErrorCode::Fatal;
+    return -1;
+  }
+
+  auto msession = static_cast<impl::MbedTlsSession *>(session);
+  int ret =
+      mbedtls_ssl_read(&msession->ssl, static_cast<unsigned char *>(buf), len);
+
+  if (ret > 0) {
+    err.code = ErrorCode::Success;
+    return static_cast<ssize_t>(ret);
+  }
+
+  if (ret == 0) {
+    err.code = ErrorCode::PeerClosed;
+    return 0;
+  }
+
+  err.code = impl::map_mbedtls_error(ret, err.sys_errno);
+  err.backend_code = static_cast<uint64_t>(-ret);
+  impl::mbedtls_last_error() = ret;
+  return -1;
+}
+
+ssize_t write(session_t session, const void *buf, size_t len,
+                     TlsError &err) {
+  if (!session || !buf) {
+    err.code = ErrorCode::Fatal;
+    return -1;
+  }
+
+  auto msession = static_cast<impl::MbedTlsSession *>(session);
+  int ret = mbedtls_ssl_write(&msession->ssl,
+                              static_cast<const unsigned char *>(buf), len);
+
+  if (ret > 0) {
+    err.code = ErrorCode::Success;
+    return static_cast<ssize_t>(ret);
+  }
+
+  if (ret == 0) {
+    err.code = ErrorCode::PeerClosed;
+    return 0;
+  }
+
+  err.code = impl::map_mbedtls_error(ret, err.sys_errno);
+  err.backend_code = static_cast<uint64_t>(-ret);
+  impl::mbedtls_last_error() = ret;
+  return -1;
+}
+
+int pending(const_session_t session) {
+  if (!session) { return 0; }
+  auto msession =
+      static_cast<impl::MbedTlsSession *>(const_cast<void *>(session));
+  return static_cast<int>(mbedtls_ssl_get_bytes_avail(&msession->ssl));
+}
+
+void shutdown(session_t session, bool graceful) {
+  if (!session) { return; }
+  auto msession = static_cast<impl::MbedTlsSession *>(session);
+
+  if (graceful) {
+    // Try to send close_notify, but don't block forever
+    int ret;
+    int attempts = 0;
+    while ((ret = mbedtls_ssl_close_notify(&msession->ssl)) != 0 &&
+           attempts < 3) {
+      if (ret != MBEDTLS_ERR_SSL_WANT_READ &&
+          ret != MBEDTLS_ERR_SSL_WANT_WRITE) {
+        break;
+      }
+      attempts++;
+    }
+  }
+}
+
+bool is_peer_closed(session_t session, socket_t sock) {
+  if (!session || sock == INVALID_SOCKET) { return true; }
+  auto msession = static_cast<impl::MbedTlsSession *>(session);
+
+  // Check if there's already decrypted data available in the TLS buffer
+  // If so, the connection is definitely alive
+  if (mbedtls_ssl_get_bytes_avail(&msession->ssl) > 0) { return false; }
+
+  // Set socket to non-blocking to avoid blocking on read
+  detail::set_nonblocking(sock, true);
+  auto cleanup =
+      detail::scope_exit([&]() { detail::set_nonblocking(sock, false); });
+
+  // Try a 1-byte read to check connection status
+  // Note: This will consume the byte if data is available, but for the
+  // purpose of checking if peer is closed, this should be acceptable
+  // since we're only called when we expect the connection might be closing
+  unsigned char buf;
+  int ret = mbedtls_ssl_read(&msession->ssl, &buf, 1);
+
+  // If we got data or WANT_READ (would block), connection is alive
+  if (ret > 0 || ret == MBEDTLS_ERR_SSL_WANT_READ) { return false; }
+
+  // If we get a peer close notify or a connection reset, the peer is closed
+  return ret == MBEDTLS_ERR_SSL_PEER_CLOSE_NOTIFY ||
+         ret == MBEDTLS_ERR_NET_CONN_RESET || ret == 0;
+}
+
+cert_t get_peer_cert(const_session_t session) {
+  if (!session) { return nullptr; }
+  auto msession =
+      static_cast<impl::MbedTlsSession *>(const_cast<void *>(session));
+
+  // Mbed TLS returns a pointer to the internal peer cert chain.
+  // WARNING: This pointer is only valid while the session is active.
+  // Do not use the certificate after calling free_session().
+  const mbedtls_x509_crt *cert = mbedtls_ssl_get_peer_cert(&msession->ssl);
+  return const_cast<mbedtls_x509_crt *>(cert);
+}
+
+void free_cert(cert_t cert) {
+  // Mbed TLS: peer certificate is owned by the SSL context.
+  // No-op here, but callers should still call this for cross-backend
+  // portability.
+  (void)cert;
+}
+
+bool verify_hostname(cert_t cert, const char *hostname) {
+  if (!cert || !hostname) { return false; }
+  auto mcert = static_cast<const mbedtls_x509_crt *>(cert);
+  std::string host_str(hostname);
+
+  // Check if hostname is an IP address
+  bool is_ip = impl::is_ipv4_address(host_str);
+  unsigned char ip_bytes[4];
+  if (is_ip) { impl::parse_ipv4(host_str, ip_bytes); }
+
+  // Check Subject Alternative Names (SAN)
+  // In Mbed TLS 3.x, subject_alt_names contains raw values without ASN.1 tags
+  // - DNS names: raw string bytes
+  // - IP addresses: raw IP bytes (4 for IPv4, 16 for IPv6)
+  const mbedtls_x509_sequence *san = &mcert->subject_alt_names;
+  while (san != nullptr && san->buf.p != nullptr && san->buf.len > 0) {
+    const unsigned char *p = san->buf.p;
+    size_t len = san->buf.len;
+
+    if (is_ip) {
+      // Check if this SAN is an IPv4 address (4 bytes)
+      if (len == 4 && memcmp(p, ip_bytes, 4) == 0) { return true; }
+      // Check if this SAN is an IPv6 address (16 bytes) - skip for now
+    } else {
+      // Check if this SAN is a DNS name (printable ASCII string)
+      bool is_dns = len > 0;
+      for (size_t i = 0; i < len && is_dns; i++) {
+        if (p[i] < 32 || p[i] > 126) { is_dns = false; }
+      }
+      if (is_dns) {
+        std::string san_name(reinterpret_cast<const char *>(p), len);
+        if (detail::match_hostname(san_name, host_str)) { return true; }
+      }
+    }
+    san = san->next;
+  }
+
+  // Fallback: Check Common Name (CN) in subject
+  char cn[256];
+  int ret = mbedtls_x509_dn_gets(cn, sizeof(cn), &mcert->subject);
+  if (ret > 0) {
+    std::string cn_str(cn);
+
+    // Look for "CN=" in the DN string
+    size_t cn_pos = cn_str.find("CN=");
+    if (cn_pos != std::string::npos) {
+      size_t start = cn_pos + 3;
+      size_t end = cn_str.find(',', start);
+      std::string cn_value =
+          cn_str.substr(start, end == std::string::npos ? end : end - start);
+
+      if (detail::match_hostname(cn_value, host_str)) { return true; }
+    }
+  }
+
+  return false;
+}
+
+uint64_t hostname_mismatch_code() {
+  return static_cast<uint64_t>(MBEDTLS_X509_BADCERT_CN_MISMATCH);
+}
+
+long get_verify_result(const_session_t session) {
+  if (!session) { return -1; }
+  auto msession =
+      static_cast<impl::MbedTlsSession *>(const_cast<void *>(session));
+  uint32_t flags = mbedtls_ssl_get_verify_result(&msession->ssl);
+  // Return 0 (X509_V_OK equivalent) if verification passed
+  return flags == 0 ? 0 : static_cast<long>(flags);
+}
+
+std::string get_cert_subject_cn(cert_t cert) {
+  if (!cert) return "";
+  auto x509 = static_cast<mbedtls_x509_crt *>(cert);
+
+  // Find the CN in the subject
+  const mbedtls_x509_name *name = &x509->subject;
+  while (name != nullptr) {
+    if (MBEDTLS_OID_CMP(MBEDTLS_OID_AT_CN, &name->oid) == 0) {
+      return std::string(reinterpret_cast<const char *>(name->val.p),
+                         name->val.len);
+    }
+    name = name->next;
+  }
+  return "";
+}
+
+std::string get_cert_issuer_name(cert_t cert) {
+  if (!cert) return "";
+  auto x509 = static_cast<mbedtls_x509_crt *>(cert);
+
+  // Build a human-readable issuer name string
+  char buf[512];
+  int ret = mbedtls_x509_dn_gets(buf, sizeof(buf), &x509->issuer);
+  if (ret < 0) return "";
+  return std::string(buf);
+}
+
+bool get_cert_sans(cert_t cert, std::vector<SanEntry> &sans) {
+  sans.clear();
+  if (!cert) return false;
+  auto x509 = static_cast<mbedtls_x509_crt *>(cert);
+
+  // Parse the Subject Alternative Name extension
+  const mbedtls_x509_sequence *cur = &x509->subject_alt_names;
+  while (cur != nullptr) {
+    if (cur->buf.len > 0) {
+      // Mbed TLS stores SAN as ASN.1 sequences
+      // The tag byte indicates the type
+      const unsigned char *p = cur->buf.p;
+      size_t len = cur->buf.len;
+
+      // First byte is the tag
+      unsigned char tag = *p;
+      p++;
+      len--;
+
+      // Parse length (simple single-byte length assumed)
+      if (len > 0 && *p < 0x80) {
+        size_t value_len = *p;
+        p++;
+        len--;
+
+        if (value_len <= len) {
+          SanEntry entry;
+          // ASN.1 context tags for GeneralName
+          switch (tag & 0x1F) {
+          case 2: // dNSName
+            entry.type = SanType::DNS;
+            entry.value =
+                std::string(reinterpret_cast<const char *>(p), value_len);
+            break;
+          case 7: // iPAddress
+            entry.type = SanType::IP;
+            if (value_len == 4) {
+              // IPv4
+              char buf[16];
+              snprintf(buf, sizeof(buf), "%d.%d.%d.%d", p[0], p[1], p[2], p[3]);
+              entry.value = buf;
+            } else if (value_len == 16) {
+              // IPv6
+              char buf[64];
+              snprintf(buf, sizeof(buf),
+                       "%02x%02x:%02x%02x:%02x%02x:%02x%02x:"
+                       "%02x%02x:%02x%02x:%02x%02x:%02x%02x",
+                       p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8],
+                       p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
+              entry.value = buf;
+            }
+            break;
+          case 1: // rfc822Name (email)
+            entry.type = SanType::EMAIL;
+            entry.value =
+                std::string(reinterpret_cast<const char *>(p), value_len);
+            break;
+          case 6: // uniformResourceIdentifier
+            entry.type = SanType::URI;
+            entry.value =
+                std::string(reinterpret_cast<const char *>(p), value_len);
+            break;
+          default: entry.type = SanType::OTHER; break;
+          }
+
+          if (!entry.value.empty()) { sans.push_back(std::move(entry)); }
+        }
+      }
+    }
+    cur = cur->next;
+  }
+  return true;
+}
+
+bool get_cert_validity(cert_t cert, time_t &not_before,
+                              time_t &not_after) {
+  if (!cert) return false;
+  auto x509 = static_cast<mbedtls_x509_crt *>(cert);
+
+  // Convert mbedtls_x509_time to time_t
+  auto to_time_t = [](const mbedtls_x509_time &t) -> time_t {
+    struct tm tm_time = {};
+    tm_time.tm_year = t.year - 1900;
+    tm_time.tm_mon = t.mon - 1;
+    tm_time.tm_mday = t.day;
+    tm_time.tm_hour = t.hour;
+    tm_time.tm_min = t.min;
+    tm_time.tm_sec = t.sec;
+#ifdef _WIN32
+    return _mkgmtime(&tm_time);
+#else
+    return timegm(&tm_time);
+#endif
+  };
+
+  not_before = to_time_t(x509->valid_from);
+  not_after = to_time_t(x509->valid_to);
+  return true;
+}
+
+std::string get_cert_serial(cert_t cert) {
+  if (!cert) return "";
+  auto x509 = static_cast<mbedtls_x509_crt *>(cert);
+
+  // Convert serial number to hex string
+  std::string result;
+  result.reserve(x509->serial.len * 2);
+  for (size_t i = 0; i < x509->serial.len; i++) {
+    char hex[3];
+    snprintf(hex, sizeof(hex), "%02X", x509->serial.p[i]);
+    result += hex;
+  }
+  return result;
+}
+
+bool get_cert_der(cert_t cert, std::vector<unsigned char> &der) {
+  if (!cert) return false;
+  auto crt = static_cast<mbedtls_x509_crt *>(cert);
+  if (!crt->raw.p || crt->raw.len == 0) return false;
+  der.assign(crt->raw.p, crt->raw.p + crt->raw.len);
+  return true;
+}
+
+const char *get_sni(const_session_t session) {
+  if (!session) return nullptr;
+  auto msession = static_cast<const impl::MbedTlsSession *>(session);
+
+  // For server: return SNI received from client during handshake
+  if (!msession->sni_hostname.empty()) {
+    return msession->sni_hostname.c_str();
+  }
+
+  // For client: return the hostname set via set_sni
+  if (!msession->hostname.empty()) { return msession->hostname.c_str(); }
+
+  return nullptr;
+}
+
+uint64_t peek_error() {
+  // Mbed TLS doesn't have an error queue, return the last error
+  return static_cast<uint64_t>(-impl::mbedtls_last_error());
+}
+
+uint64_t get_error() {
+  // Mbed TLS doesn't have an error queue, return and clear the last error
+  uint64_t err = static_cast<uint64_t>(-impl::mbedtls_last_error());
+  impl::mbedtls_last_error() = 0;
+  return err;
+}
+
+std::string error_string(uint64_t code) {
+  char buf[256];
+  mbedtls_strerror(-static_cast<int>(code), buf, sizeof(buf));
+  return std::string(buf);
+}
+
+ca_store_t create_ca_store(const char *pem, size_t len) {
+  auto *ca_chain = new (std::nothrow) mbedtls_x509_crt;
+  if (!ca_chain) { return nullptr; }
+
+  mbedtls_x509_crt_init(ca_chain);
+
+  // mbedtls_x509_crt_parse expects null-terminated PEM
+  int ret = mbedtls_x509_crt_parse(ca_chain,
+                                   reinterpret_cast<const unsigned char *>(pem),
+                                   len + 1); // +1 for null terminator
+  if (ret != 0) {
+    // Try without +1 in case PEM is already null-terminated
+    ret = mbedtls_x509_crt_parse(
+        ca_chain, reinterpret_cast<const unsigned char *>(pem), len);
+    if (ret != 0) {
+      mbedtls_x509_crt_free(ca_chain);
+      delete ca_chain;
+      return nullptr;
+    }
+  }
+
+  return static_cast<ca_store_t>(ca_chain);
+}
+
+void free_ca_store(ca_store_t store) {
+  if (store) {
+    auto *ca_chain = static_cast<mbedtls_x509_crt *>(store);
+    mbedtls_x509_crt_free(ca_chain);
+    delete ca_chain;
+  }
+}
+
+bool set_ca_store(ctx_t ctx, ca_store_t store) {
+  if (!ctx || !store) { return false; }
+  auto *mbed_ctx = static_cast<impl::MbedTlsContext *>(ctx);
+  auto *ca_chain = static_cast<mbedtls_x509_crt *>(store);
+
+  // Free existing CA chain
+  mbedtls_x509_crt_free(&mbed_ctx->ca_chain);
+  mbedtls_x509_crt_init(&mbed_ctx->ca_chain);
+
+  // Copy the CA chain (deep copy)
+  // Parse from the raw data of the source cert
+  mbedtls_x509_crt *src = ca_chain;
+  while (src != nullptr) {
+    int ret = mbedtls_x509_crt_parse_der(&mbed_ctx->ca_chain, src->raw.p,
+                                         src->raw.len);
+    if (ret != 0) { return false; }
+    src = src->next;
+  }
+
+  // Update the SSL config to use the new CA chain
+  mbedtls_ssl_conf_ca_chain(&mbed_ctx->conf, &mbed_ctx->ca_chain, nullptr);
+  return true;
+}
+
+size_t get_ca_certs(ctx_t ctx, std::vector<cert_t> &certs) {
+  certs.clear();
+  if (!ctx) { return 0; }
+  auto *mbed_ctx = static_cast<impl::MbedTlsContext *>(ctx);
+
+  // Iterate through the CA chain
+  mbedtls_x509_crt *cert = &mbed_ctx->ca_chain;
+  while (cert != nullptr && cert->raw.len > 0) {
+    // Create a copy of the certificate for the caller
+    auto *copy = new mbedtls_x509_crt;
+    mbedtls_x509_crt_init(copy);
+    int ret = mbedtls_x509_crt_parse_der(copy, cert->raw.p, cert->raw.len);
+    if (ret == 0) {
+      certs.push_back(static_cast<cert_t>(copy));
+    } else {
+      mbedtls_x509_crt_free(copy);
+      delete copy;
+    }
+    cert = cert->next;
+  }
+  return certs.size();
+}
+
+std::vector<std::string> get_ca_names(ctx_t ctx) {
+  std::vector<std::string> names;
+  if (!ctx) { return names; }
+  auto *mbed_ctx = static_cast<impl::MbedTlsContext *>(ctx);
+
+  // Iterate through the CA chain
+  mbedtls_x509_crt *cert = &mbed_ctx->ca_chain;
+  while (cert != nullptr && cert->raw.len > 0) {
+    char buf[512];
+    int ret = mbedtls_x509_dn_gets(buf, sizeof(buf), &cert->subject);
+    if (ret > 0) { names.push_back(buf); }
+    cert = cert->next;
+  }
+  return names;
+}
+
+bool update_server_cert(ctx_t ctx, const char *cert_pem,
+                               const char *key_pem, const char *password) {
+  if (!ctx || !cert_pem || !key_pem) { return false; }
+  auto *mbed_ctx = static_cast<impl::MbedTlsContext *>(ctx);
+
+  // Free existing certificate and key
+  mbedtls_x509_crt_free(&mbed_ctx->own_cert);
+  mbedtls_pk_free(&mbed_ctx->own_key);
+  mbedtls_x509_crt_init(&mbed_ctx->own_cert);
+  mbedtls_pk_init(&mbed_ctx->own_key);
+
+  // Parse certificate PEM
+  int ret = mbedtls_x509_crt_parse(
+      &mbed_ctx->own_cert, reinterpret_cast<const unsigned char *>(cert_pem),
+      strlen(cert_pem) + 1);
+  if (ret != 0) {
+    impl::mbedtls_last_error() = ret;
+    return false;
+  }
+
+  // Parse private key PEM
+#ifdef CPPHTTPLIB_MBEDTLS_V3
+  ret = mbedtls_pk_parse_key(
+      &mbed_ctx->own_key, reinterpret_cast<const unsigned char *>(key_pem),
+      strlen(key_pem) + 1,
+      password ? reinterpret_cast<const unsigned char *>(password) : nullptr,
+      password ? strlen(password) : 0, mbedtls_ctr_drbg_random,
+      &mbed_ctx->ctr_drbg);
+#else
+  ret = mbedtls_pk_parse_key(
+      &mbed_ctx->own_key, reinterpret_cast<const unsigned char *>(key_pem),
+      strlen(key_pem) + 1,
+      password ? reinterpret_cast<const unsigned char *>(password) : nullptr,
+      password ? strlen(password) : 0);
+#endif
+  if (ret != 0) {
+    impl::mbedtls_last_error() = ret;
+    return false;
+  }
+
+  // Configure SSL to use the new certificate and key
+  ret = mbedtls_ssl_conf_own_cert(&mbed_ctx->conf, &mbed_ctx->own_cert,
+                                  &mbed_ctx->own_key);
+  if (ret != 0) {
+    impl::mbedtls_last_error() = ret;
+    return false;
+  }
+
+  return true;
+}
+
+bool update_server_client_ca(ctx_t ctx, const char *ca_pem) {
+  if (!ctx || !ca_pem) { return false; }
+  auto *mbed_ctx = static_cast<impl::MbedTlsContext *>(ctx);
+
+  // Free existing CA chain
+  mbedtls_x509_crt_free(&mbed_ctx->ca_chain);
+  mbedtls_x509_crt_init(&mbed_ctx->ca_chain);
+
+  // Parse CA PEM
+  int ret = mbedtls_x509_crt_parse(
+      &mbed_ctx->ca_chain, reinterpret_cast<const unsigned char *>(ca_pem),
+      strlen(ca_pem) + 1);
+  if (ret != 0) {
+    impl::mbedtls_last_error() = ret;
+    return false;
+  }
+
+  // Update SSL config to use new CA chain
+  mbedtls_ssl_conf_ca_chain(&mbed_ctx->conf, &mbed_ctx->ca_chain, nullptr);
+  return true;
+}
+
+bool set_verify_callback(ctx_t ctx, VerifyCallback callback) {
+  if (!ctx) { return false; }
+  auto *mbed_ctx = static_cast<impl::MbedTlsContext *>(ctx);
+
+  impl::get_verify_callback() = std::move(callback);
+  mbed_ctx->has_verify_callback =
+      static_cast<bool>(impl::get_verify_callback());
+
+  if (mbed_ctx->has_verify_callback) {
+    // Set OPTIONAL mode to ensure callback is called even when verification
+    // is disabled (matching OpenSSL behavior where SSL_VERIFY_PEER is set)
+    mbedtls_ssl_conf_authmode(&mbed_ctx->conf, MBEDTLS_SSL_VERIFY_OPTIONAL);
+    mbedtls_ssl_conf_verify(&mbed_ctx->conf, impl::mbedtls_verify_callback,
+                            nullptr);
+  } else {
+    mbedtls_ssl_conf_verify(&mbed_ctx->conf, nullptr, nullptr);
+  }
+  return true;
+}
+
+long get_verify_error(const_session_t session) {
+  if (!session) { return -1; }
+  auto *msession =
+      static_cast<impl::MbedTlsSession *>(const_cast<void *>(session));
+  return static_cast<long>(mbedtls_ssl_get_verify_result(&msession->ssl));
+}
+
+std::string verify_error_string(long error_code) {
+  if (error_code == 0) { return ""; }
+  char buf[256];
+  mbedtls_x509_crt_verify_info(buf, sizeof(buf), "",
+                               static_cast<uint32_t>(error_code));
+  // Remove trailing newline if present
+  std::string result(buf);
+  while (!result.empty() && (result.back() == '\n' || result.back() == ' ')) {
+    result.pop_back();
+  }
+  return result;
+}
+
+} // namespace tls
+
+#endif // CPPHTTPLIB_MBEDTLS_SUPPORT
+
 } // namespace httplib
diff --git a/vendor/cpp-httplib/httplib.h b/vendor/cpp-httplib/httplib.h
index 7c7790f41f..1fd8b1d1e8 100644
--- a/vendor/cpp-httplib/httplib.h
+++ b/vendor/cpp-httplib/httplib.h
@@ -8,8 +8,8 @@
 #ifndef CPPHTTPLIB_HTTPLIB_H
 #define CPPHTTPLIB_HTTPLIB_H
 
-#define CPPHTTPLIB_VERSION "0.30.2"
-#define CPPHTTPLIB_VERSION_NUM "0x001E02"
+#define CPPHTTPLIB_VERSION "0.31.0"
+#define CPPHTTPLIB_VERSION_NUM "0x001F00"
 
 /*
  * Platform compatibility check
@@ -147,7 +147,7 @@
 #endif
 
 #ifndef CPPHTTPLIB_PAYLOAD_MAX_LENGTH
-#define CPPHTTPLIB_PAYLOAD_MAX_LENGTH ((std::numeric_limits<size_t>::max)())
+#define CPPHTTPLIB_PAYLOAD_MAX_LENGTH (100 * 1024 * 1024) // 100MB
 #endif
 
 #ifndef CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH
@@ -383,6 +383,45 @@ using socket_t = int;
 
 #endif // CPPHTTPLIB_OPENSSL_SUPPORT
 
+#ifdef CPPHTTPLIB_MBEDTLS_SUPPORT
+#include <mbedtls/ctr_drbg.h>
+#include <mbedtls/entropy.h>
+#include <mbedtls/error.h>
+#include <mbedtls/md5.h>
+#include <mbedtls/net_sockets.h>
+#include <mbedtls/oid.h>
+#include <mbedtls/pk.h>
+#include <mbedtls/sha1.h>
+#include <mbedtls/sha256.h>
+#include <mbedtls/sha512.h>
+#include <mbedtls/ssl.h>
+#include <mbedtls/x509_crt.h>
+#ifdef _WIN32
+#include <wincrypt.h>
+#ifdef _MSC_VER
+#pragma comment(lib, "crypt32.lib")
+#endif
+#endif // _WIN32
+#if defined(CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN)
+#if TARGET_OS_MAC
+#include <Security/Security.h>
+#endif
+#endif // CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN
+
+// Mbed TLS 3.x API compatibility
+#if MBEDTLS_VERSION_MAJOR >= 3
+#define CPPHTTPLIB_MBEDTLS_V3
+#endif
+
+#endif // CPPHTTPLIB_MBEDTLS_SUPPORT
+
+// Define CPPHTTPLIB_SSL_ENABLED if any SSL backend is available
+// This simplifies conditional compilation when adding new backends (e.g.,
+// wolfSSL)
+#if defined(CPPHTTPLIB_OPENSSL_SUPPORT) || defined(CPPHTTPLIB_MBEDTLS_SUPPORT)
+#define CPPHTTPLIB_SSL_ENABLED
+#endif
+
 #ifdef CPPHTTPLIB_ZLIB_SUPPORT
 #include <zlib.h>
 #endif
@@ -799,6 +838,105 @@ public:
 using Range = std::pair<ssize_t, ssize_t>;
 using Ranges = std::vector<Range>;
 
+#ifdef CPPHTTPLIB_SSL_ENABLED
+// TLS abstraction layer - public type definitions and API
+namespace tls {
+
+// Opaque handles (defined as void* for abstraction)
+using ctx_t = void *;
+using session_t = void *;
+using const_session_t = const void *; // For read-only session access
+using cert_t = void *;
+using ca_store_t = void *;
+
+// TLS versions
+enum class Version {
+  TLS1_2 = 0x0303,
+  TLS1_3 = 0x0304,
+};
+
+// Subject Alternative Names (SAN) entry types
+enum class SanType { DNS, IP, EMAIL, URI, OTHER };
+
+// SAN entry structure
+struct SanEntry {
+  SanType type;
+  std::string value;
+};
+
+// Verification context for certificate verification callback
+struct VerifyContext {
+  session_t session;        // TLS session handle
+  cert_t cert;              // Current certificate being verified
+  int depth;                // Certificate chain depth (0 = leaf)
+  bool preverify_ok;        // OpenSSL/Mbed TLS pre-verification result
+  long error_code;          // Backend-specific error code (0 = no error)
+  const char *error_string; // Human-readable error description
+
+  // Certificate introspection methods
+  std::string subject_cn() const;
+  std::string issuer_name() const;
+  bool check_hostname(const char *hostname) const;
+  std::vector<SanEntry> sans() const;
+  bool validity(time_t &not_before, time_t &not_after) const;
+  std::string serial() const;
+};
+
+using VerifyCallback = std::function<bool(const VerifyContext &ctx)>;
+
+// TlsError codes for TLS operations (backend-independent)
+enum class ErrorCode : int {
+  Success = 0,
+  WantRead,         // Non-blocking: need to wait for read
+  WantWrite,        // Non-blocking: need to wait for write
+  PeerClosed,       // Peer closed the connection
+  Fatal,            // Unrecoverable error
+  SyscallError,     // System call error (check sys_errno)
+  CertVerifyFailed, // Certificate verification failed
+  HostnameMismatch, // Hostname verification failed
+};
+
+// TLS error information
+struct TlsError {
+  ErrorCode code = ErrorCode::Fatal;
+  uint64_t backend_code = 0; // OpenSSL: ERR_get_error(), mbedTLS: return value
+  int sys_errno = 0;         // errno when SyscallError
+
+  // Convert verification error code to human-readable string
+  static std::string verify_error_to_string(long error_code);
+};
+
+// RAII wrapper for peer certificate
+class PeerCert {
+public:
+  PeerCert();
+  PeerCert(PeerCert &&other) noexcept;
+  PeerCert &operator=(PeerCert &&other) noexcept;
+  ~PeerCert();
+
+  PeerCert(const PeerCert &) = delete;
+  PeerCert &operator=(const PeerCert &) = delete;
+
+  explicit operator bool() const;
+  std::string subject_cn() const;
+  std::string issuer_name() const;
+  bool check_hostname(const char *hostname) const;
+  std::vector<SanEntry> sans() const;
+  bool validity(time_t &not_before, time_t &not_after) const;
+  std::string serial() const;
+
+private:
+  explicit PeerCert(cert_t cert);
+  cert_t cert_ = nullptr;
+  friend PeerCert get_peer_cert_from_session(const_session_t session);
+};
+
+// Callback for TLS context setup (used by SSLServer constructor)
+using ContextSetupCallback = std::function<bool(ctx_t ctx)>;
+
+} // namespace tls
+#endif
+
 struct Request {
   std::string method;
   std::string path;
@@ -828,9 +966,6 @@ struct Request {
   ContentReceiverWithProgress content_receiver;
   DownloadProgress download_progress;
   UploadProgress upload_progress;
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  const SSL *ssl = nullptr;
-#endif
 
   bool has_header(const std::string &key) const;
   std::string get_header_value(const std::string &key, const char *def = "",
@@ -858,6 +993,12 @@ struct Request {
   size_t authorization_count_ = 0;
   std::chrono::time_point<std::chrono::steady_clock> start_time_ =
       (std::chrono::steady_clock::time_point::min)();
+
+#ifdef CPPHTTPLIB_SSL_ENABLED
+  tls::const_session_t ssl = nullptr;
+  tls::PeerCert peer_cert() const;
+  std::string sni() const;
+#endif
 };
 
 struct Response {
@@ -1005,74 +1146,18 @@ public:
 
 class ThreadPool final : public TaskQueue {
 public:
-  explicit ThreadPool(size_t n, size_t mqr = 0)
-      : shutdown_(false), max_queued_requests_(mqr) {
-    threads_.reserve(n);
-    while (n) {
-      threads_.emplace_back(worker(*this));
-      n--;
-    }
-  }
-
+  explicit ThreadPool(size_t n, size_t mqr = 0);
   ThreadPool(const ThreadPool &) = delete;
   ~ThreadPool() override = default;
 
-  bool enqueue(std::function<void()> fn) override {
-    {
-      std::unique_lock<std::mutex> lock(mutex_);
-      if (max_queued_requests_ > 0 && jobs_.size() >= max_queued_requests_) {
-        return false;
-      }
-      jobs_.push_back(std::move(fn));
-    }
-
-    cond_.notify_one();
-    return true;
-  }
-
-  void shutdown() override {
-    // Stop all worker threads...
-    {
-      std::unique_lock<std::mutex> lock(mutex_);
-      shutdown_ = true;
-    }
-
-    cond_.notify_all();
-
-    // Join...
-    for (auto &t : threads_) {
-      t.join();
-    }
-  }
+  bool enqueue(std::function<void()> fn) override;
+  void shutdown() override;
 
 private:
   struct worker {
-    explicit worker(ThreadPool &pool) : pool_(pool) {}
+    explicit worker(ThreadPool &pool);
 
-    void operator()() {
-      for (;;) {
-        std::function<void()> fn;
-        {
-          std::unique_lock<std::mutex> lock(pool_.mutex_);
-
-          pool_.cond_.wait(
-              lock, [&] { return !pool_.jobs_.empty() || pool_.shutdown_; });
-
-          if (pool_.shutdown_ && pool_.jobs_.empty()) { break; }
-
-          fn = pool_.jobs_.front();
-          pool_.jobs_.pop_front();
-        }
-
-        assert(true == static_cast<bool>(fn));
-        fn();
-      }
-
-#if defined(CPPHTTPLIB_OPENSSL_SUPPORT) && !defined(OPENSSL_IS_BORINGSSL) &&   \
-    !defined(LIBRESSL_VERSION_NUMBER)
-      OPENSSL_thread_stop();
-#endif
-    }
+    void operator()();
 
     ThreadPool &pool_;
   };
@@ -1184,6 +1269,9 @@ int close_socket(socket_t sock);
 
 ssize_t write_headers(Stream &strm, const Headers &headers);
 
+bool set_socket_opt_time(socket_t sock, int level, int optname, time_t sec,
+                         time_t usec);
+
 } // namespace detail
 
 class Server {
@@ -1429,17 +1517,6 @@ public:
          Headers &&request_headers = Headers{})
       : res_(std::move(res)), err_(err),
         request_headers_(std::move(request_headers)) {}
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  Result(std::unique_ptr<Response> &&res, Error err, Headers &&request_headers,
-         int ssl_error)
-      : res_(std::move(res)), err_(err),
-        request_headers_(std::move(request_headers)), ssl_error_(ssl_error) {}
-  Result(std::unique_ptr<Response> &&res, Error err, Headers &&request_headers,
-         int ssl_error, unsigned long ssl_openssl_error)
-      : res_(std::move(res)), err_(err),
-        request_headers_(std::move(request_headers)), ssl_error_(ssl_error),
-        ssl_openssl_error_(ssl_openssl_error) {}
-#endif
   // Response
   operator bool() const { return res_ != nullptr; }
   bool operator==(std::nullptr_t) const { return res_ == nullptr; }
@@ -1454,13 +1531,6 @@ public:
   // Error
   Error error() const { return err_; }
 
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  // SSL Error
-  int ssl_error() const { return ssl_error_; }
-  // OpenSSL Error
-  unsigned long ssl_openssl_error() const { return ssl_openssl_error_; }
-#endif
-
   // Request Headers
   bool has_request_header(const std::string &key) const;
   std::string get_request_header_value(const std::string &key,
@@ -1474,64 +1544,76 @@ private:
   std::unique_ptr<Response> res_;
   Error err_ = Error::Unknown;
   Headers request_headers_;
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
+
+#ifdef CPPHTTPLIB_SSL_ENABLED
+public:
+  Result(std::unique_ptr<Response> &&res, Error err, Headers &&request_headers,
+         int ssl_error)
+      : res_(std::move(res)), err_(err),
+        request_headers_(std::move(request_headers)), ssl_error_(ssl_error) {}
+  Result(std::unique_ptr<Response> &&res, Error err, Headers &&request_headers,
+         int ssl_error, unsigned long ssl_backend_error)
+      : res_(std::move(res)), err_(err),
+        request_headers_(std::move(request_headers)), ssl_error_(ssl_error),
+        ssl_backend_error_(ssl_backend_error) {}
+
+  int ssl_error() const { return ssl_error_; }
+  unsigned long ssl_backend_error() const { return ssl_backend_error_; }
+
+private:
   int ssl_error_ = 0;
-  unsigned long ssl_openssl_error_ = 0;
+  unsigned long ssl_backend_error_ = 0;
+#endif
+
+#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
+public:
+  [[deprecated("Use ssl_backend_error() instead")]]
+  unsigned long ssl_openssl_error() const {
+    return ssl_backend_error_;
+  }
 #endif
 };
 
 struct ClientConnection {
   socket_t sock = INVALID_SOCKET;
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  SSL *ssl = nullptr;
-#endif
 
   bool is_open() const { return sock != INVALID_SOCKET; }
 
   ClientConnection() = default;
 
-  ~ClientConnection() {
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-    if (ssl) {
-      SSL_free(ssl);
-      ssl = nullptr;
-    }
-#endif
-    if (sock != INVALID_SOCKET) {
-      detail::close_socket(sock);
-      sock = INVALID_SOCKET;
-    }
-  }
+  ~ClientConnection();
 
   ClientConnection(const ClientConnection &) = delete;
   ClientConnection &operator=(const ClientConnection &) = delete;
 
   ClientConnection(ClientConnection &&other) noexcept
       : sock(other.sock)
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
+#ifdef CPPHTTPLIB_SSL_ENABLED
         ,
-        ssl(other.ssl)
+        session(other.session)
 #endif
   {
     other.sock = INVALID_SOCKET;
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-    other.ssl = nullptr;
+#ifdef CPPHTTPLIB_SSL_ENABLED
+    other.session = nullptr;
 #endif
   }
 
   ClientConnection &operator=(ClientConnection &&other) noexcept {
     if (this != &other) {
       sock = other.sock;
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-      ssl = other.ssl;
-#endif
       other.sock = INVALID_SOCKET;
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-      other.ssl = nullptr;
+#ifdef CPPHTTPLIB_SSL_ENABLED
+      session = other.session;
+      other.session = nullptr;
 #endif
     }
     return *this;
   }
+
+#ifdef CPPHTTPLIB_SSL_ENABLED
+  tls::session_t session = nullptr;
+#endif
 };
 
 namespace detail {
@@ -1540,7 +1622,9 @@ struct ChunkedDecoder;
 
 struct BodyReader {
   Stream *stream = nullptr;
+  bool has_content_length = false;
   size_t content_length = 0;
+  size_t payload_max_length = CPPHTTPLIB_PAYLOAD_MAX_LENGTH;
   size_t bytes_read = 0;
   bool chunked = false;
   bool eof = false;
@@ -1610,6 +1694,7 @@ public:
     std::unique_ptr<detail::decompressor> decompressor_;
     std::string decompress_buffer_;
     size_t decompress_offset_ = 0;
+    size_t decompressed_bytes_read_ = 0;
   };
 
   // clang-format off
@@ -1756,10 +1841,6 @@ public:
 
   void set_basic_auth(const std::string &username, const std::string &password);
   void set_bearer_token_auth(const std::string &token);
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  void set_digest_auth(const std::string &username,
-                       const std::string &password);
-#endif
 
   void set_keep_alive(bool on);
   void set_follow_location(bool on);
@@ -1770,30 +1851,14 @@ public:
 
   void set_decompress(bool on);
 
+  void set_payload_max_length(size_t length);
+
   void set_interface(const std::string &intf);
 
   void set_proxy(const std::string &host, int port);
   void set_proxy_basic_auth(const std::string &username,
                             const std::string &password);
   void set_proxy_bearer_token_auth(const std::string &token);
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  void set_proxy_digest_auth(const std::string &username,
-                             const std::string &password);
-#endif
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  void set_ca_cert_path(const std::string &ca_cert_file_path,
-                        const std::string &ca_cert_dir_path = std::string());
-  void set_ca_cert_store(X509_STORE *ca_cert_store);
-  X509_STORE *create_ca_cert_store(const char *ca_cert, std::size_t size) const;
-#endif
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  void enable_server_certificate_verification(bool enabled);
-  void enable_server_hostname_verification(bool enabled);
-  void set_server_certificate_verifier(
-      std::function<SSLVerifierResponse(SSL *ssl)> verifier);
-#endif
 
   void set_logger(Logger logger);
   void set_error_logger(ErrorLogger error_logger);
@@ -1801,11 +1866,15 @@ public:
 protected:
   struct Socket {
     socket_t sock = INVALID_SOCKET;
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-    SSL *ssl = nullptr;
-#endif
+
+    // For Mbed TLS compatibility: start_time for request timeout tracking
+    std::chrono::time_point<std::chrono::steady_clock> start_time_;
 
     bool is_open() const { return sock != INVALID_SOCKET; }
+
+#ifdef CPPHTTPLIB_SSL_ENABLED
+    tls::session_t ssl = nullptr;
+#endif
   };
 
   virtual bool create_and_connect_socket(Socket &socket, Error &error);
@@ -1872,10 +1941,6 @@ protected:
   std::string basic_auth_username_;
   std::string basic_auth_password_;
   std::string bearer_token_auth_token_;
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  std::string digest_auth_username_;
-  std::string digest_auth_password_;
-#endif
 
   bool keep_alive_ = false;
   bool follow_location_ = false;
@@ -1890,6 +1955,9 @@ protected:
   bool compress_ = false;
   bool decompress_ = true;
 
+  size_t payload_max_length_ = CPPHTTPLIB_PAYLOAD_MAX_LENGTH;
+  bool has_payload_max_length_ = false;
+
   std::string interface_;
 
   std::string proxy_host_;
@@ -1898,33 +1966,11 @@ protected:
   std::string proxy_basic_auth_username_;
   std::string proxy_basic_auth_password_;
   std::string proxy_bearer_token_auth_token_;
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  std::string proxy_digest_auth_username_;
-  std::string proxy_digest_auth_password_;
-#endif
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  std::string ca_cert_file_path_;
-  std::string ca_cert_dir_path_;
-
-  X509_STORE *ca_cert_store_ = nullptr;
-#endif
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  bool server_certificate_verification_ = true;
-  bool server_hostname_verification_ = true;
-  std::function<SSLVerifierResponse(SSL *ssl)> server_certificate_verifier_;
-#endif
 
   mutable std::mutex logger_mutex_;
   Logger logger_;
   ErrorLogger error_logger_;
 
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  int last_ssl_error_ = 0;
-  unsigned long last_openssl_error_ = 0;
-#endif
-
 private:
   bool send_(Request &req, Response &res, Error &error);
   Result send_(Request &&req);
@@ -1969,6 +2015,44 @@ private:
   virtual bool is_ssl() const;
 
   void transfer_socket_ownership_to_handle(StreamHandle &handle);
+
+#ifdef CPPHTTPLIB_SSL_ENABLED
+public:
+  void set_digest_auth(const std::string &username,
+                       const std::string &password);
+  void set_proxy_digest_auth(const std::string &username,
+                             const std::string &password);
+  void set_ca_cert_path(const std::string &ca_cert_file_path,
+                        const std::string &ca_cert_dir_path = std::string());
+  void enable_server_certificate_verification(bool enabled);
+  void enable_server_hostname_verification(bool enabled);
+
+protected:
+  std::string digest_auth_username_;
+  std::string digest_auth_password_;
+  std::string proxy_digest_auth_username_;
+  std::string proxy_digest_auth_password_;
+  std::string ca_cert_file_path_;
+  std::string ca_cert_dir_path_;
+  bool server_certificate_verification_ = true;
+  bool server_hostname_verification_ = true;
+  std::string ca_cert_pem_; // Store CA cert PEM for redirect transfer
+  int last_ssl_error_ = 0;
+  unsigned long last_backend_error_ = 0;
+#endif
+
+#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
+public:
+  [[deprecated("Use load_ca_cert_store() instead")]]
+  void set_ca_cert_store(X509_STORE *ca_cert_store);
+
+  [[deprecated("Use tls::create_ca_store() instead")]]
+  X509_STORE *create_ca_cert_store(const char *ca_cert, std::size_t size) const;
+
+  [[deprecated("Use set_server_certificate_verifier(VerifyCallback) instead")]]
+  virtual void set_server_certificate_verifier(
+      std::function<SSLVerifierResponse(SSL *ssl)> verifier);
+#endif
 };
 
 class Client {
@@ -2138,10 +2222,6 @@ public:
 
   void set_basic_auth(const std::string &username, const std::string &password);
   void set_bearer_token_auth(const std::string &token);
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  void set_digest_auth(const std::string &username,
-                       const std::string &password);
-#endif
 
   void set_keep_alive(bool on);
   void set_follow_location(bool on);
@@ -2153,49 +2233,65 @@ public:
 
   void set_decompress(bool on);
 
+  void set_payload_max_length(size_t length);
+
   void set_interface(const std::string &intf);
 
   void set_proxy(const std::string &host, int port);
   void set_proxy_basic_auth(const std::string &username,
                             const std::string &password);
   void set_proxy_bearer_token_auth(const std::string &token);
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  void set_proxy_digest_auth(const std::string &username,
-                             const std::string &password);
-#endif
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  void enable_server_certificate_verification(bool enabled);
-  void enable_server_hostname_verification(bool enabled);
-  void set_server_certificate_verifier(
-      std::function<SSLVerifierResponse(SSL *ssl)> verifier);
-#endif
-
   void set_logger(Logger logger);
   void set_error_logger(ErrorLogger error_logger);
 
-  // SSL
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  void set_ca_cert_path(const std::string &ca_cert_file_path,
-                        const std::string &ca_cert_dir_path = std::string());
-
-  void set_ca_cert_store(X509_STORE *ca_cert_store);
-  void load_ca_cert_store(const char *ca_cert, std::size_t size);
-
-  long get_openssl_verify_result() const;
-
-  SSL_CTX *ssl_context() const;
-#endif
-
 private:
   std::unique_ptr<ClientImpl> cli_;
 
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
+#ifdef CPPHTTPLIB_SSL_ENABLED
+public:
+  void set_digest_auth(const std::string &username,
+                       const std::string &password);
+  void set_proxy_digest_auth(const std::string &username,
+                             const std::string &password);
+  void enable_server_certificate_verification(bool enabled);
+  void enable_server_hostname_verification(bool enabled);
+  void set_ca_cert_path(const std::string &ca_cert_file_path,
+                        const std::string &ca_cert_dir_path = std::string());
+
+  void set_ca_cert_store(tls::ca_store_t ca_cert_store);
+  void load_ca_cert_store(const char *ca_cert, std::size_t size);
+
+  void set_server_certificate_verifier(tls::VerifyCallback verifier);
+
+  void set_session_verifier(
+      std::function<SSLVerifierResponse(tls::session_t)> verifier);
+
+  tls::ctx_t tls_context() const;
+
+#if defined(_WIN32) &&                                                         \
+    !defined(CPPHTTPLIB_DISABLE_WINDOWS_AUTOMATIC_ROOT_CERTIFICATES_UPDATE)
+  void enable_windows_certificate_verification(bool enabled);
+#endif
+
+private:
   bool is_ssl_ = false;
 #endif
+
+#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
+public:
+  [[deprecated("Use tls_context() instead")]]
+  SSL_CTX *ssl_context() const;
+
+  [[deprecated("Use set_session_verifier(session_t) instead")]]
+  void set_server_certificate_verifier(
+      std::function<SSLVerifierResponse(SSL *ssl)> verifier);
+
+  [[deprecated("Use Result::ssl_backend_error() instead")]]
+  long get_verify_result() const;
+#endif
 };
 
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
+#ifdef CPPHTTPLIB_SSL_ENABLED
 class SSLServer : public Server {
 public:
   SSLServer(const char *cert_path, const char *private_key_path,
@@ -2203,32 +2299,60 @@ public:
             const char *client_ca_cert_dir_path = nullptr,
             const char *private_key_password = nullptr);
 
-  SSLServer(X509 *cert, EVP_PKEY *private_key,
-            X509_STORE *client_ca_cert_store = nullptr);
+  struct PemMemory {
+    const char *cert_pem;
+    size_t cert_pem_len;
+    const char *key_pem;
+    size_t key_pem_len;
+    const char *client_ca_pem;
+    size_t client_ca_pem_len;
+    const char *private_key_password;
+  };
+  explicit SSLServer(const PemMemory &pem);
 
-  SSLServer(
-      const std::function<bool(SSL_CTX &ssl_ctx)> &setup_ssl_ctx_callback);
+  // The callback receives the ctx_t handle which can be cast to the
+  // appropriate backend type (SSL_CTX* for OpenSSL,
+  // tls::impl::MbedTlsContext* for Mbed TLS)
+  explicit SSLServer(const tls::ContextSetupCallback &setup_callback);
 
   ~SSLServer() override;
 
   bool is_valid() const override;
 
-  SSL_CTX *ssl_context() const;
+  bool update_certs_pem(const char *cert_pem, const char *key_pem,
+                        const char *client_ca_pem = nullptr,
+                        const char *password = nullptr);
 
-  void update_certs(X509 *cert, EVP_PKEY *private_key,
-                    X509_STORE *client_ca_cert_store = nullptr);
+  tls::ctx_t tls_context() const { return ctx_; }
 
   int ssl_last_error() const { return last_ssl_error_; }
 
 private:
   bool process_and_close_socket(socket_t sock) override;
 
-  STACK_OF(X509_NAME) * extract_ca_names_from_x509_store(X509_STORE *store);
-
-  SSL_CTX *ctx_;
+  tls::ctx_t ctx_ = nullptr;
   std::mutex ctx_mutex_;
 
   int last_ssl_error_ = 0;
+
+#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
+public:
+  [[deprecated("Use SSLServer(PemMemory) or "
+               "SSLServer(ContextSetupCallback) instead")]]
+  SSLServer(X509 *cert, EVP_PKEY *private_key,
+            X509_STORE *client_ca_cert_store = nullptr);
+
+  [[deprecated("Use SSLServer(ContextSetupCallback) instead")]]
+  SSLServer(
+      const std::function<bool(SSL_CTX &ssl_ctx)> &setup_ssl_ctx_callback);
+
+  [[deprecated("Use tls_context() instead")]]
+  SSL_CTX *ssl_context() const;
+
+  [[deprecated("Use update_certs_pem() instead")]]
+  void update_certs(X509 *cert, EVP_PKEY *private_key,
+                    X509_STORE *client_ca_cert_store = nullptr);
+#endif
 };
 
 class SSLClient final : public ClientImpl {
@@ -2242,20 +2366,34 @@ public:
                      const std::string &client_key_path,
                      const std::string &private_key_password = std::string());
 
-  explicit SSLClient(const std::string &host, int port, X509 *client_cert,
-                     EVP_PKEY *client_key,
-                     const std::string &private_key_password = std::string());
+  struct PemMemory {
+    const char *cert_pem;
+    size_t cert_pem_len;
+    const char *key_pem;
+    size_t key_pem_len;
+    const char *private_key_password;
+  };
+  explicit SSLClient(const std::string &host, int port, const PemMemory &pem);
 
   ~SSLClient() override;
 
   bool is_valid() const override;
 
-  void set_ca_cert_store(X509_STORE *ca_cert_store);
+  void set_ca_cert_store(tls::ca_store_t ca_cert_store);
   void load_ca_cert_store(const char *ca_cert, std::size_t size);
 
-  long get_openssl_verify_result() const;
+  void set_server_certificate_verifier(tls::VerifyCallback verifier);
 
-  SSL_CTX *ssl_context() const;
+  // Post-handshake session verifier (backend-independent)
+  void set_session_verifier(
+      std::function<SSLVerifierResponse(tls::session_t)> verifier);
+
+  tls::ctx_t tls_context() const { return ctx_; }
+
+#if defined(_WIN32) &&                                                         \
+    !defined(CPPHTTPLIB_DISABLE_WINDOWS_AUTOMATIC_ROOT_CERTIFICATES_UPDATE)
+  void enable_windows_certificate_verification(bool enabled);
+#endif
 
 private:
   bool create_and_connect_socket(Socket &socket, Error &error) override;
@@ -2277,26 +2415,45 @@ private:
 
   bool load_certs();
 
-  bool verify_host(X509 *server_cert) const;
-  bool verify_host_with_subject_alt_name(X509 *server_cert) const;
-  bool verify_host_with_common_name(X509 *server_cert) const;
-  bool check_host_name(const char *pattern, size_t pattern_len) const;
-
-  SSL_CTX *ctx_;
+  tls::ctx_t ctx_ = nullptr;
   std::mutex ctx_mutex_;
   std::once_flag initialize_cert_;
 
-  std::vector<std::string> host_components_;
-
   long verify_result_ = 0;
 
-  friend class ClientImpl;
-};
+  std::function<SSLVerifierResponse(tls::session_t)> session_verifier_;
+
+#if defined(_WIN32) &&                                                         \
+    !defined(CPPHTTPLIB_DISABLE_WINDOWS_AUTOMATIC_ROOT_CERTIFICATES_UPDATE)
+  bool enable_windows_cert_verification_ = true;
 #endif
 
-/*
- * Implementation of template methods.
- */
+  friend class ClientImpl;
+
+#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
+public:
+  [[deprecated("Use SSLClient(host, port, PemMemory) instead")]]
+  explicit SSLClient(const std::string &host, int port, X509 *client_cert,
+                     EVP_PKEY *client_key,
+                     const std::string &private_key_password = std::string());
+
+  [[deprecated("Use Result::ssl_backend_error() instead")]]
+  long get_verify_result() const;
+
+  [[deprecated("Use tls_context() instead")]]
+  SSL_CTX *ssl_context() const;
+
+  [[deprecated("Use set_session_verifier(session_t) instead")]]
+  void set_server_certificate_verifier(
+      std::function<SSLVerifierResponse(SSL *ssl)> verifier) override;
+
+private:
+  bool verify_host(X509 *server_cert) const;
+  bool verify_host_with_subject_alt_name(X509 *server_cert) const;
+  bool verify_host_with_common_name(X509 *server_cert) const;
+#endif
+};
+#endif // CPPHTTPLIB_SSL_ENABLED
 
 namespace detail {
 
@@ -2345,66 +2502,6 @@ inline size_t get_header_value_u64(const Headers &headers,
 
 } // namespace detail
 
-inline size_t Request::get_header_value_u64(const std::string &key, size_t def,
-                                            size_t id) const {
-  return detail::get_header_value_u64(headers, key, def, id);
-}
-
-inline size_t Response::get_header_value_u64(const std::string &key, size_t def,
-                                             size_t id) const {
-  return detail::get_header_value_u64(headers, key, def, id);
-}
-
-namespace detail {
-
-inline bool set_socket_opt_impl(socket_t sock, int level, int optname,
-                                const void *optval, socklen_t optlen) {
-  return setsockopt(sock, level, optname,
-#ifdef _WIN32
-                    reinterpret_cast<const char *>(optval),
-#else
-                    optval,
-#endif
-                    optlen) == 0;
-}
-
-inline bool set_socket_opt(socket_t sock, int level, int optname, int optval) {
-  return set_socket_opt_impl(sock, level, optname, &optval, sizeof(optval));
-}
-
-inline bool set_socket_opt_time(socket_t sock, int level, int optname,
-                                time_t sec, time_t usec) {
-#ifdef _WIN32
-  auto timeout = static_cast<uint32_t>(sec * 1000 + usec / 1000);
-#else
-  timeval timeout;
-  timeout.tv_sec = static_cast<long>(sec);
-  timeout.tv_usec = static_cast<decltype(timeout.tv_usec)>(usec);
-#endif
-  return set_socket_opt_impl(sock, level, optname, &timeout, sizeof(timeout));
-}
-
-} // namespace detail
-
-inline void default_socket_options(socket_t sock) {
-  detail::set_socket_opt(sock, SOL_SOCKET,
-#ifdef SO_REUSEPORT
-                         SO_REUSEPORT,
-#else
-                         SO_REUSEADDR,
-#endif
-                         1);
-}
-
-inline std::string get_bearer_token_auth(const Request &req) {
-  if (req.has_header("Authorization")) {
-    constexpr auto bearer_header_prefix_len = detail::str_len("Bearer ");
-    return req.get_header_value("Authorization")
-        .substr(bearer_header_prefix_len);
-  }
-  return "";
-}
-
 template <class Rep, class Period>
 inline Server &
 Server::set_read_timeout(const std::chrono::duration<Rep, Period> &duration) {
@@ -2429,12 +2526,6 @@ Server::set_idle_interval(const std::chrono::duration<Rep, Period> &duration) {
   return *this;
 }
 
-inline size_t Result::get_request_header_value_u64(const std::string &key,
-                                                   size_t def,
-                                                   size_t id) const {
-  return detail::get_header_value_u64(request_headers_, key, def, id);
-}
-
 template <class Rep, class Period>
 inline void ClientImpl::set_connection_timeout(
     const std::chrono::duration<Rep, Period> &duration) {
@@ -2842,105 +2933,73 @@ bool is_field_content(const std::string &s);
 bool is_field_value(const std::string &s);
 
 } // namespace fields
-
 } // namespace detail
 
+/*
+ * TLS Abstraction Layer Declarations
+ */
+
+#ifdef CPPHTTPLIB_SSL_ENABLED
+// TLS abstraction layer - backend-specific type declarations
+#ifdef CPPHTTPLIB_MBEDTLS_SUPPORT
+namespace tls {
+namespace impl {
+
+// Mbed TLS context wrapper (holds config, entropy, DRBG, CA chain, own
+// cert/key). This struct is accessible via tls::impl for use in SSL context
+// setup callbacks (cast ctx_t to tls::impl::MbedTlsContext*).
+struct MbedTlsContext {
+  mbedtls_ssl_config conf;
+  mbedtls_entropy_context entropy;
+  mbedtls_ctr_drbg_context ctr_drbg;
+  mbedtls_x509_crt ca_chain;
+  mbedtls_x509_crt own_cert;
+  mbedtls_pk_context own_key;
+  bool is_server = false;
+  bool verify_client = false;
+  bool has_verify_callback = false;
+
+  MbedTlsContext();
+  ~MbedTlsContext();
+
+  MbedTlsContext(const MbedTlsContext &) = delete;
+  MbedTlsContext &operator=(const MbedTlsContext &) = delete;
+};
+
+} // namespace impl
+} // namespace tls
+#endif
+
+#endif // CPPHTTPLIB_SSL_ENABLED
+
 namespace stream {
 
 class Result {
 public:
-  Result() : chunk_size_(8192) {}
-
-  explicit Result(ClientImpl::StreamHandle &&handle, size_t chunk_size = 8192)
-      : handle_(std::move(handle)), chunk_size_(chunk_size) {}
-
-  Result(Result &&other) noexcept
-      : handle_(std::move(other.handle_)), buffer_(std::move(other.buffer_)),
-        current_size_(other.current_size_), chunk_size_(other.chunk_size_),
-        finished_(other.finished_) {
-    other.current_size_ = 0;
-    other.finished_ = true;
-  }
-
-  Result &operator=(Result &&other) noexcept {
-    if (this != &other) {
-      handle_ = std::move(other.handle_);
-      buffer_ = std::move(other.buffer_);
-      current_size_ = other.current_size_;
-      chunk_size_ = other.chunk_size_;
-      finished_ = other.finished_;
-      other.current_size_ = 0;
-      other.finished_ = true;
-    }
-    return *this;
-  }
-
+  Result();
+  explicit Result(ClientImpl::StreamHandle &&handle, size_t chunk_size = 8192);
+  Result(Result &&other) noexcept;
+  Result &operator=(Result &&other) noexcept;
   Result(const Result &) = delete;
   Result &operator=(const Result &) = delete;
 
-  // Check if the result is valid (connection succeeded and response received)
-  bool is_valid() const { return handle_.is_valid(); }
-  explicit operator bool() const { return is_valid(); }
-
-  // Response status code
-  int status() const {
-    return handle_.response ? handle_.response->status : -1;
-  }
-
-  // Response headers
-  const Headers &headers() const {
-    static const Headers empty_headers;
-    return handle_.response ? handle_.response->headers : empty_headers;
-  }
-
+  // Response info
+  bool is_valid() const;
+  explicit operator bool() const;
+  int status() const;
+  const Headers &headers() const;
   std::string get_header_value(const std::string &key,
-                               const char *def = "") const {
-    return handle_.response ? handle_.response->get_header_value(key, def)
-                            : def;
-  }
+                               const char *def = "") const;
+  bool has_header(const std::string &key) const;
+  Error error() const;
+  Error read_error() const;
+  bool has_read_error() const;
 
-  bool has_header(const std::string &key) const {
-    return handle_.response ? handle_.response->has_header(key) : false;
-  }
-
-  // Error information
-  Error error() const { return handle_.error; }
-  Error read_error() const { return handle_.get_read_error(); }
-  bool has_read_error() const { return handle_.has_read_error(); }
-
-  // Streaming iteration API
-  // Call next() to read the next chunk, then access data via data()/size()
-  // Returns true if data was read, false when stream is exhausted
-  bool next() {
-    if (!handle_.is_valid() || finished_) { return false; }
-
-    if (buffer_.size() < chunk_size_) { buffer_.resize(chunk_size_); }
-
-    ssize_t n = handle_.read(&buffer_[0], chunk_size_);
-    if (n > 0) {
-      current_size_ = static_cast<size_t>(n);
-      return true;
-    }
-
-    current_size_ = 0;
-    finished_ = true;
-    return false;
-  }
-
-  // Pointer to current chunk data (valid after next() returns true)
-  const char *data() const { return buffer_.data(); }
-
-  // Size of current chunk (valid after next() returns true)
-  size_t size() const { return current_size_; }
-
-  // Convenience method: read all remaining data into a string
-  std::string read_all() {
-    std::string result;
-    while (next()) {
-      result.append(data(), size());
-    }
-    return result;
-  }
+  // Stream reading
+  bool next();
+  const char *data() const;
+  size_t size() const;
+  std::string read_all();
 
 private:
   ClientImpl::StreamHandle handle_;
@@ -3205,13 +3264,8 @@ struct SSEMessage {
   std::string data;  // Event payload
   std::string id;    // Event ID for Last-Event-ID header
 
-  SSEMessage() : event("message") {}
-
-  void clear() {
-    event = "message";
-    data.clear();
-    id.clear();
-  }
+  SSEMessage();
+  void clear();
 };
 
 class SSEClient {
@@ -3220,255 +3274,40 @@ public:
   using ErrorHandler = std::function<void(Error)>;
   using OpenHandler = std::function<void()>;
 
-  SSEClient(Client &client, const std::string &path)
-      : client_(client), path_(path) {}
-
-  SSEClient(Client &client, const std::string &path, const Headers &headers)
-      : client_(client), path_(path), headers_(headers) {}
-
-  ~SSEClient() { stop(); }
+  SSEClient(Client &client, const std::string &path);
+  SSEClient(Client &client, const std::string &path, const Headers &headers);
+  ~SSEClient();
 
   SSEClient(const SSEClient &) = delete;
   SSEClient &operator=(const SSEClient &) = delete;
 
   // Event handlers
-  SSEClient &on_message(MessageHandler handler) {
-    on_message_ = std::move(handler);
-    return *this;
-  }
-
-  SSEClient &on_event(const std::string &type, MessageHandler handler) {
-    event_handlers_[type] = std::move(handler);
-    return *this;
-  }
-
-  SSEClient &on_open(OpenHandler handler) {
-    on_open_ = std::move(handler);
-    return *this;
-  }
-
-  SSEClient &on_error(ErrorHandler handler) {
-    on_error_ = std::move(handler);
-    return *this;
-  }
-
-  SSEClient &set_reconnect_interval(int ms) {
-    reconnect_interval_ms_ = ms;
-    return *this;
-  }
-
-  SSEClient &set_max_reconnect_attempts(int n) {
-    max_reconnect_attempts_ = n;
-    return *this;
-  }
+  SSEClient &on_message(MessageHandler handler);
+  SSEClient &on_event(const std::string &type, MessageHandler handler);
+  SSEClient &on_open(OpenHandler handler);
+  SSEClient &on_error(ErrorHandler handler);
+  SSEClient &set_reconnect_interval(int ms);
+  SSEClient &set_max_reconnect_attempts(int n);
 
   // State accessors
-  bool is_connected() const { return connected_.load(); }
-  const std::string &last_event_id() const { return last_event_id_; }
+  bool is_connected() const;
+  const std::string &last_event_id() const;
 
   // Blocking start - runs event loop with auto-reconnect
-  void start() {
-    running_.store(true);
-    run_event_loop();
-  }
+  void start();
 
   // Non-blocking start - runs in background thread
-  void start_async() {
-    running_.store(true);
-    async_thread_ = std::thread([this]() { run_event_loop(); });
-  }
+  void start_async();
 
   // Stop the client (thread-safe)
-  void stop() {
-    running_.store(false);
-    client_.stop(); // Cancel any pending operations
-    if (async_thread_.joinable()) { async_thread_.join(); }
-  }
+  void stop();
 
 private:
-  // Parse a single SSE field line
-  // Returns true if this line ends an event (blank line)
-  bool parse_sse_line(const std::string &line, SSEMessage &msg, int &retry_ms) {
-    // Blank line signals end of event
-    if (line.empty() || line == "\r") { return true; }
-
-    // Lines starting with ':' are comments (ignored)
-    if (!line.empty() && line[0] == ':') { return false; }
-
-    // Find the colon separator
-    auto colon_pos = line.find(':');
-    if (colon_pos == std::string::npos) {
-      // Line with no colon is treated as field name with empty value
-      return false;
-    }
-
-    auto field = line.substr(0, colon_pos);
-    std::string value;
-
-    // Value starts after colon, skip optional single space
-    if (colon_pos + 1 < line.size()) {
-      auto value_start = colon_pos + 1;
-      if (line[value_start] == ' ') { value_start++; }
-      value = line.substr(value_start);
-      // Remove trailing \r if present
-      if (!value.empty() && value.back() == '\r') { value.pop_back(); }
-    }
-
-    // Handle known fields
-    if (field == "event") {
-      msg.event = value;
-    } else if (field == "data") {
-      // Multiple data lines are concatenated with newlines
-      if (!msg.data.empty()) { msg.data += "\n"; }
-      msg.data += value;
-    } else if (field == "id") {
-      // Empty id is valid (clears the last event ID)
-      msg.id = value;
-    } else if (field == "retry") {
-      // Parse retry interval in milliseconds
-      {
-        int v = 0;
-        auto res =
-            detail::from_chars(value.data(), value.data() + value.size(), v);
-        if (res.ec == std::errc{}) { retry_ms = v; }
-      }
-    }
-    // Unknown fields are ignored per SSE spec
-
-    return false;
-  }
-
-  // Main event loop with auto-reconnect
-  void run_event_loop() {
-    auto reconnect_count = 0;
-
-    while (running_.load()) {
-      // Build headers, including Last-Event-ID if we have one
-      auto request_headers = headers_;
-      if (!last_event_id_.empty()) {
-        request_headers.emplace("Last-Event-ID", last_event_id_);
-      }
-
-      // Open streaming connection
-      auto result = stream::Get(client_, path_, request_headers);
-
-      // Connection error handling
-      if (!result) {
-        connected_.store(false);
-        if (on_error_) { on_error_(result.error()); }
-
-        if (!should_reconnect(reconnect_count)) { break; }
-        wait_for_reconnect();
-        reconnect_count++;
-        continue;
-      }
-
-      if (result.status() != 200) {
-        connected_.store(false);
-        // For certain errors, don't reconnect
-        if (result.status() == 204 || // No Content - server wants us to stop
-            result.status() == 404 || // Not Found
-            result.status() == 401 || // Unauthorized
-            result.status() == 403) { // Forbidden
-          if (on_error_) { on_error_(Error::Connection); }
-          break;
-        }
-
-        if (on_error_) { on_error_(Error::Connection); }
-
-        if (!should_reconnect(reconnect_count)) { break; }
-        wait_for_reconnect();
-        reconnect_count++;
-        continue;
-      }
-
-      // Connection successful
-      connected_.store(true);
-      reconnect_count = 0;
-      if (on_open_) { on_open_(); }
-
-      // Event receiving loop
-      std::string buffer;
-      SSEMessage current_msg;
-
-      while (running_.load() && result.next()) {
-        buffer.append(result.data(), result.size());
-
-        // Process complete lines in the buffer
-        size_t line_start = 0;
-        size_t newline_pos;
-
-        while ((newline_pos = buffer.find('\n', line_start)) !=
-               std::string::npos) {
-          auto line = buffer.substr(line_start, newline_pos - line_start);
-          line_start = newline_pos + 1;
-
-          // Parse the line and check if event is complete
-          auto event_complete =
-              parse_sse_line(line, current_msg, reconnect_interval_ms_);
-
-          if (event_complete && !current_msg.data.empty()) {
-            // Update last_event_id for reconnection
-            if (!current_msg.id.empty()) { last_event_id_ = current_msg.id; }
-
-            // Dispatch event to appropriate handler
-            dispatch_event(current_msg);
-
-            current_msg.clear();
-          }
-        }
-
-        // Keep unprocessed data in buffer
-        buffer.erase(0, line_start);
-      }
-
-      // Connection ended
-      connected_.store(false);
-
-      if (!running_.load()) { break; }
-
-      // Check for read errors
-      if (result.has_read_error()) {
-        if (on_error_) { on_error_(result.read_error()); }
-      }
-
-      if (!should_reconnect(reconnect_count)) { break; }
-      wait_for_reconnect();
-      reconnect_count++;
-    }
-
-    connected_.store(false);
-  }
-
-  // Dispatch event to appropriate handler
-  void dispatch_event(const SSEMessage &msg) {
-    // Check for specific event type handler first
-    auto it = event_handlers_.find(msg.event);
-    if (it != event_handlers_.end()) {
-      it->second(msg);
-      return;
-    }
-
-    // Fall back to generic message handler
-    if (on_message_) { on_message_(msg); }
-  }
-
-  // Check if we should attempt to reconnect
-  bool should_reconnect(int count) const {
-    if (!running_.load()) { return false; }
-    if (max_reconnect_attempts_ == 0) { return true; } // unlimited
-    return count < max_reconnect_attempts_;
-  }
-
-  // Wait for reconnect interval
-  void wait_for_reconnect() {
-    // Use small increments to check running_ flag frequently
-    auto waited = 0;
-    while (running_.load() && waited < reconnect_interval_ms_) {
-      std::this_thread::sleep_for(std::chrono::milliseconds(100));
-      waited += 100;
-    }
-  }
+  bool parse_sse_line(const std::string &line, SSEMessage &msg, int &retry_ms);
+  void run_event_loop();
+  void dispatch_event(const SSEMessage &msg);
+  bool should_reconnect(int count) const;
+  void wait_for_reconnect();
 
   // Client and path
   Client &client_;