diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 5215cc3572..eee42759fc 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -69,13 +69,6 @@ jobs:
           key: macOS-latest-cmake-arm64
           evict-old-files: 1d
 
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-          brew install curl
-
       - name: Build
         id: cmake_build
         run: |
@@ -83,6 +76,8 @@ jobs:
           cmake -B build \
             -DCMAKE_BUILD_RPATH="@loader_path" \
             -DLLAMA_FATAL_WARNINGS=ON \
+            -DLLAMA_CURL=OFF \
+            -DLLAMA_BUILD_BORINGSSL=ON \
             -DGGML_METAL_USE_BF16=ON \
             -DGGML_METAL_EMBED_LIBRARY=OFF \
             -DGGML_METAL_SHADER_DEBUG=ON \
@@ -110,13 +105,6 @@ jobs:
           key: macOS-latest-cmake-x64
           evict-old-files: 1d
 
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-          brew install curl
-
       - name: Build
         id: cmake_build
         run: |
@@ -126,6 +114,8 @@ jobs:
           cmake -B build \
             -DCMAKE_BUILD_RPATH="@loader_path" \
             -DLLAMA_FATAL_WARNINGS=ON \
+            -DLLAMA_CURL=OFF \
+            -DLLAMA_BUILD_BORINGSSL=ON \
             -DGGML_METAL=OFF \
             -DGGML_RPC=ON \
             -DCMAKE_OSX_DEPLOYMENT_TARGET=13.3
@@ -151,13 +141,6 @@ jobs:
           key: macOS-latest-cmake-arm64-webgpu
           evict-old-files: 1d
 
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-          brew install curl
-
       - name: Dawn Dependency
         id: dawn-depends
         run: |
@@ -217,7 +200,7 @@ jobs:
           sudo apt-get update
           sudo apt-get install -y --no-install-recommends \
             python3 python3-pip python3-dev \
-            libjpeg-dev build-essential libcurl4-openssl-dev \
+            libjpeg-dev build-essential libssl-dev \
             git-lfs
 
       - name: Python Dependencies
@@ -238,6 +221,8 @@ jobs:
         id: cmake_build
         run: |
           cmake -B build \
+            -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=ON \
             -DLLAMA_FATAL_WARNINGS=ON \
             -DGGML_RPC=ON
           cmake --build build --config Release -j $(nproc)
@@ -294,13 +279,15 @@ jobs:
         id: depends
         run: |
           sudo apt-get update
-          sudo apt-get install build-essential libcurl4-openssl-dev
+          sudo apt-get install build-essential libssl-dev
 
       - name: Build
         id: cmake_build
         if: ${{ matrix.sanitizer != 'THREAD' }}
         run: |
           cmake -B build \
+            -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=ON \
             -DLLAMA_FATAL_WARNINGS=ON \
             -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
             -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
@@ -311,6 +298,8 @@ jobs:
         if: ${{ matrix.sanitizer == 'THREAD' }}
         run: |
           cmake -B build \
+            -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=ON \
             -DLLAMA_FATAL_WARNINGS=ON \
             -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
             -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
@@ -335,7 +324,7 @@ jobs:
         id: depends
         run: |
           sudo apt-get update
-          sudo apt-get install build-essential libcurl4-openssl-dev
+          sudo apt-get install build-essential libssl-dev
 
       - name: Build
         id: cmake_build
@@ -343,6 +332,8 @@ jobs:
           mkdir build
           cd build
           cmake .. \
+            -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=ON \
             -DLLAMA_FATAL_WARNINGS=ON \
             -DLLAMA_LLGUIDANCE=ON
           cmake --build . --config Release -j $(nproc)
@@ -373,12 +364,14 @@ jobs:
         id: depends
         run: |
           sudo apt-get update
-          sudo apt-get install build-essential libcurl4-openssl-dev
+          sudo apt-get install build-essential libssl-dev
 
       - name: Build
         id: cmake_build
         run: |
           cmake -B build \
+            -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=ON \
             -DGGML_RPC=ON
           cmake --build build --config Release -j $(nproc)
 
@@ -405,12 +398,14 @@ jobs:
       - name: Dependencies
         id: depends
         run: |
-          sudo apt-get install -y glslc libvulkan-dev libcurl4-openssl-dev
+          sudo apt-get install -y glslc libvulkan-dev libssl-dev
 
       - name: Configure
         id: cmake_configure
         run: |
           cmake -B build \
+            -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=ON \
             -DCMAKE_BUILD_TYPE=RelWithDebInfo \
             -DGGML_BACKEND_DL=ON \
             -DGGML_CPU_ALL_VARIANTS=ON \
@@ -440,7 +435,7 @@ jobs:
         run: |
           sudo add-apt-repository -y ppa:kisak/kisak-mesa
           sudo apt-get update -y
-          sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libcurl4-openssl-dev
+          sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev
 
       - name: Get latest Vulkan SDK version
         id: vulkan_sdk_version
@@ -466,6 +461,8 @@ jobs:
         run: |
           source ./vulkan_sdk/setup-env.sh
           cmake -B build \
+            -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=ON \
             -DGGML_VULKAN=ON
           cmake --build build --config Release -j $(nproc)
 
@@ -497,7 +494,7 @@ jobs:
         run: |
           sudo add-apt-repository -y ppa:kisak/kisak-mesa
           sudo apt-get update -y
-          sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libcurl4-openssl-dev
+          sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev
 
       - name: Get latest Vulkan SDK version
         id: vulkan_sdk_version
@@ -537,7 +534,10 @@ jobs:
         id: cmake_build
         run: |
           export Dawn_DIR=dawn/lib64/cmake/Dawn
-          cmake -B build -DGGML_WEBGPU=ON
+          cmake -B build \
+            -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=ON \
+            -DGGML_WEBGPU=ON
           cmake --build build --config Release -j $(nproc)
 
       - name: Test
@@ -560,7 +560,7 @@ jobs:
         id: depends
         run: |
           sudo apt-get update
-          sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libcurl4-openssl-dev rocwmma-dev
+          sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libssl-dev rocwmma-dev
 
       - name: ccache
         uses: ggml-org/ccache-action@v1.2.16
@@ -572,6 +572,8 @@ jobs:
         id: cmake_build
         run: |
           cmake -B build -S . \
+            -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=ON \
             -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
             -DGGML_HIP_ROCWMMA_FATTN=ON \
             -DGGML_HIP=ON
@@ -590,7 +592,7 @@ jobs:
         id: depends
         run: |
           apt-get update
-          apt-get install -y build-essential git cmake libcurl4-openssl-dev
+          apt-get install -y build-essential git cmake libssl-dev
 
       - name: ccache
         uses: ggml-org/ccache-action@v1.2.16
@@ -602,6 +604,8 @@ jobs:
         id: cmake_build
         run: |
           cmake -B build -S . \
+            -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=ON \
             -DGGML_MUSA=ON
           cmake --build build --config Release -j $(nproc)
 
@@ -626,7 +630,7 @@ jobs:
         shell: bash
         run: |
           sudo apt update
-          sudo apt install intel-oneapi-compiler-dpcpp-cpp libcurl4-openssl-dev
+          sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev
 
       - name: install oneAPI MKL library
         shell: bash
@@ -648,6 +652,8 @@ jobs:
         run: |
           source /opt/intel/oneapi/setvars.sh
           cmake -B build \
+            -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=ON \
             -DGGML_SYCL=ON \
             -DCMAKE_C_COMPILER=icx \
             -DCMAKE_CXX_COMPILER=icpx
@@ -674,7 +680,7 @@ jobs:
         shell: bash
         run: |
           sudo apt update
-          sudo apt install intel-oneapi-compiler-dpcpp-cpp libcurl4-openssl-dev
+          sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev
 
       - name: install oneAPI MKL library
         shell: bash
@@ -696,6 +702,8 @@ jobs:
         run: |
           source /opt/intel/oneapi/setvars.sh
           cmake -B build \
+            -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=ON \
             -DGGML_SYCL=ON \
             -DCMAKE_C_COMPILER=icx \
             -DCMAKE_CXX_COMPILER=icpx \
@@ -722,12 +730,6 @@ jobs:
           key: macOS-latest-cmake-ios
           evict-old-files: 1d
 
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
       - name: Build
         id: cmake_build
         run: |
@@ -759,12 +761,6 @@ jobs:
           key: macOS-latest-cmake-tvos
           evict-old-files: 1d
 
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
       - name: Build
         id: cmake_build
         run: |
@@ -790,12 +786,6 @@ jobs:
         id: checkout
         uses: actions/checkout@v4
 
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
       - name: Build
         id: cmake_build
         run: |
@@ -838,12 +828,6 @@ jobs:
           name: llama-xcframework
           path: build-apple/llama.xcframework/
 
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
       - name: Build llama.cpp with CMake
         id: cmake_build
         run: |
@@ -995,21 +979,12 @@ jobs:
             -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
           cmake --build build-arm64-release --target install --config release
 
-      - name: libCURL
-        id: get_libcurl
-        uses: ./.github/actions/windows-setup-curl
-        with:
-          architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }}
-
       - name: Build
         id: cmake_build
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
         run: |
           cmake -S . -B build ${{ matrix.defines }} `
-            -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
+            -DLLAMA_CURL=OFF -DLLAMA_BUILD_BORINGSSL=ON
           cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
-          cp $env:CURL_PATH/bin/libcurl-*.dll build/bin/Release
 
       - name: Add libopenblas.dll
         id: add_libopenblas_dll
@@ -1053,7 +1028,7 @@ jobs:
             DEBIAN_FRONTEND: noninteractive
           run: |
               apt update
-              apt install -y cmake build-essential ninja-build libgomp1 git libcurl4-openssl-dev
+              apt install -y cmake build-essential ninja-build libgomp1 git libssl-dev
 
         - name: ccache
           uses: ggml-org/ccache-action@v1.2.16
@@ -1064,10 +1039,12 @@ jobs:
         - name: Build with CMake
           run: |
             cmake -S . -B build -G Ninja \
+              -DLLAMA_CURL=OFF \
+              -DLLAMA_OPENSSL=ON \
+              -DLLAMA_FATAL_WARNINGS=ON \
               -DCMAKE_BUILD_TYPE=Release \
               -DCMAKE_CUDA_ARCHITECTURES=89-real \
               -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined \
-              -DLLAMA_FATAL_WARNINGS=ON \
               -DGGML_NATIVE=OFF \
               -DGGML_CUDA=ON
             cmake --build build
@@ -1101,25 +1078,20 @@ jobs:
         run: |
           choco install ninja
 
-      - name: libCURL
-        id: get_libcurl
-        uses: ./.github/actions/windows-setup-curl
-
       - name: Build
         id: cmake_build
         shell: cmd
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
         run: |
           call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
           cmake -S . -B build -G "Ninja Multi-Config" ^
             -DLLAMA_BUILD_SERVER=ON ^
+            -DLLAMA_CURL=OFF ^
+            -DLLAMA_BUILD_BORINGSSL=ON ^
             -DGGML_NATIVE=OFF ^
             -DGGML_BACKEND_DL=ON ^
             -DGGML_CPU_ALL_VARIANTS=ON ^
             -DGGML_CUDA=ON ^
-            -DGGML_RPC=ON ^
-            -DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include"
+            -DGGML_RPC=ON
           set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
           cmake --build build --config Release -j %NINJA_JOBS% -t ggml
           cmake --build build --config Release
@@ -1151,7 +1123,7 @@ jobs:
         run:  |
           scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
 
-      # TODO: add libcurl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
+      # TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
 
       - name: Build
         id: cmake_build
@@ -1208,14 +1180,8 @@ jobs:
           key: ${{ github.job }}
           evict-old-files: 1d
 
-      - name: libCURL
-        id: get_libcurl
-        uses: ./.github/actions/windows-setup-curl
-
       - name: Build
         id: cmake_build
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
         run: |
           $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
           $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
@@ -1224,11 +1190,12 @@ jobs:
             -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
             -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-${{ env.ROCM_VERSION }}/include/" `
             -DCMAKE_BUILD_TYPE=Release `
+            -DLLAMA_CURL=OFF `
+            -DLLAMA_BUILD_BORINGSSL=ON `
             -DROCM_DIR="${env:HIP_PATH}" `
             -DGGML_HIP=ON `
             -DGGML_HIP_ROCWMMA_FATTN=ON `
-            -DGGML_RPC=ON `
-            -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
+            -DGGML_RPC=ON
           cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
 
   ios-xcode-build:
diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
index ebcd6424bc..be4e23bc81 100644
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -56,7 +56,7 @@ jobs:
             curl \
             wget \
             language-pack-en \
-            libcurl4-openssl-dev
+            libssl-dev
 
       - name: Clone
         id: checkout
@@ -242,7 +242,7 @@ jobs:
             curl \
             wget \
             language-pack-en \
-            libcurl4-openssl-dev
+            libssl-dev
 
       - name: Clone
         id: checkout
@@ -283,6 +283,8 @@ jobs:
         run: |
           cmake -B build \
               -DGGML_NATIVE=OFF \
+              -DLLAMA_CURL=OFF \
+              -DLLAMA_OPENSSL=ON \
               -DLLAMA_BUILD_SERVER=ON \
               -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
               -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
@@ -295,6 +297,8 @@ jobs:
         run: |
           cmake -B build \
               -DGGML_NATIVE=OFF \
+              -DLLAMA_CURL=OFF \
+              -DLLAMA_OPENSSL=ON \
               -DLLAMA_BUILD_SERVER=ON \
               -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
               -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
@@ -306,6 +310,8 @@ jobs:
         run: |
           cmake -B build \
               -DGGML_NATIVE=OFF \
+              -DLLAMA_CURL=OFF \
+              -DLLAMA_OPENSSL=ON \
               -DLLAMA_BUILD_SERVER=ON \
               -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
           cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
diff --git a/README.md b/README.md
index 3d94b30cda..2e44ae7d0c 100644
--- a/README.md
+++ b/README.md
@@ -242,6 +242,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
 - [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
 - [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with pre-built Mobile and Web platform wrappers and a model example)
+- [unslothai/unsloth](https://github.com/unslothai/unsloth) – 🦥 exports/saves fine-tuned and trained models to GGUF (Apache-2.0)
 
 </details>
 
diff --git a/common/arg.cpp b/common/arg.cpp
index 430ab45dfe..eab26b67f2 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -212,7 +212,6 @@ struct handle_model_result {
 static handle_model_result common_params_handle_model(
         struct common_params_model & model,
         const std::string & bearer_token,
-        const std::string & model_path_default,
         bool offline) {
     handle_model_result result;
     // handle pre-fill default model path and url based on hf_repo and hf_file
@@ -257,8 +256,6 @@ static handle_model_result common_params_handle_model(
                 model.path = fs_get_cache_file(string_split<std::string>(f, '/').back());
             }
 
-        } else if (model.path.empty()) {
-            model.path = model_path_default;
         }
     }
 
@@ -405,7 +402,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
 
     // handle model and download
     {
-        auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH, params.offline);
+        auto res = common_params_handle_model(params.model, params.hf_token, params.offline);
         if (params.no_mmproj) {
             params.mmproj = {};
         } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
@@ -415,12 +412,18 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
         // only download mmproj if the current example is using it
         for (auto & ex : mmproj_examples) {
             if (ctx_arg.ex == ex) {
-                common_params_handle_model(params.mmproj,    params.hf_token, "", params.offline);
+                common_params_handle_model(params.mmproj,    params.hf_token, params.offline);
                 break;
             }
         }
-        common_params_handle_model(params.speculative.model, params.hf_token, "", params.offline);
-        common_params_handle_model(params.vocoder.model,     params.hf_token, "", params.offline);
+        common_params_handle_model(params.speculative.model, params.hf_token, params.offline);
+        common_params_handle_model(params.vocoder.model,     params.hf_token, params.offline);
+    }
+
+    // model is required (except for server)
+    // TODO @ngxson : maybe show a list of available models in CLI in this case
+    if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER) {
+        throw std::invalid_argument("error: --model is required\n");
     }
 
     if (params.escape) {
@@ -2072,11 +2075,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     add_opt(common_arg(
         {"-m", "--model"}, "FNAME",
         ex == LLAMA_EXAMPLE_EXPORT_LORA
-            ? std::string("model path from which to load base model")
-            : string_format(
-                "model path (default: `models/$filename` with filename from `--hf-file` "
-                "or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH
-            ),
+            ? "model path from which to load base model"
+            : "model path to load",
         [](common_params & params, const std::string & value) {
             params.model.path = value;
         }
@@ -2474,6 +2474,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             }
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(common_arg(
+        {"--models-dir"}, "PATH",
+        "directory containing models for the router server (default: disabled)",
+        [](common_params & params, const std::string & value) {
+            params.models_dir = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_DIR"));
+    add_opt(common_arg(
+        {"--max-models"}, "N",
+        string_format("for router server, maximum number of models to load simultaneously (default: %d, 0 = unlimited)", params.max_models),
+        [](common_params & params, int value) {
+            params.max_models = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MAX_MODELS"));
     add_opt(common_arg(
         {"--jinja"},
         "use jinja template for chat (default: disabled)",
diff --git a/common/common.cpp b/common/common.cpp
index f3cc55247e..be31c66de1 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -911,7 +911,7 @@ std::string fs_get_cache_file(const std::string & filename) {
     return cache_directory + filename;
 }
 
-std::vector<common_file_info> fs_list_files(const std::string & path) {
+std::vector<common_file_info> fs_list(const std::string & path, bool include_directories) {
     std::vector<common_file_info> files;
     if (path.empty()) return files;
 
@@ -926,14 +926,22 @@ std::vector<common_file_info> fs_list_files(const std::string & path) {
             const auto & p = entry.path();
             if (std::filesystem::is_regular_file(p)) {
                 common_file_info info;
-                info.path = p.string();
-                info.name = p.filename().string();
+                info.path   = p.string();
+                info.name   = p.filename().string();
+                info.is_dir = false;
                 try {
                     info.size = static_cast<size_t>(std::filesystem::file_size(p));
                 } catch (const std::filesystem::filesystem_error &) {
                     info.size = 0;
                 }
                 files.push_back(std::move(info));
+            } else if (include_directories && std::filesystem::is_directory(p)) {
+                common_file_info info;
+                info.path   = p.string();
+                info.name   = p.filename().string();
+                info.size   = 0; // Directories have no size
+                info.is_dir = true;
+                files.push_back(std::move(info));
             }
         } catch (const std::filesystem::filesystem_error &) {
             // skip entries we cannot inspect
diff --git a/common/common.h b/common/common.h
index de5b404dd8..20ba209ce4 100644
--- a/common/common.h
+++ b/common/common.h
@@ -26,8 +26,6 @@
     fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);    \
 } while(0)
 
-#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
-
 struct common_time_meas {
     common_time_meas(int64_t & t_acc, bool disable = false);
     ~common_time_meas();
@@ -460,6 +458,10 @@ struct common_params {
     bool endpoint_props   = false; // only control POST requests, not GET
     bool endpoint_metrics = false;
 
+    // router server configs
+    std::string models_dir = ""; // directory containing models for the router server
+    int max_models = 4;          // maximum number of models to load simultaneously
+
     bool log_json = false;
 
     std::string slot_save_path;
@@ -623,8 +625,9 @@ struct common_file_info {
     std::string path;
     std::string name;
     size_t      size = 0; // in bytes
+    bool        is_dir = false;
 };
-std::vector<common_file_info> fs_list_files(const std::string & path);
+std::vector<common_file_info> fs_list(const std::string & path, bool include_directories);
 
 //
 // Model utils
diff --git a/common/download.cpp b/common/download.cpp
index eeb32b6a86..1a3bc9216f 100644
--- a/common/download.cpp
+++ b/common/download.cpp
@@ -1047,7 +1047,7 @@ std::string common_docker_resolve_model(const std::string &) {
 std::vector<common_cached_model_info> common_list_cached_models() {
     std::vector<common_cached_model_info> models;
     const std::string cache_dir = fs_get_cache_directory();
-    const std::vector<common_file_info> files = fs_list_files(cache_dir);
+    const std::vector<common_file_info> files = fs_list(cache_dir, false);
     for (const auto & file : files) {
         if (string_starts_with(file.name, "manifest=") && string_ends_with(file.name, ".json")) {
             common_cached_model_info model_info;
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index 9576dcb6e8..5cbf5683e1 100644
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -2246,8 +2246,7 @@ static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx
                                             bool &                      use_cann_graph,
                                             bool &                      cann_graph_update_required) {
 #ifdef USE_ACL_GRAPH
-    ggml_cann_graph * matched_graph = cann_ctx->graph_lru_cache.cache_list.front();
-    if (use_cann_graph && cann_graph_update_required) {
+    if (use_cann_graph && cann_graph_update_required) {  // Begin CANN graph capture
         ACL_CHECK(aclmdlRICaptureBegin(cann_ctx->stream(), ACL_MODEL_RI_CAPTURE_MODE_GLOBAL));
     }
 #endif  // USE_ACL_GRAPH
@@ -2271,12 +2270,14 @@ static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx
     }
 
 #ifdef USE_ACL_GRAPH
-    if (use_cann_graph && cann_graph_update_required) {  // End CANN graph capture
-        ACL_CHECK(aclmdlRICaptureEnd(cann_ctx->stream(), &matched_graph->graph));
-    }
-
     if (use_cann_graph) {
-        // Execute graph
+        ggml_cann_graph * matched_graph = cann_ctx->graph_lru_cache.cache_list.front();
+
+        if (cann_graph_update_required) {  // End CANN graph capture
+            ACL_CHECK(aclmdlRICaptureEnd(cann_ctx->stream(), &matched_graph->graph));
+        }
+
+        // Execute CANN graph
         ACL_CHECK(aclmdlRIExecuteAsync(matched_graph->graph, cann_ctx->stream()));
     }
 #endif  // USE_ACL_GRAPH
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index 25e9308d75..99ec96869a 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -224,6 +224,10 @@ static const char * cu_get_error_str(CUresult err) {
 #define AMD_MFMA_AVAILABLE
 #endif // defined(GGML_USE_HIP) && defined(CDNA) && !defined(GGML_HIP_NO_MMQ_MFMA)
 
+#if defined(GGML_USE_HIP) && defined(RDNA4)
+#define AMD_WMMA_AVAILABLE
+#endif // defined(GGML_USE_HIP) && defined(RDNA4)
+
 // The Volta instructions are in principle available on Turing or newer but they are effectively unusable:
 #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
 #define VOLTA_MMA_AVAILABLE
@@ -283,6 +287,10 @@ static bool amd_mfma_available(const int cc) {
 #endif //!defined(GGML_HIP_NO_MMQ_MFMA)
 }
 
+static bool amd_wmma_available(const int cc) {
+    return GGML_CUDA_CC_IS_RDNA4(cc);
+}
+
 static bool volta_mma_available(const int cc) {
     return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) == GGML_CUDA_CC_VOLTA;
 }
diff --git a/ggml/src/ggml-cuda/convert.cuh b/ggml/src/ggml-cuda/convert.cuh
index 8a5e08ef66..09f9a33f90 100644
--- a/ggml/src/ggml-cuda/convert.cuh
+++ b/ggml/src/ggml-cuda/convert.cuh
@@ -39,6 +39,15 @@ template<typename dst_t, typename src_t>
         return __float2bfloat16(float(x));
     } else if constexpr(std::is_same_v<src_t, nv_bfloat16>) {
         return __bfloat162float(x);
+    } else if constexpr(std::is_same_v<src_t, float2> && std::is_same_v<dst_t, half2>) {
+        return __float22half2_rn(x);
+    } else if constexpr(std::is_same_v<src_t, float2> && std::is_same_v<dst_t, nv_bfloat162>) {
+        // bypass compile error on cuda 12.0.1
+#ifdef GGML_USE_HIP
+        return __float22bfloat162_rn(x);
+#else
+        return {x.x, x.y};
+#endif // GGML_USE_HIP
     } else if constexpr(std::is_same_v<dst_t, int32_t>) {
         return int32_t(x);
     } else {
diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh
index a7a28fd1ae..c3c4b77996 100644
--- a/ggml/src/ggml-cuda/mma.cuh
+++ b/ggml/src/ggml-cuda/mma.cuh
@@ -74,6 +74,33 @@ namespace ggml_cuda_mma {
         static constexpr int J  = J_;
 
 #if defined(GGML_USE_HIP)
+#if defined(RDNA4)
+        static constexpr int ne = I * J / 32;
+        T x[ne] = {0};
+
+        static constexpr __device__ bool supported() {
+            if (I == 16 && J == 16) return true;
+            return false;
+        }
+
+        static __device__ __forceinline__ int get_i(const int l) {
+            if constexpr (I == 16 && J == 16) {
+                return 8 * (threadIdx.x / 16) + l;
+            } else {
+                NO_DEVICE_CODE;
+                return -1;
+            }
+        }
+
+        static __device__ __forceinline__ int get_j(const int l) {
+            if constexpr (I == 16 && J == 16) {
+                return threadIdx.x % 16;
+            } else {
+                NO_DEVICE_CODE;
+                return -1;
+            }
+        }
+#else
         static constexpr int ne = I * J / 64;
         T x[ne] = {0};
 
@@ -119,6 +146,7 @@ namespace ggml_cuda_mma {
                 return -1;
             }
         }
+#endif // defined(RDNA4)
 #elif __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
         static constexpr int ne = I * J / 32;
         T x[ne] = {0};
@@ -236,6 +264,32 @@ namespace ggml_cuda_mma {
                 return -1;
             }
         }
+#elif defined(AMD_WMMA_AVAILABLE)
+        static constexpr int ne = I * J / 32;
+        half2 x[ne] = {{0.0f, 0.0f}};
+
+        static constexpr __device__ bool supported() {
+            if (I == 16 && J == 8) return true;
+            return false;
+        }
+
+        static __device__ __forceinline__ int get_i(const int l) {
+            if constexpr (I == 16 && J == 8) {
+                return threadIdx.x % 16;
+            } else {
+                NO_DEVICE_CODE;
+                return -1;
+            }
+        }
+
+        static __device__ __forceinline__ int get_j(const int l) {
+            if constexpr (I == 16 && J == 8) {
+                return 4 * (threadIdx.x / 16) + l;
+            } else {
+                NO_DEVICE_CODE;
+                return -1;
+            }
+        }
 #else
         static constexpr int ne = I * J / WARP_SIZE;
         half2 x[ne] = {{0.0f, 0.0f}};
@@ -285,6 +339,34 @@ namespace ggml_cuda_mma {
     struct tile<I_, J_, nv_bfloat162> {
         static constexpr int I  = I_;
         static constexpr int J  = J_;
+
+#if defined(AMD_WMMA_AVAILABLE)
+        static constexpr int ne = I * J / 32;
+        nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
+
+        static constexpr __device__ bool supported() {
+            if (I == 16 && J == 8) return true;
+            return false;
+        }
+
+        static __device__ __forceinline__ int get_i(const int l) {
+            if constexpr (I == 16 && J == 8) {
+                return threadIdx.x % 16;
+            } else {
+                NO_DEVICE_CODE;
+                return -1;
+            }
+        }
+
+        static __device__ __forceinline__ int get_j(const int l) {
+            if constexpr (I == 16 && J == 8) {
+                return 4 * (threadIdx.x / 16) + l;
+            } else {
+                NO_DEVICE_CODE;
+                return -1;
+            }
+        }
+#else
         static constexpr int ne = I * J / WARP_SIZE;
         nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
 
@@ -320,6 +402,7 @@ namespace ggml_cuda_mma {
                 return -1;
             }
         }
+#endif  // defined(AMD_WMMA_AVAILABLE)
     };
 
     template <int I, int J>
@@ -353,6 +436,8 @@ namespace ggml_cuda_mma {
             const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 2 * (threadIdx.x / t.I));
             xi[0] = xs[0];
         }
+#elif defined(AMD_WMMA_AVAILABLE)
+        ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
 #else
 #pragma unroll
         for (int l = 0; l < t.ne; ++l) {
@@ -639,12 +724,34 @@ namespace ggml_cuda_mma {
             : "+r"(Dxi[4]), "+r"(Dxi[5]), "+r"(Dxi[6]), "+r"(Dxi[7])
             : "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[3]));
 #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
+#elif defined(AMD_WMMA_AVAILABLE)
+        using halfx8_t = __attribute__((ext_vector_type(8))) _Float16;
+        using floatx8_t = __attribute__((ext_vector_type(8))) float;
+        floatx8_t& acc_frag = reinterpret_cast<floatx8_t&>(D.x[0]);
+        const halfx8_t& a_frag = reinterpret_cast<const halfx8_t&>(A.x[0]);
+        const halfx8_t& b_frag = reinterpret_cast<const halfx8_t&>(B.x[0]);
+        acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(a_frag, b_frag, acc_frag);
 #else
         GGML_UNUSED_VARS(D, A, B);
         NO_DEVICE_CODE;
 #endif // TURING_MMA_AVAILABLE
     }
 
+    static __device__ __forceinline__ void mma(
+            tile<16, 16, float> & D, const tile<16, 8, nv_bfloat162> & A, const tile<16, 8, nv_bfloat162> & B) {
+#if defined(AMD_WMMA_AVAILABLE)
+        using bf16x8_t = __attribute__((ext_vector_type(8))) __bf16;
+        using floatx8_t = __attribute__((ext_vector_type(8))) float;
+        floatx8_t& acc_frag = reinterpret_cast<floatx8_t&>(D.x[0]);
+        const bf16x8_t& a_frag = reinterpret_cast<const bf16x8_t&>(A.x[0]);
+        const bf16x8_t& b_frag = reinterpret_cast<const bf16x8_t&>(B.x[0]);
+        acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12(a_frag, b_frag, acc_frag);
+#else
+        GGML_UNUSED_VARS(D, A, B);
+        NO_DEVICE_CODE;
+#endif // AMPERE_MMA_AVAILABLE
+    }
+
     static __device__ __forceinline__ void mma(
             tile<16, 16, int> & D, const tile<16, 8, int> & A, const tile<16, 8, int> & B) {
 #if defined(AMD_MFMA_AVAILABLE)
diff --git a/ggml/src/ggml-cuda/mmf.cu b/ggml/src/ggml-cuda/mmf.cu
index 153dd5a97d..5c51a22256 100644
--- a/ggml/src/ggml-cuda/mmf.cu
+++ b/ggml/src/ggml-cuda/mmf.cu
@@ -151,7 +151,7 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const
             return false;
         }
     } else {
-        if (src1_ncols > 16) {
+        if (src1_ncols > 16 || GGML_CUDA_CC_IS_RDNA4(cc)) {
             return false;
         }
     }
@@ -160,9 +160,9 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const
         case GGML_TYPE_F32:
             return ampere_mma_available(cc);
         case GGML_TYPE_F16:
-            return volta_mma_available(cc) || turing_mma_available(cc);
+            return volta_mma_available(cc) || turing_mma_available(cc) || amd_wmma_available(cc);
         case GGML_TYPE_BF16:
-            return ampere_mma_available(cc);
+            return ampere_mma_available(cc) || amd_wmma_available(cc);
         default:
             return false;
     }
diff --git a/ggml/src/ggml-cuda/mmf.cuh b/ggml/src/ggml-cuda/mmf.cuh
index 45724e0911..c2a0a2e42f 100644
--- a/ggml/src/ggml-cuda/mmf.cuh
+++ b/ggml/src/ggml-cuda/mmf.cuh
@@ -2,6 +2,7 @@
 
 #include "mma.cuh"
 #include "common.cuh"
+#include "convert.cuh"
 
 using namespace ggml_cuda_mma;
 
@@ -27,20 +28,35 @@ static __global__ void mul_mat_f(
         const int stride_col_id, const int stride_row_id,
         const int channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
         const int sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) {
-#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
+// TODO: handle this in a consistent and simpler way after AMD MFMA support has been added
+#if (!defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)) || defined(AMD_WMMA_AVAILABLE)
+#if defined(AMD_WMMA_AVAILABLE)
+    // Special case for tf32, just dummy mma layout as wmma doesn't support it.
+    constexpr int tile_B_I = std::is_same_v<T, float> ? 8 : 16;
+    constexpr int tile_C_J = std::is_same_v<T, float> ? 8 : 16;
+    typedef tile<16,       8, T>     tile_A;
+    typedef tile<tile_B_I, 8, T>     tile_B;
+    typedef tile<16,       tile_C_J, float> tile_C;
+
+    constexpr bool a_supported = tile_A::supported();
+    constexpr bool b_supported = tile_B::supported();
+    constexpr bool c_supported = tile_C::supported();
+    constexpr bool supported = a_supported && b_supported && c_supported;
+#else
     constexpr bool I_16_supported = tile<16, 8, T>::supported() && tile<16, 8, float>::supported();
     constexpr bool I_32_supported = tile<32, 8, T>::supported() && tile<32, 8, float>::supported();
-
-    if (!I_16_supported && !I_32_supported) {
-        NO_DEVICE_CODE;
-        return;
-    }
+    constexpr bool supported = I_16_supported || I_32_supported;
 
     constexpr int I_preferred = I_16_supported ? 16 : 32; // For Turing MMA both work but 16 is ~1% faster.
 
     typedef tile<I_preferred, 8, T>     tile_A;
     typedef tile<8,           8, T>     tile_B;
     typedef tile<I_preferred, 8, float> tile_C;
+#endif // defined(AMD_WMMA_AVAILABLE)
+    if constexpr (!supported) {
+        NO_DEVICE_CODE;
+        return;
+    }
 
     constexpr int warp_size = ggml_cuda_get_physical_warp_size();
     constexpr int tile_k_padded = warp_size + 4;
@@ -161,11 +177,11 @@ static __global__ void mul_mat_f(
 
                     if constexpr (!has_ids) {
                         const float2 tmp = j < cols_per_block ? y2[j*stride_col_y + col] : make_float2(0.0f, 0.0f);
-                        tile_xy[j0*tile_k_padded + threadIdx.x] = {tmp.x, tmp.y};
+                        tile_xy[j0*tile_k_padded + threadIdx.x] = ggml_cuda_cast<T>(tmp);
                     } else {
                         const bool valid = j < cols_per_block && (col_base + j) < ncols_dst_total && slot_map[j] >= 0;
                         float2 tmp = valid ? *(const float2*) &y[slot_map[j]*stride_channel_y + 2*(j*stride_col_y + col)] : make_float2(0.0f, 0.0f);
-                        tile_xy[j0*tile_k_padded + threadIdx.x] = {tmp.x, tmp.y};
+                        tile_xy[j0*tile_k_padded + threadIdx.x] = ggml_cuda_cast<T>(tmp);
                     }
                 }
             } else {
@@ -239,7 +255,7 @@ static __global__ void mul_mat_f(
         channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
         sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
     NO_DEVICE_CODE;
-#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
+#endif // (!defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)) || defined(AMD_WMMA_AVAILABLE)
 }
 
 //This kernel is for larger batch sizes of mul_mat_id
@@ -253,20 +269,35 @@ static __global__ void mul_mat_f_ids(
         const int channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
         const int sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst,
         const uint3 sis1_fd, const uint3 nch_fd) {
-#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
+// TODO: handle this in a consistent and simpler way after AMD MFMA support has been added
+#if (!defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)) || defined(AMD_WMMA_AVAILABLE)
+#if defined(AMD_WMMA_AVAILABLE)
+    // Special case for tf32, just dummy mma layout as wmma doesn't support it.
+    constexpr int tile_B_I = std::is_same_v<T, float> ? 8 : 16;
+    constexpr int tile_C_J = std::is_same_v<T, float> ? 8 : 16;
+    typedef tile<16,       8, T>     tile_A;
+    typedef tile<tile_B_I, 8, T>     tile_B;
+    typedef tile<16,       tile_C_J, float> tile_C;
+
+    constexpr bool a_supported = tile_A::supported();
+    constexpr bool b_supported = tile_B::supported();
+    constexpr bool c_supported = tile_C::supported();
+    constexpr bool supported = a_supported && b_supported && c_supported;
+#else
     constexpr bool I_16_supported = tile<16, 8, T>::supported() && tile<16, 8, float>::supported();
     constexpr bool I_32_supported = tile<32, 8, T>::supported() && tile<32, 8, float>::supported();
+    constexpr bool supported = I_16_supported || I_32_supported;
 
-    if (!I_16_supported && !I_32_supported) {
-        NO_DEVICE_CODE;
-        return;
-    }
-
-    constexpr int I_preferred = I_16_supported ? 16 : 32; // For Turing MMA both work butr 16 is ~1% faster.
+    constexpr int I_preferred = I_16_supported ? 16 : 32; // For Turing MMA both work but 16 is ~1% faster.
 
     typedef tile<I_preferred, 8, T>     tile_A;
     typedef tile<8,           8, T>     tile_B;
     typedef tile<I_preferred, 8, float> tile_C;
+#endif // defined(AMD_WMMA_AVAILABLE)
+    if constexpr (!supported) {
+        NO_DEVICE_CODE;
+        return;
+    }
 
     constexpr int warp_size = ggml_cuda_get_physical_warp_size();
     constexpr int tile_k_padded = warp_size + 4;
@@ -408,7 +439,7 @@ static __global__ void mul_mat_f_ids(
 #pragma unroll
                 for (int j0 = 0; j0 < tile_B::I; ++j0) {
                     const float2 tmp = vals_buf[curr_buf][j0];
-                    tile_xy[j0*tile_k_padded + threadIdx.x] = {tmp.x, tmp.y};
+                    tile_xy[j0*tile_k_padded + threadIdx.x] = ggml_cuda_cast<T>(tmp);
                 }
 
                 if (itB + 1 < ntB) {
@@ -492,7 +523,7 @@ static __global__ void mul_mat_f_ids(
         channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
         sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, sis1_fd, nch_fd);
     NO_DEVICE_CODE;
-#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
+#endif // (!defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)) || defined(AMD_WMMA_AVAILABLE)
 }
 
 template<typename T, int cols_per_block, int nwarps>
@@ -554,7 +585,8 @@ void mul_mat_f_cuda(
         cudaStream_t stream, const mmf_ids_data * ids_data) {
     typedef tile<16, 8, T>     tile_A_16;
     typedef tile<32, 8, T>     tile_A_32;
-    typedef tile< 8, 8, T>     tile_B;
+    typedef tile<16, 8, T>     tile_B_16;
+    typedef tile< 8, 8, T>     tile_B_8;
 
     GGML_ASSERT(ncols_x      % 2 == 0);
     GGML_ASSERT(stride_row   % 2 == 0);
@@ -581,7 +613,8 @@ void mul_mat_f_cuda(
 
     constexpr int rows_per_block = MMF_ROWS_PER_BLOCK;
     const int nbytes_shared_iter = nwarps_best * (volta_mma_available(cc) ? tile_A_32::I : tile_A_16::I) * (warp_size + 4) * 4;
-    const int nbytes_shared_combine = GGML_PAD(cols_per_block, tile_B::I) * (nwarps_best*rows_per_block + 4) * 4;
+    const int nbytes_cols_per_block_pad = amd_wmma_available(cc) ? tile_B_16::I : tile_B_8::I;
+    const int nbytes_shared_combine = GGML_PAD(cols_per_block, nbytes_cols_per_block_pad) * (nwarps_best*rows_per_block + 4) * 4;
     const int nbytes_shared = std::max(nbytes_shared_iter, nbytes_shared_combine);
     const int nbytes_slotmap = ids ? GGML_PAD(cols_per_block, 16) * sizeof(int) : 0;
     const int nbytes_shared_total = nbytes_shared + nbytes_slotmap;
diff --git a/ggml/src/ggml-hexagon/htp/act-ops.c b/ggml/src/ggml-hexagon/htp/act-ops.c
index 16044975d9..87b09cca3a 100644
--- a/ggml/src/ggml-hexagon/htp/act-ops.c
+++ b/ggml/src/ggml-hexagon/htp/act-ops.c
@@ -106,33 +106,32 @@ static void glu_swiglu_fp32_per_thread(const struct htp_tensor * src0,
     t1 = HAP_perf_get_qtimer_count();
 
     int is_aligned = 1;
-    int opt_path   = 0;
     if (!htp_is_aligned((void *) src0->data, VLEN) || !htp_is_aligned((void *) dst->data, VLEN)) {
         is_aligned = 0;
         FARF(HIGH, "swiglu-f32: unaligned addresses in elementwise op, possibly slower execution\n");
     }
-    if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
-        opt_path = 1;
-    }
 
     const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
     const uint8_t * restrict data_src1 = (const uint8_t *) src1->data;
     uint8_t * restrict data_dst        = (uint8_t *) dst->data;
 
-    bool src1_valid = src1->ne[0];
+    const bool src1_valid = src1->ne[0];
+    const int  nc         = (src1_valid) ? ne00 : ne00 / 2;
     if (!src1_valid) {
-        data_src1     = data_src0;
-        src1_row_size = src0_row_size;
+        const int32_t swapped = op_params[1];
+        data_src1             = data_src0;
+        src1_row_size         = src0_row_size;
+
+        const size_t nc_in_bytes = nc * SIZEOF_FP32;
+        data_src0 += swapped ? nc_in_bytes : 0;
+        data_src1 += swapped ? 0 : nc_in_bytes;
     }
 
     uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_row_size);
     uint8_t * restrict src1_spad_data = src1_spad->data + (ith * src1_row_size);
     uint8_t * restrict dst_spad_data  = dst_spad->data + (ith * dst_row_size);
 
-    const int32_t swapped = op_params[1];
-
-    const int nc = (src1_valid) ? ne0 : ne0 / 2;
-
+    const bool opt_path = ((1 == is_aligned) && !(nb01 & (VLEN - 1)));
     for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) {
         const float * restrict src0 = (float *) (data_src0 + (ir * src0_row_size));
         const float * restrict src1 = (float *) (data_src1 + (ir * src1_row_size));
@@ -142,12 +141,7 @@ static void glu_swiglu_fp32_per_thread(const struct htp_tensor * src0,
             htp_l2fetch(src0 + src0_row_size, 1, src0_row_size, src0_row_size);
         }
 
-        if (!src1_valid) {
-            src0 += swapped ? nc : 0;
-            src1 += swapped ? 0 : nc;
-        }
-
-        if (1 == opt_path) {
+        if (opt_path) {
             hvx_fast_sigmoid_f32((const uint8_t *) src0, (uint8_t *) src0_spad_data, nc);
             hvx_mul_mul_f32_opt((const uint8_t *) src0, (const uint8_t *) src0_spad_data, (const uint8_t *) src1,
                                 (uint8_t *) dst, nc);
@@ -218,7 +212,7 @@ static void glu_swiglu_oai_fp32_per_thread(const struct htp_tensor * src0,
     const float   alpha   = ((const float *) (op_params))[2];
     const float   limit   = ((const float *) (op_params))[3];
 
-    const int nc = (src1_valid) ? ne0 : ne0 / 2;
+    const int nc = (src1_valid) ? ne00 : ne00 / 2;
 
     for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) {
         const float * restrict src0 = (float *) (data_src0 + (ir * src0_row_size));
diff --git a/ggml/src/ggml-hexagon/htp/hvx-exp.c b/ggml/src/ggml-hexagon/htp/hvx-exp.c
index 19f6795083..d0735e9325 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-exp.c
+++ b/ggml/src/ggml-hexagon/htp/hvx-exp.c
@@ -16,6 +16,19 @@
 #include "hvx-utils.h"
 #include "ops-utils.h"
 
+static inline HVX_Vector hvx_vec_exp_fp32_guard(HVX_Vector in_vec) {
+    static const float kInf    = INFINITY;
+    static const float kMaxExp = 88.02f;  // log(INF)
+
+    const HVX_Vector     max_exp = hvx_vec_splat_fp32(kMaxExp);
+    const HVX_Vector     inf     = hvx_vec_splat_fp32(kInf);
+    const HVX_VectorPred pred0   = Q6_Q_vcmp_gt_VsfVsf(in_vec, max_exp);
+
+    HVX_Vector out = hvx_vec_exp_fp32(in_vec);
+
+    return Q6_V_vmux_QVV(pred0, inf, out);
+}
+
 void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems, bool negate) {
     int left_over       = num_elems & (VLEN_FP32 - 1);
     int num_elems_whole = num_elems - left_over;
@@ -42,9 +55,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             if (true == negate) {
                 HVX_Vector neg_vec_in = hvx_vec_neg_fp32(*p_vec_in1++);
-                *p_vec_out++          = hvx_vec_exp_fp32(neg_vec_in);
+                *p_vec_out++          = hvx_vec_exp_fp32_guard(neg_vec_in);
             } else {
-                *p_vec_out++ = hvx_vec_exp_fp32(*p_vec_in1++);
+                *p_vec_out++ = hvx_vec_exp_fp32_guard(*p_vec_in1++);
             }
         }
     } else {
@@ -54,9 +67,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int
 
             if (true == negate) {
                 HVX_Vector neg_vec_in                    = hvx_vec_neg_fp32(in);
-                *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32(neg_vec_in);
+                *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(neg_vec_in);
             } else {
-                *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32(in);
+                *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(in);
             }
         }
     }
@@ -70,9 +83,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int
         if (true == negate) {
             HVX_Vector neg_vec_in = hvx_vec_neg_fp32(in);
 
-            vec_out = hvx_vec_exp_fp32(neg_vec_in);
+            vec_out = hvx_vec_exp_fp32_guard(neg_vec_in);
         } else {
-            vec_out = hvx_vec_exp_fp32(in);
+            vec_out = hvx_vec_exp_fp32_guard(in);
         }
 
         hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, vec_out);
diff --git a/ggml/src/ggml-hexagon/htp/hvx-inverse.c b/ggml/src/ggml-hexagon/htp/hvx-inverse.c
index 4cf588a878..953d3e6c16 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-inverse.c
+++ b/ggml/src/ggml-hexagon/htp/hvx-inverse.c
@@ -38,13 +38,13 @@ void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const
 
         #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
-            *p_vec_out++ = hvx_vec_inverse_fp32(*p_vec_in++);
+            *p_vec_out++ = hvx_vec_inverse_fp32_guard(*p_vec_in++);
         }
     } else {
         #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector in                            = *(HVX_UVector *) (src + i * SIZEOF_FP32);
-            *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_inverse_fp32(in);
+            *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_inverse_fp32_guard(in);
         }
     }
 
@@ -53,7 +53,7 @@ void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const
         float *       dstf = (float *) dst + num_elems_whole;
 
         HVX_Vector in  = *(HVX_UVector *) srcf;
-        HVX_Vector out = hvx_vec_inverse_fp32(in);
+        HVX_Vector out = hvx_vec_inverse_fp32_guard(in);
 
         hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, out);
     }
diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.c b/ggml/src/ggml-hexagon/htp/hvx-utils.c
index d3599bc9c1..e02b1d9099 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.c
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.c
@@ -401,7 +401,9 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
         FARF(HIGH, "hvx_add_scalar_f32: unaligned loop in hvx op, possibly slower execution\n");
     }
 
-    HVX_Vector val_vec = hvx_vec_splat_fp32(val);
+    static const float kInf    = INFINITY;
+    const HVX_Vector   inf     = hvx_vec_splat_fp32(kInf);
+    HVX_Vector         val_vec = hvx_vec_splat_fp32(val);
 
     if (0 == unaligned_loop) {
         HVX_Vector * restrict vec_in1 = (HVX_Vector *) src;
@@ -409,17 +411,24 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
 
         #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
-            HVX_Vector v = Q6_Vqf32_vadd_VsfVsf(*vec_in1++, val_vec);
-            *vec_out++   = Q6_Vsf_equals_Vqf32(v);
+            HVX_Vector           in       = *vec_in1++;
+            const HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(inf, in);
+            HVX_Vector           v        = Q6_Vqf32_vadd_VsfVsf(in, val_vec);
+            v                             = Q6_Vsf_equals_Vqf32(v);
+            v                             = Q6_V_vmux_QVV(pred_inf, inf, v);
+            *vec_out++                    = v;
         }
     } else {
         #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
 
-            HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in, val_vec);
+            const HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(inf, in);
+            HVX_Vector           out      = Q6_Vqf32_vadd_VsfVsf(in, val_vec);
+            out                           = Q6_Vsf_equals_Vqf32(out);
+            out                           = Q6_V_vmux_QVV(pred_inf, inf, out);
 
-            *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out);
+            *(HVX_UVector *) (dst + i * SIZEOF_FP32) = out;
         }
     }
 
@@ -429,8 +438,12 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
 
         HVX_Vector in = *(HVX_UVector *) srcf;
 
-        HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in, val_vec);
-        hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out));
+        const HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(inf, in);
+        HVX_Vector           out      = Q6_Vqf32_vadd_VsfVsf(in, val_vec);
+        out                           = Q6_Vsf_equals_Vqf32(out);
+        out                           = Q6_V_vmux_QVV(pred_inf, inf, out);
+
+        hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, out);
     }
 }
 
diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h
index b2ca8e88f4..5f94645cde 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h
@@ -12,6 +12,15 @@
 #define VLEN_FP32   (VLEN / SIZEOF_FP32)
 #define VLEN_FP16   (VLEN / SIZEOF_FP16)
 
+typedef union {
+    HVX_Vector v;
+    uint8_t    b[VLEN];
+    uint16_t   h[VLEN_FP16];
+    uint32_t   w[VLEN_FP32];
+    __fp16     fp16[VLEN_FP16];
+    float      fp32[VLEN_FP32];
+} __attribute__((aligned(VLEN), packed)) HVX_VectorAlias;
+
 static inline HVX_Vector hvx_vec_splat_fp32(float i) {
     union {
         float   f;
@@ -243,19 +252,16 @@ static __attribute__((always_inline)) int32_t is_in_one_chunk(void * addr, uint3
 }
 
 static void hvx_vec_dump_fp16_n(char * pref, HVX_Vector v, uint32_t n) {
-    union {
-        HVX_Vector v;
-        __fp16 d[64];
-    } u = { .v = v };
+    HVX_VectorAlias u = { .v = v };
 
     const uint32_t n0 = n / 16;
     const uint32_t n1 = n % 16;
     int            i  = 0;
     for (; i < n0; i++) {
-        htp_dump_fp16_line(pref, u.d + (16 * i), 16);
+        htp_dump_fp16_line(pref, u.fp16 + (16 * i), 16);
     }
     if (n1) {
-        htp_dump_fp16_line(pref, u.d + (16 * i), n1);
+        htp_dump_fp16_line(pref, u.fp16 + (16 * i), n1);
     }
 }
 
@@ -411,8 +417,8 @@ static inline HVX_Vector hvx_vec_fp32_reduce_sum_n(HVX_Vector in, unsigned int n
 
     HVX_Vector sum = in, sum_t;
     while (width < total) {
-        sum_t = Q6_V_vror_VR(sum, width);       // rotate right
-        sum   = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(sum, sum_t)); // elementwise sum
+        sum_t = Q6_V_vror_VR(sum, width);                               // rotate right
+        sum   = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(sum, sum_t));  // elementwise sum
         width = width << 1;
     }
     return sum;
@@ -491,7 +497,7 @@ static inline HVX_Vector hvx_vec_abs_fp16(HVX_Vector v) {
 static inline HVX_Vector hvx_vec_neg_fp16(HVX_Vector v) {
     // neg by setting the fp16 sign bit
     HVX_Vector mask = Q6_Vh_vsplat_R(0x8000);
-    return Q6_V_vor_VV(v, mask);
+    return Q6_V_vxor_VV(v, mask);
 }
 
 static inline HVX_Vector hvx_vec_abs_fp32(HVX_Vector v) {
@@ -506,7 +512,7 @@ static inline HVX_Vector hvx_vec_neg_fp32(HVX_Vector v) {
 #else
     // neg by setting the fp32 sign bit
     HVX_Vector mask = Q6_V_vsplat_R(0x80000000);
-    return Q6_V_vor_VV(v, mask);
+    return Q6_V_vxor_VV(v, mask);
 #endif  // __HTP_ARCH__ > 75
 }
 
@@ -720,6 +726,24 @@ static inline HVX_Vector hvx_vec_inverse_fp32(HVX_Vector v_sf) {
     return Q6_Vsf_equals_Vqf32(r_qf);
 }
 
+static inline HVX_Vector hvx_vec_inverse_fp32_guard(HVX_Vector v_sf) {
+    static const float    kInf     = INFINITY;
+    static const uint32_t kNanMask = 0x7fffffff;
+    static const uint32_t kNanMin  = 0x7f800000;
+
+    const HVX_Vector     inf      = hvx_vec_splat_fp32(kInf);
+    const HVX_VectorPred pred_inf = Q6_Q_vcmp_gt_VsfVsf(inf, v_sf);
+
+    HVX_Vector out = hvx_vec_inverse_fp32(v_sf);
+
+    const HVX_Vector     nan_mask   = Q6_V_vsplat_R(kNanMask);
+    const HVX_Vector     nan_min    = Q6_V_vsplat_R(kNanMin);
+    HVX_Vector           masked_out = Q6_V_vand_VV(out, nan_mask);
+    const HVX_VectorPred pred       = Q6_Q_vcmp_gtand_QVuwVuw(pred_inf, nan_min, masked_out);
+
+    return Q6_V_vmux_QVV(pred, out, Q6_V_vzero());
+}
+
 #define FAST_SIGMOID_LOG2F (0x3fb8aa3b)  // 1.442695022
 #define FAST_SIGMOID_C1    (0x3d009076)  // 0.03138777
 #define FAST_SIGMOID_C2    (0x3e8d74bd)  // 0.276281267
@@ -934,6 +958,16 @@ static inline HVX_Vector hvx_vec_rsqrt_fp32(HVX_Vector in_vec) {
     return Q6_Vsf_equals_Vqf32(temp);
 }
 
+static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard(HVX_Vector v) {
+    static const float kMaxExp = -88.02f;  // log(INF)
+
+    const HVX_Vector     max_exp  = Q6_V_vsplat_R(*((uint32_t *) &kMaxExp));
+    const HVX_VectorPred pred_inf = Q6_Q_vcmp_gt_VsfVsf(v, max_exp);
+
+    HVX_Vector out = hvx_vec_fast_sigmoid_fp32(v);
+    return Q6_V_vmux_QVV(pred_inf, out, Q6_V_vzero());
+}
+
 static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) {
     int step_of_1 = num_elems >> 5;
     int remaining = num_elems - step_of_1 * VLEN_FP32;
@@ -945,7 +979,7 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t *
 
     #pragma unroll(4)
     for (int i = 0; i < step_of_1; i++) {
-        v_dst[i] = hvx_vec_fast_sigmoid_fp32(v_src[i]);
+        v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i]);
     }
 }
 
diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
index 4cb6afe927..2319f7a9e2 100644
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -6895,9 +6895,23 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
     cl_context context = backend_ctx->context;
 
     if(src0t == GGML_TYPE_F16 && src1t == GGML_TYPE_F32){
-        if (ne01 >= 64 && ne1 >= 32 && ne00 >= 16 && (ne12 % ne02) == 0){
-            ggml_cl_mul_mat_kq_kqv_adreno(backend, src0, src1, dst);
-            return;
+        if (ne01 >= 64 && ne1 >= 32 && ne00 >= 16 && (ne12 % ne02) == 0) {
+            // For KQ
+            if (ggml_is_permuted(src0) && ggml_is_permuted(src1) &&
+                nb00 <= nb02 &&
+                nb02 <= nb01 &&
+                nb01 <= nb03 &&
+                nb10 <= nb12 &&
+                nb12 <= nb11 &&
+                nb11 <= nb13) {
+                ggml_cl_mul_mat_kq_kqv_adreno(backend, src0, src1, dst);
+                return;
+            }
+            // For KQV
+            if (!ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
+                ggml_cl_mul_mat_kq_kqv_adreno(backend, src0, src1, dst);
+                return;
+            }
         }
     }
 
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index f83dfdaef6..d4f27af8fc 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -513,6 +513,7 @@ struct vk_device_struct {
     vk_queue compute_queue;
     vk_queue transfer_queue;
     bool single_queue;
+    bool support_async;
     uint32_t subgroup_size;
     uint32_t shader_core_count;
     bool uma;
@@ -4273,6 +4274,16 @@ static vk_device ggml_vk_get_device(size_t idx) {
         device->vendor_id = device->properties.vendorID;
         device->driver_id = driver_props.driverID;
 
+        // Implementing the async backend interfaces seems broken on older Intel HW,
+        // see https://github.com/ggml-org/llama.cpp/issues/17302.
+        device->support_async = (device->vendor_id != VK_VENDOR_ID_INTEL ||
+                                 std::string(device->properties.deviceName.data()).find("(DG1)") == std::string::npos) &&
+                                getenv("GGML_VK_DISABLE_ASYNC") == nullptr;
+
+        if (!device->support_async) {
+            GGML_LOG_DEBUG("ggml_vulkan: WARNING: Async execution disabled on certain Intel devices.\n");
+        }
+
         const char* GGML_VK_FORCE_MAX_ALLOCATION_SIZE = getenv("GGML_VK_FORCE_MAX_ALLOCATION_SIZE");
 
         if (GGML_VK_FORCE_MAX_ALLOCATION_SIZE != nullptr) {
@@ -13187,6 +13198,10 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
         ctx->device->perf_logger->print_timings();
     }
 
+    if (!ctx->device->support_async) {
+        ggml_vk_synchronize(ctx);
+    }
+
     return GGML_STATUS_SUCCESS;
 
     UNUSED(backend);
@@ -13480,6 +13495,10 @@ ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
         /* .context = */ ctx,
     };
 
+    if (!ctx->device->support_async) {
+        vk_backend->iface.get_tensor_async = nullptr;
+    }
+
     return vk_backend;
 }
 
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index e703181a19..175549a9e3 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1593,7 +1593,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
             } break;
         case LLM_ARCH_DEEPSEEK2:
             {
-                bool is_lite = (hparams.n_layer == 27);
+                // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
+                bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                 ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead);
                 if (!is_lite) {
@@ -4581,7 +4582,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                 } break;
             case LLM_ARCH_DEEPSEEK2:
                 {
-                    const bool is_lite = (hparams.n_layer == 27);
+                    // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
+                    const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
 
                     const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
 
diff --git a/src/models/deepseek2.cpp b/src/models/deepseek2.cpp
index 68f72f72bb..0b41f7ba8e 100644
--- a/src/models/deepseek2.cpp
+++ b/src/models/deepseek2.cpp
@@ -4,7 +4,8 @@
 
 llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) :
     llm_graph_context(params) {
-    bool is_lite = (hparams.n_layer == 27);
+    // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
+    bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
 
     const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
 
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 2bb4b12224..90bdb80e5e 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -7819,6 +7819,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
     for (int bs : {1, 4, 8, 32, 64, 128, 256, 512}) {
         for (ggml_type type_a : {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K, GGML_TYPE_IQ2_XS}) {
             for (ggml_type type_b : {GGML_TYPE_F32}) {
+                test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, 128, 8, false, 768, bs, 2048));
                 test_cases.emplace_back(new test_mul_mat_id_fusion(type_a, type_b, 128, 8, false, 768, bs, 2048, 1));
             }
         }
@@ -7827,6 +7828,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
     for (int bs : {1, 4, 8, 32, 64, 128, 256, 512}) {
         for (ggml_type type_a : {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K, GGML_TYPE_IQ2_XS}) {
             for (ggml_type type_b : {GGML_TYPE_F32}) {
+                test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, 32, 4, false, 1792, bs, 2048));
                 test_cases.emplace_back(new test_mul_mat_id_fusion(type_a, type_b, 32, 4, false, 1792, bs, 2048, 1));
             }
         }
@@ -7837,6 +7839,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
     for (int bs : {1, 4, 8, 512}) {
         for (ggml_type type_a : {GGML_TYPE_MXFP4}) {
             for (ggml_type type_b : {GGML_TYPE_F32}) {
+                test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, 32, 4, false, 2880, bs, 2880));
                 test_cases.emplace_back(new test_mul_mat_id_fusion(type_a, type_b, 32, 4, false, 2880, bs, 2880, 1));
             }
         }
diff --git a/tests/test-quantize-stats.cpp b/tests/test-quantize-stats.cpp
index a284a1f0c5..de587d456d 100644
--- a/tests/test-quantize-stats.cpp
+++ b/tests/test-quantize-stats.cpp
@@ -23,7 +23,7 @@
 #endif
 
 struct quantize_stats_params {
-    std::string model = DEFAULT_MODEL_PATH;
+    std::string model = "models/7B/ggml-model-f16.gguf";
     bool verbose = false;
     bool per_layer_stats = false;
     bool print_histogram = false;
diff --git a/tools/server/README.md b/tools/server/README.md
index 54c1062c9b..3e311a657c 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -1364,18 +1364,32 @@ llama-server -hf <user>/<model>:<tag>
 
 *The server must be restarted after adding a new model.*
 
-Alternatively, you can point the router to a local directory containing your GGUF files using `--models-dir`. Files prefixed with `mmproj-` will automatically be treated as multimodal projection files **for the model with the matching base name**:
+Alternatively, you can point the router to a local directory containing your GGUF files using `--models-dir`. Example command:
 
 ```sh
-llama-3.2-1b-Q4_K_M.gguf
-gemma-3-4b-it-Q8_0.gguf
-mmproj-gemma-3-4b-it-Q8_0.gguf   # must be "mmproj-" + text model filename
+llama-server --models-dir ./models_directory
 ```
 
-Example:
+If the model contains multiple GGUF (for multimodal or multi-shard), files should be put into a subdirectory. The directory structure should look like this:
 
 ```sh
-llama-server --models-dir ./path/to/models
+models_directory
+ │
+ │  # single file
+ ├─ llama-3.2-1b-Q4_K_M.gguf
+ ├─ Qwen3-8B-Q4_K_M.gguf
+ │
+ │  # multimodal
+ ├─ gemma-3-4b-it-Q8_0
+ │    ├─ gemma-3-4b-it-Q8_0.gguf
+ │    └─ mmproj-F16.gguf   # file name must start with "mmproj"
+ │
+ │  # multi-shard
+ ├─ gemma-3-4b-it-Q8_0
+ │    ├─ Kimi-K2-Thinking-UD-IQ1_S-00001-of-00006.gguf
+ │    ├─ Kimi-K2-Thinking-UD-IQ1_S-00002-of-00006.gguf
+ │    ├─ ...
+ │    └─ Kimi-K2-Thinking-UD-IQ1_S-00006-of-00006.gguf
 ```
 
 You may also specify default arguments that will be passed to every loaded model instance:
@@ -1384,6 +1398,8 @@ You may also specify default arguments that will be passed to every loaded model
 llama-server -ctx 8192 -n 1024 -np 2
 ```
 
+Note: model instances inherit both command line arguments and environment variables from the router server.
+
 ### Routing requests
 
 Requests are routed according to the requested model name.
@@ -1393,7 +1409,12 @@ For **POST** endpoints (`/v1/chat/completions`, `/v1/completions`, `/infill`, et
 ```json
 {
   "model": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M",
-  ...
+  "messages": [
+    {
+      "role": "user",
+      "content": "hello"
+    }
+  ]
 }
 ```
 
@@ -1405,15 +1426,92 @@ GET /props?model=ggml-org%2Fgemma-3-4b-it-GGUF%3AQ4_K_M
 
 ### GET `/models`: List available models
 
-TODO
+Listing all models in cache. The model metadata will also include a field to indicate the status of the model:
+
+```json
+{
+  "data": [{
+    "name": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M",
+    "id": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M",
+    "in_cache": true,
+    "path": "/Users/REDACTED/Library/Caches/llama.cpp/ggml-org_gemma-3-4b-it-GGUF_gemma-3-4b-it-Q4_K_M.gguf",
+    "status": {
+      "value": "loaded"
+    },
+    ...
+  }]
+}
+```
+
+Note: For a local GGUF (stored offline in a custom directory), the model object will have `"in_cache": false`.
+
+The `status` object can be:
+
+```json
+"status": {
+  "value": "unloaded"
+}
+```
+
+```json
+"status": {
+  "value": "loading"
+}
+```
+
+```json
+"status": {
+  "value": "failed"
+}
+```
+
+```json
+"status": {
+  "value": "loaded"
+}
+```
 
 ### POST `/models/load`: Load a model
 
-TODO
+
+Load a model
+
+Payload:
+
+```json
+{
+  "model": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
+}
+```
+
+Response:
+
+```json
+{
+  "success": true
+}
+```
 
 ### POST `/models/unload`: Unload a model
 
-TODO
+
+Unload a model
+
+Payload:
+
+```json
+{
+  "model": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M",
+}
+```
+
+Response:
+
+```json
+{
+  "success": true
+}
+```
 
 ## More examples
 
diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index 92b02fbf49..071b5522ea 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -12,6 +12,7 @@
 #include <cstring>
 #include <atomic>
 #include <chrono>
+#include <unordered_set>
 
 #ifdef _WIN32
 #include <winsock2.h>
@@ -57,7 +58,7 @@ static std::filesystem::path get_server_exec_path() {
                 return std::filesystem::path(buf.data());
             }
         }
-        return std::filesystem::path(std::string(buf.data(), (size > 0) ? size : 0));
+        throw std::runtime_error("_NSGetExecutablePath failed after buffer resize");
     }
 #else
     char path[FILENAME_MAX];
@@ -69,6 +70,64 @@ static std::filesystem::path get_server_exec_path() {
 #endif
 }
 
+struct local_model {
+    std::string name;
+    std::string path;
+    std::string path_mmproj;
+};
+
+static std::vector<local_model> list_local_models(const std::string & dir) {
+    if (!std::filesystem::exists(dir) || !std::filesystem::is_directory(dir)) {
+        throw std::runtime_error(string_format("error: '%s' does not exist or is not a directory\n", dir.c_str()));
+    }
+
+    std::vector<local_model> models;
+    auto scan_subdir = [&models](const std::string & subdir_path, const std::string name) {
+        auto files = fs_list(subdir_path, false);
+        common_file_info model_file;
+        common_file_info first_shard_file;
+        common_file_info mmproj_file;
+        for (const auto & file : files) {
+            if (string_ends_with(file.name, ".gguf")) {
+                if (file.name.find("mmproj") != std::string::npos) {
+                    mmproj_file = file;
+                } else if (file.name.find("-00001-of-") != std::string::npos) {
+                    first_shard_file = file;
+                } else {
+                    model_file = file;
+                }
+            }
+        }
+        // single file model
+        local_model model{
+            /* name        */ name,
+            /* path        */ first_shard_file.path.empty() ? model_file.path : first_shard_file.path,
+            /* path_mmproj */ mmproj_file.path // can be empty
+        };
+        if (!model.path.empty()) {
+            models.push_back(model);
+        }
+    };
+
+    auto files = fs_list(dir, true);
+    for (const auto & file : files) {
+        if (file.is_dir) {
+            scan_subdir(file.path, file.name);
+        } else if (string_ends_with(file.name, ".gguf")) {
+            // single file model
+            std::string name = file.name;
+            string_replace_all(name, ".gguf", "");
+            local_model model{
+                /* name        */ name,
+                /* path        */ file.path,
+                /* path_mmproj */ ""
+            };
+            models.push_back(model);
+        }
+    }
+    return models;
+}
+
 //
 // server_models
 //
@@ -85,15 +144,17 @@ server_models::server_models(
         base_env.push_back(std::string(*env));
     }
     // TODO: allow refreshing cached model list
+    // add cached models
     auto cached_models = common_list_cached_models();
     for (const auto & model : cached_models) {
         server_model_meta meta{
             /* name        */ model.to_string(),
             /* path        */ model.manifest_path,
-            /* path_mmproj */ "",
+            /* path_mmproj */ "", // auto-detected when loading
             /* in_cache    */ true,
             /* port        */ 0,
-            /* status      */ SERVER_MODEL_STATUS_UNLOADED
+            /* status      */ SERVER_MODEL_STATUS_UNLOADED,
+            /* last_used   */ 0
         };
         mapping[meta.name] = instance_t{
             /* subproc */ std::make_shared<subprocess_s>(),
@@ -101,6 +162,30 @@ server_models::server_models(
             /* meta    */ meta
         };
     }
+    // add local models specificed via --models-dir
+    if (!params.models_dir.empty()) {
+        auto local_models = list_local_models(params.models_dir);
+        for (const auto & model : local_models) {
+            if (mapping.find(model.name) != mapping.end()) {
+                // already exists in cached models, skip
+                continue;
+            }
+            server_model_meta meta{
+                /* name        */ model.name,
+                /* path        */ model.path,
+                /* path_mmproj */ model.path_mmproj,
+                /* in_cache    */ false,
+                /* port        */ 0,
+                /* status      */ SERVER_MODEL_STATUS_UNLOADED,
+                /* last_used   */ 0
+            };
+            mapping[meta.name] = instance_t{
+                /* subproc */ std::make_shared<subprocess_s>(),
+                /* th      */ std::thread(),
+                /* meta    */ meta
+            };
+        }
+    }
 }
 
 void server_models::update_meta(const std::string & name, const server_model_meta & meta) {
@@ -207,11 +292,39 @@ std::vector<server_model_meta> server_models::get_all_meta() {
     return result;
 }
 
+void server_models::unload_lru() {
+    if (base_params.max_models <= 0) {
+        return; // no limit
+    }
+    // remove one of the servers if we passed the max_models (least recently used - LRU)
+    std::string lru_model_name = "";
+    int64_t lru_last_used = ggml_time_ms();
+    size_t count_active = 0;
+    {
+        std::lock_guard<std::mutex> lk(mutex);
+        for (const auto & m : mapping) {
+            if (m.second.meta.is_active()) {
+                count_active++;
+                if (m.second.meta.last_used < lru_last_used) {
+                    lru_model_name = m.first;
+                    lru_last_used = m.second.meta.last_used;
+                }
+            }
+        }
+    }
+    if (!lru_model_name.empty() && count_active >= (size_t)base_params.max_models) {
+        SRV_INF("max_models limit reached, removing LRU name=%s\n", lru_model_name.c_str());
+        unload(lru_model_name);
+    }
+}
+
 void server_models::load(const std::string & name) {
-    std::lock_guard<std::mutex> lk(mutex);
-    if (mapping.find(name) == mapping.end()) {
+    if (!has_model(name)) {
         throw std::runtime_error("model name=" + name + " is not found");
     }
+    unload_lru();
+
+    std::lock_guard<std::mutex> lk(mutex);
 
     auto meta = mapping[name].meta;
     if (meta.status != SERVER_MODEL_STATUS_FAILED && meta.status != SERVER_MODEL_STATUS_UNLOADED) {
@@ -221,9 +334,10 @@ void server_models::load(const std::string & name) {
 
     // prepare new instance info
     instance_t inst;
-    inst.meta        = meta;
-    inst.meta.port   = get_free_port();
-    inst.meta.status = SERVER_MODEL_STATUS_LOADING;
+    inst.meta           = meta;
+    inst.meta.port      = get_free_port();
+    inst.meta.status    = SERVER_MODEL_STATUS_LOADING;
+    inst.meta.last_used = ggml_time_ms();
 
     if (inst.meta.port <= 0) {
         throw std::runtime_error("failed to get a port number");
@@ -385,7 +499,7 @@ bool server_models::ensure_model_loaded(const std::string & name) {
     return true;
 }
 
-server_http_res_ptr server_models::proxy_request(const server_http_req & req, const std::string & method, const std::string & name) {
+server_http_res_ptr server_models::proxy_request(const server_http_req & req, const std::string & method, const std::string & name, bool update_last_used) {
     auto meta = get_meta(name);
     if (!meta.has_value()) {
         throw std::runtime_error("model name=" + name + " is not found");
@@ -393,6 +507,10 @@ server_http_res_ptr server_models::proxy_request(const server_http_req & req, co
     if (ensure_model_loaded(name)) {
         meta = get_meta(name); // refresh meta
     }
+    if (update_last_used) {
+        std::unique_lock<std::mutex> lk(mutex);
+        mapping[name].meta.last_used = ggml_time_ms();
+    }
     SRV_INF("proxying request to model %s on port %d\n", name.c_str(), meta->port);
     auto proxy = std::make_unique<server_http_proxy>(
             method,
@@ -572,8 +690,11 @@ server_http_proxy::server_http_proxy(
 
     // wait for the first chunk (headers)
     msg_t header;
-    pipe->read(header, should_stop);
-    SRV_DBG("%s", "received response headers\n");
-    this->status  = header.status;
-    this->headers = header.headers;
+    if (pipe->read(header, should_stop)) {
+        SRV_DBG("%s", "received response headers\n");
+        this->status  = header.status;
+        this->headers = header.headers;
+    } else {
+        SRV_DBG("%s", "no response headers received (request cancelled?)\n");
+    }
 }
diff --git a/tools/server/server-models.h b/tools/server/server-models.h
index f8ae757fa4..3cb3b39fe7 100644
--- a/tools/server/server-models.h
+++ b/tools/server/server-models.h
@@ -58,6 +58,8 @@ struct server_model_meta {
     bool in_cache = false; // if true, use -hf; use -m otherwise
     int port = 0;
     server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
+    int64_t last_used = 0;
+
     bool is_active() const {
         return status == SERVER_MODEL_STATUS_LOADED || status == SERVER_MODEL_STATUS_LOADING;
     }
@@ -81,6 +83,9 @@ private:
 
     void update_meta(const std::string & name, const server_model_meta & meta);
 
+    // unload least recently used models if the limit is reached
+    void unload_lru();
+
 public:
     server_models(const common_params & params, int argc, char ** argv, char ** envp);
 
@@ -109,7 +114,7 @@ public:
     bool ensure_model_loaded(const std::string & name);
 
     // proxy an HTTP request to the model instance
-    server_http_res_ptr proxy_request(const server_http_req & req, const std::string & method, const std::string & name);
+    server_http_res_ptr proxy_request(const server_http_req & req, const std::string & method, const std::string & name, bool update_last_used);
 
     // notify the router server that a model instance is ready
     static void setup_child_server(const std::string & host, int router_port, const std::string & name, std::function<void(int)> & shutdown_handler);
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 37ef9d96ea..43d145fb67 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -5143,7 +5143,7 @@ public:
         std::string method = "GET";
         std::string name = req.get_param("model");
         models->ensure_model_loaded(name);
-        return models->proxy_request(req, method, name);
+        return models->proxy_request(req, method, name, false);
     };
 
     server_http_context::handler_t proxy_post = [this](const server_http_req & req) {
@@ -5151,7 +5151,7 @@ public:
         json body = json::parse(req.body);
         std::string name = json_value(body, "model", std::string());
         models->ensure_model_loaded(name);
-        return models->proxy_request(req, method, name);
+        return models->proxy_request(req, method, name, true); // update last usage for POST request only
     };
 
     server_http_context::handler_t post_router_models_load = [this](const server_http_req & req) {
@@ -5189,9 +5189,10 @@ public:
         auto all_models = models->get_all_meta();
         for (const auto & model : all_models) {
             models_json.push_back(json {
-                {"model",  model.name},
-                {"name",   model.name},
-                {"id",     model.name},
+                {"name",     model.name},
+                {"id",       model.name},
+                {"in_cache", model.in_cache},
+                {"path",     model.path},
                 // TODO: other fields...
                 {"status", {
                     {"value", server_model_status_to_string(model.status)}
@@ -5662,8 +5663,7 @@ int main(int argc, char ** argv, char ** envp) {
     // register API routes
     server_routes routes(params, ctx_server, ctx_http);
 
-    // TODO: improve this by changing arg.cpp
-    bool is_router_server = params.model.path == DEFAULT_MODEL_PATH;
+    bool is_router_server = params.model.path.empty();
     if (is_router_server) {
         // setup server instances manager
         routes.models.reset(new server_models(params, argc, argv, envp));
diff --git a/tools/server/webui/src/lib/components/app/misc/SelectorModel.svelte b/tools/server/webui/src/lib/components/app/misc/SelectorModel.svelte
index 2aebb40d9c..bd40aba568 100644
--- a/tools/server/webui/src/lib/components/app/misc/SelectorModel.svelte
+++ b/tools/server/webui/src/lib/components/app/misc/SelectorModel.svelte
@@ -267,7 +267,6 @@
 		if (activeId) {
 			return options.find((option) => option.id === activeId);
 		}
-
 		return options[0];
 	}
 </script>
diff --git a/vendor/cpp-httplib/CMakeLists.txt b/vendor/cpp-httplib/CMakeLists.txt
index 3b42fc8c1d..8e0f8064f7 100644
--- a/vendor/cpp-httplib/CMakeLists.txt
+++ b/vendor/cpp-httplib/CMakeLists.txt
@@ -22,7 +22,38 @@ target_compile_definitions(${TARGET} PRIVATE
     CPPHTTPLIB_TCP_NODELAY=1
 )
 
-if (LLAMA_OPENSSL)
+if (LLAMA_BUILD_BORINGSSL)
+    set(OPENSSL_NO_ASM ON CACHE BOOL "Disable OpenSSL ASM code (BoringSSL)")
+    set(FIPS OFF CACHE BOOL "Enable FIPS (BoringSSL)")
+
+    set(BORINGSSL_GIT "https://boringssl.googlesource.com/boringssl" CACHE STRING "BoringSSL git repository")
+    set(BORINGSSL_VERSION "0.20251002.0" CACHE STRING "BoringSSL version")
+
+    message(STATUS "Fetching BoringSSL version ${BORINGSSL_VERSION}")
+
+    include(FetchContent)
+    FetchContent_Declare(
+        boringssl
+        GIT_REPOSITORY ${BORINGSSL_GIT}
+        GIT_TAG        ${BORINGSSL_VERSION}
+        PATCH_COMMAND  ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_SOURCE_DIR}/patch-boringssl.cmake"
+    )
+
+    set(SAVED_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS})
+    set(SAVED_BUILD_TESTING ${BUILD_TESTING})
+
+    set(BUILD_SHARED_LIBS OFF)
+    set(BUILD_TESTING OFF)
+
+    FetchContent_MakeAvailable(boringssl)
+
+    set(BUILD_SHARED_LIBS ${SAVED_BUILD_SHARED_LIBS})
+    set(BUILD_TESTING ${SAVED_BUILD_TESTING})
+
+    set(CPPHTTPLIB_OPENSSL_SUPPORT TRUE)
+    target_link_libraries(${TARGET} PUBLIC ssl crypto)
+
+elseif (LLAMA_OPENSSL)
     find_package(OpenSSL)
     if (OpenSSL_FOUND)
         include(CheckCSourceCompiles)
@@ -44,17 +75,20 @@ if (LLAMA_OPENSSL)
         set(CMAKE_REQUIRED_INCLUDES ${SAVED_CMAKE_REQUIRED_INCLUDES})
         if (OPENSSL_VERSION_SUPPORTED)
             message(STATUS "OpenSSL found: ${OPENSSL_VERSION}")
-            target_compile_definitions(${TARGET} PUBLIC CPPHTTPLIB_OPENSSL_SUPPORT)
+            set(CPPHTTPLIB_OPENSSL_SUPPORT TRUE)
             target_link_libraries(${TARGET} PUBLIC OpenSSL::SSL OpenSSL::Crypto)
-            if (APPLE AND CMAKE_SYSTEM_NAME STREQUAL "Darwin")
-                target_compile_definitions(${TARGET} PUBLIC CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN)
-                find_library(CORE_FOUNDATION_FRAMEWORK CoreFoundation REQUIRED)
-                find_library(SECURITY_FRAMEWORK Security REQUIRED)
-                target_link_libraries(${TARGET} PUBLIC ${CORE_FOUNDATION_FRAMEWORK} ${SECURITY_FRAMEWORK})
-            endif()
         endif()
     else()
         message(STATUS "OpenSSL not found, SSL support disabled")
     endif()
 endif()
 
+if (CPPHTTPLIB_OPENSSL_SUPPORT)
+    target_compile_definitions(${TARGET} PUBLIC CPPHTTPLIB_OPENSSL_SUPPORT) # used in server.cpp
+    if (APPLE AND CMAKE_SYSTEM_NAME STREQUAL "Darwin")
+        target_compile_definitions(${TARGET} PRIVATE CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN)
+        find_library(CORE_FOUNDATION_FRAMEWORK CoreFoundation REQUIRED)
+        find_library(SECURITY_FRAMEWORK Security REQUIRED)
+        target_link_libraries(${TARGET} PUBLIC ${CORE_FOUNDATION_FRAMEWORK} ${SECURITY_FRAMEWORK})
+    endif()
+endif()
diff --git a/vendor/cpp-httplib/patch-boringssl.cmake b/vendor/cpp-httplib/patch-boringssl.cmake
new file mode 100644
index 0000000000..2914e1dddb
--- /dev/null
+++ b/vendor/cpp-httplib/patch-boringssl.cmake
@@ -0,0 +1,6 @@
+# Remove bssl
+file(READ "CMakeLists.txt" content)
+string(REPLACE "add_executable(bssl" "#add_executable(bssl" content "${content}")
+string(REPLACE "target_link_libraries(bssl" "#target_link_libraries(bssl" content "${content}")
+string(REPLACE "install(TARGETS bssl" "#install(TARGETS bssl" content "${content}")
+file(WRITE "CMakeLists.txt" "${content}")