Merge remote-tracking branch 'ngxson/xsn/server_model_management_v1_2' into allozaur/server_model_management_v1_2

This commit is contained in:
Aleksander Grygier 2025-11-22 23:35:05 +01:00
commit 6282537a8b
31 changed files with 742 additions and 229 deletions

View File

@ -69,13 +69,6 @@ jobs:
key: macOS-latest-cmake-arm64
evict-old-files: 1d
- name: Dependencies
id: depends
continue-on-error: true
run: |
brew update
brew install curl
- name: Build
id: cmake_build
run: |
@ -83,6 +76,8 @@ jobs:
cmake -B build \
-DCMAKE_BUILD_RPATH="@loader_path" \
-DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_CURL=OFF \
-DLLAMA_BUILD_BORINGSSL=ON \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=OFF \
-DGGML_METAL_SHADER_DEBUG=ON \
@ -110,13 +105,6 @@ jobs:
key: macOS-latest-cmake-x64
evict-old-files: 1d
- name: Dependencies
id: depends
continue-on-error: true
run: |
brew update
brew install curl
- name: Build
id: cmake_build
run: |
@ -126,6 +114,8 @@ jobs:
cmake -B build \
-DCMAKE_BUILD_RPATH="@loader_path" \
-DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_CURL=OFF \
-DLLAMA_BUILD_BORINGSSL=ON \
-DGGML_METAL=OFF \
-DGGML_RPC=ON \
-DCMAKE_OSX_DEPLOYMENT_TARGET=13.3
@ -151,13 +141,6 @@ jobs:
key: macOS-latest-cmake-arm64-webgpu
evict-old-files: 1d
- name: Dependencies
id: depends
continue-on-error: true
run: |
brew update
brew install curl
- name: Dawn Dependency
id: dawn-depends
run: |
@ -217,7 +200,7 @@ jobs:
sudo apt-get update
sudo apt-get install -y --no-install-recommends \
python3 python3-pip python3-dev \
libjpeg-dev build-essential libcurl4-openssl-dev \
libjpeg-dev build-essential libssl-dev \
git-lfs
- name: Python Dependencies
@ -238,6 +221,8 @@ jobs:
id: cmake_build
run: |
cmake -B build \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DLLAMA_FATAL_WARNINGS=ON \
-DGGML_RPC=ON
cmake --build build --config Release -j $(nproc)
@ -294,13 +279,15 @@ jobs:
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential libcurl4-openssl-dev
sudo apt-get install build-essential libssl-dev
- name: Build
id: cmake_build
if: ${{ matrix.sanitizer != 'THREAD' }}
run: |
cmake -B build \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
@ -311,6 +298,8 @@ jobs:
if: ${{ matrix.sanitizer == 'THREAD' }}
run: |
cmake -B build \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
@ -335,7 +324,7 @@ jobs:
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential libcurl4-openssl-dev
sudo apt-get install build-essential libssl-dev
- name: Build
id: cmake_build
@ -343,6 +332,8 @@ jobs:
mkdir build
cd build
cmake .. \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_LLGUIDANCE=ON
cmake --build . --config Release -j $(nproc)
@ -373,12 +364,14 @@ jobs:
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential libcurl4-openssl-dev
sudo apt-get install build-essential libssl-dev
- name: Build
id: cmake_build
run: |
cmake -B build \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DGGML_RPC=ON
cmake --build build --config Release -j $(nproc)
@ -405,12 +398,14 @@ jobs:
- name: Dependencies
id: depends
run: |
sudo apt-get install -y glslc libvulkan-dev libcurl4-openssl-dev
sudo apt-get install -y glslc libvulkan-dev libssl-dev
- name: Configure
id: cmake_configure
run: |
cmake -B build \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DCMAKE_BUILD_TYPE=RelWithDebInfo \
-DGGML_BACKEND_DL=ON \
-DGGML_CPU_ALL_VARIANTS=ON \
@ -440,7 +435,7 @@ jobs:
run: |
sudo add-apt-repository -y ppa:kisak/kisak-mesa
sudo apt-get update -y
sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libcurl4-openssl-dev
sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev
- name: Get latest Vulkan SDK version
id: vulkan_sdk_version
@ -466,6 +461,8 @@ jobs:
run: |
source ./vulkan_sdk/setup-env.sh
cmake -B build \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DGGML_VULKAN=ON
cmake --build build --config Release -j $(nproc)
@ -497,7 +494,7 @@ jobs:
run: |
sudo add-apt-repository -y ppa:kisak/kisak-mesa
sudo apt-get update -y
sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libcurl4-openssl-dev
sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev
- name: Get latest Vulkan SDK version
id: vulkan_sdk_version
@ -537,7 +534,10 @@ jobs:
id: cmake_build
run: |
export Dawn_DIR=dawn/lib64/cmake/Dawn
cmake -B build -DGGML_WEBGPU=ON
cmake -B build \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DGGML_WEBGPU=ON
cmake --build build --config Release -j $(nproc)
- name: Test
@ -560,7 +560,7 @@ jobs:
id: depends
run: |
sudo apt-get update
sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libcurl4-openssl-dev rocwmma-dev
sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libssl-dev rocwmma-dev
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
@ -572,6 +572,8 @@ jobs:
id: cmake_build
run: |
cmake -B build -S . \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
-DGGML_HIP_ROCWMMA_FATTN=ON \
-DGGML_HIP=ON
@ -590,7 +592,7 @@ jobs:
id: depends
run: |
apt-get update
apt-get install -y build-essential git cmake libcurl4-openssl-dev
apt-get install -y build-essential git cmake libssl-dev
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
@ -602,6 +604,8 @@ jobs:
id: cmake_build
run: |
cmake -B build -S . \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DGGML_MUSA=ON
cmake --build build --config Release -j $(nproc)
@ -626,7 +630,7 @@ jobs:
shell: bash
run: |
sudo apt update
sudo apt install intel-oneapi-compiler-dpcpp-cpp libcurl4-openssl-dev
sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev
- name: install oneAPI MKL library
shell: bash
@ -648,6 +652,8 @@ jobs:
run: |
source /opt/intel/oneapi/setvars.sh
cmake -B build \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DGGML_SYCL=ON \
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx
@ -674,7 +680,7 @@ jobs:
shell: bash
run: |
sudo apt update
sudo apt install intel-oneapi-compiler-dpcpp-cpp libcurl4-openssl-dev
sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev
- name: install oneAPI MKL library
shell: bash
@ -696,6 +702,8 @@ jobs:
run: |
source /opt/intel/oneapi/setvars.sh
cmake -B build \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DGGML_SYCL=ON \
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx \
@ -722,12 +730,6 @@ jobs:
key: macOS-latest-cmake-ios
evict-old-files: 1d
- name: Dependencies
id: depends
continue-on-error: true
run: |
brew update
- name: Build
id: cmake_build
run: |
@ -759,12 +761,6 @@ jobs:
key: macOS-latest-cmake-tvos
evict-old-files: 1d
- name: Dependencies
id: depends
continue-on-error: true
run: |
brew update
- name: Build
id: cmake_build
run: |
@ -790,12 +786,6 @@ jobs:
id: checkout
uses: actions/checkout@v4
- name: Dependencies
id: depends
continue-on-error: true
run: |
brew update
- name: Build
id: cmake_build
run: |
@ -838,12 +828,6 @@ jobs:
name: llama-xcframework
path: build-apple/llama.xcframework/
- name: Dependencies
id: depends
continue-on-error: true
run: |
brew update
- name: Build llama.cpp with CMake
id: cmake_build
run: |
@ -995,21 +979,12 @@ jobs:
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
cmake --build build-arm64-release --target install --config release
- name: libCURL
id: get_libcurl
uses: ./.github/actions/windows-setup-curl
with:
architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }}
- name: Build
id: cmake_build
env:
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
cmake -S . -B build ${{ matrix.defines }} `
-DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
-DLLAMA_CURL=OFF -DLLAMA_BUILD_BORINGSSL=ON
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
cp $env:CURL_PATH/bin/libcurl-*.dll build/bin/Release
- name: Add libopenblas.dll
id: add_libopenblas_dll
@ -1053,7 +1028,7 @@ jobs:
DEBIAN_FRONTEND: noninteractive
run: |
apt update
apt install -y cmake build-essential ninja-build libgomp1 git libcurl4-openssl-dev
apt install -y cmake build-essential ninja-build libgomp1 git libssl-dev
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
@ -1064,10 +1039,12 @@ jobs:
- name: Build with CMake
run: |
cmake -S . -B build -G Ninja \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DLLAMA_FATAL_WARNINGS=ON \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_CUDA_ARCHITECTURES=89-real \
-DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined \
-DLLAMA_FATAL_WARNINGS=ON \
-DGGML_NATIVE=OFF \
-DGGML_CUDA=ON
cmake --build build
@ -1101,25 +1078,20 @@ jobs:
run: |
choco install ninja
- name: libCURL
id: get_libcurl
uses: ./.github/actions/windows-setup-curl
- name: Build
id: cmake_build
shell: cmd
env:
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
cmake -S . -B build -G "Ninja Multi-Config" ^
-DLLAMA_BUILD_SERVER=ON ^
-DLLAMA_CURL=OFF ^
-DLLAMA_BUILD_BORINGSSL=ON ^
-DGGML_NATIVE=OFF ^
-DGGML_BACKEND_DL=ON ^
-DGGML_CPU_ALL_VARIANTS=ON ^
-DGGML_CUDA=ON ^
-DGGML_RPC=ON ^
-DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include"
-DGGML_RPC=ON
set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
cmake --build build --config Release -j %NINJA_JOBS% -t ggml
cmake --build build --config Release
@ -1151,7 +1123,7 @@ jobs:
run: |
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
# TODO: add libcurl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
# TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
- name: Build
id: cmake_build
@ -1208,14 +1180,8 @@ jobs:
key: ${{ github.job }}
evict-old-files: 1d
- name: libCURL
id: get_libcurl
uses: ./.github/actions/windows-setup-curl
- name: Build
id: cmake_build
env:
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
@ -1224,11 +1190,12 @@ jobs:
-DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-${{ env.ROCM_VERSION }}/include/" `
-DCMAKE_BUILD_TYPE=Release `
-DLLAMA_CURL=OFF `
-DLLAMA_BUILD_BORINGSSL=ON `
-DROCM_DIR="${env:HIP_PATH}" `
-DGGML_HIP=ON `
-DGGML_HIP_ROCWMMA_FATTN=ON `
-DGGML_RPC=ON `
-DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
-DGGML_RPC=ON
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
ios-xcode-build:

View File

@ -56,7 +56,7 @@ jobs:
curl \
wget \
language-pack-en \
libcurl4-openssl-dev
libssl-dev
- name: Clone
id: checkout
@ -242,7 +242,7 @@ jobs:
curl \
wget \
language-pack-en \
libcurl4-openssl-dev
libssl-dev
- name: Clone
id: checkout
@ -283,6 +283,8 @@ jobs:
run: |
cmake -B build \
-DGGML_NATIVE=OFF \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DLLAMA_BUILD_SERVER=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
@ -295,6 +297,8 @@ jobs:
run: |
cmake -B build \
-DGGML_NATIVE=OFF \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DLLAMA_BUILD_SERVER=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
@ -306,6 +310,8 @@ jobs:
run: |
cmake -B build \
-DGGML_NATIVE=OFF \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DLLAMA_BUILD_SERVER=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server

View File

@ -242,6 +242,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
- [crashr/gppm](https://github.com/crashr/gppm) launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
- [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with pre-built Mobile and Web platform wrappers and a model example)
- [unslothai/unsloth](https://github.com/unslothai/unsloth) 🦥 exports/saves fine-tuned and trained models to GGUF (Apache-2.0)
</details>

View File

@ -212,7 +212,6 @@ struct handle_model_result {
static handle_model_result common_params_handle_model(
struct common_params_model & model,
const std::string & bearer_token,
const std::string & model_path_default,
bool offline) {
handle_model_result result;
// handle pre-fill default model path and url based on hf_repo and hf_file
@ -257,8 +256,6 @@ static handle_model_result common_params_handle_model(
model.path = fs_get_cache_file(string_split<std::string>(f, '/').back());
}
} else if (model.path.empty()) {
model.path = model_path_default;
}
}
@ -405,7 +402,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
// handle model and download
{
auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH, params.offline);
auto res = common_params_handle_model(params.model, params.hf_token, params.offline);
if (params.no_mmproj) {
params.mmproj = {};
} else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
@ -415,12 +412,18 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
// only download mmproj if the current example is using it
for (auto & ex : mmproj_examples) {
if (ctx_arg.ex == ex) {
common_params_handle_model(params.mmproj, params.hf_token, "", params.offline);
common_params_handle_model(params.mmproj, params.hf_token, params.offline);
break;
}
}
common_params_handle_model(params.speculative.model, params.hf_token, "", params.offline);
common_params_handle_model(params.vocoder.model, params.hf_token, "", params.offline);
common_params_handle_model(params.speculative.model, params.hf_token, params.offline);
common_params_handle_model(params.vocoder.model, params.hf_token, params.offline);
}
// model is required (except for server)
// TODO @ngxson : maybe show a list of available models in CLI in this case
if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER) {
throw std::invalid_argument("error: --model is required\n");
}
if (params.escape) {
@ -2072,11 +2075,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
add_opt(common_arg(
{"-m", "--model"}, "FNAME",
ex == LLAMA_EXAMPLE_EXPORT_LORA
? std::string("model path from which to load base model")
: string_format(
"model path (default: `models/$filename` with filename from `--hf-file` "
"or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH
),
? "model path from which to load base model"
: "model path to load",
[](common_params & params, const std::string & value) {
params.model.path = value;
}
@ -2474,6 +2474,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
}
).set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--models-dir"}, "PATH",
"directory containing models for the router server (default: disabled)",
[](common_params & params, const std::string & value) {
params.models_dir = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_DIR"));
add_opt(common_arg(
{"--max-models"}, "N",
string_format("for router server, maximum number of models to load simultaneously (default: %d, 0 = unlimited)", params.max_models),
[](common_params & params, int value) {
params.max_models = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MAX_MODELS"));
add_opt(common_arg(
{"--jinja"},
"use jinja template for chat (default: disabled)",

View File

@ -911,7 +911,7 @@ std::string fs_get_cache_file(const std::string & filename) {
return cache_directory + filename;
}
std::vector<common_file_info> fs_list_files(const std::string & path) {
std::vector<common_file_info> fs_list(const std::string & path, bool include_directories) {
std::vector<common_file_info> files;
if (path.empty()) return files;
@ -926,14 +926,22 @@ std::vector<common_file_info> fs_list_files(const std::string & path) {
const auto & p = entry.path();
if (std::filesystem::is_regular_file(p)) {
common_file_info info;
info.path = p.string();
info.name = p.filename().string();
info.path = p.string();
info.name = p.filename().string();
info.is_dir = false;
try {
info.size = static_cast<size_t>(std::filesystem::file_size(p));
} catch (const std::filesystem::filesystem_error &) {
info.size = 0;
}
files.push_back(std::move(info));
} else if (include_directories && std::filesystem::is_directory(p)) {
common_file_info info;
info.path = p.string();
info.name = p.filename().string();
info.size = 0; // Directories have no size
info.is_dir = true;
files.push_back(std::move(info));
}
} catch (const std::filesystem::filesystem_error &) {
// skip entries we cannot inspect

View File

@ -26,8 +26,6 @@
fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
} while(0)
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
struct common_time_meas {
common_time_meas(int64_t & t_acc, bool disable = false);
~common_time_meas();
@ -460,6 +458,10 @@ struct common_params {
bool endpoint_props = false; // only control POST requests, not GET
bool endpoint_metrics = false;
// router server configs
std::string models_dir = ""; // directory containing models for the router server
int max_models = 4; // maximum number of models to load simultaneously
bool log_json = false;
std::string slot_save_path;
@ -623,8 +625,9 @@ struct common_file_info {
std::string path;
std::string name;
size_t size = 0; // in bytes
bool is_dir = false;
};
std::vector<common_file_info> fs_list_files(const std::string & path);
std::vector<common_file_info> fs_list(const std::string & path, bool include_directories);
//
// Model utils

View File

@ -1047,7 +1047,7 @@ std::string common_docker_resolve_model(const std::string &) {
std::vector<common_cached_model_info> common_list_cached_models() {
std::vector<common_cached_model_info> models;
const std::string cache_dir = fs_get_cache_directory();
const std::vector<common_file_info> files = fs_list_files(cache_dir);
const std::vector<common_file_info> files = fs_list(cache_dir, false);
for (const auto & file : files) {
if (string_starts_with(file.name, "manifest=") && string_ends_with(file.name, ".json")) {
common_cached_model_info model_info;

View File

@ -2246,8 +2246,7 @@ static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx
bool & use_cann_graph,
bool & cann_graph_update_required) {
#ifdef USE_ACL_GRAPH
ggml_cann_graph * matched_graph = cann_ctx->graph_lru_cache.cache_list.front();
if (use_cann_graph && cann_graph_update_required) {
if (use_cann_graph && cann_graph_update_required) { // Begin CANN graph capture
ACL_CHECK(aclmdlRICaptureBegin(cann_ctx->stream(), ACL_MODEL_RI_CAPTURE_MODE_GLOBAL));
}
#endif // USE_ACL_GRAPH
@ -2271,12 +2270,14 @@ static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx
}
#ifdef USE_ACL_GRAPH
if (use_cann_graph && cann_graph_update_required) { // End CANN graph capture
ACL_CHECK(aclmdlRICaptureEnd(cann_ctx->stream(), &matched_graph->graph));
}
if (use_cann_graph) {
// Execute graph
ggml_cann_graph * matched_graph = cann_ctx->graph_lru_cache.cache_list.front();
if (cann_graph_update_required) { // End CANN graph capture
ACL_CHECK(aclmdlRICaptureEnd(cann_ctx->stream(), &matched_graph->graph));
}
// Execute CANN graph
ACL_CHECK(aclmdlRIExecuteAsync(matched_graph->graph, cann_ctx->stream()));
}
#endif // USE_ACL_GRAPH

View File

@ -224,6 +224,10 @@ static const char * cu_get_error_str(CUresult err) {
#define AMD_MFMA_AVAILABLE
#endif // defined(GGML_USE_HIP) && defined(CDNA) && !defined(GGML_HIP_NO_MMQ_MFMA)
#if defined(GGML_USE_HIP) && defined(RDNA4)
#define AMD_WMMA_AVAILABLE
#endif // defined(GGML_USE_HIP) && defined(RDNA4)
// The Volta instructions are in principle available on Turing or newer but they are effectively unusable:
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
#define VOLTA_MMA_AVAILABLE
@ -283,6 +287,10 @@ static bool amd_mfma_available(const int cc) {
#endif //!defined(GGML_HIP_NO_MMQ_MFMA)
}
static bool amd_wmma_available(const int cc) {
return GGML_CUDA_CC_IS_RDNA4(cc);
}
static bool volta_mma_available(const int cc) {
return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) == GGML_CUDA_CC_VOLTA;
}

View File

@ -39,6 +39,15 @@ template<typename dst_t, typename src_t>
return __float2bfloat16(float(x));
} else if constexpr(std::is_same_v<src_t, nv_bfloat16>) {
return __bfloat162float(x);
} else if constexpr(std::is_same_v<src_t, float2> && std::is_same_v<dst_t, half2>) {
return __float22half2_rn(x);
} else if constexpr(std::is_same_v<src_t, float2> && std::is_same_v<dst_t, nv_bfloat162>) {
// bypass compile error on cuda 12.0.1
#ifdef GGML_USE_HIP
return __float22bfloat162_rn(x);
#else
return {x.x, x.y};
#endif // GGML_USE_HIP
} else if constexpr(std::is_same_v<dst_t, int32_t>) {
return int32_t(x);
} else {

View File

@ -74,6 +74,33 @@ namespace ggml_cuda_mma {
static constexpr int J = J_;
#if defined(GGML_USE_HIP)
#if defined(RDNA4)
static constexpr int ne = I * J / 32;
T x[ne] = {0};
static constexpr __device__ bool supported() {
if (I == 16 && J == 16) return true;
return false;
}
static __device__ __forceinline__ int get_i(const int l) {
if constexpr (I == 16 && J == 16) {
return 8 * (threadIdx.x / 16) + l;
} else {
NO_DEVICE_CODE;
return -1;
}
}
static __device__ __forceinline__ int get_j(const int l) {
if constexpr (I == 16 && J == 16) {
return threadIdx.x % 16;
} else {
NO_DEVICE_CODE;
return -1;
}
}
#else
static constexpr int ne = I * J / 64;
T x[ne] = {0};
@ -119,6 +146,7 @@ namespace ggml_cuda_mma {
return -1;
}
}
#endif // defined(RDNA4)
#elif __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
static constexpr int ne = I * J / 32;
T x[ne] = {0};
@ -236,6 +264,32 @@ namespace ggml_cuda_mma {
return -1;
}
}
#elif defined(AMD_WMMA_AVAILABLE)
static constexpr int ne = I * J / 32;
half2 x[ne] = {{0.0f, 0.0f}};
static constexpr __device__ bool supported() {
if (I == 16 && J == 8) return true;
return false;
}
static __device__ __forceinline__ int get_i(const int l) {
if constexpr (I == 16 && J == 8) {
return threadIdx.x % 16;
} else {
NO_DEVICE_CODE;
return -1;
}
}
static __device__ __forceinline__ int get_j(const int l) {
if constexpr (I == 16 && J == 8) {
return 4 * (threadIdx.x / 16) + l;
} else {
NO_DEVICE_CODE;
return -1;
}
}
#else
static constexpr int ne = I * J / WARP_SIZE;
half2 x[ne] = {{0.0f, 0.0f}};
@ -285,6 +339,34 @@ namespace ggml_cuda_mma {
struct tile<I_, J_, nv_bfloat162> {
static constexpr int I = I_;
static constexpr int J = J_;
#if defined(AMD_WMMA_AVAILABLE)
static constexpr int ne = I * J / 32;
nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
static constexpr __device__ bool supported() {
if (I == 16 && J == 8) return true;
return false;
}
static __device__ __forceinline__ int get_i(const int l) {
if constexpr (I == 16 && J == 8) {
return threadIdx.x % 16;
} else {
NO_DEVICE_CODE;
return -1;
}
}
static __device__ __forceinline__ int get_j(const int l) {
if constexpr (I == 16 && J == 8) {
return 4 * (threadIdx.x / 16) + l;
} else {
NO_DEVICE_CODE;
return -1;
}
}
#else
static constexpr int ne = I * J / WARP_SIZE;
nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
@ -320,6 +402,7 @@ namespace ggml_cuda_mma {
return -1;
}
}
#endif // defined(AMD_WMMA_AVAILABLE)
};
template <int I, int J>
@ -353,6 +436,8 @@ namespace ggml_cuda_mma {
const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 2 * (threadIdx.x / t.I));
xi[0] = xs[0];
}
#elif defined(AMD_WMMA_AVAILABLE)
ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
#else
#pragma unroll
for (int l = 0; l < t.ne; ++l) {
@ -639,12 +724,34 @@ namespace ggml_cuda_mma {
: "+r"(Dxi[4]), "+r"(Dxi[5]), "+r"(Dxi[6]), "+r"(Dxi[7])
: "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[3]));
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
#elif defined(AMD_WMMA_AVAILABLE)
using halfx8_t = __attribute__((ext_vector_type(8))) _Float16;
using floatx8_t = __attribute__((ext_vector_type(8))) float;
floatx8_t& acc_frag = reinterpret_cast<floatx8_t&>(D.x[0]);
const halfx8_t& a_frag = reinterpret_cast<const halfx8_t&>(A.x[0]);
const halfx8_t& b_frag = reinterpret_cast<const halfx8_t&>(B.x[0]);
acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(a_frag, b_frag, acc_frag);
#else
GGML_UNUSED_VARS(D, A, B);
NO_DEVICE_CODE;
#endif // TURING_MMA_AVAILABLE
}
static __device__ __forceinline__ void mma(
tile<16, 16, float> & D, const tile<16, 8, nv_bfloat162> & A, const tile<16, 8, nv_bfloat162> & B) {
#if defined(AMD_WMMA_AVAILABLE)
using bf16x8_t = __attribute__((ext_vector_type(8))) __bf16;
using floatx8_t = __attribute__((ext_vector_type(8))) float;
floatx8_t& acc_frag = reinterpret_cast<floatx8_t&>(D.x[0]);
const bf16x8_t& a_frag = reinterpret_cast<const bf16x8_t&>(A.x[0]);
const bf16x8_t& b_frag = reinterpret_cast<const bf16x8_t&>(B.x[0]);
acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12(a_frag, b_frag, acc_frag);
#else
GGML_UNUSED_VARS(D, A, B);
NO_DEVICE_CODE;
#endif // AMPERE_MMA_AVAILABLE
}
static __device__ __forceinline__ void mma(
tile<16, 16, int> & D, const tile<16, 8, int> & A, const tile<16, 8, int> & B) {
#if defined(AMD_MFMA_AVAILABLE)

View File

@ -151,7 +151,7 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const
return false;
}
} else {
if (src1_ncols > 16) {
if (src1_ncols > 16 || GGML_CUDA_CC_IS_RDNA4(cc)) {
return false;
}
}
@ -160,9 +160,9 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const
case GGML_TYPE_F32:
return ampere_mma_available(cc);
case GGML_TYPE_F16:
return volta_mma_available(cc) || turing_mma_available(cc);
return volta_mma_available(cc) || turing_mma_available(cc) || amd_wmma_available(cc);
case GGML_TYPE_BF16:
return ampere_mma_available(cc);
return ampere_mma_available(cc) || amd_wmma_available(cc);
default:
return false;
}

View File

@ -2,6 +2,7 @@
#include "mma.cuh"
#include "common.cuh"
#include "convert.cuh"
using namespace ggml_cuda_mma;
@ -27,20 +28,35 @@ static __global__ void mul_mat_f(
const int stride_col_id, const int stride_row_id,
const int channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
const int sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) {
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
// TODO: handle this in a consistent and simpler way after AMD MFMA support has been added
#if (!defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)) || defined(AMD_WMMA_AVAILABLE)
#if defined(AMD_WMMA_AVAILABLE)
// Special case for tf32, just dummy mma layout as wmma doesn't support it.
constexpr int tile_B_I = std::is_same_v<T, float> ? 8 : 16;
constexpr int tile_C_J = std::is_same_v<T, float> ? 8 : 16;
typedef tile<16, 8, T> tile_A;
typedef tile<tile_B_I, 8, T> tile_B;
typedef tile<16, tile_C_J, float> tile_C;
constexpr bool a_supported = tile_A::supported();
constexpr bool b_supported = tile_B::supported();
constexpr bool c_supported = tile_C::supported();
constexpr bool supported = a_supported && b_supported && c_supported;
#else
constexpr bool I_16_supported = tile<16, 8, T>::supported() && tile<16, 8, float>::supported();
constexpr bool I_32_supported = tile<32, 8, T>::supported() && tile<32, 8, float>::supported();
if (!I_16_supported && !I_32_supported) {
NO_DEVICE_CODE;
return;
}
constexpr bool supported = I_16_supported || I_32_supported;
constexpr int I_preferred = I_16_supported ? 16 : 32; // For Turing MMA both work but 16 is ~1% faster.
typedef tile<I_preferred, 8, T> tile_A;
typedef tile<8, 8, T> tile_B;
typedef tile<I_preferred, 8, float> tile_C;
#endif // defined(AMD_WMMA_AVAILABLE)
if constexpr (!supported) {
NO_DEVICE_CODE;
return;
}
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
constexpr int tile_k_padded = warp_size + 4;
@ -161,11 +177,11 @@ static __global__ void mul_mat_f(
if constexpr (!has_ids) {
const float2 tmp = j < cols_per_block ? y2[j*stride_col_y + col] : make_float2(0.0f, 0.0f);
tile_xy[j0*tile_k_padded + threadIdx.x] = {tmp.x, tmp.y};
tile_xy[j0*tile_k_padded + threadIdx.x] = ggml_cuda_cast<T>(tmp);
} else {
const bool valid = j < cols_per_block && (col_base + j) < ncols_dst_total && slot_map[j] >= 0;
float2 tmp = valid ? *(const float2*) &y[slot_map[j]*stride_channel_y + 2*(j*stride_col_y + col)] : make_float2(0.0f, 0.0f);
tile_xy[j0*tile_k_padded + threadIdx.x] = {tmp.x, tmp.y};
tile_xy[j0*tile_k_padded + threadIdx.x] = ggml_cuda_cast<T>(tmp);
}
}
} else {
@ -239,7 +255,7 @@ static __global__ void mul_mat_f(
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
NO_DEVICE_CODE;
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
#endif // (!defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)) || defined(AMD_WMMA_AVAILABLE)
}
//This kernel is for larger batch sizes of mul_mat_id
@ -253,20 +269,35 @@ static __global__ void mul_mat_f_ids(
const int channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
const int sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst,
const uint3 sis1_fd, const uint3 nch_fd) {
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
// TODO: handle this in a consistent and simpler way after AMD MFMA support has been added
#if (!defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)) || defined(AMD_WMMA_AVAILABLE)
#if defined(AMD_WMMA_AVAILABLE)
// Special case for tf32, just dummy mma layout as wmma doesn't support it.
constexpr int tile_B_I = std::is_same_v<T, float> ? 8 : 16;
constexpr int tile_C_J = std::is_same_v<T, float> ? 8 : 16;
typedef tile<16, 8, T> tile_A;
typedef tile<tile_B_I, 8, T> tile_B;
typedef tile<16, tile_C_J, float> tile_C;
constexpr bool a_supported = tile_A::supported();
constexpr bool b_supported = tile_B::supported();
constexpr bool c_supported = tile_C::supported();
constexpr bool supported = a_supported && b_supported && c_supported;
#else
constexpr bool I_16_supported = tile<16, 8, T>::supported() && tile<16, 8, float>::supported();
constexpr bool I_32_supported = tile<32, 8, T>::supported() && tile<32, 8, float>::supported();
constexpr bool supported = I_16_supported || I_32_supported;
if (!I_16_supported && !I_32_supported) {
NO_DEVICE_CODE;
return;
}
constexpr int I_preferred = I_16_supported ? 16 : 32; // For Turing MMA both work butr 16 is ~1% faster.
constexpr int I_preferred = I_16_supported ? 16 : 32; // For Turing MMA both work but 16 is ~1% faster.
typedef tile<I_preferred, 8, T> tile_A;
typedef tile<8, 8, T> tile_B;
typedef tile<I_preferred, 8, float> tile_C;
#endif // defined(AMD_WMMA_AVAILABLE)
if constexpr (!supported) {
NO_DEVICE_CODE;
return;
}
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
constexpr int tile_k_padded = warp_size + 4;
@ -408,7 +439,7 @@ static __global__ void mul_mat_f_ids(
#pragma unroll
for (int j0 = 0; j0 < tile_B::I; ++j0) {
const float2 tmp = vals_buf[curr_buf][j0];
tile_xy[j0*tile_k_padded + threadIdx.x] = {tmp.x, tmp.y};
tile_xy[j0*tile_k_padded + threadIdx.x] = ggml_cuda_cast<T>(tmp);
}
if (itB + 1 < ntB) {
@ -492,7 +523,7 @@ static __global__ void mul_mat_f_ids(
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, sis1_fd, nch_fd);
NO_DEVICE_CODE;
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
#endif // (!defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)) || defined(AMD_WMMA_AVAILABLE)
}
template<typename T, int cols_per_block, int nwarps>
@ -554,7 +585,8 @@ void mul_mat_f_cuda(
cudaStream_t stream, const mmf_ids_data * ids_data) {
typedef tile<16, 8, T> tile_A_16;
typedef tile<32, 8, T> tile_A_32;
typedef tile< 8, 8, T> tile_B;
typedef tile<16, 8, T> tile_B_16;
typedef tile< 8, 8, T> tile_B_8;
GGML_ASSERT(ncols_x % 2 == 0);
GGML_ASSERT(stride_row % 2 == 0);
@ -581,7 +613,8 @@ void mul_mat_f_cuda(
constexpr int rows_per_block = MMF_ROWS_PER_BLOCK;
const int nbytes_shared_iter = nwarps_best * (volta_mma_available(cc) ? tile_A_32::I : tile_A_16::I) * (warp_size + 4) * 4;
const int nbytes_shared_combine = GGML_PAD(cols_per_block, tile_B::I) * (nwarps_best*rows_per_block + 4) * 4;
const int nbytes_cols_per_block_pad = amd_wmma_available(cc) ? tile_B_16::I : tile_B_8::I;
const int nbytes_shared_combine = GGML_PAD(cols_per_block, nbytes_cols_per_block_pad) * (nwarps_best*rows_per_block + 4) * 4;
const int nbytes_shared = std::max(nbytes_shared_iter, nbytes_shared_combine);
const int nbytes_slotmap = ids ? GGML_PAD(cols_per_block, 16) * sizeof(int) : 0;
const int nbytes_shared_total = nbytes_shared + nbytes_slotmap;

View File

@ -106,33 +106,32 @@ static void glu_swiglu_fp32_per_thread(const struct htp_tensor * src0,
t1 = HAP_perf_get_qtimer_count();
int is_aligned = 1;
int opt_path = 0;
if (!htp_is_aligned((void *) src0->data, VLEN) || !htp_is_aligned((void *) dst->data, VLEN)) {
is_aligned = 0;
FARF(HIGH, "swiglu-f32: unaligned addresses in elementwise op, possibly slower execution\n");
}
if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
opt_path = 1;
}
const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
const uint8_t * restrict data_src1 = (const uint8_t *) src1->data;
uint8_t * restrict data_dst = (uint8_t *) dst->data;
bool src1_valid = src1->ne[0];
const bool src1_valid = src1->ne[0];
const int nc = (src1_valid) ? ne00 : ne00 / 2;
if (!src1_valid) {
data_src1 = data_src0;
src1_row_size = src0_row_size;
const int32_t swapped = op_params[1];
data_src1 = data_src0;
src1_row_size = src0_row_size;
const size_t nc_in_bytes = nc * SIZEOF_FP32;
data_src0 += swapped ? nc_in_bytes : 0;
data_src1 += swapped ? 0 : nc_in_bytes;
}
uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_row_size);
uint8_t * restrict src1_spad_data = src1_spad->data + (ith * src1_row_size);
uint8_t * restrict dst_spad_data = dst_spad->data + (ith * dst_row_size);
const int32_t swapped = op_params[1];
const int nc = (src1_valid) ? ne0 : ne0 / 2;
const bool opt_path = ((1 == is_aligned) && !(nb01 & (VLEN - 1)));
for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) {
const float * restrict src0 = (float *) (data_src0 + (ir * src0_row_size));
const float * restrict src1 = (float *) (data_src1 + (ir * src1_row_size));
@ -142,12 +141,7 @@ static void glu_swiglu_fp32_per_thread(const struct htp_tensor * src0,
htp_l2fetch(src0 + src0_row_size, 1, src0_row_size, src0_row_size);
}
if (!src1_valid) {
src0 += swapped ? nc : 0;
src1 += swapped ? 0 : nc;
}
if (1 == opt_path) {
if (opt_path) {
hvx_fast_sigmoid_f32((const uint8_t *) src0, (uint8_t *) src0_spad_data, nc);
hvx_mul_mul_f32_opt((const uint8_t *) src0, (const uint8_t *) src0_spad_data, (const uint8_t *) src1,
(uint8_t *) dst, nc);
@ -218,7 +212,7 @@ static void glu_swiglu_oai_fp32_per_thread(const struct htp_tensor * src0,
const float alpha = ((const float *) (op_params))[2];
const float limit = ((const float *) (op_params))[3];
const int nc = (src1_valid) ? ne0 : ne0 / 2;
const int nc = (src1_valid) ? ne00 : ne00 / 2;
for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) {
const float * restrict src0 = (float *) (data_src0 + (ir * src0_row_size));

View File

@ -16,6 +16,19 @@
#include "hvx-utils.h"
#include "ops-utils.h"
static inline HVX_Vector hvx_vec_exp_fp32_guard(HVX_Vector in_vec) {
static const float kInf = INFINITY;
static const float kMaxExp = 88.02f; // log(INF)
const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
const HVX_Vector inf = hvx_vec_splat_fp32(kInf);
const HVX_VectorPred pred0 = Q6_Q_vcmp_gt_VsfVsf(in_vec, max_exp);
HVX_Vector out = hvx_vec_exp_fp32(in_vec);
return Q6_V_vmux_QVV(pred0, inf, out);
}
void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems, bool negate) {
int left_over = num_elems & (VLEN_FP32 - 1);
int num_elems_whole = num_elems - left_over;
@ -42,9 +55,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
if (true == negate) {
HVX_Vector neg_vec_in = hvx_vec_neg_fp32(*p_vec_in1++);
*p_vec_out++ = hvx_vec_exp_fp32(neg_vec_in);
*p_vec_out++ = hvx_vec_exp_fp32_guard(neg_vec_in);
} else {
*p_vec_out++ = hvx_vec_exp_fp32(*p_vec_in1++);
*p_vec_out++ = hvx_vec_exp_fp32_guard(*p_vec_in1++);
}
}
} else {
@ -54,9 +67,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int
if (true == negate) {
HVX_Vector neg_vec_in = hvx_vec_neg_fp32(in);
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32(neg_vec_in);
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(neg_vec_in);
} else {
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32(in);
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(in);
}
}
}
@ -70,9 +83,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int
if (true == negate) {
HVX_Vector neg_vec_in = hvx_vec_neg_fp32(in);
vec_out = hvx_vec_exp_fp32(neg_vec_in);
vec_out = hvx_vec_exp_fp32_guard(neg_vec_in);
} else {
vec_out = hvx_vec_exp_fp32(in);
vec_out = hvx_vec_exp_fp32_guard(in);
}
hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, vec_out);

View File

@ -38,13 +38,13 @@ void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const
#pragma unroll(4)
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
*p_vec_out++ = hvx_vec_inverse_fp32(*p_vec_in++);
*p_vec_out++ = hvx_vec_inverse_fp32_guard(*p_vec_in++);
}
} else {
#pragma unroll(4)
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_inverse_fp32(in);
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_inverse_fp32_guard(in);
}
}
@ -53,7 +53,7 @@ void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const
float * dstf = (float *) dst + num_elems_whole;
HVX_Vector in = *(HVX_UVector *) srcf;
HVX_Vector out = hvx_vec_inverse_fp32(in);
HVX_Vector out = hvx_vec_inverse_fp32_guard(in);
hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, out);
}

View File

@ -401,7 +401,9 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
FARF(HIGH, "hvx_add_scalar_f32: unaligned loop in hvx op, possibly slower execution\n");
}
HVX_Vector val_vec = hvx_vec_splat_fp32(val);
static const float kInf = INFINITY;
const HVX_Vector inf = hvx_vec_splat_fp32(kInf);
HVX_Vector val_vec = hvx_vec_splat_fp32(val);
if (0 == unaligned_loop) {
HVX_Vector * restrict vec_in1 = (HVX_Vector *) src;
@ -409,17 +411,24 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
#pragma unroll(4)
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
HVX_Vector v = Q6_Vqf32_vadd_VsfVsf(*vec_in1++, val_vec);
*vec_out++ = Q6_Vsf_equals_Vqf32(v);
HVX_Vector in = *vec_in1++;
const HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(inf, in);
HVX_Vector v = Q6_Vqf32_vadd_VsfVsf(in, val_vec);
v = Q6_Vsf_equals_Vqf32(v);
v = Q6_V_vmux_QVV(pred_inf, inf, v);
*vec_out++ = v;
}
} else {
#pragma unroll(4)
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in, val_vec);
const HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(inf, in);
HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in, val_vec);
out = Q6_Vsf_equals_Vqf32(out);
out = Q6_V_vmux_QVV(pred_inf, inf, out);
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out);
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = out;
}
}
@ -429,8 +438,12 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
HVX_Vector in = *(HVX_UVector *) srcf;
HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in, val_vec);
hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out));
const HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(inf, in);
HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in, val_vec);
out = Q6_Vsf_equals_Vqf32(out);
out = Q6_V_vmux_QVV(pred_inf, inf, out);
hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, out);
}
}

View File

@ -12,6 +12,15 @@
#define VLEN_FP32 (VLEN / SIZEOF_FP32)
#define VLEN_FP16 (VLEN / SIZEOF_FP16)
typedef union {
HVX_Vector v;
uint8_t b[VLEN];
uint16_t h[VLEN_FP16];
uint32_t w[VLEN_FP32];
__fp16 fp16[VLEN_FP16];
float fp32[VLEN_FP32];
} __attribute__((aligned(VLEN), packed)) HVX_VectorAlias;
static inline HVX_Vector hvx_vec_splat_fp32(float i) {
union {
float f;
@ -243,19 +252,16 @@ static __attribute__((always_inline)) int32_t is_in_one_chunk(void * addr, uint3
}
static void hvx_vec_dump_fp16_n(char * pref, HVX_Vector v, uint32_t n) {
union {
HVX_Vector v;
__fp16 d[64];
} u = { .v = v };
HVX_VectorAlias u = { .v = v };
const uint32_t n0 = n / 16;
const uint32_t n1 = n % 16;
int i = 0;
for (; i < n0; i++) {
htp_dump_fp16_line(pref, u.d + (16 * i), 16);
htp_dump_fp16_line(pref, u.fp16 + (16 * i), 16);
}
if (n1) {
htp_dump_fp16_line(pref, u.d + (16 * i), n1);
htp_dump_fp16_line(pref, u.fp16 + (16 * i), n1);
}
}
@ -411,8 +417,8 @@ static inline HVX_Vector hvx_vec_fp32_reduce_sum_n(HVX_Vector in, unsigned int n
HVX_Vector sum = in, sum_t;
while (width < total) {
sum_t = Q6_V_vror_VR(sum, width); // rotate right
sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(sum, sum_t)); // elementwise sum
sum_t = Q6_V_vror_VR(sum, width); // rotate right
sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(sum, sum_t)); // elementwise sum
width = width << 1;
}
return sum;
@ -491,7 +497,7 @@ static inline HVX_Vector hvx_vec_abs_fp16(HVX_Vector v) {
static inline HVX_Vector hvx_vec_neg_fp16(HVX_Vector v) {
// neg by setting the fp16 sign bit
HVX_Vector mask = Q6_Vh_vsplat_R(0x8000);
return Q6_V_vor_VV(v, mask);
return Q6_V_vxor_VV(v, mask);
}
static inline HVX_Vector hvx_vec_abs_fp32(HVX_Vector v) {
@ -506,7 +512,7 @@ static inline HVX_Vector hvx_vec_neg_fp32(HVX_Vector v) {
#else
// neg by setting the fp32 sign bit
HVX_Vector mask = Q6_V_vsplat_R(0x80000000);
return Q6_V_vor_VV(v, mask);
return Q6_V_vxor_VV(v, mask);
#endif // __HTP_ARCH__ > 75
}
@ -720,6 +726,24 @@ static inline HVX_Vector hvx_vec_inverse_fp32(HVX_Vector v_sf) {
return Q6_Vsf_equals_Vqf32(r_qf);
}
static inline HVX_Vector hvx_vec_inverse_fp32_guard(HVX_Vector v_sf) {
static const float kInf = INFINITY;
static const uint32_t kNanMask = 0x7fffffff;
static const uint32_t kNanMin = 0x7f800000;
const HVX_Vector inf = hvx_vec_splat_fp32(kInf);
const HVX_VectorPred pred_inf = Q6_Q_vcmp_gt_VsfVsf(inf, v_sf);
HVX_Vector out = hvx_vec_inverse_fp32(v_sf);
const HVX_Vector nan_mask = Q6_V_vsplat_R(kNanMask);
const HVX_Vector nan_min = Q6_V_vsplat_R(kNanMin);
HVX_Vector masked_out = Q6_V_vand_VV(out, nan_mask);
const HVX_VectorPred pred = Q6_Q_vcmp_gtand_QVuwVuw(pred_inf, nan_min, masked_out);
return Q6_V_vmux_QVV(pred, out, Q6_V_vzero());
}
#define FAST_SIGMOID_LOG2F (0x3fb8aa3b) // 1.442695022
#define FAST_SIGMOID_C1 (0x3d009076) // 0.03138777
#define FAST_SIGMOID_C2 (0x3e8d74bd) // 0.276281267
@ -934,6 +958,16 @@ static inline HVX_Vector hvx_vec_rsqrt_fp32(HVX_Vector in_vec) {
return Q6_Vsf_equals_Vqf32(temp);
}
static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard(HVX_Vector v) {
static const float kMaxExp = -88.02f; // log(INF)
const HVX_Vector max_exp = Q6_V_vsplat_R(*((uint32_t *) &kMaxExp));
const HVX_VectorPred pred_inf = Q6_Q_vcmp_gt_VsfVsf(v, max_exp);
HVX_Vector out = hvx_vec_fast_sigmoid_fp32(v);
return Q6_V_vmux_QVV(pred_inf, out, Q6_V_vzero());
}
static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) {
int step_of_1 = num_elems >> 5;
int remaining = num_elems - step_of_1 * VLEN_FP32;
@ -945,7 +979,7 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t *
#pragma unroll(4)
for (int i = 0; i < step_of_1; i++) {
v_dst[i] = hvx_vec_fast_sigmoid_fp32(v_src[i]);
v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i]);
}
}

View File

@ -6895,9 +6895,23 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
cl_context context = backend_ctx->context;
if(src0t == GGML_TYPE_F16 && src1t == GGML_TYPE_F32){
if (ne01 >= 64 && ne1 >= 32 && ne00 >= 16 && (ne12 % ne02) == 0){
ggml_cl_mul_mat_kq_kqv_adreno(backend, src0, src1, dst);
return;
if (ne01 >= 64 && ne1 >= 32 && ne00 >= 16 && (ne12 % ne02) == 0) {
// For KQ
if (ggml_is_permuted(src0) && ggml_is_permuted(src1) &&
nb00 <= nb02 &&
nb02 <= nb01 &&
nb01 <= nb03 &&
nb10 <= nb12 &&
nb12 <= nb11 &&
nb11 <= nb13) {
ggml_cl_mul_mat_kq_kqv_adreno(backend, src0, src1, dst);
return;
}
// For KQV
if (!ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
ggml_cl_mul_mat_kq_kqv_adreno(backend, src0, src1, dst);
return;
}
}
}

View File

@ -513,6 +513,7 @@ struct vk_device_struct {
vk_queue compute_queue;
vk_queue transfer_queue;
bool single_queue;
bool support_async;
uint32_t subgroup_size;
uint32_t shader_core_count;
bool uma;
@ -4273,6 +4274,16 @@ static vk_device ggml_vk_get_device(size_t idx) {
device->vendor_id = device->properties.vendorID;
device->driver_id = driver_props.driverID;
// Implementing the async backend interfaces seems broken on older Intel HW,
// see https://github.com/ggml-org/llama.cpp/issues/17302.
device->support_async = (device->vendor_id != VK_VENDOR_ID_INTEL ||
std::string(device->properties.deviceName.data()).find("(DG1)") == std::string::npos) &&
getenv("GGML_VK_DISABLE_ASYNC") == nullptr;
if (!device->support_async) {
GGML_LOG_DEBUG("ggml_vulkan: WARNING: Async execution disabled on certain Intel devices.\n");
}
const char* GGML_VK_FORCE_MAX_ALLOCATION_SIZE = getenv("GGML_VK_FORCE_MAX_ALLOCATION_SIZE");
if (GGML_VK_FORCE_MAX_ALLOCATION_SIZE != nullptr) {
@ -13187,6 +13198,10 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
ctx->device->perf_logger->print_timings();
}
if (!ctx->device->support_async) {
ggml_vk_synchronize(ctx);
}
return GGML_STATUS_SUCCESS;
UNUSED(backend);
@ -13480,6 +13495,10 @@ ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
/* .context = */ ctx,
};
if (!ctx->device->support_async) {
vk_backend->iface.get_tensor_async = nullptr;
}
return vk_backend;
}

View File

@ -1593,7 +1593,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
} break;
case LLM_ARCH_DEEPSEEK2:
{
bool is_lite = (hparams.n_layer == 27);
// lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
if (!is_lite) {
@ -4581,7 +4582,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
} break;
case LLM_ARCH_DEEPSEEK2:
{
const bool is_lite = (hparams.n_layer == 27);
// lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);

View File

@ -4,7 +4,8 @@
llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
bool is_lite = (hparams.n_layer == 27);
// lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);

View File

@ -7819,6 +7819,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
for (int bs : {1, 4, 8, 32, 64, 128, 256, 512}) {
for (ggml_type type_a : {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K, GGML_TYPE_IQ2_XS}) {
for (ggml_type type_b : {GGML_TYPE_F32}) {
test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, 128, 8, false, 768, bs, 2048));
test_cases.emplace_back(new test_mul_mat_id_fusion(type_a, type_b, 128, 8, false, 768, bs, 2048, 1));
}
}
@ -7827,6 +7828,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
for (int bs : {1, 4, 8, 32, 64, 128, 256, 512}) {
for (ggml_type type_a : {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K, GGML_TYPE_IQ2_XS}) {
for (ggml_type type_b : {GGML_TYPE_F32}) {
test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, 32, 4, false, 1792, bs, 2048));
test_cases.emplace_back(new test_mul_mat_id_fusion(type_a, type_b, 32, 4, false, 1792, bs, 2048, 1));
}
}
@ -7837,6 +7839,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
for (int bs : {1, 4, 8, 512}) {
for (ggml_type type_a : {GGML_TYPE_MXFP4}) {
for (ggml_type type_b : {GGML_TYPE_F32}) {
test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, 32, 4, false, 2880, bs, 2880));
test_cases.emplace_back(new test_mul_mat_id_fusion(type_a, type_b, 32, 4, false, 2880, bs, 2880, 1));
}
}

View File

@ -23,7 +23,7 @@
#endif
struct quantize_stats_params {
std::string model = DEFAULT_MODEL_PATH;
std::string model = "models/7B/ggml-model-f16.gguf";
bool verbose = false;
bool per_layer_stats = false;
bool print_histogram = false;

View File

@ -1364,18 +1364,32 @@ llama-server -hf <user>/<model>:<tag>
*The server must be restarted after adding a new model.*
Alternatively, you can point the router to a local directory containing your GGUF files using `--models-dir`. Files prefixed with `mmproj-` will automatically be treated as multimodal projection files **for the model with the matching base name**:
Alternatively, you can point the router to a local directory containing your GGUF files using `--models-dir`. Example command:
```sh
llama-3.2-1b-Q4_K_M.gguf
gemma-3-4b-it-Q8_0.gguf
mmproj-gemma-3-4b-it-Q8_0.gguf # must be "mmproj-" + text model filename
llama-server --models-dir ./models_directory
```
Example:
If the model contains multiple GGUF (for multimodal or multi-shard), files should be put into a subdirectory. The directory structure should look like this:
```sh
llama-server --models-dir ./path/to/models
models_directory
│ # single file
├─ llama-3.2-1b-Q4_K_M.gguf
├─ Qwen3-8B-Q4_K_M.gguf
│ # multimodal
├─ gemma-3-4b-it-Q8_0
│ ├─ gemma-3-4b-it-Q8_0.gguf
│ └─ mmproj-F16.gguf # file name must start with "mmproj"
│ # multi-shard
├─ gemma-3-4b-it-Q8_0
│ ├─ Kimi-K2-Thinking-UD-IQ1_S-00001-of-00006.gguf
│ ├─ Kimi-K2-Thinking-UD-IQ1_S-00002-of-00006.gguf
│ ├─ ...
│ └─ Kimi-K2-Thinking-UD-IQ1_S-00006-of-00006.gguf
```
You may also specify default arguments that will be passed to every loaded model instance:
@ -1384,6 +1398,8 @@ You may also specify default arguments that will be passed to every loaded model
llama-server -ctx 8192 -n 1024 -np 2
```
Note: model instances inherit both command line arguments and environment variables from the router server.
### Routing requests
Requests are routed according to the requested model name.
@ -1393,7 +1409,12 @@ For **POST** endpoints (`/v1/chat/completions`, `/v1/completions`, `/infill`, et
```json
{
"model": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M",
...
"messages": [
{
"role": "user",
"content": "hello"
}
]
}
```
@ -1405,15 +1426,92 @@ GET /props?model=ggml-org%2Fgemma-3-4b-it-GGUF%3AQ4_K_M
### GET `/models`: List available models
TODO
Listing all models in cache. The model metadata will also include a field to indicate the status of the model:
```json
{
"data": [{
"name": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M",
"id": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M",
"in_cache": true,
"path": "/Users/REDACTED/Library/Caches/llama.cpp/ggml-org_gemma-3-4b-it-GGUF_gemma-3-4b-it-Q4_K_M.gguf",
"status": {
"value": "loaded"
},
...
}]
}
```
Note: For a local GGUF (stored offline in a custom directory), the model object will have `"in_cache": false`.
The `status` object can be:
```json
"status": {
"value": "unloaded"
}
```
```json
"status": {
"value": "loading"
}
```
```json
"status": {
"value": "failed"
}
```
```json
"status": {
"value": "loaded"
}
```
### POST `/models/load`: Load a model
TODO
Load a model
Payload:
```json
{
"model": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
}
```
Response:
```json
{
"success": true
}
```
### POST `/models/unload`: Unload a model
TODO
Unload a model
Payload:
```json
{
"model": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M",
}
```
Response:
```json
{
"success": true
}
```
## More examples

View File

@ -12,6 +12,7 @@
#include <cstring>
#include <atomic>
#include <chrono>
#include <unordered_set>
#ifdef _WIN32
#include <winsock2.h>
@ -57,7 +58,7 @@ static std::filesystem::path get_server_exec_path() {
return std::filesystem::path(buf.data());
}
}
return std::filesystem::path(std::string(buf.data(), (size > 0) ? size : 0));
throw std::runtime_error("_NSGetExecutablePath failed after buffer resize");
}
#else
char path[FILENAME_MAX];
@ -69,6 +70,64 @@ static std::filesystem::path get_server_exec_path() {
#endif
}
struct local_model {
std::string name;
std::string path;
std::string path_mmproj;
};
static std::vector<local_model> list_local_models(const std::string & dir) {
if (!std::filesystem::exists(dir) || !std::filesystem::is_directory(dir)) {
throw std::runtime_error(string_format("error: '%s' does not exist or is not a directory\n", dir.c_str()));
}
std::vector<local_model> models;
auto scan_subdir = [&models](const std::string & subdir_path, const std::string name) {
auto files = fs_list(subdir_path, false);
common_file_info model_file;
common_file_info first_shard_file;
common_file_info mmproj_file;
for (const auto & file : files) {
if (string_ends_with(file.name, ".gguf")) {
if (file.name.find("mmproj") != std::string::npos) {
mmproj_file = file;
} else if (file.name.find("-00001-of-") != std::string::npos) {
first_shard_file = file;
} else {
model_file = file;
}
}
}
// single file model
local_model model{
/* name */ name,
/* path */ first_shard_file.path.empty() ? model_file.path : first_shard_file.path,
/* path_mmproj */ mmproj_file.path // can be empty
};
if (!model.path.empty()) {
models.push_back(model);
}
};
auto files = fs_list(dir, true);
for (const auto & file : files) {
if (file.is_dir) {
scan_subdir(file.path, file.name);
} else if (string_ends_with(file.name, ".gguf")) {
// single file model
std::string name = file.name;
string_replace_all(name, ".gguf", "");
local_model model{
/* name */ name,
/* path */ file.path,
/* path_mmproj */ ""
};
models.push_back(model);
}
}
return models;
}
//
// server_models
//
@ -85,15 +144,17 @@ server_models::server_models(
base_env.push_back(std::string(*env));
}
// TODO: allow refreshing cached model list
// add cached models
auto cached_models = common_list_cached_models();
for (const auto & model : cached_models) {
server_model_meta meta{
/* name */ model.to_string(),
/* path */ model.manifest_path,
/* path_mmproj */ "",
/* path_mmproj */ "", // auto-detected when loading
/* in_cache */ true,
/* port */ 0,
/* status */ SERVER_MODEL_STATUS_UNLOADED
/* status */ SERVER_MODEL_STATUS_UNLOADED,
/* last_used */ 0
};
mapping[meta.name] = instance_t{
/* subproc */ std::make_shared<subprocess_s>(),
@ -101,6 +162,30 @@ server_models::server_models(
/* meta */ meta
};
}
// add local models specificed via --models-dir
if (!params.models_dir.empty()) {
auto local_models = list_local_models(params.models_dir);
for (const auto & model : local_models) {
if (mapping.find(model.name) != mapping.end()) {
// already exists in cached models, skip
continue;
}
server_model_meta meta{
/* name */ model.name,
/* path */ model.path,
/* path_mmproj */ model.path_mmproj,
/* in_cache */ false,
/* port */ 0,
/* status */ SERVER_MODEL_STATUS_UNLOADED,
/* last_used */ 0
};
mapping[meta.name] = instance_t{
/* subproc */ std::make_shared<subprocess_s>(),
/* th */ std::thread(),
/* meta */ meta
};
}
}
}
void server_models::update_meta(const std::string & name, const server_model_meta & meta) {
@ -207,11 +292,39 @@ std::vector<server_model_meta> server_models::get_all_meta() {
return result;
}
void server_models::unload_lru() {
if (base_params.max_models <= 0) {
return; // no limit
}
// remove one of the servers if we passed the max_models (least recently used - LRU)
std::string lru_model_name = "";
int64_t lru_last_used = ggml_time_ms();
size_t count_active = 0;
{
std::lock_guard<std::mutex> lk(mutex);
for (const auto & m : mapping) {
if (m.second.meta.is_active()) {
count_active++;
if (m.second.meta.last_used < lru_last_used) {
lru_model_name = m.first;
lru_last_used = m.second.meta.last_used;
}
}
}
}
if (!lru_model_name.empty() && count_active >= (size_t)base_params.max_models) {
SRV_INF("max_models limit reached, removing LRU name=%s\n", lru_model_name.c_str());
unload(lru_model_name);
}
}
void server_models::load(const std::string & name) {
std::lock_guard<std::mutex> lk(mutex);
if (mapping.find(name) == mapping.end()) {
if (!has_model(name)) {
throw std::runtime_error("model name=" + name + " is not found");
}
unload_lru();
std::lock_guard<std::mutex> lk(mutex);
auto meta = mapping[name].meta;
if (meta.status != SERVER_MODEL_STATUS_FAILED && meta.status != SERVER_MODEL_STATUS_UNLOADED) {
@ -221,9 +334,10 @@ void server_models::load(const std::string & name) {
// prepare new instance info
instance_t inst;
inst.meta = meta;
inst.meta.port = get_free_port();
inst.meta.status = SERVER_MODEL_STATUS_LOADING;
inst.meta = meta;
inst.meta.port = get_free_port();
inst.meta.status = SERVER_MODEL_STATUS_LOADING;
inst.meta.last_used = ggml_time_ms();
if (inst.meta.port <= 0) {
throw std::runtime_error("failed to get a port number");
@ -385,7 +499,7 @@ bool server_models::ensure_model_loaded(const std::string & name) {
return true;
}
server_http_res_ptr server_models::proxy_request(const server_http_req & req, const std::string & method, const std::string & name) {
server_http_res_ptr server_models::proxy_request(const server_http_req & req, const std::string & method, const std::string & name, bool update_last_used) {
auto meta = get_meta(name);
if (!meta.has_value()) {
throw std::runtime_error("model name=" + name + " is not found");
@ -393,6 +507,10 @@ server_http_res_ptr server_models::proxy_request(const server_http_req & req, co
if (ensure_model_loaded(name)) {
meta = get_meta(name); // refresh meta
}
if (update_last_used) {
std::unique_lock<std::mutex> lk(mutex);
mapping[name].meta.last_used = ggml_time_ms();
}
SRV_INF("proxying request to model %s on port %d\n", name.c_str(), meta->port);
auto proxy = std::make_unique<server_http_proxy>(
method,
@ -572,8 +690,11 @@ server_http_proxy::server_http_proxy(
// wait for the first chunk (headers)
msg_t header;
pipe->read(header, should_stop);
SRV_DBG("%s", "received response headers\n");
this->status = header.status;
this->headers = header.headers;
if (pipe->read(header, should_stop)) {
SRV_DBG("%s", "received response headers\n");
this->status = header.status;
this->headers = header.headers;
} else {
SRV_DBG("%s", "no response headers received (request cancelled?)\n");
}
}

View File

@ -58,6 +58,8 @@ struct server_model_meta {
bool in_cache = false; // if true, use -hf; use -m otherwise
int port = 0;
server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
int64_t last_used = 0;
bool is_active() const {
return status == SERVER_MODEL_STATUS_LOADED || status == SERVER_MODEL_STATUS_LOADING;
}
@ -81,6 +83,9 @@ private:
void update_meta(const std::string & name, const server_model_meta & meta);
// unload least recently used models if the limit is reached
void unload_lru();
public:
server_models(const common_params & params, int argc, char ** argv, char ** envp);
@ -109,7 +114,7 @@ public:
bool ensure_model_loaded(const std::string & name);
// proxy an HTTP request to the model instance
server_http_res_ptr proxy_request(const server_http_req & req, const std::string & method, const std::string & name);
server_http_res_ptr proxy_request(const server_http_req & req, const std::string & method, const std::string & name, bool update_last_used);
// notify the router server that a model instance is ready
static void setup_child_server(const std::string & host, int router_port, const std::string & name, std::function<void(int)> & shutdown_handler);

View File

@ -5143,7 +5143,7 @@ public:
std::string method = "GET";
std::string name = req.get_param("model");
models->ensure_model_loaded(name);
return models->proxy_request(req, method, name);
return models->proxy_request(req, method, name, false);
};
server_http_context::handler_t proxy_post = [this](const server_http_req & req) {
@ -5151,7 +5151,7 @@ public:
json body = json::parse(req.body);
std::string name = json_value(body, "model", std::string());
models->ensure_model_loaded(name);
return models->proxy_request(req, method, name);
return models->proxy_request(req, method, name, true); // update last usage for POST request only
};
server_http_context::handler_t post_router_models_load = [this](const server_http_req & req) {
@ -5189,9 +5189,10 @@ public:
auto all_models = models->get_all_meta();
for (const auto & model : all_models) {
models_json.push_back(json {
{"model", model.name},
{"name", model.name},
{"id", model.name},
{"name", model.name},
{"id", model.name},
{"in_cache", model.in_cache},
{"path", model.path},
// TODO: other fields...
{"status", {
{"value", server_model_status_to_string(model.status)}
@ -5662,8 +5663,7 @@ int main(int argc, char ** argv, char ** envp) {
// register API routes
server_routes routes(params, ctx_server, ctx_http);
// TODO: improve this by changing arg.cpp
bool is_router_server = params.model.path == DEFAULT_MODEL_PATH;
bool is_router_server = params.model.path.empty();
if (is_router_server) {
// setup server instances manager
routes.models.reset(new server_models(params, argc, argv, envp));

View File

@ -267,7 +267,6 @@
if (activeId) {
return options.find((option) => option.id === activeId);
}
return options[0];
}
</script>

View File

@ -22,7 +22,38 @@ target_compile_definitions(${TARGET} PRIVATE
CPPHTTPLIB_TCP_NODELAY=1
)
if (LLAMA_OPENSSL)
if (LLAMA_BUILD_BORINGSSL)
set(OPENSSL_NO_ASM ON CACHE BOOL "Disable OpenSSL ASM code (BoringSSL)")
set(FIPS OFF CACHE BOOL "Enable FIPS (BoringSSL)")
set(BORINGSSL_GIT "https://boringssl.googlesource.com/boringssl" CACHE STRING "BoringSSL git repository")
set(BORINGSSL_VERSION "0.20251002.0" CACHE STRING "BoringSSL version")
message(STATUS "Fetching BoringSSL version ${BORINGSSL_VERSION}")
include(FetchContent)
FetchContent_Declare(
boringssl
GIT_REPOSITORY ${BORINGSSL_GIT}
GIT_TAG ${BORINGSSL_VERSION}
PATCH_COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_SOURCE_DIR}/patch-boringssl.cmake"
)
set(SAVED_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS})
set(SAVED_BUILD_TESTING ${BUILD_TESTING})
set(BUILD_SHARED_LIBS OFF)
set(BUILD_TESTING OFF)
FetchContent_MakeAvailable(boringssl)
set(BUILD_SHARED_LIBS ${SAVED_BUILD_SHARED_LIBS})
set(BUILD_TESTING ${SAVED_BUILD_TESTING})
set(CPPHTTPLIB_OPENSSL_SUPPORT TRUE)
target_link_libraries(${TARGET} PUBLIC ssl crypto)
elseif (LLAMA_OPENSSL)
find_package(OpenSSL)
if (OpenSSL_FOUND)
include(CheckCSourceCompiles)
@ -44,17 +75,20 @@ if (LLAMA_OPENSSL)
set(CMAKE_REQUIRED_INCLUDES ${SAVED_CMAKE_REQUIRED_INCLUDES})
if (OPENSSL_VERSION_SUPPORTED)
message(STATUS "OpenSSL found: ${OPENSSL_VERSION}")
target_compile_definitions(${TARGET} PUBLIC CPPHTTPLIB_OPENSSL_SUPPORT)
set(CPPHTTPLIB_OPENSSL_SUPPORT TRUE)
target_link_libraries(${TARGET} PUBLIC OpenSSL::SSL OpenSSL::Crypto)
if (APPLE AND CMAKE_SYSTEM_NAME STREQUAL "Darwin")
target_compile_definitions(${TARGET} PUBLIC CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN)
find_library(CORE_FOUNDATION_FRAMEWORK CoreFoundation REQUIRED)
find_library(SECURITY_FRAMEWORK Security REQUIRED)
target_link_libraries(${TARGET} PUBLIC ${CORE_FOUNDATION_FRAMEWORK} ${SECURITY_FRAMEWORK})
endif()
endif()
else()
message(STATUS "OpenSSL not found, SSL support disabled")
endif()
endif()
if (CPPHTTPLIB_OPENSSL_SUPPORT)
target_compile_definitions(${TARGET} PUBLIC CPPHTTPLIB_OPENSSL_SUPPORT) # used in server.cpp
if (APPLE AND CMAKE_SYSTEM_NAME STREQUAL "Darwin")
target_compile_definitions(${TARGET} PRIVATE CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN)
find_library(CORE_FOUNDATION_FRAMEWORK CoreFoundation REQUIRED)
find_library(SECURITY_FRAMEWORK Security REQUIRED)
target_link_libraries(${TARGET} PUBLIC ${CORE_FOUNDATION_FRAMEWORK} ${SECURITY_FRAMEWORK})
endif()
endif()

View File

@ -0,0 +1,6 @@
# Remove bssl
file(READ "CMakeLists.txt" content)
string(REPLACE "add_executable(bssl" "#add_executable(bssl" content "${content}")
string(REPLACE "target_link_libraries(bssl" "#target_link_libraries(bssl" content "${content}")
string(REPLACE "install(TARGETS bssl" "#install(TARGETS bssl" content "${content}")
file(WRITE "CMakeLists.txt" "${content}")