diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 5215cc3572..eee42759fc 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -69,13 +69,6 @@ jobs: key: macOS-latest-cmake-arm64 evict-old-files: 1d - - name: Dependencies - id: depends - continue-on-error: true - run: | - brew update - brew install curl - - name: Build id: cmake_build run: | @@ -83,6 +76,8 @@ jobs: cmake -B build \ -DCMAKE_BUILD_RPATH="@loader_path" \ -DLLAMA_FATAL_WARNINGS=ON \ + -DLLAMA_CURL=OFF \ + -DLLAMA_BUILD_BORINGSSL=ON \ -DGGML_METAL_USE_BF16=ON \ -DGGML_METAL_EMBED_LIBRARY=OFF \ -DGGML_METAL_SHADER_DEBUG=ON \ @@ -110,13 +105,6 @@ jobs: key: macOS-latest-cmake-x64 evict-old-files: 1d - - name: Dependencies - id: depends - continue-on-error: true - run: | - brew update - brew install curl - - name: Build id: cmake_build run: | @@ -126,6 +114,8 @@ jobs: cmake -B build \ -DCMAKE_BUILD_RPATH="@loader_path" \ -DLLAMA_FATAL_WARNINGS=ON \ + -DLLAMA_CURL=OFF \ + -DLLAMA_BUILD_BORINGSSL=ON \ -DGGML_METAL=OFF \ -DGGML_RPC=ON \ -DCMAKE_OSX_DEPLOYMENT_TARGET=13.3 @@ -151,13 +141,6 @@ jobs: key: macOS-latest-cmake-arm64-webgpu evict-old-files: 1d - - name: Dependencies - id: depends - continue-on-error: true - run: | - brew update - brew install curl - - name: Dawn Dependency id: dawn-depends run: | @@ -217,7 +200,7 @@ jobs: sudo apt-get update sudo apt-get install -y --no-install-recommends \ python3 python3-pip python3-dev \ - libjpeg-dev build-essential libcurl4-openssl-dev \ + libjpeg-dev build-essential libssl-dev \ git-lfs - name: Python Dependencies @@ -238,6 +221,8 @@ jobs: id: cmake_build run: | cmake -B build \ + -DLLAMA_CURL=OFF \ + -DLLAMA_OPENSSL=ON \ -DLLAMA_FATAL_WARNINGS=ON \ -DGGML_RPC=ON cmake --build build --config Release -j $(nproc) @@ -294,13 +279,15 @@ jobs: id: depends run: | sudo apt-get update - sudo apt-get install build-essential libcurl4-openssl-dev + sudo apt-get install build-essential libssl-dev - name: Build id: cmake_build if: ${{ matrix.sanitizer != 'THREAD' }} run: | cmake -B build \ + -DLLAMA_CURL=OFF \ + -DLLAMA_OPENSSL=ON \ -DLLAMA_FATAL_WARNINGS=ON \ -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} @@ -311,6 +298,8 @@ jobs: if: ${{ matrix.sanitizer == 'THREAD' }} run: | cmake -B build \ + -DLLAMA_CURL=OFF \ + -DLLAMA_OPENSSL=ON \ -DLLAMA_FATAL_WARNINGS=ON \ -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ @@ -335,7 +324,7 @@ jobs: id: depends run: | sudo apt-get update - sudo apt-get install build-essential libcurl4-openssl-dev + sudo apt-get install build-essential libssl-dev - name: Build id: cmake_build @@ -343,6 +332,8 @@ jobs: mkdir build cd build cmake .. \ + -DLLAMA_CURL=OFF \ + -DLLAMA_OPENSSL=ON \ -DLLAMA_FATAL_WARNINGS=ON \ -DLLAMA_LLGUIDANCE=ON cmake --build . --config Release -j $(nproc) @@ -373,12 +364,14 @@ jobs: id: depends run: | sudo apt-get update - sudo apt-get install build-essential libcurl4-openssl-dev + sudo apt-get install build-essential libssl-dev - name: Build id: cmake_build run: | cmake -B build \ + -DLLAMA_CURL=OFF \ + -DLLAMA_OPENSSL=ON \ -DGGML_RPC=ON cmake --build build --config Release -j $(nproc) @@ -405,12 +398,14 @@ jobs: - name: Dependencies id: depends run: | - sudo apt-get install -y glslc libvulkan-dev libcurl4-openssl-dev + sudo apt-get install -y glslc libvulkan-dev libssl-dev - name: Configure id: cmake_configure run: | cmake -B build \ + -DLLAMA_CURL=OFF \ + -DLLAMA_OPENSSL=ON \ -DCMAKE_BUILD_TYPE=RelWithDebInfo \ -DGGML_BACKEND_DL=ON \ -DGGML_CPU_ALL_VARIANTS=ON \ @@ -440,7 +435,7 @@ jobs: run: | sudo add-apt-repository -y ppa:kisak/kisak-mesa sudo apt-get update -y - sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libcurl4-openssl-dev + sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev - name: Get latest Vulkan SDK version id: vulkan_sdk_version @@ -466,6 +461,8 @@ jobs: run: | source ./vulkan_sdk/setup-env.sh cmake -B build \ + -DLLAMA_CURL=OFF \ + -DLLAMA_OPENSSL=ON \ -DGGML_VULKAN=ON cmake --build build --config Release -j $(nproc) @@ -497,7 +494,7 @@ jobs: run: | sudo add-apt-repository -y ppa:kisak/kisak-mesa sudo apt-get update -y - sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libcurl4-openssl-dev + sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev - name: Get latest Vulkan SDK version id: vulkan_sdk_version @@ -537,7 +534,10 @@ jobs: id: cmake_build run: | export Dawn_DIR=dawn/lib64/cmake/Dawn - cmake -B build -DGGML_WEBGPU=ON + cmake -B build \ + -DLLAMA_CURL=OFF \ + -DLLAMA_OPENSSL=ON \ + -DGGML_WEBGPU=ON cmake --build build --config Release -j $(nproc) - name: Test @@ -560,7 +560,7 @@ jobs: id: depends run: | sudo apt-get update - sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libcurl4-openssl-dev rocwmma-dev + sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libssl-dev rocwmma-dev - name: ccache uses: ggml-org/ccache-action@v1.2.16 @@ -572,6 +572,8 @@ jobs: id: cmake_build run: | cmake -B build -S . \ + -DLLAMA_CURL=OFF \ + -DLLAMA_OPENSSL=ON \ -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \ -DGGML_HIP_ROCWMMA_FATTN=ON \ -DGGML_HIP=ON @@ -590,7 +592,7 @@ jobs: id: depends run: | apt-get update - apt-get install -y build-essential git cmake libcurl4-openssl-dev + apt-get install -y build-essential git cmake libssl-dev - name: ccache uses: ggml-org/ccache-action@v1.2.16 @@ -602,6 +604,8 @@ jobs: id: cmake_build run: | cmake -B build -S . \ + -DLLAMA_CURL=OFF \ + -DLLAMA_OPENSSL=ON \ -DGGML_MUSA=ON cmake --build build --config Release -j $(nproc) @@ -626,7 +630,7 @@ jobs: shell: bash run: | sudo apt update - sudo apt install intel-oneapi-compiler-dpcpp-cpp libcurl4-openssl-dev + sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev - name: install oneAPI MKL library shell: bash @@ -648,6 +652,8 @@ jobs: run: | source /opt/intel/oneapi/setvars.sh cmake -B build \ + -DLLAMA_CURL=OFF \ + -DLLAMA_OPENSSL=ON \ -DGGML_SYCL=ON \ -DCMAKE_C_COMPILER=icx \ -DCMAKE_CXX_COMPILER=icpx @@ -674,7 +680,7 @@ jobs: shell: bash run: | sudo apt update - sudo apt install intel-oneapi-compiler-dpcpp-cpp libcurl4-openssl-dev + sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev - name: install oneAPI MKL library shell: bash @@ -696,6 +702,8 @@ jobs: run: | source /opt/intel/oneapi/setvars.sh cmake -B build \ + -DLLAMA_CURL=OFF \ + -DLLAMA_OPENSSL=ON \ -DGGML_SYCL=ON \ -DCMAKE_C_COMPILER=icx \ -DCMAKE_CXX_COMPILER=icpx \ @@ -722,12 +730,6 @@ jobs: key: macOS-latest-cmake-ios evict-old-files: 1d - - name: Dependencies - id: depends - continue-on-error: true - run: | - brew update - - name: Build id: cmake_build run: | @@ -759,12 +761,6 @@ jobs: key: macOS-latest-cmake-tvos evict-old-files: 1d - - name: Dependencies - id: depends - continue-on-error: true - run: | - brew update - - name: Build id: cmake_build run: | @@ -790,12 +786,6 @@ jobs: id: checkout uses: actions/checkout@v4 - - name: Dependencies - id: depends - continue-on-error: true - run: | - brew update - - name: Build id: cmake_build run: | @@ -838,12 +828,6 @@ jobs: name: llama-xcframework path: build-apple/llama.xcframework/ - - name: Dependencies - id: depends - continue-on-error: true - run: | - brew update - - name: Build llama.cpp with CMake id: cmake_build run: | @@ -995,21 +979,12 @@ jobs: -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release" cmake --build build-arm64-release --target install --config release - - name: libCURL - id: get_libcurl - uses: ./.github/actions/windows-setup-curl - with: - architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }} - - name: Build id: cmake_build - env: - CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} run: | cmake -S . -B build ${{ matrix.defines }} ` - -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" + -DLLAMA_CURL=OFF -DLLAMA_BUILD_BORINGSSL=ON cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} - cp $env:CURL_PATH/bin/libcurl-*.dll build/bin/Release - name: Add libopenblas.dll id: add_libopenblas_dll @@ -1053,7 +1028,7 @@ jobs: DEBIAN_FRONTEND: noninteractive run: | apt update - apt install -y cmake build-essential ninja-build libgomp1 git libcurl4-openssl-dev + apt install -y cmake build-essential ninja-build libgomp1 git libssl-dev - name: ccache uses: ggml-org/ccache-action@v1.2.16 @@ -1064,10 +1039,12 @@ jobs: - name: Build with CMake run: | cmake -S . -B build -G Ninja \ + -DLLAMA_CURL=OFF \ + -DLLAMA_OPENSSL=ON \ + -DLLAMA_FATAL_WARNINGS=ON \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CUDA_ARCHITECTURES=89-real \ -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined \ - -DLLAMA_FATAL_WARNINGS=ON \ -DGGML_NATIVE=OFF \ -DGGML_CUDA=ON cmake --build build @@ -1101,25 +1078,20 @@ jobs: run: | choco install ninja - - name: libCURL - id: get_libcurl - uses: ./.github/actions/windows-setup-curl - - name: Build id: cmake_build shell: cmd - env: - CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} run: | call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 cmake -S . -B build -G "Ninja Multi-Config" ^ -DLLAMA_BUILD_SERVER=ON ^ + -DLLAMA_CURL=OFF ^ + -DLLAMA_BUILD_BORINGSSL=ON ^ -DGGML_NATIVE=OFF ^ -DGGML_BACKEND_DL=ON ^ -DGGML_CPU_ALL_VARIANTS=ON ^ -DGGML_CUDA=ON ^ - -DGGML_RPC=ON ^ - -DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include" + -DGGML_RPC=ON set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1 cmake --build build --config Release -j %NINJA_JOBS% -t ggml cmake --build build --config Release @@ -1151,7 +1123,7 @@ jobs: run: | scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL - # TODO: add libcurl support ; we will also need to modify win-build-sycl.bat to accept user-specified args + # TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args - name: Build id: cmake_build @@ -1208,14 +1180,8 @@ jobs: key: ${{ github.job }} evict-old-files: 1d - - name: libCURL - id: get_libcurl - uses: ./.github/actions/windows-setup-curl - - name: Build id: cmake_build - env: - CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} run: | $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path) $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}" @@ -1224,11 +1190,12 @@ jobs: -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" ` -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-${{ env.ROCM_VERSION }}/include/" ` -DCMAKE_BUILD_TYPE=Release ` + -DLLAMA_CURL=OFF ` + -DLLAMA_BUILD_BORINGSSL=ON ` -DROCM_DIR="${env:HIP_PATH}" ` -DGGML_HIP=ON ` -DGGML_HIP_ROCWMMA_FATTN=ON ` - -DGGML_RPC=ON ` - -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" + -DGGML_RPC=ON cmake --build build -j ${env:NUMBER_OF_PROCESSORS} ios-xcode-build: diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index ebcd6424bc..be4e23bc81 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -56,7 +56,7 @@ jobs: curl \ wget \ language-pack-en \ - libcurl4-openssl-dev + libssl-dev - name: Clone id: checkout @@ -242,7 +242,7 @@ jobs: curl \ wget \ language-pack-en \ - libcurl4-openssl-dev + libssl-dev - name: Clone id: checkout @@ -283,6 +283,8 @@ jobs: run: | cmake -B build \ -DGGML_NATIVE=OFF \ + -DLLAMA_CURL=OFF \ + -DLLAMA_OPENSSL=ON \ -DLLAMA_BUILD_SERVER=ON \ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \ @@ -295,6 +297,8 @@ jobs: run: | cmake -B build \ -DGGML_NATIVE=OFF \ + -DLLAMA_CURL=OFF \ + -DLLAMA_OPENSSL=ON \ -DLLAMA_BUILD_SERVER=ON \ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ; @@ -306,6 +310,8 @@ jobs: run: | cmake -B build \ -DGGML_NATIVE=OFF \ + -DLLAMA_CURL=OFF \ + -DLLAMA_OPENSSL=ON \ -DLLAMA_BUILD_SERVER=ON \ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ; cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server diff --git a/README.md b/README.md index 3d94b30cda..2e44ae7d0c 100644 --- a/README.md +++ b/README.md @@ -242,6 +242,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo - [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption - [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage - [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with pre-built Mobile and Web platform wrappers and a model example) +- [unslothai/unsloth](https://github.com/unslothai/unsloth) – 🦥 exports/saves fine-tuned and trained models to GGUF (Apache-2.0) diff --git a/common/arg.cpp b/common/arg.cpp index 430ab45dfe..eab26b67f2 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -212,7 +212,6 @@ struct handle_model_result { static handle_model_result common_params_handle_model( struct common_params_model & model, const std::string & bearer_token, - const std::string & model_path_default, bool offline) { handle_model_result result; // handle pre-fill default model path and url based on hf_repo and hf_file @@ -257,8 +256,6 @@ static handle_model_result common_params_handle_model( model.path = fs_get_cache_file(string_split(f, '/').back()); } - } else if (model.path.empty()) { - model.path = model_path_default; } } @@ -405,7 +402,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context // handle model and download { - auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH, params.offline); + auto res = common_params_handle_model(params.model, params.hf_token, params.offline); if (params.no_mmproj) { params.mmproj = {}; } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) { @@ -415,12 +412,18 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context // only download mmproj if the current example is using it for (auto & ex : mmproj_examples) { if (ctx_arg.ex == ex) { - common_params_handle_model(params.mmproj, params.hf_token, "", params.offline); + common_params_handle_model(params.mmproj, params.hf_token, params.offline); break; } } - common_params_handle_model(params.speculative.model, params.hf_token, "", params.offline); - common_params_handle_model(params.vocoder.model, params.hf_token, "", params.offline); + common_params_handle_model(params.speculative.model, params.hf_token, params.offline); + common_params_handle_model(params.vocoder.model, params.hf_token, params.offline); + } + + // model is required (except for server) + // TODO @ngxson : maybe show a list of available models in CLI in this case + if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER) { + throw std::invalid_argument("error: --model is required\n"); } if (params.escape) { @@ -2072,11 +2075,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"-m", "--model"}, "FNAME", ex == LLAMA_EXAMPLE_EXPORT_LORA - ? std::string("model path from which to load base model") - : string_format( - "model path (default: `models/$filename` with filename from `--hf-file` " - "or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH - ), + ? "model path from which to load base model" + : "model path to load", [](common_params & params, const std::string & value) { params.model.path = value; } @@ -2474,6 +2474,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } } ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"--models-dir"}, "PATH", + "directory containing models for the router server (default: disabled)", + [](common_params & params, const std::string & value) { + params.models_dir = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_DIR")); + add_opt(common_arg( + {"--max-models"}, "N", + string_format("for router server, maximum number of models to load simultaneously (default: %d, 0 = unlimited)", params.max_models), + [](common_params & params, int value) { + params.max_models = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MAX_MODELS")); add_opt(common_arg( {"--jinja"}, "use jinja template for chat (default: disabled)", diff --git a/common/common.cpp b/common/common.cpp index f3cc55247e..be31c66de1 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -911,7 +911,7 @@ std::string fs_get_cache_file(const std::string & filename) { return cache_directory + filename; } -std::vector fs_list_files(const std::string & path) { +std::vector fs_list(const std::string & path, bool include_directories) { std::vector files; if (path.empty()) return files; @@ -926,14 +926,22 @@ std::vector fs_list_files(const std::string & path) { const auto & p = entry.path(); if (std::filesystem::is_regular_file(p)) { common_file_info info; - info.path = p.string(); - info.name = p.filename().string(); + info.path = p.string(); + info.name = p.filename().string(); + info.is_dir = false; try { info.size = static_cast(std::filesystem::file_size(p)); } catch (const std::filesystem::filesystem_error &) { info.size = 0; } files.push_back(std::move(info)); + } else if (include_directories && std::filesystem::is_directory(p)) { + common_file_info info; + info.path = p.string(); + info.name = p.filename().string(); + info.size = 0; // Directories have no size + info.is_dir = true; + files.push_back(std::move(info)); } } catch (const std::filesystem::filesystem_error &) { // skip entries we cannot inspect diff --git a/common/common.h b/common/common.h index de5b404dd8..20ba209ce4 100644 --- a/common/common.h +++ b/common/common.h @@ -26,8 +26,6 @@ fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \ } while(0) -#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf" - struct common_time_meas { common_time_meas(int64_t & t_acc, bool disable = false); ~common_time_meas(); @@ -460,6 +458,10 @@ struct common_params { bool endpoint_props = false; // only control POST requests, not GET bool endpoint_metrics = false; + // router server configs + std::string models_dir = ""; // directory containing models for the router server + int max_models = 4; // maximum number of models to load simultaneously + bool log_json = false; std::string slot_save_path; @@ -623,8 +625,9 @@ struct common_file_info { std::string path; std::string name; size_t size = 0; // in bytes + bool is_dir = false; }; -std::vector fs_list_files(const std::string & path); +std::vector fs_list(const std::string & path, bool include_directories); // // Model utils diff --git a/common/download.cpp b/common/download.cpp index eeb32b6a86..1a3bc9216f 100644 --- a/common/download.cpp +++ b/common/download.cpp @@ -1047,7 +1047,7 @@ std::string common_docker_resolve_model(const std::string &) { std::vector common_list_cached_models() { std::vector models; const std::string cache_dir = fs_get_cache_directory(); - const std::vector files = fs_list_files(cache_dir); + const std::vector files = fs_list(cache_dir, false); for (const auto & file : files) { if (string_starts_with(file.name, "manifest=") && string_ends_with(file.name, ".json")) { common_cached_model_info model_info; diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index 9576dcb6e8..5cbf5683e1 100644 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -2246,8 +2246,7 @@ static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx bool & use_cann_graph, bool & cann_graph_update_required) { #ifdef USE_ACL_GRAPH - ggml_cann_graph * matched_graph = cann_ctx->graph_lru_cache.cache_list.front(); - if (use_cann_graph && cann_graph_update_required) { + if (use_cann_graph && cann_graph_update_required) { // Begin CANN graph capture ACL_CHECK(aclmdlRICaptureBegin(cann_ctx->stream(), ACL_MODEL_RI_CAPTURE_MODE_GLOBAL)); } #endif // USE_ACL_GRAPH @@ -2271,12 +2270,14 @@ static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx } #ifdef USE_ACL_GRAPH - if (use_cann_graph && cann_graph_update_required) { // End CANN graph capture - ACL_CHECK(aclmdlRICaptureEnd(cann_ctx->stream(), &matched_graph->graph)); - } - if (use_cann_graph) { - // Execute graph + ggml_cann_graph * matched_graph = cann_ctx->graph_lru_cache.cache_list.front(); + + if (cann_graph_update_required) { // End CANN graph capture + ACL_CHECK(aclmdlRICaptureEnd(cann_ctx->stream(), &matched_graph->graph)); + } + + // Execute CANN graph ACL_CHECK(aclmdlRIExecuteAsync(matched_graph->graph, cann_ctx->stream())); } #endif // USE_ACL_GRAPH diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 25e9308d75..99ec96869a 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -224,6 +224,10 @@ static const char * cu_get_error_str(CUresult err) { #define AMD_MFMA_AVAILABLE #endif // defined(GGML_USE_HIP) && defined(CDNA) && !defined(GGML_HIP_NO_MMQ_MFMA) +#if defined(GGML_USE_HIP) && defined(RDNA4) +#define AMD_WMMA_AVAILABLE +#endif // defined(GGML_USE_HIP) && defined(RDNA4) + // The Volta instructions are in principle available on Turing or newer but they are effectively unusable: #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA #define VOLTA_MMA_AVAILABLE @@ -283,6 +287,10 @@ static bool amd_mfma_available(const int cc) { #endif //!defined(GGML_HIP_NO_MMQ_MFMA) } +static bool amd_wmma_available(const int cc) { + return GGML_CUDA_CC_IS_RDNA4(cc); +} + static bool volta_mma_available(const int cc) { return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) == GGML_CUDA_CC_VOLTA; } diff --git a/ggml/src/ggml-cuda/convert.cuh b/ggml/src/ggml-cuda/convert.cuh index 8a5e08ef66..09f9a33f90 100644 --- a/ggml/src/ggml-cuda/convert.cuh +++ b/ggml/src/ggml-cuda/convert.cuh @@ -39,6 +39,15 @@ template return __float2bfloat16(float(x)); } else if constexpr(std::is_same_v) { return __bfloat162float(x); + } else if constexpr(std::is_same_v && std::is_same_v) { + return __float22half2_rn(x); + } else if constexpr(std::is_same_v && std::is_same_v) { + // bypass compile error on cuda 12.0.1 +#ifdef GGML_USE_HIP + return __float22bfloat162_rn(x); +#else + return {x.x, x.y}; +#endif // GGML_USE_HIP } else if constexpr(std::is_same_v) { return int32_t(x); } else { diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh index a7a28fd1ae..c3c4b77996 100644 --- a/ggml/src/ggml-cuda/mma.cuh +++ b/ggml/src/ggml-cuda/mma.cuh @@ -74,6 +74,33 @@ namespace ggml_cuda_mma { static constexpr int J = J_; #if defined(GGML_USE_HIP) +#if defined(RDNA4) + static constexpr int ne = I * J / 32; + T x[ne] = {0}; + + static constexpr __device__ bool supported() { + if (I == 16 && J == 16) return true; + return false; + } + + static __device__ __forceinline__ int get_i(const int l) { + if constexpr (I == 16 && J == 16) { + return 8 * (threadIdx.x / 16) + l; + } else { + NO_DEVICE_CODE; + return -1; + } + } + + static __device__ __forceinline__ int get_j(const int l) { + if constexpr (I == 16 && J == 16) { + return threadIdx.x % 16; + } else { + NO_DEVICE_CODE; + return -1; + } + } +#else static constexpr int ne = I * J / 64; T x[ne] = {0}; @@ -119,6 +146,7 @@ namespace ggml_cuda_mma { return -1; } } +#endif // defined(RDNA4) #elif __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA static constexpr int ne = I * J / 32; T x[ne] = {0}; @@ -236,6 +264,32 @@ namespace ggml_cuda_mma { return -1; } } +#elif defined(AMD_WMMA_AVAILABLE) + static constexpr int ne = I * J / 32; + half2 x[ne] = {{0.0f, 0.0f}}; + + static constexpr __device__ bool supported() { + if (I == 16 && J == 8) return true; + return false; + } + + static __device__ __forceinline__ int get_i(const int l) { + if constexpr (I == 16 && J == 8) { + return threadIdx.x % 16; + } else { + NO_DEVICE_CODE; + return -1; + } + } + + static __device__ __forceinline__ int get_j(const int l) { + if constexpr (I == 16 && J == 8) { + return 4 * (threadIdx.x / 16) + l; + } else { + NO_DEVICE_CODE; + return -1; + } + } #else static constexpr int ne = I * J / WARP_SIZE; half2 x[ne] = {{0.0f, 0.0f}}; @@ -285,6 +339,34 @@ namespace ggml_cuda_mma { struct tile { static constexpr int I = I_; static constexpr int J = J_; + +#if defined(AMD_WMMA_AVAILABLE) + static constexpr int ne = I * J / 32; + nv_bfloat162 x[ne] = {{0.0f, 0.0f}}; + + static constexpr __device__ bool supported() { + if (I == 16 && J == 8) return true; + return false; + } + + static __device__ __forceinline__ int get_i(const int l) { + if constexpr (I == 16 && J == 8) { + return threadIdx.x % 16; + } else { + NO_DEVICE_CODE; + return -1; + } + } + + static __device__ __forceinline__ int get_j(const int l) { + if constexpr (I == 16 && J == 8) { + return 4 * (threadIdx.x / 16) + l; + } else { + NO_DEVICE_CODE; + return -1; + } + } +#else static constexpr int ne = I * J / WARP_SIZE; nv_bfloat162 x[ne] = {{0.0f, 0.0f}}; @@ -320,6 +402,7 @@ namespace ggml_cuda_mma { return -1; } } +#endif // defined(AMD_WMMA_AVAILABLE) }; template @@ -353,6 +436,8 @@ namespace ggml_cuda_mma { const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 2 * (threadIdx.x / t.I)); xi[0] = xs[0]; } +#elif defined(AMD_WMMA_AVAILABLE) + ggml_cuda_memcpy_1(t.x, xs0 + t.get_i(0) * stride + t.get_j(0)); #else #pragma unroll for (int l = 0; l < t.ne; ++l) { @@ -639,12 +724,34 @@ namespace ggml_cuda_mma { : "+r"(Dxi[4]), "+r"(Dxi[5]), "+r"(Dxi[6]), "+r"(Dxi[7]) : "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[3])); #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE +#elif defined(AMD_WMMA_AVAILABLE) + using halfx8_t = __attribute__((ext_vector_type(8))) _Float16; + using floatx8_t = __attribute__((ext_vector_type(8))) float; + floatx8_t& acc_frag = reinterpret_cast(D.x[0]); + const halfx8_t& a_frag = reinterpret_cast(A.x[0]); + const halfx8_t& b_frag = reinterpret_cast(B.x[0]); + acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(a_frag, b_frag, acc_frag); #else GGML_UNUSED_VARS(D, A, B); NO_DEVICE_CODE; #endif // TURING_MMA_AVAILABLE } + static __device__ __forceinline__ void mma( + tile<16, 16, float> & D, const tile<16, 8, nv_bfloat162> & A, const tile<16, 8, nv_bfloat162> & B) { +#if defined(AMD_WMMA_AVAILABLE) + using bf16x8_t = __attribute__((ext_vector_type(8))) __bf16; + using floatx8_t = __attribute__((ext_vector_type(8))) float; + floatx8_t& acc_frag = reinterpret_cast(D.x[0]); + const bf16x8_t& a_frag = reinterpret_cast(A.x[0]); + const bf16x8_t& b_frag = reinterpret_cast(B.x[0]); + acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12(a_frag, b_frag, acc_frag); +#else + GGML_UNUSED_VARS(D, A, B); + NO_DEVICE_CODE; +#endif // AMPERE_MMA_AVAILABLE + } + static __device__ __forceinline__ void mma( tile<16, 16, int> & D, const tile<16, 8, int> & A, const tile<16, 8, int> & B) { #if defined(AMD_MFMA_AVAILABLE) diff --git a/ggml/src/ggml-cuda/mmf.cu b/ggml/src/ggml-cuda/mmf.cu index 153dd5a97d..5c51a22256 100644 --- a/ggml/src/ggml-cuda/mmf.cu +++ b/ggml/src/ggml-cuda/mmf.cu @@ -151,7 +151,7 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const return false; } } else { - if (src1_ncols > 16) { + if (src1_ncols > 16 || GGML_CUDA_CC_IS_RDNA4(cc)) { return false; } } @@ -160,9 +160,9 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const case GGML_TYPE_F32: return ampere_mma_available(cc); case GGML_TYPE_F16: - return volta_mma_available(cc) || turing_mma_available(cc); + return volta_mma_available(cc) || turing_mma_available(cc) || amd_wmma_available(cc); case GGML_TYPE_BF16: - return ampere_mma_available(cc); + return ampere_mma_available(cc) || amd_wmma_available(cc); default: return false; } diff --git a/ggml/src/ggml-cuda/mmf.cuh b/ggml/src/ggml-cuda/mmf.cuh index 45724e0911..c2a0a2e42f 100644 --- a/ggml/src/ggml-cuda/mmf.cuh +++ b/ggml/src/ggml-cuda/mmf.cuh @@ -2,6 +2,7 @@ #include "mma.cuh" #include "common.cuh" +#include "convert.cuh" using namespace ggml_cuda_mma; @@ -27,20 +28,35 @@ static __global__ void mul_mat_f( const int stride_col_id, const int stride_row_id, const int channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst, const int sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) { -#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) +// TODO: handle this in a consistent and simpler way after AMD MFMA support has been added +#if (!defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)) || defined(AMD_WMMA_AVAILABLE) +#if defined(AMD_WMMA_AVAILABLE) + // Special case for tf32, just dummy mma layout as wmma doesn't support it. + constexpr int tile_B_I = std::is_same_v ? 8 : 16; + constexpr int tile_C_J = std::is_same_v ? 8 : 16; + typedef tile<16, 8, T> tile_A; + typedef tile tile_B; + typedef tile<16, tile_C_J, float> tile_C; + + constexpr bool a_supported = tile_A::supported(); + constexpr bool b_supported = tile_B::supported(); + constexpr bool c_supported = tile_C::supported(); + constexpr bool supported = a_supported && b_supported && c_supported; +#else constexpr bool I_16_supported = tile<16, 8, T>::supported() && tile<16, 8, float>::supported(); constexpr bool I_32_supported = tile<32, 8, T>::supported() && tile<32, 8, float>::supported(); - - if (!I_16_supported && !I_32_supported) { - NO_DEVICE_CODE; - return; - } + constexpr bool supported = I_16_supported || I_32_supported; constexpr int I_preferred = I_16_supported ? 16 : 32; // For Turing MMA both work but 16 is ~1% faster. typedef tile tile_A; typedef tile<8, 8, T> tile_B; typedef tile tile_C; +#endif // defined(AMD_WMMA_AVAILABLE) + if constexpr (!supported) { + NO_DEVICE_CODE; + return; + } constexpr int warp_size = ggml_cuda_get_physical_warp_size(); constexpr int tile_k_padded = warp_size + 4; @@ -161,11 +177,11 @@ static __global__ void mul_mat_f( if constexpr (!has_ids) { const float2 tmp = j < cols_per_block ? y2[j*stride_col_y + col] : make_float2(0.0f, 0.0f); - tile_xy[j0*tile_k_padded + threadIdx.x] = {tmp.x, tmp.y}; + tile_xy[j0*tile_k_padded + threadIdx.x] = ggml_cuda_cast(tmp); } else { const bool valid = j < cols_per_block && (col_base + j) < ncols_dst_total && slot_map[j] >= 0; float2 tmp = valid ? *(const float2*) &y[slot_map[j]*stride_channel_y + 2*(j*stride_col_y + col)] : make_float2(0.0f, 0.0f); - tile_xy[j0*tile_k_padded + threadIdx.x] = {tmp.x, tmp.y}; + tile_xy[j0*tile_k_padded + threadIdx.x] = ggml_cuda_cast(tmp); } } } else { @@ -239,7 +255,7 @@ static __global__ void mul_mat_f( channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); NO_DEVICE_CODE; -#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) +#endif // (!defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)) || defined(AMD_WMMA_AVAILABLE) } //This kernel is for larger batch sizes of mul_mat_id @@ -253,20 +269,35 @@ static __global__ void mul_mat_f_ids( const int channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst, const int sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst, const uint3 sis1_fd, const uint3 nch_fd) { -#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) +// TODO: handle this in a consistent and simpler way after AMD MFMA support has been added +#if (!defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)) || defined(AMD_WMMA_AVAILABLE) +#if defined(AMD_WMMA_AVAILABLE) + // Special case for tf32, just dummy mma layout as wmma doesn't support it. + constexpr int tile_B_I = std::is_same_v ? 8 : 16; + constexpr int tile_C_J = std::is_same_v ? 8 : 16; + typedef tile<16, 8, T> tile_A; + typedef tile tile_B; + typedef tile<16, tile_C_J, float> tile_C; + + constexpr bool a_supported = tile_A::supported(); + constexpr bool b_supported = tile_B::supported(); + constexpr bool c_supported = tile_C::supported(); + constexpr bool supported = a_supported && b_supported && c_supported; +#else constexpr bool I_16_supported = tile<16, 8, T>::supported() && tile<16, 8, float>::supported(); constexpr bool I_32_supported = tile<32, 8, T>::supported() && tile<32, 8, float>::supported(); + constexpr bool supported = I_16_supported || I_32_supported; - if (!I_16_supported && !I_32_supported) { - NO_DEVICE_CODE; - return; - } - - constexpr int I_preferred = I_16_supported ? 16 : 32; // For Turing MMA both work butr 16 is ~1% faster. + constexpr int I_preferred = I_16_supported ? 16 : 32; // For Turing MMA both work but 16 is ~1% faster. typedef tile tile_A; typedef tile<8, 8, T> tile_B; typedef tile tile_C; +#endif // defined(AMD_WMMA_AVAILABLE) + if constexpr (!supported) { + NO_DEVICE_CODE; + return; + } constexpr int warp_size = ggml_cuda_get_physical_warp_size(); constexpr int tile_k_padded = warp_size + 4; @@ -408,7 +439,7 @@ static __global__ void mul_mat_f_ids( #pragma unroll for (int j0 = 0; j0 < tile_B::I; ++j0) { const float2 tmp = vals_buf[curr_buf][j0]; - tile_xy[j0*tile_k_padded + threadIdx.x] = {tmp.x, tmp.y}; + tile_xy[j0*tile_k_padded + threadIdx.x] = ggml_cuda_cast(tmp); } if (itB + 1 < ntB) { @@ -492,7 +523,7 @@ static __global__ void mul_mat_f_ids( channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, sis1_fd, nch_fd); NO_DEVICE_CODE; -#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) +#endif // (!defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)) || defined(AMD_WMMA_AVAILABLE) } template @@ -554,7 +585,8 @@ void mul_mat_f_cuda( cudaStream_t stream, const mmf_ids_data * ids_data) { typedef tile<16, 8, T> tile_A_16; typedef tile<32, 8, T> tile_A_32; - typedef tile< 8, 8, T> tile_B; + typedef tile<16, 8, T> tile_B_16; + typedef tile< 8, 8, T> tile_B_8; GGML_ASSERT(ncols_x % 2 == 0); GGML_ASSERT(stride_row % 2 == 0); @@ -581,7 +613,8 @@ void mul_mat_f_cuda( constexpr int rows_per_block = MMF_ROWS_PER_BLOCK; const int nbytes_shared_iter = nwarps_best * (volta_mma_available(cc) ? tile_A_32::I : tile_A_16::I) * (warp_size + 4) * 4; - const int nbytes_shared_combine = GGML_PAD(cols_per_block, tile_B::I) * (nwarps_best*rows_per_block + 4) * 4; + const int nbytes_cols_per_block_pad = amd_wmma_available(cc) ? tile_B_16::I : tile_B_8::I; + const int nbytes_shared_combine = GGML_PAD(cols_per_block, nbytes_cols_per_block_pad) * (nwarps_best*rows_per_block + 4) * 4; const int nbytes_shared = std::max(nbytes_shared_iter, nbytes_shared_combine); const int nbytes_slotmap = ids ? GGML_PAD(cols_per_block, 16) * sizeof(int) : 0; const int nbytes_shared_total = nbytes_shared + nbytes_slotmap; diff --git a/ggml/src/ggml-hexagon/htp/act-ops.c b/ggml/src/ggml-hexagon/htp/act-ops.c index 16044975d9..87b09cca3a 100644 --- a/ggml/src/ggml-hexagon/htp/act-ops.c +++ b/ggml/src/ggml-hexagon/htp/act-ops.c @@ -106,33 +106,32 @@ static void glu_swiglu_fp32_per_thread(const struct htp_tensor * src0, t1 = HAP_perf_get_qtimer_count(); int is_aligned = 1; - int opt_path = 0; if (!htp_is_aligned((void *) src0->data, VLEN) || !htp_is_aligned((void *) dst->data, VLEN)) { is_aligned = 0; FARF(HIGH, "swiglu-f32: unaligned addresses in elementwise op, possibly slower execution\n"); } - if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) { - opt_path = 1; - } const uint8_t * restrict data_src0 = (const uint8_t *) src0->data; const uint8_t * restrict data_src1 = (const uint8_t *) src1->data; uint8_t * restrict data_dst = (uint8_t *) dst->data; - bool src1_valid = src1->ne[0]; + const bool src1_valid = src1->ne[0]; + const int nc = (src1_valid) ? ne00 : ne00 / 2; if (!src1_valid) { - data_src1 = data_src0; - src1_row_size = src0_row_size; + const int32_t swapped = op_params[1]; + data_src1 = data_src0; + src1_row_size = src0_row_size; + + const size_t nc_in_bytes = nc * SIZEOF_FP32; + data_src0 += swapped ? nc_in_bytes : 0; + data_src1 += swapped ? 0 : nc_in_bytes; } uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_row_size); uint8_t * restrict src1_spad_data = src1_spad->data + (ith * src1_row_size); uint8_t * restrict dst_spad_data = dst_spad->data + (ith * dst_row_size); - const int32_t swapped = op_params[1]; - - const int nc = (src1_valid) ? ne0 : ne0 / 2; - + const bool opt_path = ((1 == is_aligned) && !(nb01 & (VLEN - 1))); for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) { const float * restrict src0 = (float *) (data_src0 + (ir * src0_row_size)); const float * restrict src1 = (float *) (data_src1 + (ir * src1_row_size)); @@ -142,12 +141,7 @@ static void glu_swiglu_fp32_per_thread(const struct htp_tensor * src0, htp_l2fetch(src0 + src0_row_size, 1, src0_row_size, src0_row_size); } - if (!src1_valid) { - src0 += swapped ? nc : 0; - src1 += swapped ? 0 : nc; - } - - if (1 == opt_path) { + if (opt_path) { hvx_fast_sigmoid_f32((const uint8_t *) src0, (uint8_t *) src0_spad_data, nc); hvx_mul_mul_f32_opt((const uint8_t *) src0, (const uint8_t *) src0_spad_data, (const uint8_t *) src1, (uint8_t *) dst, nc); @@ -218,7 +212,7 @@ static void glu_swiglu_oai_fp32_per_thread(const struct htp_tensor * src0, const float alpha = ((const float *) (op_params))[2]; const float limit = ((const float *) (op_params))[3]; - const int nc = (src1_valid) ? ne0 : ne0 / 2; + const int nc = (src1_valid) ? ne00 : ne00 / 2; for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) { const float * restrict src0 = (float *) (data_src0 + (ir * src0_row_size)); diff --git a/ggml/src/ggml-hexagon/htp/hvx-exp.c b/ggml/src/ggml-hexagon/htp/hvx-exp.c index 19f6795083..d0735e9325 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-exp.c +++ b/ggml/src/ggml-hexagon/htp/hvx-exp.c @@ -16,6 +16,19 @@ #include "hvx-utils.h" #include "ops-utils.h" +static inline HVX_Vector hvx_vec_exp_fp32_guard(HVX_Vector in_vec) { + static const float kInf = INFINITY; + static const float kMaxExp = 88.02f; // log(INF) + + const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp); + const HVX_Vector inf = hvx_vec_splat_fp32(kInf); + const HVX_VectorPred pred0 = Q6_Q_vcmp_gt_VsfVsf(in_vec, max_exp); + + HVX_Vector out = hvx_vec_exp_fp32(in_vec); + + return Q6_V_vmux_QVV(pred0, inf, out); +} + void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems, bool negate) { int left_over = num_elems & (VLEN_FP32 - 1); int num_elems_whole = num_elems - left_over; @@ -42,9 +55,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { if (true == negate) { HVX_Vector neg_vec_in = hvx_vec_neg_fp32(*p_vec_in1++); - *p_vec_out++ = hvx_vec_exp_fp32(neg_vec_in); + *p_vec_out++ = hvx_vec_exp_fp32_guard(neg_vec_in); } else { - *p_vec_out++ = hvx_vec_exp_fp32(*p_vec_in1++); + *p_vec_out++ = hvx_vec_exp_fp32_guard(*p_vec_in1++); } } } else { @@ -54,9 +67,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int if (true == negate) { HVX_Vector neg_vec_in = hvx_vec_neg_fp32(in); - *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32(neg_vec_in); + *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(neg_vec_in); } else { - *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32(in); + *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(in); } } } @@ -70,9 +83,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int if (true == negate) { HVX_Vector neg_vec_in = hvx_vec_neg_fp32(in); - vec_out = hvx_vec_exp_fp32(neg_vec_in); + vec_out = hvx_vec_exp_fp32_guard(neg_vec_in); } else { - vec_out = hvx_vec_exp_fp32(in); + vec_out = hvx_vec_exp_fp32_guard(in); } hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, vec_out); diff --git a/ggml/src/ggml-hexagon/htp/hvx-inverse.c b/ggml/src/ggml-hexagon/htp/hvx-inverse.c index 4cf588a878..953d3e6c16 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-inverse.c +++ b/ggml/src/ggml-hexagon/htp/hvx-inverse.c @@ -38,13 +38,13 @@ void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { - *p_vec_out++ = hvx_vec_inverse_fp32(*p_vec_in++); + *p_vec_out++ = hvx_vec_inverse_fp32_guard(*p_vec_in++); } } else { #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32); - *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_inverse_fp32(in); + *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_inverse_fp32_guard(in); } } @@ -53,7 +53,7 @@ void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const float * dstf = (float *) dst + num_elems_whole; HVX_Vector in = *(HVX_UVector *) srcf; - HVX_Vector out = hvx_vec_inverse_fp32(in); + HVX_Vector out = hvx_vec_inverse_fp32_guard(in); hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, out); } diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.c b/ggml/src/ggml-hexagon/htp/hvx-utils.c index d3599bc9c1..e02b1d9099 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.c +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.c @@ -401,7 +401,9 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * FARF(HIGH, "hvx_add_scalar_f32: unaligned loop in hvx op, possibly slower execution\n"); } - HVX_Vector val_vec = hvx_vec_splat_fp32(val); + static const float kInf = INFINITY; + const HVX_Vector inf = hvx_vec_splat_fp32(kInf); + HVX_Vector val_vec = hvx_vec_splat_fp32(val); if (0 == unaligned_loop) { HVX_Vector * restrict vec_in1 = (HVX_Vector *) src; @@ -409,17 +411,24 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { - HVX_Vector v = Q6_Vqf32_vadd_VsfVsf(*vec_in1++, val_vec); - *vec_out++ = Q6_Vsf_equals_Vqf32(v); + HVX_Vector in = *vec_in1++; + const HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(inf, in); + HVX_Vector v = Q6_Vqf32_vadd_VsfVsf(in, val_vec); + v = Q6_Vsf_equals_Vqf32(v); + v = Q6_V_vmux_QVV(pred_inf, inf, v); + *vec_out++ = v; } } else { #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32); - HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in, val_vec); + const HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(inf, in); + HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in, val_vec); + out = Q6_Vsf_equals_Vqf32(out); + out = Q6_V_vmux_QVV(pred_inf, inf, out); - *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out); + *(HVX_UVector *) (dst + i * SIZEOF_FP32) = out; } } @@ -429,8 +438,12 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * HVX_Vector in = *(HVX_UVector *) srcf; - HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in, val_vec); - hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out)); + const HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(inf, in); + HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in, val_vec); + out = Q6_Vsf_equals_Vqf32(out); + out = Q6_V_vmux_QVV(pred_inf, inf, out); + + hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, out); } } diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h index b2ca8e88f4..5f94645cde 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.h +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h @@ -12,6 +12,15 @@ #define VLEN_FP32 (VLEN / SIZEOF_FP32) #define VLEN_FP16 (VLEN / SIZEOF_FP16) +typedef union { + HVX_Vector v; + uint8_t b[VLEN]; + uint16_t h[VLEN_FP16]; + uint32_t w[VLEN_FP32]; + __fp16 fp16[VLEN_FP16]; + float fp32[VLEN_FP32]; +} __attribute__((aligned(VLEN), packed)) HVX_VectorAlias; + static inline HVX_Vector hvx_vec_splat_fp32(float i) { union { float f; @@ -243,19 +252,16 @@ static __attribute__((always_inline)) int32_t is_in_one_chunk(void * addr, uint3 } static void hvx_vec_dump_fp16_n(char * pref, HVX_Vector v, uint32_t n) { - union { - HVX_Vector v; - __fp16 d[64]; - } u = { .v = v }; + HVX_VectorAlias u = { .v = v }; const uint32_t n0 = n / 16; const uint32_t n1 = n % 16; int i = 0; for (; i < n0; i++) { - htp_dump_fp16_line(pref, u.d + (16 * i), 16); + htp_dump_fp16_line(pref, u.fp16 + (16 * i), 16); } if (n1) { - htp_dump_fp16_line(pref, u.d + (16 * i), n1); + htp_dump_fp16_line(pref, u.fp16 + (16 * i), n1); } } @@ -411,8 +417,8 @@ static inline HVX_Vector hvx_vec_fp32_reduce_sum_n(HVX_Vector in, unsigned int n HVX_Vector sum = in, sum_t; while (width < total) { - sum_t = Q6_V_vror_VR(sum, width); // rotate right - sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(sum, sum_t)); // elementwise sum + sum_t = Q6_V_vror_VR(sum, width); // rotate right + sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(sum, sum_t)); // elementwise sum width = width << 1; } return sum; @@ -491,7 +497,7 @@ static inline HVX_Vector hvx_vec_abs_fp16(HVX_Vector v) { static inline HVX_Vector hvx_vec_neg_fp16(HVX_Vector v) { // neg by setting the fp16 sign bit HVX_Vector mask = Q6_Vh_vsplat_R(0x8000); - return Q6_V_vor_VV(v, mask); + return Q6_V_vxor_VV(v, mask); } static inline HVX_Vector hvx_vec_abs_fp32(HVX_Vector v) { @@ -506,7 +512,7 @@ static inline HVX_Vector hvx_vec_neg_fp32(HVX_Vector v) { #else // neg by setting the fp32 sign bit HVX_Vector mask = Q6_V_vsplat_R(0x80000000); - return Q6_V_vor_VV(v, mask); + return Q6_V_vxor_VV(v, mask); #endif // __HTP_ARCH__ > 75 } @@ -720,6 +726,24 @@ static inline HVX_Vector hvx_vec_inverse_fp32(HVX_Vector v_sf) { return Q6_Vsf_equals_Vqf32(r_qf); } +static inline HVX_Vector hvx_vec_inverse_fp32_guard(HVX_Vector v_sf) { + static const float kInf = INFINITY; + static const uint32_t kNanMask = 0x7fffffff; + static const uint32_t kNanMin = 0x7f800000; + + const HVX_Vector inf = hvx_vec_splat_fp32(kInf); + const HVX_VectorPred pred_inf = Q6_Q_vcmp_gt_VsfVsf(inf, v_sf); + + HVX_Vector out = hvx_vec_inverse_fp32(v_sf); + + const HVX_Vector nan_mask = Q6_V_vsplat_R(kNanMask); + const HVX_Vector nan_min = Q6_V_vsplat_R(kNanMin); + HVX_Vector masked_out = Q6_V_vand_VV(out, nan_mask); + const HVX_VectorPred pred = Q6_Q_vcmp_gtand_QVuwVuw(pred_inf, nan_min, masked_out); + + return Q6_V_vmux_QVV(pred, out, Q6_V_vzero()); +} + #define FAST_SIGMOID_LOG2F (0x3fb8aa3b) // 1.442695022 #define FAST_SIGMOID_C1 (0x3d009076) // 0.03138777 #define FAST_SIGMOID_C2 (0x3e8d74bd) // 0.276281267 @@ -934,6 +958,16 @@ static inline HVX_Vector hvx_vec_rsqrt_fp32(HVX_Vector in_vec) { return Q6_Vsf_equals_Vqf32(temp); } +static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard(HVX_Vector v) { + static const float kMaxExp = -88.02f; // log(INF) + + const HVX_Vector max_exp = Q6_V_vsplat_R(*((uint32_t *) &kMaxExp)); + const HVX_VectorPred pred_inf = Q6_Q_vcmp_gt_VsfVsf(v, max_exp); + + HVX_Vector out = hvx_vec_fast_sigmoid_fp32(v); + return Q6_V_vmux_QVV(pred_inf, out, Q6_V_vzero()); +} + static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) { int step_of_1 = num_elems >> 5; int remaining = num_elems - step_of_1 * VLEN_FP32; @@ -945,7 +979,7 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * #pragma unroll(4) for (int i = 0; i < step_of_1; i++) { - v_dst[i] = hvx_vec_fast_sigmoid_fp32(v_src[i]); + v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i]); } } diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 4cb6afe927..2319f7a9e2 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -6895,9 +6895,23 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co cl_context context = backend_ctx->context; if(src0t == GGML_TYPE_F16 && src1t == GGML_TYPE_F32){ - if (ne01 >= 64 && ne1 >= 32 && ne00 >= 16 && (ne12 % ne02) == 0){ - ggml_cl_mul_mat_kq_kqv_adreno(backend, src0, src1, dst); - return; + if (ne01 >= 64 && ne1 >= 32 && ne00 >= 16 && (ne12 % ne02) == 0) { + // For KQ + if (ggml_is_permuted(src0) && ggml_is_permuted(src1) && + nb00 <= nb02 && + nb02 <= nb01 && + nb01 <= nb03 && + nb10 <= nb12 && + nb12 <= nb11 && + nb11 <= nb13) { + ggml_cl_mul_mat_kq_kqv_adreno(backend, src0, src1, dst); + return; + } + // For KQV + if (!ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) { + ggml_cl_mul_mat_kq_kqv_adreno(backend, src0, src1, dst); + return; + } } } diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index f83dfdaef6..d4f27af8fc 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -513,6 +513,7 @@ struct vk_device_struct { vk_queue compute_queue; vk_queue transfer_queue; bool single_queue; + bool support_async; uint32_t subgroup_size; uint32_t shader_core_count; bool uma; @@ -4273,6 +4274,16 @@ static vk_device ggml_vk_get_device(size_t idx) { device->vendor_id = device->properties.vendorID; device->driver_id = driver_props.driverID; + // Implementing the async backend interfaces seems broken on older Intel HW, + // see https://github.com/ggml-org/llama.cpp/issues/17302. + device->support_async = (device->vendor_id != VK_VENDOR_ID_INTEL || + std::string(device->properties.deviceName.data()).find("(DG1)") == std::string::npos) && + getenv("GGML_VK_DISABLE_ASYNC") == nullptr; + + if (!device->support_async) { + GGML_LOG_DEBUG("ggml_vulkan: WARNING: Async execution disabled on certain Intel devices.\n"); + } + const char* GGML_VK_FORCE_MAX_ALLOCATION_SIZE = getenv("GGML_VK_FORCE_MAX_ALLOCATION_SIZE"); if (GGML_VK_FORCE_MAX_ALLOCATION_SIZE != nullptr) { @@ -13187,6 +13198,10 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg ctx->device->perf_logger->print_timings(); } + if (!ctx->device->support_async) { + ggml_vk_synchronize(ctx); + } + return GGML_STATUS_SUCCESS; UNUSED(backend); @@ -13480,6 +13495,10 @@ ggml_backend_t ggml_backend_vk_init(size_t dev_num) { /* .context = */ ctx, }; + if (!ctx->device->support_async) { + vk_backend->iface.get_tensor_async = nullptr; + } + return vk_backend; } diff --git a/src/llama-model.cpp b/src/llama-model.cpp index e703181a19..175549a9e3 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1593,7 +1593,8 @@ void llama_model::load_hparams(llama_model_loader & ml) { } break; case LLM_ARCH_DEEPSEEK2: { - bool is_lite = (hparams.n_layer == 27); + // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B + bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead); if (!is_lite) { @@ -4581,7 +4582,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } break; case LLM_ARCH_DEEPSEEK2: { - const bool is_lite = (hparams.n_layer == 27); + // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B + const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26); const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0); diff --git a/src/models/deepseek2.cpp b/src/models/deepseek2.cpp index 68f72f72bb..0b41f7ba8e 100644 --- a/src/models/deepseek2.cpp +++ b/src/models/deepseek2.cpp @@ -4,7 +4,8 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - bool is_lite = (hparams.n_layer == 27); + // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B + bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26); const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0); diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 2bb4b12224..90bdb80e5e 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -7819,6 +7819,7 @@ static std::vector> make_test_cases_perf() { for (int bs : {1, 4, 8, 32, 64, 128, 256, 512}) { for (ggml_type type_a : {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K, GGML_TYPE_IQ2_XS}) { for (ggml_type type_b : {GGML_TYPE_F32}) { + test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, 128, 8, false, 768, bs, 2048)); test_cases.emplace_back(new test_mul_mat_id_fusion(type_a, type_b, 128, 8, false, 768, bs, 2048, 1)); } } @@ -7827,6 +7828,7 @@ static std::vector> make_test_cases_perf() { for (int bs : {1, 4, 8, 32, 64, 128, 256, 512}) { for (ggml_type type_a : {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K, GGML_TYPE_IQ2_XS}) { for (ggml_type type_b : {GGML_TYPE_F32}) { + test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, 32, 4, false, 1792, bs, 2048)); test_cases.emplace_back(new test_mul_mat_id_fusion(type_a, type_b, 32, 4, false, 1792, bs, 2048, 1)); } } @@ -7837,6 +7839,7 @@ static std::vector> make_test_cases_perf() { for (int bs : {1, 4, 8, 512}) { for (ggml_type type_a : {GGML_TYPE_MXFP4}) { for (ggml_type type_b : {GGML_TYPE_F32}) { + test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, 32, 4, false, 2880, bs, 2880)); test_cases.emplace_back(new test_mul_mat_id_fusion(type_a, type_b, 32, 4, false, 2880, bs, 2880, 1)); } } diff --git a/tests/test-quantize-stats.cpp b/tests/test-quantize-stats.cpp index a284a1f0c5..de587d456d 100644 --- a/tests/test-quantize-stats.cpp +++ b/tests/test-quantize-stats.cpp @@ -23,7 +23,7 @@ #endif struct quantize_stats_params { - std::string model = DEFAULT_MODEL_PATH; + std::string model = "models/7B/ggml-model-f16.gguf"; bool verbose = false; bool per_layer_stats = false; bool print_histogram = false; diff --git a/tools/server/README.md b/tools/server/README.md index 54c1062c9b..3e311a657c 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -1364,18 +1364,32 @@ llama-server -hf /: *The server must be restarted after adding a new model.* -Alternatively, you can point the router to a local directory containing your GGUF files using `--models-dir`. Files prefixed with `mmproj-` will automatically be treated as multimodal projection files **for the model with the matching base name**: +Alternatively, you can point the router to a local directory containing your GGUF files using `--models-dir`. Example command: ```sh -llama-3.2-1b-Q4_K_M.gguf -gemma-3-4b-it-Q8_0.gguf -mmproj-gemma-3-4b-it-Q8_0.gguf # must be "mmproj-" + text model filename +llama-server --models-dir ./models_directory ``` -Example: +If the model contains multiple GGUF (for multimodal or multi-shard), files should be put into a subdirectory. The directory structure should look like this: ```sh -llama-server --models-dir ./path/to/models +models_directory + │ + │ # single file + ├─ llama-3.2-1b-Q4_K_M.gguf + ├─ Qwen3-8B-Q4_K_M.gguf + │ + │ # multimodal + ├─ gemma-3-4b-it-Q8_0 + │ ├─ gemma-3-4b-it-Q8_0.gguf + │ └─ mmproj-F16.gguf # file name must start with "mmproj" + │ + │ # multi-shard + ├─ gemma-3-4b-it-Q8_0 + │ ├─ Kimi-K2-Thinking-UD-IQ1_S-00001-of-00006.gguf + │ ├─ Kimi-K2-Thinking-UD-IQ1_S-00002-of-00006.gguf + │ ├─ ... + │ └─ Kimi-K2-Thinking-UD-IQ1_S-00006-of-00006.gguf ``` You may also specify default arguments that will be passed to every loaded model instance: @@ -1384,6 +1398,8 @@ You may also specify default arguments that will be passed to every loaded model llama-server -ctx 8192 -n 1024 -np 2 ``` +Note: model instances inherit both command line arguments and environment variables from the router server. + ### Routing requests Requests are routed according to the requested model name. @@ -1393,7 +1409,12 @@ For **POST** endpoints (`/v1/chat/completions`, `/v1/completions`, `/infill`, et ```json { "model": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M", - ... + "messages": [ + { + "role": "user", + "content": "hello" + } + ] } ``` @@ -1405,15 +1426,92 @@ GET /props?model=ggml-org%2Fgemma-3-4b-it-GGUF%3AQ4_K_M ### GET `/models`: List available models -TODO +Listing all models in cache. The model metadata will also include a field to indicate the status of the model: + +```json +{ + "data": [{ + "name": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M", + "id": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M", + "in_cache": true, + "path": "/Users/REDACTED/Library/Caches/llama.cpp/ggml-org_gemma-3-4b-it-GGUF_gemma-3-4b-it-Q4_K_M.gguf", + "status": { + "value": "loaded" + }, + ... + }] +} +``` + +Note: For a local GGUF (stored offline in a custom directory), the model object will have `"in_cache": false`. + +The `status` object can be: + +```json +"status": { + "value": "unloaded" +} +``` + +```json +"status": { + "value": "loading" +} +``` + +```json +"status": { + "value": "failed" +} +``` + +```json +"status": { + "value": "loaded" +} +``` ### POST `/models/load`: Load a model -TODO + +Load a model + +Payload: + +```json +{ + "model": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M" +} +``` + +Response: + +```json +{ + "success": true +} +``` ### POST `/models/unload`: Unload a model -TODO + +Unload a model + +Payload: + +```json +{ + "model": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M", +} +``` + +Response: + +```json +{ + "success": true +} +``` ## More examples diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index 92b02fbf49..071b5522ea 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #ifdef _WIN32 #include @@ -57,7 +58,7 @@ static std::filesystem::path get_server_exec_path() { return std::filesystem::path(buf.data()); } } - return std::filesystem::path(std::string(buf.data(), (size > 0) ? size : 0)); + throw std::runtime_error("_NSGetExecutablePath failed after buffer resize"); } #else char path[FILENAME_MAX]; @@ -69,6 +70,64 @@ static std::filesystem::path get_server_exec_path() { #endif } +struct local_model { + std::string name; + std::string path; + std::string path_mmproj; +}; + +static std::vector list_local_models(const std::string & dir) { + if (!std::filesystem::exists(dir) || !std::filesystem::is_directory(dir)) { + throw std::runtime_error(string_format("error: '%s' does not exist or is not a directory\n", dir.c_str())); + } + + std::vector models; + auto scan_subdir = [&models](const std::string & subdir_path, const std::string name) { + auto files = fs_list(subdir_path, false); + common_file_info model_file; + common_file_info first_shard_file; + common_file_info mmproj_file; + for (const auto & file : files) { + if (string_ends_with(file.name, ".gguf")) { + if (file.name.find("mmproj") != std::string::npos) { + mmproj_file = file; + } else if (file.name.find("-00001-of-") != std::string::npos) { + first_shard_file = file; + } else { + model_file = file; + } + } + } + // single file model + local_model model{ + /* name */ name, + /* path */ first_shard_file.path.empty() ? model_file.path : first_shard_file.path, + /* path_mmproj */ mmproj_file.path // can be empty + }; + if (!model.path.empty()) { + models.push_back(model); + } + }; + + auto files = fs_list(dir, true); + for (const auto & file : files) { + if (file.is_dir) { + scan_subdir(file.path, file.name); + } else if (string_ends_with(file.name, ".gguf")) { + // single file model + std::string name = file.name; + string_replace_all(name, ".gguf", ""); + local_model model{ + /* name */ name, + /* path */ file.path, + /* path_mmproj */ "" + }; + models.push_back(model); + } + } + return models; +} + // // server_models // @@ -85,15 +144,17 @@ server_models::server_models( base_env.push_back(std::string(*env)); } // TODO: allow refreshing cached model list + // add cached models auto cached_models = common_list_cached_models(); for (const auto & model : cached_models) { server_model_meta meta{ /* name */ model.to_string(), /* path */ model.manifest_path, - /* path_mmproj */ "", + /* path_mmproj */ "", // auto-detected when loading /* in_cache */ true, /* port */ 0, - /* status */ SERVER_MODEL_STATUS_UNLOADED + /* status */ SERVER_MODEL_STATUS_UNLOADED, + /* last_used */ 0 }; mapping[meta.name] = instance_t{ /* subproc */ std::make_shared(), @@ -101,6 +162,30 @@ server_models::server_models( /* meta */ meta }; } + // add local models specificed via --models-dir + if (!params.models_dir.empty()) { + auto local_models = list_local_models(params.models_dir); + for (const auto & model : local_models) { + if (mapping.find(model.name) != mapping.end()) { + // already exists in cached models, skip + continue; + } + server_model_meta meta{ + /* name */ model.name, + /* path */ model.path, + /* path_mmproj */ model.path_mmproj, + /* in_cache */ false, + /* port */ 0, + /* status */ SERVER_MODEL_STATUS_UNLOADED, + /* last_used */ 0 + }; + mapping[meta.name] = instance_t{ + /* subproc */ std::make_shared(), + /* th */ std::thread(), + /* meta */ meta + }; + } + } } void server_models::update_meta(const std::string & name, const server_model_meta & meta) { @@ -207,11 +292,39 @@ std::vector server_models::get_all_meta() { return result; } +void server_models::unload_lru() { + if (base_params.max_models <= 0) { + return; // no limit + } + // remove one of the servers if we passed the max_models (least recently used - LRU) + std::string lru_model_name = ""; + int64_t lru_last_used = ggml_time_ms(); + size_t count_active = 0; + { + std::lock_guard lk(mutex); + for (const auto & m : mapping) { + if (m.second.meta.is_active()) { + count_active++; + if (m.second.meta.last_used < lru_last_used) { + lru_model_name = m.first; + lru_last_used = m.second.meta.last_used; + } + } + } + } + if (!lru_model_name.empty() && count_active >= (size_t)base_params.max_models) { + SRV_INF("max_models limit reached, removing LRU name=%s\n", lru_model_name.c_str()); + unload(lru_model_name); + } +} + void server_models::load(const std::string & name) { - std::lock_guard lk(mutex); - if (mapping.find(name) == mapping.end()) { + if (!has_model(name)) { throw std::runtime_error("model name=" + name + " is not found"); } + unload_lru(); + + std::lock_guard lk(mutex); auto meta = mapping[name].meta; if (meta.status != SERVER_MODEL_STATUS_FAILED && meta.status != SERVER_MODEL_STATUS_UNLOADED) { @@ -221,9 +334,10 @@ void server_models::load(const std::string & name) { // prepare new instance info instance_t inst; - inst.meta = meta; - inst.meta.port = get_free_port(); - inst.meta.status = SERVER_MODEL_STATUS_LOADING; + inst.meta = meta; + inst.meta.port = get_free_port(); + inst.meta.status = SERVER_MODEL_STATUS_LOADING; + inst.meta.last_used = ggml_time_ms(); if (inst.meta.port <= 0) { throw std::runtime_error("failed to get a port number"); @@ -385,7 +499,7 @@ bool server_models::ensure_model_loaded(const std::string & name) { return true; } -server_http_res_ptr server_models::proxy_request(const server_http_req & req, const std::string & method, const std::string & name) { +server_http_res_ptr server_models::proxy_request(const server_http_req & req, const std::string & method, const std::string & name, bool update_last_used) { auto meta = get_meta(name); if (!meta.has_value()) { throw std::runtime_error("model name=" + name + " is not found"); @@ -393,6 +507,10 @@ server_http_res_ptr server_models::proxy_request(const server_http_req & req, co if (ensure_model_loaded(name)) { meta = get_meta(name); // refresh meta } + if (update_last_used) { + std::unique_lock lk(mutex); + mapping[name].meta.last_used = ggml_time_ms(); + } SRV_INF("proxying request to model %s on port %d\n", name.c_str(), meta->port); auto proxy = std::make_unique( method, @@ -572,8 +690,11 @@ server_http_proxy::server_http_proxy( // wait for the first chunk (headers) msg_t header; - pipe->read(header, should_stop); - SRV_DBG("%s", "received response headers\n"); - this->status = header.status; - this->headers = header.headers; + if (pipe->read(header, should_stop)) { + SRV_DBG("%s", "received response headers\n"); + this->status = header.status; + this->headers = header.headers; + } else { + SRV_DBG("%s", "no response headers received (request cancelled?)\n"); + } } diff --git a/tools/server/server-models.h b/tools/server/server-models.h index f8ae757fa4..3cb3b39fe7 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -58,6 +58,8 @@ struct server_model_meta { bool in_cache = false; // if true, use -hf; use -m otherwise int port = 0; server_model_status status = SERVER_MODEL_STATUS_UNLOADED; + int64_t last_used = 0; + bool is_active() const { return status == SERVER_MODEL_STATUS_LOADED || status == SERVER_MODEL_STATUS_LOADING; } @@ -81,6 +83,9 @@ private: void update_meta(const std::string & name, const server_model_meta & meta); + // unload least recently used models if the limit is reached + void unload_lru(); + public: server_models(const common_params & params, int argc, char ** argv, char ** envp); @@ -109,7 +114,7 @@ public: bool ensure_model_loaded(const std::string & name); // proxy an HTTP request to the model instance - server_http_res_ptr proxy_request(const server_http_req & req, const std::string & method, const std::string & name); + server_http_res_ptr proxy_request(const server_http_req & req, const std::string & method, const std::string & name, bool update_last_used); // notify the router server that a model instance is ready static void setup_child_server(const std::string & host, int router_port, const std::string & name, std::function & shutdown_handler); diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 37ef9d96ea..43d145fb67 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -5143,7 +5143,7 @@ public: std::string method = "GET"; std::string name = req.get_param("model"); models->ensure_model_loaded(name); - return models->proxy_request(req, method, name); + return models->proxy_request(req, method, name, false); }; server_http_context::handler_t proxy_post = [this](const server_http_req & req) { @@ -5151,7 +5151,7 @@ public: json body = json::parse(req.body); std::string name = json_value(body, "model", std::string()); models->ensure_model_loaded(name); - return models->proxy_request(req, method, name); + return models->proxy_request(req, method, name, true); // update last usage for POST request only }; server_http_context::handler_t post_router_models_load = [this](const server_http_req & req) { @@ -5189,9 +5189,10 @@ public: auto all_models = models->get_all_meta(); for (const auto & model : all_models) { models_json.push_back(json { - {"model", model.name}, - {"name", model.name}, - {"id", model.name}, + {"name", model.name}, + {"id", model.name}, + {"in_cache", model.in_cache}, + {"path", model.path}, // TODO: other fields... {"status", { {"value", server_model_status_to_string(model.status)} @@ -5662,8 +5663,7 @@ int main(int argc, char ** argv, char ** envp) { // register API routes server_routes routes(params, ctx_server, ctx_http); - // TODO: improve this by changing arg.cpp - bool is_router_server = params.model.path == DEFAULT_MODEL_PATH; + bool is_router_server = params.model.path.empty(); if (is_router_server) { // setup server instances manager routes.models.reset(new server_models(params, argc, argv, envp)); diff --git a/tools/server/webui/src/lib/components/app/misc/SelectorModel.svelte b/tools/server/webui/src/lib/components/app/misc/SelectorModel.svelte index 2aebb40d9c..bd40aba568 100644 --- a/tools/server/webui/src/lib/components/app/misc/SelectorModel.svelte +++ b/tools/server/webui/src/lib/components/app/misc/SelectorModel.svelte @@ -267,7 +267,6 @@ if (activeId) { return options.find((option) => option.id === activeId); } - return options[0]; } diff --git a/vendor/cpp-httplib/CMakeLists.txt b/vendor/cpp-httplib/CMakeLists.txt index 3b42fc8c1d..8e0f8064f7 100644 --- a/vendor/cpp-httplib/CMakeLists.txt +++ b/vendor/cpp-httplib/CMakeLists.txt @@ -22,7 +22,38 @@ target_compile_definitions(${TARGET} PRIVATE CPPHTTPLIB_TCP_NODELAY=1 ) -if (LLAMA_OPENSSL) +if (LLAMA_BUILD_BORINGSSL) + set(OPENSSL_NO_ASM ON CACHE BOOL "Disable OpenSSL ASM code (BoringSSL)") + set(FIPS OFF CACHE BOOL "Enable FIPS (BoringSSL)") + + set(BORINGSSL_GIT "https://boringssl.googlesource.com/boringssl" CACHE STRING "BoringSSL git repository") + set(BORINGSSL_VERSION "0.20251002.0" CACHE STRING "BoringSSL version") + + message(STATUS "Fetching BoringSSL version ${BORINGSSL_VERSION}") + + include(FetchContent) + FetchContent_Declare( + boringssl + GIT_REPOSITORY ${BORINGSSL_GIT} + GIT_TAG ${BORINGSSL_VERSION} + PATCH_COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_SOURCE_DIR}/patch-boringssl.cmake" + ) + + set(SAVED_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS}) + set(SAVED_BUILD_TESTING ${BUILD_TESTING}) + + set(BUILD_SHARED_LIBS OFF) + set(BUILD_TESTING OFF) + + FetchContent_MakeAvailable(boringssl) + + set(BUILD_SHARED_LIBS ${SAVED_BUILD_SHARED_LIBS}) + set(BUILD_TESTING ${SAVED_BUILD_TESTING}) + + set(CPPHTTPLIB_OPENSSL_SUPPORT TRUE) + target_link_libraries(${TARGET} PUBLIC ssl crypto) + +elseif (LLAMA_OPENSSL) find_package(OpenSSL) if (OpenSSL_FOUND) include(CheckCSourceCompiles) @@ -44,17 +75,20 @@ if (LLAMA_OPENSSL) set(CMAKE_REQUIRED_INCLUDES ${SAVED_CMAKE_REQUIRED_INCLUDES}) if (OPENSSL_VERSION_SUPPORTED) message(STATUS "OpenSSL found: ${OPENSSL_VERSION}") - target_compile_definitions(${TARGET} PUBLIC CPPHTTPLIB_OPENSSL_SUPPORT) + set(CPPHTTPLIB_OPENSSL_SUPPORT TRUE) target_link_libraries(${TARGET} PUBLIC OpenSSL::SSL OpenSSL::Crypto) - if (APPLE AND CMAKE_SYSTEM_NAME STREQUAL "Darwin") - target_compile_definitions(${TARGET} PUBLIC CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN) - find_library(CORE_FOUNDATION_FRAMEWORK CoreFoundation REQUIRED) - find_library(SECURITY_FRAMEWORK Security REQUIRED) - target_link_libraries(${TARGET} PUBLIC ${CORE_FOUNDATION_FRAMEWORK} ${SECURITY_FRAMEWORK}) - endif() endif() else() message(STATUS "OpenSSL not found, SSL support disabled") endif() endif() +if (CPPHTTPLIB_OPENSSL_SUPPORT) + target_compile_definitions(${TARGET} PUBLIC CPPHTTPLIB_OPENSSL_SUPPORT) # used in server.cpp + if (APPLE AND CMAKE_SYSTEM_NAME STREQUAL "Darwin") + target_compile_definitions(${TARGET} PRIVATE CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN) + find_library(CORE_FOUNDATION_FRAMEWORK CoreFoundation REQUIRED) + find_library(SECURITY_FRAMEWORK Security REQUIRED) + target_link_libraries(${TARGET} PUBLIC ${CORE_FOUNDATION_FRAMEWORK} ${SECURITY_FRAMEWORK}) + endif() +endif() diff --git a/vendor/cpp-httplib/patch-boringssl.cmake b/vendor/cpp-httplib/patch-boringssl.cmake new file mode 100644 index 0000000000..2914e1dddb --- /dev/null +++ b/vendor/cpp-httplib/patch-boringssl.cmake @@ -0,0 +1,6 @@ +# Remove bssl +file(READ "CMakeLists.txt" content) +string(REPLACE "add_executable(bssl" "#add_executable(bssl" content "${content}") +string(REPLACE "target_link_libraries(bssl" "#target_link_libraries(bssl" content "${content}") +string(REPLACE "install(TARGETS bssl" "#install(TARGETS bssl" content "${content}") +file(WRITE "CMakeLists.txt" "${content}")