diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 5215cc3572..eee42759fc 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -69,13 +69,6 @@ jobs: key: macOS-latest-cmake-arm64 evict-old-files: 1d - - name: Dependencies - id: depends - continue-on-error: true - run: | - brew update - brew install curl - - name: Build id: cmake_build run: | @@ -83,6 +76,8 @@ jobs: cmake -B build \ -DCMAKE_BUILD_RPATH="@loader_path" \ -DLLAMA_FATAL_WARNINGS=ON \ + -DLLAMA_CURL=OFF \ + -DLLAMA_BUILD_BORINGSSL=ON \ -DGGML_METAL_USE_BF16=ON \ -DGGML_METAL_EMBED_LIBRARY=OFF \ -DGGML_METAL_SHADER_DEBUG=ON \ @@ -110,13 +105,6 @@ jobs: key: macOS-latest-cmake-x64 evict-old-files: 1d - - name: Dependencies - id: depends - continue-on-error: true - run: | - brew update - brew install curl - - name: Build id: cmake_build run: | @@ -126,6 +114,8 @@ jobs: cmake -B build \ -DCMAKE_BUILD_RPATH="@loader_path" \ -DLLAMA_FATAL_WARNINGS=ON \ + -DLLAMA_CURL=OFF \ + -DLLAMA_BUILD_BORINGSSL=ON \ -DGGML_METAL=OFF \ -DGGML_RPC=ON \ -DCMAKE_OSX_DEPLOYMENT_TARGET=13.3 @@ -151,13 +141,6 @@ jobs: key: macOS-latest-cmake-arm64-webgpu evict-old-files: 1d - - name: Dependencies - id: depends - continue-on-error: true - run: | - brew update - brew install curl - - name: Dawn Dependency id: dawn-depends run: | @@ -217,7 +200,7 @@ jobs: sudo apt-get update sudo apt-get install -y --no-install-recommends \ python3 python3-pip python3-dev \ - libjpeg-dev build-essential libcurl4-openssl-dev \ + libjpeg-dev build-essential libssl-dev \ git-lfs - name: Python Dependencies @@ -238,6 +221,8 @@ jobs: id: cmake_build run: | cmake -B build \ + -DLLAMA_CURL=OFF \ + -DLLAMA_OPENSSL=ON \ -DLLAMA_FATAL_WARNINGS=ON \ -DGGML_RPC=ON cmake --build build --config Release -j $(nproc) @@ -294,13 +279,15 @@ jobs: id: depends run: | sudo apt-get update - sudo apt-get install build-essential libcurl4-openssl-dev + sudo apt-get install build-essential libssl-dev - name: Build id: cmake_build if: ${{ matrix.sanitizer != 'THREAD' }} run: | cmake -B build \ + -DLLAMA_CURL=OFF \ + -DLLAMA_OPENSSL=ON \ -DLLAMA_FATAL_WARNINGS=ON \ -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} @@ -311,6 +298,8 @@ jobs: if: ${{ matrix.sanitizer == 'THREAD' }} run: | cmake -B build \ + -DLLAMA_CURL=OFF \ + -DLLAMA_OPENSSL=ON \ -DLLAMA_FATAL_WARNINGS=ON \ -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ @@ -335,7 +324,7 @@ jobs: id: depends run: | sudo apt-get update - sudo apt-get install build-essential libcurl4-openssl-dev + sudo apt-get install build-essential libssl-dev - name: Build id: cmake_build @@ -343,6 +332,8 @@ jobs: mkdir build cd build cmake .. \ + -DLLAMA_CURL=OFF \ + -DLLAMA_OPENSSL=ON \ -DLLAMA_FATAL_WARNINGS=ON \ -DLLAMA_LLGUIDANCE=ON cmake --build . --config Release -j $(nproc) @@ -373,12 +364,14 @@ jobs: id: depends run: | sudo apt-get update - sudo apt-get install build-essential libcurl4-openssl-dev + sudo apt-get install build-essential libssl-dev - name: Build id: cmake_build run: | cmake -B build \ + -DLLAMA_CURL=OFF \ + -DLLAMA_OPENSSL=ON \ -DGGML_RPC=ON cmake --build build --config Release -j $(nproc) @@ -405,12 +398,14 @@ jobs: - name: Dependencies id: depends run: | - sudo apt-get install -y glslc libvulkan-dev libcurl4-openssl-dev + sudo apt-get install -y glslc libvulkan-dev libssl-dev - name: Configure id: cmake_configure run: | cmake -B build \ + -DLLAMA_CURL=OFF \ + -DLLAMA_OPENSSL=ON \ -DCMAKE_BUILD_TYPE=RelWithDebInfo \ -DGGML_BACKEND_DL=ON \ -DGGML_CPU_ALL_VARIANTS=ON \ @@ -440,7 +435,7 @@ jobs: run: | sudo add-apt-repository -y ppa:kisak/kisak-mesa sudo apt-get update -y - sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libcurl4-openssl-dev + sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev - name: Get latest Vulkan SDK version id: vulkan_sdk_version @@ -466,6 +461,8 @@ jobs: run: | source ./vulkan_sdk/setup-env.sh cmake -B build \ + -DLLAMA_CURL=OFF \ + -DLLAMA_OPENSSL=ON \ -DGGML_VULKAN=ON cmake --build build --config Release -j $(nproc) @@ -497,7 +494,7 @@ jobs: run: | sudo add-apt-repository -y ppa:kisak/kisak-mesa sudo apt-get update -y - sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libcurl4-openssl-dev + sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev - name: Get latest Vulkan SDK version id: vulkan_sdk_version @@ -537,7 +534,10 @@ jobs: id: cmake_build run: | export Dawn_DIR=dawn/lib64/cmake/Dawn - cmake -B build -DGGML_WEBGPU=ON + cmake -B build \ + -DLLAMA_CURL=OFF \ + -DLLAMA_OPENSSL=ON \ + -DGGML_WEBGPU=ON cmake --build build --config Release -j $(nproc) - name: Test @@ -560,7 +560,7 @@ jobs: id: depends run: | sudo apt-get update - sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libcurl4-openssl-dev rocwmma-dev + sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libssl-dev rocwmma-dev - name: ccache uses: ggml-org/ccache-action@v1.2.16 @@ -572,6 +572,8 @@ jobs: id: cmake_build run: | cmake -B build -S . \ + -DLLAMA_CURL=OFF \ + -DLLAMA_OPENSSL=ON \ -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \ -DGGML_HIP_ROCWMMA_FATTN=ON \ -DGGML_HIP=ON @@ -590,7 +592,7 @@ jobs: id: depends run: | apt-get update - apt-get install -y build-essential git cmake libcurl4-openssl-dev + apt-get install -y build-essential git cmake libssl-dev - name: ccache uses: ggml-org/ccache-action@v1.2.16 @@ -602,6 +604,8 @@ jobs: id: cmake_build run: | cmake -B build -S . \ + -DLLAMA_CURL=OFF \ + -DLLAMA_OPENSSL=ON \ -DGGML_MUSA=ON cmake --build build --config Release -j $(nproc) @@ -626,7 +630,7 @@ jobs: shell: bash run: | sudo apt update - sudo apt install intel-oneapi-compiler-dpcpp-cpp libcurl4-openssl-dev + sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev - name: install oneAPI MKL library shell: bash @@ -648,6 +652,8 @@ jobs: run: | source /opt/intel/oneapi/setvars.sh cmake -B build \ + -DLLAMA_CURL=OFF \ + -DLLAMA_OPENSSL=ON \ -DGGML_SYCL=ON \ -DCMAKE_C_COMPILER=icx \ -DCMAKE_CXX_COMPILER=icpx @@ -674,7 +680,7 @@ jobs: shell: bash run: | sudo apt update - sudo apt install intel-oneapi-compiler-dpcpp-cpp libcurl4-openssl-dev + sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev - name: install oneAPI MKL library shell: bash @@ -696,6 +702,8 @@ jobs: run: | source /opt/intel/oneapi/setvars.sh cmake -B build \ + -DLLAMA_CURL=OFF \ + -DLLAMA_OPENSSL=ON \ -DGGML_SYCL=ON \ -DCMAKE_C_COMPILER=icx \ -DCMAKE_CXX_COMPILER=icpx \ @@ -722,12 +730,6 @@ jobs: key: macOS-latest-cmake-ios evict-old-files: 1d - - name: Dependencies - id: depends - continue-on-error: true - run: | - brew update - - name: Build id: cmake_build run: | @@ -759,12 +761,6 @@ jobs: key: macOS-latest-cmake-tvos evict-old-files: 1d - - name: Dependencies - id: depends - continue-on-error: true - run: | - brew update - - name: Build id: cmake_build run: | @@ -790,12 +786,6 @@ jobs: id: checkout uses: actions/checkout@v4 - - name: Dependencies - id: depends - continue-on-error: true - run: | - brew update - - name: Build id: cmake_build run: | @@ -838,12 +828,6 @@ jobs: name: llama-xcframework path: build-apple/llama.xcframework/ - - name: Dependencies - id: depends - continue-on-error: true - run: | - brew update - - name: Build llama.cpp with CMake id: cmake_build run: | @@ -995,21 +979,12 @@ jobs: -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release" cmake --build build-arm64-release --target install --config release - - name: libCURL - id: get_libcurl - uses: ./.github/actions/windows-setup-curl - with: - architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }} - - name: Build id: cmake_build - env: - CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} run: | cmake -S . -B build ${{ matrix.defines }} ` - -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" + -DLLAMA_CURL=OFF -DLLAMA_BUILD_BORINGSSL=ON cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} - cp $env:CURL_PATH/bin/libcurl-*.dll build/bin/Release - name: Add libopenblas.dll id: add_libopenblas_dll @@ -1053,7 +1028,7 @@ jobs: DEBIAN_FRONTEND: noninteractive run: | apt update - apt install -y cmake build-essential ninja-build libgomp1 git libcurl4-openssl-dev + apt install -y cmake build-essential ninja-build libgomp1 git libssl-dev - name: ccache uses: ggml-org/ccache-action@v1.2.16 @@ -1064,10 +1039,12 @@ jobs: - name: Build with CMake run: | cmake -S . -B build -G Ninja \ + -DLLAMA_CURL=OFF \ + -DLLAMA_OPENSSL=ON \ + -DLLAMA_FATAL_WARNINGS=ON \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CUDA_ARCHITECTURES=89-real \ -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined \ - -DLLAMA_FATAL_WARNINGS=ON \ -DGGML_NATIVE=OFF \ -DGGML_CUDA=ON cmake --build build @@ -1101,25 +1078,20 @@ jobs: run: | choco install ninja - - name: libCURL - id: get_libcurl - uses: ./.github/actions/windows-setup-curl - - name: Build id: cmake_build shell: cmd - env: - CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} run: | call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 cmake -S . -B build -G "Ninja Multi-Config" ^ -DLLAMA_BUILD_SERVER=ON ^ + -DLLAMA_CURL=OFF ^ + -DLLAMA_BUILD_BORINGSSL=ON ^ -DGGML_NATIVE=OFF ^ -DGGML_BACKEND_DL=ON ^ -DGGML_CPU_ALL_VARIANTS=ON ^ -DGGML_CUDA=ON ^ - -DGGML_RPC=ON ^ - -DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include" + -DGGML_RPC=ON set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1 cmake --build build --config Release -j %NINJA_JOBS% -t ggml cmake --build build --config Release @@ -1151,7 +1123,7 @@ jobs: run: | scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL - # TODO: add libcurl support ; we will also need to modify win-build-sycl.bat to accept user-specified args + # TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args - name: Build id: cmake_build @@ -1208,14 +1180,8 @@ jobs: key: ${{ github.job }} evict-old-files: 1d - - name: libCURL - id: get_libcurl - uses: ./.github/actions/windows-setup-curl - - name: Build id: cmake_build - env: - CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} run: | $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path) $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}" @@ -1224,11 +1190,12 @@ jobs: -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" ` -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-${{ env.ROCM_VERSION }}/include/" ` -DCMAKE_BUILD_TYPE=Release ` + -DLLAMA_CURL=OFF ` + -DLLAMA_BUILD_BORINGSSL=ON ` -DROCM_DIR="${env:HIP_PATH}" ` -DGGML_HIP=ON ` -DGGML_HIP_ROCWMMA_FATTN=ON ` - -DGGML_RPC=ON ` - -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" + -DGGML_RPC=ON cmake --build build -j ${env:NUMBER_OF_PROCESSORS} ios-xcode-build: diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index ebcd6424bc..be4e23bc81 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -56,7 +56,7 @@ jobs: curl \ wget \ language-pack-en \ - libcurl4-openssl-dev + libssl-dev - name: Clone id: checkout @@ -242,7 +242,7 @@ jobs: curl \ wget \ language-pack-en \ - libcurl4-openssl-dev + libssl-dev - name: Clone id: checkout @@ -283,6 +283,8 @@ jobs: run: | cmake -B build \ -DGGML_NATIVE=OFF \ + -DLLAMA_CURL=OFF \ + -DLLAMA_OPENSSL=ON \ -DLLAMA_BUILD_SERVER=ON \ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \ @@ -295,6 +297,8 @@ jobs: run: | cmake -B build \ -DGGML_NATIVE=OFF \ + -DLLAMA_CURL=OFF \ + -DLLAMA_OPENSSL=ON \ -DLLAMA_BUILD_SERVER=ON \ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ; @@ -306,6 +310,8 @@ jobs: run: | cmake -B build \ -DGGML_NATIVE=OFF \ + -DLLAMA_CURL=OFF \ + -DLLAMA_OPENSSL=ON \ -DLLAMA_BUILD_SERVER=ON \ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ; cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server diff --git a/README.md b/README.md index 2962783585..cff3bd4370 100644 --- a/README.md +++ b/README.md @@ -242,6 +242,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo - [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption - [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage - [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with pre-built Mobile and Web platform wrappers and a model example) +- [unslothai/unsloth](https://github.com/unslothai/unsloth) – 🦥 exports/saves fine-tuned and trained models to GGUF (Apache-2.0) diff --git a/common/common.cpp b/common/common.cpp index 40dea7fd4b..6a6f5fec3d 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -27,7 +27,6 @@ #include #include #include -#include #include #include @@ -61,6 +60,14 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif +common_time_meas::common_time_meas(int64_t & t_acc, bool disable) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {} + +common_time_meas::~common_time_meas() { + if (t_start_us >= 0) { + t_acc += ggml_time_us() - t_start_us; + } +} + // // CPU utils // diff --git a/common/common.h b/common/common.h index 5d289a116b..01e6dfe59b 100644 --- a/common/common.h +++ b/common/common.h @@ -2,17 +2,15 @@ #pragma once +#include "ggml-opt.h" +#include "llama-cpp.h" + #include #include #include #include #include #include -#include -#include - -#include "ggml-opt.h" -#include "llama-cpp.h" #ifdef _WIN32 #define DIRECTORY_SEPARATOR '\\' @@ -30,6 +28,15 @@ #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf" +struct common_time_meas { + common_time_meas(int64_t & t_acc, bool disable = false); + ~common_time_meas(); + + const int64_t t_start_us; + + int64_t & t_acc; +}; + struct common_adapter_lora_info { std::string path; float scale; diff --git a/common/sampling.cpp b/common/sampling.cpp index ae2276c712..0a2be6bf7d 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -3,9 +3,10 @@ #include "common.h" #include "log.h" -#include -#include #include +#include +#include +#include // the ring buffer works similarly to std::deque, but with a fixed capacity // TODO: deduplicate with llama-impl.h @@ -112,6 +113,13 @@ struct common_sampler { llama_token_data_array cur_p; + void reset() { + prev.clear(); + + llama_sampler_reset(grmr); + llama_sampler_reset(chain); + } + void set_logits(struct llama_context * ctx, int idx) { const float * sampled_probs = llama_get_backend_sampled_probs_ith (ctx, idx); const float * sampled_logits = llama_get_backend_sampled_logits_ith (ctx, idx); @@ -145,6 +153,12 @@ struct common_sampler { cur_p = { cur.data(), cur.size(), -1, false }; } + + common_time_meas tm() { + return common_time_meas(t_total_us, params.no_perf); + } + + mutable int64_t t_total_us = 0; }; static bool sampler_enabled(const struct common_params_sampling & params, enum common_sampler_type type) { @@ -356,6 +370,8 @@ void common_sampler_free(struct common_sampler * gsmpl) { } void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) { + const auto tm = gsmpl->tm(); + if (accept_grammar) { llama_sampler_accept(gsmpl->grmr, token); } @@ -366,9 +382,7 @@ void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, boo } void common_sampler_reset(struct common_sampler * gsmpl) { - llama_sampler_reset(gsmpl->grmr); - - llama_sampler_reset(gsmpl->chain); + gsmpl->reset(); } struct common_sampler * common_sampler_clone(common_sampler * gsmpl) { @@ -385,11 +399,44 @@ struct common_sampler * common_sampler_clone(common_sampler * gsmpl) { void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl) { // TODO: measure grammar performance + const double t_sampling_ms = gsmpl ? 1e-3*gsmpl->t_total_us : 0; + + llama_perf_sampler_data data_smpl; + llama_perf_context_data data_ctx; + + memset(&data_smpl, 0, sizeof(data_smpl)); + memset(&data_ctx, 0, sizeof(data_ctx)); + if (gsmpl) { - llama_perf_sampler_print(gsmpl->chain); + auto & data = data_smpl; + + data = llama_perf_sampler(gsmpl->chain); + + // note: the sampling time includes the samplers time + extra time spent in common/sampling + LOG_INF("%s: sampling time = %10.2f ms\n", __func__, t_sampling_ms); + LOG_INF("%s: samplers time = %10.2f ms / %5d tokens\n", __func__, data.t_sample_ms, data.n_sample); } + if (ctx) { - llama_perf_context_print(ctx); + auto & data = data_ctx; + + data = llama_perf_context(ctx); + + const double t_end_ms = 1e-3 * ggml_time_us(); + + const double t_total_ms = t_end_ms - data.t_start_ms; + const double t_unacc_ms = t_total_ms - (t_sampling_ms + data.t_p_eval_ms + data.t_eval_ms); + const double t_unacc_pc = 100.0 * t_unacc_ms / t_total_ms; + + LOG_INF("%s: load time = %10.2f ms\n", __func__, data.t_load_ms); + LOG_INF("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", + __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval); + LOG_INF("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", + __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval); + LOG_INF("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval)); + LOG_INF("%s: unaccounted time = %10.2f ms / %5.1f %% (total - sampling - prompt eval - eval) / (total)\n", __func__, t_unacc_ms, t_unacc_pc); + LOG_INF("%s: graphs reused = %10d\n", __func__, data.n_reused); + llama_memory_breakdown_print(ctx); } } @@ -404,6 +451,10 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co return id; } } + llama_synchronize(ctx); + + // start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations + const auto tm = gsmpl->tm(); gsmpl->set_logits(ctx, idx); @@ -496,6 +547,8 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) { // helpers llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort) { + const auto tm = gsmpl->tm(); + auto * res = &gsmpl->cur_p; if (do_sort && !res->sorted) { diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index befe8ab9cc..57c6cd0df1 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -277,10 +277,15 @@ def parse_args() -> argparse.Namespace: return parser.parse_args() -def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]: +def load_hparams_from_hf(hf_model_id: str) -> tuple[dict[str, Any], Path | None]: + from huggingface_hub import try_to_load_from_cache + # normally, adapter does not come with base model config, we need to load it from AutoConfig config = AutoConfig.from_pretrained(hf_model_id) - return config.to_dict() + cache_dir = try_to_load_from_cache(hf_model_id, "config.json") + cache_dir = Path(cache_dir).parent if isinstance(cache_dir, str) else None + + return config.to_dict(), cache_dir if __name__ == '__main__': @@ -325,13 +330,13 @@ if __name__ == '__main__': # load base model if base_model_id is not None: logger.info(f"Loading base model from Hugging Face: {base_model_id}") - hparams = load_hparams_from_hf(base_model_id) + hparams, dir_base_model = load_hparams_from_hf(base_model_id) elif dir_base_model is None: if "base_model_name_or_path" in lparams: model_id = lparams["base_model_name_or_path"] logger.info(f"Loading base model from Hugging Face: {model_id}") try: - hparams = load_hparams_from_hf(model_id) + hparams, dir_base_model = load_hparams_from_hf(model_id) except OSError as e: logger.error(f"Failed to load base model config: {e}") logger.error("Please try downloading the base model and add its path to --base") @@ -480,6 +485,7 @@ if __name__ == '__main__': dir_lora_model=dir_lora, lora_alpha=alpha, hparams=hparams, + remote_hf_model_id=base_model_id, ) logger.info("Exporting model...") diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index cefa39a57c..80c693ce61 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -4,10 +4,10 @@ #include "llama.h" #include "ggml.h" +#include #include #include #include -#include /** * This the arbitrary data which will be passed to each callback. @@ -37,23 +37,23 @@ static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) { return u.f; } -static float ggml_get_float_value(uint8_t * data, ggml_type type, const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) { +static float ggml_get_float_value(const uint8_t * data, ggml_type type, const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) { size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0]; float v; if (type == GGML_TYPE_F16) { - v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]); + v = ggml_fp16_to_fp32(*(const ggml_fp16_t *) &data[i]); } else if (type == GGML_TYPE_F32) { - v = *(float *) &data[i]; + v = *(const float *) &data[i]; } else if (type == GGML_TYPE_I64) { - v = (float) *(int64_t *) &data[i]; + v = (float) *(const int64_t *) &data[i]; } else if (type == GGML_TYPE_I32) { - v = (float) *(int32_t *) &data[i]; + v = (float) *(const int32_t *) &data[i]; } else if (type == GGML_TYPE_I16) { - v = (float) *(int16_t *) &data[i]; + v = (float) *(const int16_t *) &data[i]; } else if (type == GGML_TYPE_I8) { - v = (float) *(int8_t *) &data[i]; + v = (float) *(const int8_t *) &data[i]; } else if (type == GGML_TYPE_BF16) { - v = ggml_compute_bf16_to_fp32(*(ggml_bf16_t *) &data[i]); + v = ggml_compute_bf16_to_fp32(*(const ggml_bf16_t *) &data[i]); } else { GGML_ABORT("fatal error"); } diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index 9576dcb6e8..5cbf5683e1 100644 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -2246,8 +2246,7 @@ static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx bool & use_cann_graph, bool & cann_graph_update_required) { #ifdef USE_ACL_GRAPH - ggml_cann_graph * matched_graph = cann_ctx->graph_lru_cache.cache_list.front(); - if (use_cann_graph && cann_graph_update_required) { + if (use_cann_graph && cann_graph_update_required) { // Begin CANN graph capture ACL_CHECK(aclmdlRICaptureBegin(cann_ctx->stream(), ACL_MODEL_RI_CAPTURE_MODE_GLOBAL)); } #endif // USE_ACL_GRAPH @@ -2271,12 +2270,14 @@ static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx } #ifdef USE_ACL_GRAPH - if (use_cann_graph && cann_graph_update_required) { // End CANN graph capture - ACL_CHECK(aclmdlRICaptureEnd(cann_ctx->stream(), &matched_graph->graph)); - } - if (use_cann_graph) { - // Execute graph + ggml_cann_graph * matched_graph = cann_ctx->graph_lru_cache.cache_list.front(); + + if (cann_graph_update_required) { // End CANN graph capture + ACL_CHECK(aclmdlRICaptureEnd(cann_ctx->stream(), &matched_graph->graph)); + } + + // Execute CANN graph ACL_CHECK(aclmdlRIExecuteAsync(matched_graph->graph, cann_ctx->stream())); } #endif // USE_ACL_GRAPH diff --git a/ggml/src/ggml-cpu/kleidiai/kernels.cpp b/ggml/src/ggml-cpu/kleidiai/kernels.cpp index 1d5b44f9fe..55a00f008a 100644 --- a/ggml/src/ggml-cpu/kleidiai/kernels.cpp +++ b/ggml/src/ggml-cpu/kleidiai/kernels.cpp @@ -39,7 +39,7 @@ #include "kernels.h" -#define NELEMS(x) sizeof(x) / sizeof(*x) +#define NELEMS(x) (sizeof(x) / sizeof(*x)) template static inline size_t kernel_offs_fn3(size_t a, size_t b, size_t c) { @@ -635,6 +635,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = { }, #endif #endif + { /* Sentinel */ } }; static ggml_kleidiai_kernels gemm_gemv_kernels_q8[] = { @@ -803,6 +804,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels_q8[] = { /* .op_type = */ GGML_TYPE_F32, }, #endif + { /* Sentinel */ } }; ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature cpu_features, const ggml_tensor * tensor) { @@ -810,7 +812,7 @@ ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature cpu_features, c if (tensor->op == GGML_OP_MUL_MAT && tensor->src[0] != nullptr && tensor->src[1] != nullptr) { #if defined(__ARM_FEATURE_SME) || defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8) - for (size_t i = 0; i < NELEMS(gemm_gemv_kernels); ++i) { + for (size_t i = 0; i < NELEMS(gemm_gemv_kernels) - 1; ++i) { if ((cpu_features & gemm_gemv_kernels[i].required_cpu) == gemm_gemv_kernels[i].required_cpu && gemm_gemv_kernels[i].lhs_type == tensor->src[1]->type && gemm_gemv_kernels[i].rhs_type == tensor->src[0]->type && @@ -820,7 +822,7 @@ ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature cpu_features, c } } if (!kernel) { - for (size_t i = 0; i < NELEMS(gemm_gemv_kernels_q8); ++i) { + for (size_t i = 0; i < NELEMS(gemm_gemv_kernels_q8) - 1; ++i) { if ((cpu_features & gemm_gemv_kernels_q8[i].required_cpu) == gemm_gemv_kernels_q8[i].required_cpu && gemm_gemv_kernels_q8[i].lhs_type == tensor->src[1]->type && gemm_gemv_kernels_q8[i].rhs_type == tensor->src[0]->type && @@ -830,6 +832,10 @@ ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature cpu_features, c } } } +#else + GGML_UNUSED(gemm_gemv_kernels); + GGML_UNUSED(gemm_gemv_kernels_q8); + GGML_UNUSED(cpu_features); #endif } @@ -840,12 +846,14 @@ ggml_kleidiai_kernels * ggml_kleidiai_select_kernels_q4_0(cpu_feature features) ggml_kleidiai_kernels * kernels = nullptr; #if defined(__ARM_FEATURE_SME) || defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8) - for (size_t i = 0; i < NELEMS(gemm_gemv_kernels); ++i) { + for (size_t i = 0; i < NELEMS(gemm_gemv_kernels) - 1; ++i) { if ((features & gemm_gemv_kernels[i].required_cpu) == gemm_gemv_kernels[i].required_cpu) { kernels = &gemm_gemv_kernels[i]; break; } } +#else + GGML_UNUSED(features); #endif return kernels; @@ -855,12 +863,14 @@ ggml_kleidiai_kernels * ggml_kleidiai_select_kernels_q8_0(cpu_feature features) ggml_kleidiai_kernels * kernels = nullptr; #if defined(__ARM_FEATURE_SME) || defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8) - for (size_t i = 0; i < NELEMS(gemm_gemv_kernels_q8); ++i) { + for (size_t i = 0; i < NELEMS(gemm_gemv_kernels_q8) - 1; ++i) { if ((features & gemm_gemv_kernels_q8[i].required_cpu) == gemm_gemv_kernels_q8[i].required_cpu) { kernels = &gemm_gemv_kernels_q8[i]; break; } } +#else + GGML_UNUSED(features); #endif return kernels; diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index b6209588db..41e89d83c2 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -9696,13 +9696,12 @@ static void ggml_compute_forward_solve_tri_f32(const struct ggml_compute_params for (int64_t i00 = 0; i00 < n; ++i00) { float sum = 0.0f; for (int64_t t = 0; t < i00; ++t) { - sum += A_batch[i00 * n + t] * X_batch[i01 * n + t]; + sum += A_batch[i00 * n + t] * X_batch[t * k + i01]; } const float diag = A_batch[i00 * n + i00]; GGML_ASSERT(diag != 0.0f && "Zero diagonal in triangular matrix"); - - X_batch[i01 * n + i00] = (B_batch[i00 * k + i01] - sum) / diag; + X_batch[i00 * k + i01] = (B_batch[i00 * k + i01] - sum) / diag; } } } diff --git a/ggml/src/ggml-cpu/simd-mappings.h b/ggml/src/ggml-cpu/simd-mappings.h index 74c74d1a28..101a9c086b 100644 --- a/ggml/src/ggml-cpu/simd-mappings.h +++ b/ggml/src/ggml-cpu/simd-mappings.h @@ -160,18 +160,18 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { #define GGML_F32xt svfloat32_t #define GGML_F32xt_ZERO svdup_n_f32(0.0f) #define GGML_F32xt_SET1(x) svdup_n_f32(x) -#define GGML_F32xt_LOAD_IMPL(pg, a, ...) svld1_f32(pg, a) -#define GGML_F32xt_LOAD(...) GGML_F32xt_LOAD_IMPL(DEFAULT_PG, __VA_ARGS__) -#define GGML_F32xt_STORE_IMPL(pg,a,b) svst1_f32(pg, a, b) -#define GGML_F32xt_STORE(...) GGML_F32xt_STORE_IMPL(DEFAULT_PG, __VA_ARGS__) +#define GGML_F32xt_LOAD_IMPL(pg, a) svld1_f32(pg, a) +#define GGML_F32xt_LOAD(a) GGML_F32xt_LOAD_IMPL(DEFAULT_PG, a) +#define GGML_F32xt_STORE_IMPL(pg, a, b) svst1_f32(pg, a, b) +#define GGML_F32xt_STORE(a, b) GGML_F32xt_STORE_IMPL(DEFAULT_PG, a, b) #define GGML_F32xt_FMA_IMPL(pg, a, b, c) svmad_f32_m(pg, b, c, a) -#define GGML_F32xt_FMA(...) GGML_F32xt_FMA_IMPL(DEFAULT_PG, __VA_ARGS__) +#define GGML_F32xt_FMA(a, b, c) GGML_F32xt_FMA_IMPL(DEFAULT_PG, a, b, c) #define GGML_F32xt_ADD_IMPL(pg, a, b) svadd_f32_m(pg, a, b) -#define GGML_F32xt_ADD(...) GGML_F32xt_ADD_IMPL(DEFAULT_PG, __VA_ARGS__) +#define GGML_F32xt_ADD(a, b) GGML_F32xt_ADD_IMPL(DEFAULT_PG, a, b) #define GGML_F32xt_MUL_IMPL(pg, a, b) svmul_f32_m(pg, a, b) -#define GGML_F32xt_MUL(...) GGML_F32xt_MUL_IMPL(DEFAULT_PG, __VA_ARGS__) +#define GGML_F32xt_MUL(a, b) GGML_F32xt_MUL_IMPL(DEFAULT_PG, a, b) #define GGML_F32xt_REDUCE_ONE_IMPL(pg, a) svaddv(pg, a) -#define GGML_F32xt_REDUCE_ONE(...) GGML_F32xt_REDUCE_ONE_IMPL(DEFAULT_PG, __VA_ARGS__) +#define GGML_F32xt_REDUCE_ONE(a) GGML_F32xt_REDUCE_ONE_IMPL(DEFAULT_PG, a) #define GGML_F32xt_REDUCE_IMPL(pg, res, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8) \ { \ sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum2); \ @@ -183,7 +183,8 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum5); \ (res) = (ggml_float) GGML_F32xt_REDUCE_ONE(sum1); \ } -#define GGML_F32xt_REDUCE(...) GGML_F32xt_REDUCE_IMPL(DEFAULT_PG, __VA_ARGS__) +#define GGML_F32xt_REDUCE(res, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8) \ + GGML_F32xt_REDUCE_IMPL(DEFAULT_PG, res, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8) #define GGML_F32_VEC GGML_F32xt #define GGML_F32_VEC_ZERO GGML_F32xt_ZERO @@ -206,11 +207,11 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { #define GGML_F32Cxt_STORE(dst_ptr, src_vec) svst1_f16(DEFAULT_PG16, (__fp16 *)(dst_ptr), (src_vec)) #define GGML_F32Cxt_FMA_IMPL(pg, a, b, c) svmad_f16_x(pg, b, c, a) -#define GGML_F32Cxt_FMA(...) GGML_F32Cxt_FMA_IMPL(DEFAULT_PG16, __VA_ARGS__) +#define GGML_F32Cxt_FMA(a, b, c) GGML_F32Cxt_FMA_IMPL(DEFAULT_PG16, a, b, c) #define GGML_F32Cxt_ADD_IMPL(pg, a, b) svadd_f16_x(pg, a, b) -#define GGML_F32Cxt_ADD(...) GGML_F32Cxt_ADD_IMPL(DEFAULT_PG16, __VA_ARGS__) +#define GGML_F32Cxt_ADD(a, b) GGML_F32Cxt_ADD_IMPL(DEFAULT_PG16, a, b) #define GGML_F32Cxt_MUL_IMPL(pg, a, b) svmul_f16_x(pg, a, b) -#define GGML_F32Cxt_MUL(...) GGML_F32Cxt_MUL_IMPL(DEFAULT_PG16, __VA_ARGS__) +#define GGML_F32Cxt_MUL(a, b) GGML_F32Cxt_MUL_IMPL(DEFAULT_PG16, a, b) #define GGML_F32Cxt_REDUCE GGML_F16xt_REDUCE_MIXED #define GGML_F16x_VEC GGML_F32Cxt @@ -224,7 +225,7 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { #define GGML_F16x_VEC_REDUCE GGML_F32Cxt_REDUCE #define GGML_F16xt_REDUCE_ONE_IMPL(pg, a) svaddv_f16(pg, a) -#define GGML_F16xt_REDUCE_ONE(...) GGML_F16xt_REDUCE_ONE_IMPL(DEFAULT_PG16, __VA_ARGS__) +#define GGML_F16xt_REDUCE_ONE(a) GGML_F16xt_REDUCE_ONE_IMPL(DEFAULT_PG16, a) #define GGML_F16xt_REDUCE_MIXED_IMPL(pg16, res, sum1, sum2, sum3, sum4) \ { \ @@ -234,7 +235,8 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { __fp16 sum_f16 = svaddv_f16(pg16, sum1); \ (res) = (ggml_float) sum_f16; \ } -#define GGML_F16xt_REDUCE_MIXED(...) GGML_F16xt_REDUCE_MIXED_IMPL(DEFAULT_PG16, __VA_ARGS__) +#define GGML_F16xt_REDUCE_MIXED(res, sum1, sum2, sum3, sum4) \ + GGML_F16xt_REDUCE_MIXED_IMPL(DEFAULT_PG16, res, sum1, sum2, sum3, sum4) // F16 NEON diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h index ac59f1fe85..e0e9540433 100644 --- a/ggml/src/ggml-cpu/vec.h +++ b/ggml/src/ggml-cpu/vec.h @@ -698,60 +698,61 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { } inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) { -#if defined(GGML_SIMD) - #if defined(__ARM_FEATURE_SVE) - const int sve_register_length = svcntb() * 8; - const int ggml_f16_epr = sve_register_length / 16; - const int ggml_f16_step = 2 * ggml_f16_epr; +#if defined(GGML_SIMD) && defined(__ARM_FEATURE_SVE) + const int sve_register_length = svcntb() * 8; + const int ggml_f16_epr = sve_register_length / 16; + const int ggml_f16_step = 2 * ggml_f16_epr; - GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v); - const int np = (n & ~(ggml_f16_step - 1)); - svfloat16_t ay1, ay2; + GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v); + const int np = (n & ~(ggml_f16_step - 1)); + svfloat16_t ay1, ay2; - for (int i = 0; i < np; i += ggml_f16_step) { - ay1 = GGML_F16x_VEC_LOAD(y + i + 0*ggml_f16_epr, 0); - ay1 = GGML_F16x_VEC_MUL(ay1, vx); - GGML_F16x_VEC_STORE(y + i + 0*ggml_f16_epr, ay1, 0); + for (int i = 0; i < np; i += ggml_f16_step) { + ay1 = GGML_F16x_VEC_LOAD(y + i + 0*ggml_f16_epr, 0); + ay1 = GGML_F16x_VEC_MUL(ay1, vx); + GGML_F16x_VEC_STORE(y + i + 0*ggml_f16_epr, ay1, 0); - ay2 = GGML_F16x_VEC_LOAD(y + i + 1*ggml_f16_epr, 1); - ay2 = GGML_F16x_VEC_MUL(ay2, vx); - GGML_F16x_VEC_STORE(y + i + 1*ggml_f16_epr, ay2, 1); + ay2 = GGML_F16x_VEC_LOAD(y + i + 1*ggml_f16_epr, 1); + ay2 = GGML_F16x_VEC_MUL(ay2, vx); + GGML_F16x_VEC_STORE(y + i + 1*ggml_f16_epr, ay2, 1); + } + // leftovers + // maximum number of leftover elements will be less that ggmlF_16x_epr. Apply predicated svmad on available elements only + if (np < n) { + svbool_t pg = svwhilelt_b16(np, n); + svfloat16_t hy = svld1_f16(pg, (__fp16 *)(y + np)); + svfloat16_t out = svmul_f16_m(pg, hy, vx); + svst1_f16(pg, (__fp16 *)(y + np), out); + } +#elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfh) + for (int i = 0, vl; i < n; i += vl) { + vl = __riscv_vsetvl_e16m2(n - i); + vfloat16m2_t vy = __riscv_vle16_v_f16m2((_Float16 *)&y[i], vl); + vfloat32m4_t vy32 = __riscv_vfwcvt_f_f_v_f32m4(vy, vl); + vy32 = __riscv_vfmul_vf_f32m4(vy32, v, vl); + vy = __riscv_vfncvt_f_f_w_f16m2(vy32, vl); + __riscv_vse16_v_f16m2((_Float16 *)&y[i], vy, vl); + } +#elif defined(GGML_SIMD) + const int np = (n & ~(GGML_F16_STEP - 1)); + + GGML_F16_VEC vx = GGML_F16_VEC_SET1(v); + + GGML_F16_VEC ay[GGML_F16_ARR]; + + for (int i = 0; i < np; i += GGML_F16_STEP) { + for (int j = 0; j < GGML_F16_ARR; j++) { + ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j); + ay[j] = GGML_F16_VEC_MUL(ay[j], vx); + + GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j); } - // leftovers - // maximum number of leftover elements will be less that ggmlF_16x_epr. Apply predicated svmad on available elements only - if (np < n) { - svbool_t pg = svwhilelt_b16(np, n); - svfloat16_t hy = svld1_f16(pg, (__fp16 *)(y + np)); - svfloat16_t out = svmul_f16_m(pg, hy, vx); - svst1_f16(pg, (__fp16 *)(y + np), out); - } - #elif defined(__riscv_v_intrinsic) - // todo: RVV impl - // scalar - for (int i = 0; i < n; ++i) { - y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v); - } - #else - const int np = (n & ~(GGML_F16_STEP - 1)); + } - GGML_F16_VEC vx = GGML_F16_VEC_SET1(v); - - GGML_F16_VEC ay[GGML_F16_ARR]; - - for (int i = 0; i < np; i += GGML_F16_STEP) { - for (int j = 0; j < GGML_F16_ARR; j++) { - ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j); - ay[j] = GGML_F16_VEC_MUL(ay[j], vx); - - GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j); - } - } - - // leftovers - for (int i = np; i < n; ++i) { - y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v); - } - #endif + // leftovers + for (int i = np; i < n; ++i) { + y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v); + } #else // scalar for (int i = 0; i < n; ++i) { diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 685dd37d13..a4ff461120 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3752,10 +3752,110 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t return ctx->description.c_str(); } +#if defined(__linux__) +// Helper function to get available memory from /proc/meminfo for UMA systems +static bool ggml_backend_cuda_get_available_uma_memory(long * available_memory_kb, long * free_swap_kb) { + FILE * meminfo_file = nullptr; + // 2KB buffer for reading /proc/meminfo since it does not report size info, should be enough + const size_t BUFFER_SIZE = 2048; + auto file_buffer = std::make_unique(BUFFER_SIZE); + size_t bytes_read = 0; + long huge_tlb_total_pages = -1; + long huge_tlb_free_pages = -1; + long huge_tlb_page_size = -1; + + if (available_memory_kb == nullptr || free_swap_kb == nullptr) { + return false; + } + + meminfo_file = fopen("/proc/meminfo", "r"); + if (meminfo_file == nullptr) { + GGML_LOG_ERROR("%s: failed to open /proc/meminfo\n", __func__); + return false; + } + + // Read file into buffer + bytes_read = fread(file_buffer.get(), 1, BUFFER_SIZE - 1, meminfo_file); + fclose(meminfo_file); + + if (bytes_read == 0) { + GGML_LOG_ERROR("%s: failed to read from /proc/meminfo\n", __func__); + return false; + } + file_buffer[bytes_read] = '\0'; + + *available_memory_kb = -1; + *free_swap_kb = -1; + + // Parse the file buffer line by line + char * line = file_buffer.get(); + char * line_next; + while (line < file_buffer.get() + bytes_read) { + // Find the end of the current line + line_next = strchr(line, '\n'); + if (line_next != nullptr) { + *line_next = '\0'; + line_next++; + } else { + line_next = file_buffer.get() + bytes_read; + } + + long value; + if (sscanf(line, "MemAvailable: %ld kB", &value) == 1) { + *available_memory_kb = value; + } else if (sscanf(line, "SwapFree: %ld kB", &value) == 1) { + *free_swap_kb = value; + } else if (sscanf(line, "HugePages_Total: %ld", &value) == 1) { + huge_tlb_total_pages = value; + } else if (sscanf(line, "HugePages_Free: %ld", &value) == 1) { + huge_tlb_free_pages = value; + } else if (sscanf(line, "Hugepagesize: %ld kB", &value) == 1) { + huge_tlb_page_size = value; + } + + line = line_next; + } + + if (huge_tlb_total_pages != 0 && huge_tlb_total_pages != -1) { + *available_memory_kb = huge_tlb_free_pages * huge_tlb_page_size; + + // Hugetlbfs pages are not swappable. + *free_swap_kb = 0; + } + + GGML_LOG_DEBUG("%s: final available_memory_kb: %ld\n", __func__, *available_memory_kb); + return true; +} +#endif // defined(__linux__) + static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context; ggml_cuda_set_device(ctx->device); CUDA_CHECK(cudaMemGetInfo(free, total)); + +// ref: https://github.com/ggml-org/llama.cpp/pull/17368 +#if defined(__linux__) + // Check if this is a UMA (Unified Memory Architecture) system + cudaDeviceProp prop; + CUDA_CHECK(cudaGetDeviceProperties(&prop, ctx->device)); + + // Check if UMA is explicitly enabled via environment variable + bool uma_env = getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr; + bool is_uma = prop.unifiedAddressing > 0 || uma_env; + + if (is_uma) { + // For UMA systems (like DGX Spark), use system memory info + long available_memory_kb = 0; + long free_swap_kb = 0; + + if (ggml_backend_cuda_get_available_uma_memory(&available_memory_kb, &free_swap_kb) && available_memory_kb > 0) { + *free = (size_t)available_memory_kb * 1024; + } else { + GGML_LOG_ERROR("%s: /proc/meminfo reading failed, using cudaMemGetInfo\n", __func__); + } + } +#endif // defined(__linux__) + } static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend_dev_t dev) { diff --git a/ggml/src/ggml-hexagon/htp/act-ops.c b/ggml/src/ggml-hexagon/htp/act-ops.c index 16044975d9..87b09cca3a 100644 --- a/ggml/src/ggml-hexagon/htp/act-ops.c +++ b/ggml/src/ggml-hexagon/htp/act-ops.c @@ -106,33 +106,32 @@ static void glu_swiglu_fp32_per_thread(const struct htp_tensor * src0, t1 = HAP_perf_get_qtimer_count(); int is_aligned = 1; - int opt_path = 0; if (!htp_is_aligned((void *) src0->data, VLEN) || !htp_is_aligned((void *) dst->data, VLEN)) { is_aligned = 0; FARF(HIGH, "swiglu-f32: unaligned addresses in elementwise op, possibly slower execution\n"); } - if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) { - opt_path = 1; - } const uint8_t * restrict data_src0 = (const uint8_t *) src0->data; const uint8_t * restrict data_src1 = (const uint8_t *) src1->data; uint8_t * restrict data_dst = (uint8_t *) dst->data; - bool src1_valid = src1->ne[0]; + const bool src1_valid = src1->ne[0]; + const int nc = (src1_valid) ? ne00 : ne00 / 2; if (!src1_valid) { - data_src1 = data_src0; - src1_row_size = src0_row_size; + const int32_t swapped = op_params[1]; + data_src1 = data_src0; + src1_row_size = src0_row_size; + + const size_t nc_in_bytes = nc * SIZEOF_FP32; + data_src0 += swapped ? nc_in_bytes : 0; + data_src1 += swapped ? 0 : nc_in_bytes; } uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_row_size); uint8_t * restrict src1_spad_data = src1_spad->data + (ith * src1_row_size); uint8_t * restrict dst_spad_data = dst_spad->data + (ith * dst_row_size); - const int32_t swapped = op_params[1]; - - const int nc = (src1_valid) ? ne0 : ne0 / 2; - + const bool opt_path = ((1 == is_aligned) && !(nb01 & (VLEN - 1))); for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) { const float * restrict src0 = (float *) (data_src0 + (ir * src0_row_size)); const float * restrict src1 = (float *) (data_src1 + (ir * src1_row_size)); @@ -142,12 +141,7 @@ static void glu_swiglu_fp32_per_thread(const struct htp_tensor * src0, htp_l2fetch(src0 + src0_row_size, 1, src0_row_size, src0_row_size); } - if (!src1_valid) { - src0 += swapped ? nc : 0; - src1 += swapped ? 0 : nc; - } - - if (1 == opt_path) { + if (opt_path) { hvx_fast_sigmoid_f32((const uint8_t *) src0, (uint8_t *) src0_spad_data, nc); hvx_mul_mul_f32_opt((const uint8_t *) src0, (const uint8_t *) src0_spad_data, (const uint8_t *) src1, (uint8_t *) dst, nc); @@ -218,7 +212,7 @@ static void glu_swiglu_oai_fp32_per_thread(const struct htp_tensor * src0, const float alpha = ((const float *) (op_params))[2]; const float limit = ((const float *) (op_params))[3]; - const int nc = (src1_valid) ? ne0 : ne0 / 2; + const int nc = (src1_valid) ? ne00 : ne00 / 2; for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) { const float * restrict src0 = (float *) (data_src0 + (ir * src0_row_size)); diff --git a/ggml/src/ggml-hexagon/htp/hvx-exp.c b/ggml/src/ggml-hexagon/htp/hvx-exp.c index 19f6795083..d0735e9325 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-exp.c +++ b/ggml/src/ggml-hexagon/htp/hvx-exp.c @@ -16,6 +16,19 @@ #include "hvx-utils.h" #include "ops-utils.h" +static inline HVX_Vector hvx_vec_exp_fp32_guard(HVX_Vector in_vec) { + static const float kInf = INFINITY; + static const float kMaxExp = 88.02f; // log(INF) + + const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp); + const HVX_Vector inf = hvx_vec_splat_fp32(kInf); + const HVX_VectorPred pred0 = Q6_Q_vcmp_gt_VsfVsf(in_vec, max_exp); + + HVX_Vector out = hvx_vec_exp_fp32(in_vec); + + return Q6_V_vmux_QVV(pred0, inf, out); +} + void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems, bool negate) { int left_over = num_elems & (VLEN_FP32 - 1); int num_elems_whole = num_elems - left_over; @@ -42,9 +55,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { if (true == negate) { HVX_Vector neg_vec_in = hvx_vec_neg_fp32(*p_vec_in1++); - *p_vec_out++ = hvx_vec_exp_fp32(neg_vec_in); + *p_vec_out++ = hvx_vec_exp_fp32_guard(neg_vec_in); } else { - *p_vec_out++ = hvx_vec_exp_fp32(*p_vec_in1++); + *p_vec_out++ = hvx_vec_exp_fp32_guard(*p_vec_in1++); } } } else { @@ -54,9 +67,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int if (true == negate) { HVX_Vector neg_vec_in = hvx_vec_neg_fp32(in); - *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32(neg_vec_in); + *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(neg_vec_in); } else { - *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32(in); + *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(in); } } } @@ -70,9 +83,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int if (true == negate) { HVX_Vector neg_vec_in = hvx_vec_neg_fp32(in); - vec_out = hvx_vec_exp_fp32(neg_vec_in); + vec_out = hvx_vec_exp_fp32_guard(neg_vec_in); } else { - vec_out = hvx_vec_exp_fp32(in); + vec_out = hvx_vec_exp_fp32_guard(in); } hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, vec_out); diff --git a/ggml/src/ggml-hexagon/htp/hvx-inverse.c b/ggml/src/ggml-hexagon/htp/hvx-inverse.c index 4cf588a878..953d3e6c16 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-inverse.c +++ b/ggml/src/ggml-hexagon/htp/hvx-inverse.c @@ -38,13 +38,13 @@ void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { - *p_vec_out++ = hvx_vec_inverse_fp32(*p_vec_in++); + *p_vec_out++ = hvx_vec_inverse_fp32_guard(*p_vec_in++); } } else { #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32); - *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_inverse_fp32(in); + *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_inverse_fp32_guard(in); } } @@ -53,7 +53,7 @@ void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const float * dstf = (float *) dst + num_elems_whole; HVX_Vector in = *(HVX_UVector *) srcf; - HVX_Vector out = hvx_vec_inverse_fp32(in); + HVX_Vector out = hvx_vec_inverse_fp32_guard(in); hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, out); } diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.c b/ggml/src/ggml-hexagon/htp/hvx-utils.c index d3599bc9c1..e02b1d9099 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.c +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.c @@ -401,7 +401,9 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * FARF(HIGH, "hvx_add_scalar_f32: unaligned loop in hvx op, possibly slower execution\n"); } - HVX_Vector val_vec = hvx_vec_splat_fp32(val); + static const float kInf = INFINITY; + const HVX_Vector inf = hvx_vec_splat_fp32(kInf); + HVX_Vector val_vec = hvx_vec_splat_fp32(val); if (0 == unaligned_loop) { HVX_Vector * restrict vec_in1 = (HVX_Vector *) src; @@ -409,17 +411,24 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { - HVX_Vector v = Q6_Vqf32_vadd_VsfVsf(*vec_in1++, val_vec); - *vec_out++ = Q6_Vsf_equals_Vqf32(v); + HVX_Vector in = *vec_in1++; + const HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(inf, in); + HVX_Vector v = Q6_Vqf32_vadd_VsfVsf(in, val_vec); + v = Q6_Vsf_equals_Vqf32(v); + v = Q6_V_vmux_QVV(pred_inf, inf, v); + *vec_out++ = v; } } else { #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32); - HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in, val_vec); + const HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(inf, in); + HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in, val_vec); + out = Q6_Vsf_equals_Vqf32(out); + out = Q6_V_vmux_QVV(pred_inf, inf, out); - *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out); + *(HVX_UVector *) (dst + i * SIZEOF_FP32) = out; } } @@ -429,8 +438,12 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * HVX_Vector in = *(HVX_UVector *) srcf; - HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in, val_vec); - hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out)); + const HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(inf, in); + HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in, val_vec); + out = Q6_Vsf_equals_Vqf32(out); + out = Q6_V_vmux_QVV(pred_inf, inf, out); + + hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, out); } } diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h index b2ca8e88f4..5f94645cde 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.h +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h @@ -12,6 +12,15 @@ #define VLEN_FP32 (VLEN / SIZEOF_FP32) #define VLEN_FP16 (VLEN / SIZEOF_FP16) +typedef union { + HVX_Vector v; + uint8_t b[VLEN]; + uint16_t h[VLEN_FP16]; + uint32_t w[VLEN_FP32]; + __fp16 fp16[VLEN_FP16]; + float fp32[VLEN_FP32]; +} __attribute__((aligned(VLEN), packed)) HVX_VectorAlias; + static inline HVX_Vector hvx_vec_splat_fp32(float i) { union { float f; @@ -243,19 +252,16 @@ static __attribute__((always_inline)) int32_t is_in_one_chunk(void * addr, uint3 } static void hvx_vec_dump_fp16_n(char * pref, HVX_Vector v, uint32_t n) { - union { - HVX_Vector v; - __fp16 d[64]; - } u = { .v = v }; + HVX_VectorAlias u = { .v = v }; const uint32_t n0 = n / 16; const uint32_t n1 = n % 16; int i = 0; for (; i < n0; i++) { - htp_dump_fp16_line(pref, u.d + (16 * i), 16); + htp_dump_fp16_line(pref, u.fp16 + (16 * i), 16); } if (n1) { - htp_dump_fp16_line(pref, u.d + (16 * i), n1); + htp_dump_fp16_line(pref, u.fp16 + (16 * i), n1); } } @@ -411,8 +417,8 @@ static inline HVX_Vector hvx_vec_fp32_reduce_sum_n(HVX_Vector in, unsigned int n HVX_Vector sum = in, sum_t; while (width < total) { - sum_t = Q6_V_vror_VR(sum, width); // rotate right - sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(sum, sum_t)); // elementwise sum + sum_t = Q6_V_vror_VR(sum, width); // rotate right + sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(sum, sum_t)); // elementwise sum width = width << 1; } return sum; @@ -491,7 +497,7 @@ static inline HVX_Vector hvx_vec_abs_fp16(HVX_Vector v) { static inline HVX_Vector hvx_vec_neg_fp16(HVX_Vector v) { // neg by setting the fp16 sign bit HVX_Vector mask = Q6_Vh_vsplat_R(0x8000); - return Q6_V_vor_VV(v, mask); + return Q6_V_vxor_VV(v, mask); } static inline HVX_Vector hvx_vec_abs_fp32(HVX_Vector v) { @@ -506,7 +512,7 @@ static inline HVX_Vector hvx_vec_neg_fp32(HVX_Vector v) { #else // neg by setting the fp32 sign bit HVX_Vector mask = Q6_V_vsplat_R(0x80000000); - return Q6_V_vor_VV(v, mask); + return Q6_V_vxor_VV(v, mask); #endif // __HTP_ARCH__ > 75 } @@ -720,6 +726,24 @@ static inline HVX_Vector hvx_vec_inverse_fp32(HVX_Vector v_sf) { return Q6_Vsf_equals_Vqf32(r_qf); } +static inline HVX_Vector hvx_vec_inverse_fp32_guard(HVX_Vector v_sf) { + static const float kInf = INFINITY; + static const uint32_t kNanMask = 0x7fffffff; + static const uint32_t kNanMin = 0x7f800000; + + const HVX_Vector inf = hvx_vec_splat_fp32(kInf); + const HVX_VectorPred pred_inf = Q6_Q_vcmp_gt_VsfVsf(inf, v_sf); + + HVX_Vector out = hvx_vec_inverse_fp32(v_sf); + + const HVX_Vector nan_mask = Q6_V_vsplat_R(kNanMask); + const HVX_Vector nan_min = Q6_V_vsplat_R(kNanMin); + HVX_Vector masked_out = Q6_V_vand_VV(out, nan_mask); + const HVX_VectorPred pred = Q6_Q_vcmp_gtand_QVuwVuw(pred_inf, nan_min, masked_out); + + return Q6_V_vmux_QVV(pred, out, Q6_V_vzero()); +} + #define FAST_SIGMOID_LOG2F (0x3fb8aa3b) // 1.442695022 #define FAST_SIGMOID_C1 (0x3d009076) // 0.03138777 #define FAST_SIGMOID_C2 (0x3e8d74bd) // 0.276281267 @@ -934,6 +958,16 @@ static inline HVX_Vector hvx_vec_rsqrt_fp32(HVX_Vector in_vec) { return Q6_Vsf_equals_Vqf32(temp); } +static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard(HVX_Vector v) { + static const float kMaxExp = -88.02f; // log(INF) + + const HVX_Vector max_exp = Q6_V_vsplat_R(*((uint32_t *) &kMaxExp)); + const HVX_VectorPred pred_inf = Q6_Q_vcmp_gt_VsfVsf(v, max_exp); + + HVX_Vector out = hvx_vec_fast_sigmoid_fp32(v); + return Q6_V_vmux_QVV(pred_inf, out, Q6_V_vzero()); +} + static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) { int step_of_1 = num_elems >> 5; int remaining = num_elems - step_of_1 * VLEN_FP32; @@ -945,7 +979,7 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * #pragma unroll(4) for (int i = 0; i < step_of_1; i++) { - v_dst[i] = hvx_vec_fast_sigmoid_fp32(v_src[i]); + v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i]); } } diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp index 366c54ebec..e46970dad4 100644 --- a/ggml/src/ggml-metal/ggml-metal-ops.cpp +++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp @@ -11,6 +11,7 @@ #include #include #include +#include static ggml_metal_buffer_id ggml_metal_get_buffer_id(const ggml_tensor * t) { if (!t) { diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index f83dfdaef6..d4f27af8fc 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -513,6 +513,7 @@ struct vk_device_struct { vk_queue compute_queue; vk_queue transfer_queue; bool single_queue; + bool support_async; uint32_t subgroup_size; uint32_t shader_core_count; bool uma; @@ -4273,6 +4274,16 @@ static vk_device ggml_vk_get_device(size_t idx) { device->vendor_id = device->properties.vendorID; device->driver_id = driver_props.driverID; + // Implementing the async backend interfaces seems broken on older Intel HW, + // see https://github.com/ggml-org/llama.cpp/issues/17302. + device->support_async = (device->vendor_id != VK_VENDOR_ID_INTEL || + std::string(device->properties.deviceName.data()).find("(DG1)") == std::string::npos) && + getenv("GGML_VK_DISABLE_ASYNC") == nullptr; + + if (!device->support_async) { + GGML_LOG_DEBUG("ggml_vulkan: WARNING: Async execution disabled on certain Intel devices.\n"); + } + const char* GGML_VK_FORCE_MAX_ALLOCATION_SIZE = getenv("GGML_VK_FORCE_MAX_ALLOCATION_SIZE"); if (GGML_VK_FORCE_MAX_ALLOCATION_SIZE != nullptr) { @@ -13187,6 +13198,10 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg ctx->device->perf_logger->print_timings(); } + if (!ctx->device->support_async) { + ggml_vk_synchronize(ctx); + } + return GGML_STATUS_SUCCESS; UNUSED(backend); @@ -13480,6 +13495,10 @@ ggml_backend_t ggml_backend_vk_init(size_t dev_num) { /* .context = */ ctx, }; + if (!ctx->device->support_async) { + vk_backend->iface.get_tensor_async = nullptr; + } + return vk_backend; } diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 46173585f2..c9056b59c7 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -7b6abb2b92fcef35cb01c6ce6ada9bd85306522d +781baf2a14d9e0aaee542b2e1bb918bfc4132199 diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp index bed706bb24..b3c5eb5717 100644 --- a/src/llama-grammar.cpp +++ b/src/llama-grammar.cpp @@ -6,8 +6,10 @@ #include #include +#include #include +#define MAX_REPETITION_THRESHOLD 2000 // // helpers // @@ -345,8 +347,10 @@ const char * llama_grammar_parser::parse_sequence( size_t last_sym_start = rule.size(); const char * pos = src; - auto handle_repetitions = [&](int min_times, int max_times) { - + // use UINT64_MAX as the empty value because we aligned to the proper uint64_t type so -1 can't be used + // (though it's technically the same as -1 now) + auto handle_repetitions = [&](uint64_t min_times, uint64_t max_times) { + bool no_max = max_times == UINT64_MAX; if (last_sym_start == rule.size()) { throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos); } @@ -373,20 +377,20 @@ const char * llama_grammar_parser::parse_sequence( rule.resize(last_sym_start); } else { // Repeat the previous elements (min_times - 1) times - for (int i = 1; i < min_times; i++) { + for (uint64_t i = 1; i < min_times; i++) { rule.insert(rule.end(), prev_rule.begin(), prev_rule.end()); } } uint32_t last_rec_rule_id = 0; - auto n_opt = max_times < 0 ? 1 : max_times - min_times; + auto n_opt = no_max ? 1 : max_times - min_times; llama_grammar_rule rec_rule(prev_rule); - for (int i = 0; i < n_opt; i++) { + for (uint64_t i = 0; i < n_opt; i++) { rec_rule.resize(prev_rule.size()); uint32_t rec_rule_id = generate_symbol_id( rule_name); - if (i > 0 || max_times < 0) { - rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times < 0 ? rec_rule_id : last_rec_rule_id}); + if (i > 0 || no_max) { + rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, no_max ? rec_rule_id : last_rec_rule_id}); } rec_rule.push_back({LLAMA_GRETYPE_ALT, 0}); rec_rule.push_back({LLAMA_GRETYPE_END, 0}); @@ -478,10 +482,10 @@ const char * llama_grammar_parser::parse_sequence( throw std::runtime_error(std::string("expecting an int at ") + pos); } const char * int_end = parse_int(pos); - int min_times = std::stoul(std::string(pos, int_end - pos)); + uint64_t min_times = std::stoul(std::string(pos, int_end - pos)); pos = parse_space(int_end, is_nested); - int max_times = -1; + uint64_t max_times = UINT64_MAX; // default: no max limit if (*pos == '}') { max_times = min_times; @@ -502,6 +506,10 @@ const char * llama_grammar_parser::parse_sequence( } else { throw std::runtime_error(std::string("expecting ',' at ") + pos); } + bool has_max = max_times != UINT64_MAX; + if (min_times > MAX_REPETITION_THRESHOLD || (has_max && max_times > MAX_REPETITION_THRESHOLD)) { + throw std::runtime_error(std::string("number of repetitions exceeds sane defaults, please reduce the number of repetitions")); + } handle_repetitions(min_times, max_times); } else { break; diff --git a/src/llama-impl.cpp b/src/llama-impl.cpp index 6ec709dd32..c7a1880aad 100644 --- a/src/llama-impl.cpp +++ b/src/llama-impl.cpp @@ -20,10 +20,10 @@ static llama_logger_state g_logger_state; time_meas::time_meas(int64_t & t_acc, bool disable) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {} time_meas::~time_meas() { - if (t_start_us >= 0) { - t_acc += ggml_time_us() - t_start_us; - } + if (t_start_us >= 0) { + t_acc += ggml_time_us() - t_start_us; } +} void llama_log_set(ggml_log_callback log_callback, void * user_data) { ggml_log_set(log_callback, user_data); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 1647b85453..29fc6bbc63 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1593,7 +1593,8 @@ void llama_model::load_hparams(llama_model_loader & ml) { } break; case LLM_ARCH_DEEPSEEK2: { - bool is_lite = (hparams.n_layer == 27); + // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B + bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead); if (!is_lite) { @@ -4581,7 +4582,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } break; case LLM_ARCH_DEEPSEEK2: { - const bool is_lite = (hparams.n_layer == 27); + // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B + const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26); const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0); diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index c126b70226..621438a9cf 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -534,9 +534,6 @@ static void llama_sampler_chain_reset(struct llama_sampler * smpl) { for (auto * smpl : chain->samplers) { llama_sampler_reset(smpl); } - - chain->t_sample_us = 0; - chain->n_sample = 0; } static struct llama_sampler * llama_sampler_chain_clone(const struct llama_sampler * smpl) { @@ -2854,8 +2851,7 @@ struct llama_perf_sampler_data llama_perf_sampler(const struct llama_sampler * c void llama_perf_sampler_print(const struct llama_sampler * chain) { const auto data = llama_perf_sampler(chain); - LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample); + LLAMA_LOG_INFO("%s: samplers time = %10.2f ms / %5d runs\n", __func__, data.t_sample_ms, data.n_sample); } void llama_perf_sampler_reset(struct llama_sampler * chain) { @@ -2865,5 +2861,6 @@ void llama_perf_sampler_reset(struct llama_sampler * chain) { auto * ctx = (struct llama_sampler_chain *) chain->ctx; - ctx->t_sample_us = ctx->n_sample = 0; + ctx->t_sample_us = 0; + ctx->n_sample = 0; } diff --git a/src/models/deepseek2.cpp b/src/models/deepseek2.cpp index 68f72f72bb..0b41f7ba8e 100644 --- a/src/models/deepseek2.cpp +++ b/src/models/deepseek2.cpp @@ -4,7 +4,8 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - bool is_lite = (hparams.n_layer == 27); + // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B + bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26); const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0); diff --git a/tools/main/main.cpp b/tools/main/main.cpp index 06185e47eb..64855b646f 100644 --- a/tools/main/main.cpp +++ b/tools/main/main.cpp @@ -161,11 +161,15 @@ int main(int argc, char ** argv) { return 1; } - auto * mem = llama_get_memory(ctx); - + llama_memory_t mem = llama_get_memory(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); + + // note: the time for chat template initialization is not negligible: auto chat_templates = common_chat_templates_init(model, params.chat_template); + // start measuring performance timings from here + llama_perf_context_reset(ctx); + LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads); auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); diff --git a/tools/server/public/index.html.gz b/tools/server/public/index.html.gz index 8842620a7a..47bf04750a 100644 Binary files a/tools/server/public/index.html.gz and b/tools/server/public/index.html.gz differ diff --git a/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentPreview.svelte b/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentPreview.svelte new file mode 100644 index 0000000000..212b1fe890 --- /dev/null +++ b/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentPreview.svelte @@ -0,0 +1,273 @@ + + +
+
+ {#if isPdf} +
+ + + +
+ {/if} +
+ +
+ {#if isImage && displayPreview} +
+ {displayName} +
+ {:else if isPdf && pdfViewMode === 'pages'} + {#if pdfImagesLoading} +
+
+
+ +

Converting PDF to images...

+
+
+ {:else if pdfImagesError} +
+
+ + +

Failed to load PDF images

+ +

{pdfImagesError}

+ + +
+
+ {:else if pdfImages.length > 0} +
+ {#each pdfImages as image, index (image)} +
+

Page {index + 1}

+ + PDF Page {index + 1} +
+ {/each} +
+ {:else} +
+
+ + +

No PDF pages available

+
+
+ {/if} + {:else if (isText || (isPdf && pdfViewMode === 'text')) && displayTextContent} +
+ {displayTextContent} +
+ {:else if isAudio} +
+
+ + + {#if attachment?.type === 'audioFile'} + + {:else if uploadedFile?.preview} + + {:else} +

Audio preview not available

+ {/if} + +

+ {displayName} +

+
+
+ {:else} +
+
+ {#if IconComponent} + + {/if} + +

Preview not available for this file type

+
+
+ {/if} +
+
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentPreviewDialog.svelte b/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentPreviewDialog.svelte deleted file mode 100644 index 8a3389b657..0000000000 --- a/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentPreviewDialog.svelte +++ /dev/null @@ -1,314 +0,0 @@ - - - - - -
-
- {#if IconComponent} - - {/if} - -
- {displayName} - -
- {displayType} - - {#if displaySize} - - - {formatFileSize(displaySize)} - {/if} -
-
-
- - {#if isPdf} -
- - - -
- {/if} -
-
- -
- {#if isImage && displayPreview} -
- {displayName} -
- {:else if isPdf && pdfViewMode === 'pages'} - {#if pdfImagesLoading} -
-
-
- -

Converting PDF to images...

-
-
- {:else if pdfImagesError} -
-
- - -

Failed to load PDF images

- -

{pdfImagesError}

- - -
-
- {:else if pdfImages.length > 0} -
- {#each pdfImages as image, index (image)} -
-

Page {index + 1}

- - PDF Page {index + 1} -
- {/each} -
- {:else} -
-
- - -

No PDF pages available

-
-
- {/if} - {:else if (isText || (isPdf && pdfViewMode === 'text')) && displayTextContent} -
- {displayTextContent} -
- {:else if isAudio} -
-
- - - {#if attachment?.type === 'audioFile'} - - {:else if uploadedFile?.preview} - - {:else} -

Audio preview not available

- {/if} - -

- {displayName} -

-
-
- {:else} -
-
- {#if IconComponent} - - {/if} - -

Preview not available for this file type

-
-
- {/if} -
-
-
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentFilePreview.svelte b/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentThumbnailFile.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentFilePreview.svelte rename to tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentThumbnailFile.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentImagePreview.svelte b/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentThumbnailImage.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentImagePreview.svelte rename to tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentThumbnailImage.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList.svelte b/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList.svelte index a2aea0232a..050c793316 100644 --- a/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList.svelte +++ b/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList.svelte @@ -1,11 +1,10 @@ - - - - - - - All Attachments ({displayItems.length}) - - View and manage all attached files - - - -
- {#if fileItems.length > 0} -
-

Files ({fileItems.length})

-
- {#each fileItems as item (item.id)} - openPreview(item, event)} - /> - {/each} -
-
- {/if} - - {#if imageItems.length > 0} -
-

Images ({imageItems.length})

-
- {#each imageItems as item (item.id)} - {#if item.preview} - openPreview(item, event)} - /> - {/if} - {/each} -
-
- {/if} +
+
+ {#if fileItems.length > 0} +
+

Files ({fileItems.length})

+
+ {#each fileItems as item (item.id)} + openPreview(item, event)} + /> + {/each} +
- - - + {/if} + + {#if imageItems.length > 0} +
+

Images ({imageItems.length})

+
+ {#each imageItems as item (item.id)} + {#if item.preview} + openPreview(item, event)} + /> + {/if} + {/each} +
+
+ {/if} +
+
{#if previewItem} - import { Square, ArrowUp } from '@lucide/svelte'; import { Button } from '$lib/components/ui/button'; - import ChatFormActionFileAttachments from './ChatFormActionFileAttachments.svelte'; - import ChatFormActionRecord from './ChatFormActionRecord.svelte'; - import ChatFormModelSelector from './ChatFormModelSelector.svelte'; + import { + ChatFormActionFileAttachments, + ChatFormActionRecord, + ChatFormModelSelector + } from '$lib/components/app'; import { config } from '$lib/stores/settings.svelte'; import type { FileTypeCategory } from '$lib/enums/files'; diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageActions.svelte b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageActions.svelte index d37d806514..ff335c328c 100644 --- a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageActions.svelte +++ b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageActions.svelte @@ -1,7 +1,10 @@ - - -
- - + +
+
+ +
+ - -
-
- Settings - - -
- - -
-
- {#each settingSections as section (section.title)} - - {/each} -
-
- - +
+
+ {#each settingSections as section (section.title)} + + {/each}
+ +
- - -
-
- - - {#if currentSection.title === 'Import/Export'} - - {:else} -
- -
- {/if} -
- -
-

- Settings are saved in browser's localStorage -

-
-
-
+
- - - + +
+
+ + + {#if currentSection.title === 'Import/Export'} + + {:else} +
+ +
+ {/if} +
+ +
+

Settings are saved in browser's localStorage

+
+
+
+
+ + diff --git a/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsFields.svelte b/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsFields.svelte index 7412753b3a..1bafaf137a 100644 --- a/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsFields.svelte +++ b/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsFields.svelte @@ -9,7 +9,7 @@ import { supportsVision } from '$lib/stores/server.svelte'; import { getParameterInfo, resetParameterToServerDefault } from '$lib/stores/settings.svelte'; import { ParameterSyncService } from '$lib/services/parameter-sync'; - import ParameterSourceIndicator from './ParameterSourceIndicator.svelte'; + import { ChatSettingsParameterSourceIndicator } from '$lib/components/app'; import type { Component } from 'svelte'; interface Props { @@ -63,7 +63,7 @@ {/if} {#if isCustomRealTime} - + {/if}
@@ -145,7 +145,7 @@ {/if} {#if isCustomRealTime} - + {/if}
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatSettings/ImportExportTab.svelte b/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsImportExportTab.svelte similarity index 98% rename from tools/server/webui/src/lib/components/app/chat/ChatSettings/ImportExportTab.svelte rename to tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsImportExportTab.svelte index 19c982c7b4..b2adf3942e 100644 --- a/tools/server/webui/src/lib/components/app/chat/ChatSettings/ImportExportTab.svelte +++ b/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsImportExportTab.svelte @@ -1,7 +1,7 @@ - - - - - - - - - Select Conversations to {mode === 'export' ? 'Export' : 'Import'} - - - - {#if mode === 'export'} - Choose which conversations you want to export. Selected conversations will be downloaded - as a JSON file. - {:else} - Choose which conversations you want to import. Selected conversations will be merged - with your existing conversations. - {/if} - - - -
-
- - - - - {#if searchQuery} - - {/if} -
- -
- - {selectedIds.size} of {conversations.length} selected - {#if searchQuery} - ({filteredConversations.length} shown) - {/if} - -
- -
- - - - - - - - - - - - - {#if filteredConversations.length === 0} - - - - {:else} - {#each filteredConversations as conv (conv.id)} - toggleConversation(conv.id, e.shiftKey)} - > - - - - - - - {/each} - {/if} - -
- - Conversation NameMessages
- {#if searchQuery} - No conversations found matching "{searchQuery}" - {:else} - No conversations available - {/if} -
- { - e.preventDefault(); - e.stopPropagation(); - toggleConversation(conv.id, e.shiftKey); - }} - /> - -
- {conv.name || 'Untitled conversation'} -
-
- {messageCountMap.get(conv.id) ?? 0} -
-
-
-
- - - - - - -
-
-
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebar.svelte b/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebar.svelte index 5976e5dd03..34f3da53ea 100644 --- a/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebar.svelte +++ b/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebar.svelte @@ -2,7 +2,7 @@ import { goto } from '$app/navigation'; import { page } from '$app/state'; import { Trash2 } from '@lucide/svelte'; - import { ChatSidebarConversationItem, ConfirmationDialog } from '$lib/components/app'; + import { ChatSidebarConversationItem, DialogConfirmation } from '$lib/components/app'; import ScrollArea from '$lib/components/ui/scroll-area/scroll-area.svelte'; import * as Sidebar from '$lib/components/ui/sidebar'; import * as AlertDialog from '$lib/components/ui/alert-dialog'; @@ -158,7 +158,7 @@
- + import * as Dialog from '$lib/components/ui/dialog'; + import { ChatAttachmentPreview } from '$lib/components/app'; + import { formatFileSize } from '$lib/utils/file-preview'; + + interface Props { + open: boolean; + // Either an uploaded file or a stored attachment + uploadedFile?: ChatUploadedFile; + attachment?: DatabaseMessageExtra; + // For uploaded files + preview?: string; + name?: string; + type?: string; + size?: number; + textContent?: string; + } + + let { + open = $bindable(), + uploadedFile, + attachment, + preview, + name, + type, + size, + textContent + }: Props = $props(); + + let chatAttachmentPreviewRef: ChatAttachmentPreview | undefined = $state(); + + let displayName = $derived(uploadedFile?.name || attachment?.name || name || 'Unknown File'); + + let displayType = $derived( + uploadedFile?.type || + (attachment?.type === 'imageFile' + ? 'image' + : attachment?.type === 'textFile' + ? 'text' + : attachment?.type === 'audioFile' + ? attachment.mimeType || 'audio' + : attachment?.type === 'pdfFile' + ? 'application/pdf' + : type || 'unknown') + ); + + let displaySize = $derived(uploadedFile?.size || size); + + $effect(() => { + if (open && chatAttachmentPreviewRef) { + chatAttachmentPreviewRef.reset(); + } + }); + + + + + + {displayName} + + {displayType} + {#if displaySize} + • {formatFileSize(displaySize)} + {/if} + + + + + + diff --git a/tools/server/webui/src/lib/components/app/dialogs/DialogChatAttachmentsViewAll.svelte b/tools/server/webui/src/lib/components/app/dialogs/DialogChatAttachmentsViewAll.svelte new file mode 100644 index 0000000000..8f6ca76d42 --- /dev/null +++ b/tools/server/webui/src/lib/components/app/dialogs/DialogChatAttachmentsViewAll.svelte @@ -0,0 +1,51 @@ + + + + + + + + + All Attachments ({totalCount}) + View and manage all attached files + + + + + + diff --git a/tools/server/webui/src/lib/components/app/dialogs/ChatErrorDialog.svelte b/tools/server/webui/src/lib/components/app/dialogs/DialogChatError.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/dialogs/ChatErrorDialog.svelte rename to tools/server/webui/src/lib/components/app/dialogs/DialogChatError.svelte diff --git a/tools/server/webui/src/lib/components/app/dialogs/DialogChatSettings.svelte b/tools/server/webui/src/lib/components/app/dialogs/DialogChatSettings.svelte new file mode 100644 index 0000000000..e9aaa1000b --- /dev/null +++ b/tools/server/webui/src/lib/components/app/dialogs/DialogChatSettings.svelte @@ -0,0 +1,37 @@ + + + + + + + diff --git a/tools/server/webui/src/lib/components/app/dialogs/ConfirmationDialog.svelte b/tools/server/webui/src/lib/components/app/dialogs/DialogConfirmation.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/dialogs/ConfirmationDialog.svelte rename to tools/server/webui/src/lib/components/app/dialogs/DialogConfirmation.svelte diff --git a/tools/server/webui/src/lib/components/app/dialogs/DialogConversationSelection.svelte b/tools/server/webui/src/lib/components/app/dialogs/DialogConversationSelection.svelte new file mode 100644 index 0000000000..1f8ea64bed --- /dev/null +++ b/tools/server/webui/src/lib/components/app/dialogs/DialogConversationSelection.svelte @@ -0,0 +1,68 @@ + + + + + + + + + + Select Conversations to {mode === 'export' ? 'Export' : 'Import'} + + + {#if mode === 'export'} + Choose which conversations you want to export. Selected conversations will be downloaded + as a JSON file. + {:else} + Choose which conversations you want to import. Selected conversations will be merged + with your existing conversations. + {/if} + + + + + + + diff --git a/tools/server/webui/src/lib/components/app/dialogs/ConversationTitleUpdateDialog.svelte b/tools/server/webui/src/lib/components/app/dialogs/DialogConversationTitleUpdate.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/dialogs/ConversationTitleUpdateDialog.svelte rename to tools/server/webui/src/lib/components/app/dialogs/DialogConversationTitleUpdate.svelte diff --git a/tools/server/webui/src/lib/components/app/dialogs/EmptyFileAlertDialog.svelte b/tools/server/webui/src/lib/components/app/dialogs/DialogEmptyFileAlert.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/dialogs/EmptyFileAlertDialog.svelte rename to tools/server/webui/src/lib/components/app/dialogs/DialogEmptyFileAlert.svelte diff --git a/tools/server/webui/src/lib/components/app/index.ts b/tools/server/webui/src/lib/components/app/index.ts index a695f99747..54bd8d5aa3 100644 --- a/tools/server/webui/src/lib/components/app/index.ts +++ b/tools/server/webui/src/lib/components/app/index.ts @@ -1,56 +1,63 @@ +// Chat + +export { default as ChatAttachmentPreview } from './chat/ChatAttachments/ChatAttachmentPreview.svelte'; +export { default as ChatAttachmentThumbnailFile } from './chat/ChatAttachments/ChatAttachmentThumbnailFile.svelte'; +export { default as ChatAttachmentThumbnailImage } from './chat/ChatAttachments/ChatAttachmentThumbnailImage.svelte'; export { default as ChatAttachmentsList } from './chat/ChatAttachments/ChatAttachmentsList.svelte'; -export { default as ChatAttachmentFilePreview } from './chat/ChatAttachments/ChatAttachmentFilePreview.svelte'; -export { default as ChatAttachmentImagePreview } from './chat/ChatAttachments/ChatAttachmentImagePreview.svelte'; -export { default as ChatAttachmentPreviewDialog } from './chat/ChatAttachments/ChatAttachmentPreviewDialog.svelte'; -export { default as ChatAttachmentsViewAllDialog } from './chat/ChatAttachments/ChatAttachmentsViewAllDialog.svelte'; +export { default as ChatAttachmentsViewAll } from './chat/ChatAttachments/ChatAttachmentsViewAll.svelte'; export { default as ChatForm } from './chat/ChatForm/ChatForm.svelte'; -export { default as ChatFormTextarea } from './chat/ChatForm/ChatFormTextarea.svelte'; -export { default as ChatFormActions } from './chat/ChatForm/ChatFormActions.svelte'; -export { default as ChatFormActionFileAttachments } from './chat/ChatForm/ChatFormActionFileAttachments.svelte'; -export { default as ChatFormActionRecord } from './chat/ChatForm/ChatFormActionRecord.svelte'; -export { default as ChatFormModelSelector } from './chat/ChatForm/ChatFormModelSelector.svelte'; -export { default as ChatFormHelperText } from './chat/ChatForm/ChatFormHelperText.svelte'; +export { default as ChatFormActionFileAttachments } from './chat/ChatForm/ChatFormActions/ChatFormActionFileAttachments.svelte'; +export { default as ChatFormActionRecord } from './chat/ChatForm/ChatFormActions/ChatFormActionRecord.svelte'; +export { default as ChatFormActions } from './chat/ChatForm/ChatFormActions/ChatFormActions.svelte'; export { default as ChatFormFileInputInvisible } from './chat/ChatForm/ChatFormFileInputInvisible.svelte'; +export { default as ChatFormHelperText } from './chat/ChatForm/ChatFormHelperText.svelte'; +export { default as ChatFormModelSelector } from './chat/ChatForm/ChatFormModelSelector.svelte'; +export { default as ChatFormTextarea } from './chat/ChatForm/ChatFormTextarea.svelte'; export { default as ChatMessage } from './chat/ChatMessages/ChatMessage.svelte'; export { default as ChatMessages } from './chat/ChatMessages/ChatMessages.svelte'; +export { default as ChatMessageBranchingControls } from './chat/ChatMessages/ChatMessageBranchingControls.svelte'; export { default as ChatMessageThinkingBlock } from './chat/ChatMessages/ChatMessageThinkingBlock.svelte'; -export { default as MessageBranchingControls } from './chat/ChatMessages/ChatMessageBranchingControls.svelte'; -export { default as ChatProcessingInfo } from './chat/ChatProcessingInfo.svelte'; - -export { default as ChatScreenHeader } from './chat/ChatScreen/ChatScreenHeader.svelte'; -export { default as ChatScreenWarning } from './chat/ChatScreen/ChatScreenWarning.svelte'; export { default as ChatScreen } from './chat/ChatScreen/ChatScreen.svelte'; +export { default as ChatScreenHeader } from './chat/ChatScreen/ChatScreenHeader.svelte'; +export { default as ChatScreenProcessingInfo } from './chat/ChatScreen/ChatScreenProcessingInfo.svelte'; +export { default as ChatScreenWarning } from './chat/ChatScreen/ChatScreenWarning.svelte'; -export { default as ChatSettingsDialog } from './chat/ChatSettings/ChatSettingsDialog.svelte'; +export { default as ChatSettings } from './chat/ChatSettings/ChatSettings.svelte'; export { default as ChatSettingsFooter } from './chat/ChatSettings/ChatSettingsFooter.svelte'; export { default as ChatSettingsFields } from './chat/ChatSettings/ChatSettingsFields.svelte'; -export { default as ImportExportTab } from './chat/ChatSettings/ImportExportTab.svelte'; -export { default as ConversationSelectionDialog } from './chat/ChatSettings/ConversationSelectionDialog.svelte'; -export { default as ParameterSourceIndicator } from './chat/ChatSettings/ParameterSourceIndicator.svelte'; +export { default as ChatSettingsImportExportTab } from './chat/ChatSettings/ChatSettingsImportExportTab.svelte'; +export { default as ChatSettingsParameterSourceIndicator } from './chat/ChatSettings/ChatSettingsParameterSourceIndicator.svelte'; export { default as ChatSidebar } from './chat/ChatSidebar/ChatSidebar.svelte'; export { default as ChatSidebarConversationItem } from './chat/ChatSidebar/ChatSidebarConversationItem.svelte'; export { default as ChatSidebarSearch } from './chat/ChatSidebar/ChatSidebarSearch.svelte'; -export { default as ChatErrorDialog } from './dialogs/ChatErrorDialog.svelte'; -export { default as EmptyFileAlertDialog } from './dialogs/EmptyFileAlertDialog.svelte'; -export { default as ConversationTitleUpdateDialog } from './dialogs/ConversationTitleUpdateDialog.svelte'; +// Dialogs +export { default as DialogChatAttachmentPreview } from './dialogs/DialogChatAttachmentPreview.svelte'; +export { default as DialogChatAttachmentsViewAll } from './dialogs/DialogChatAttachmentsViewAll.svelte'; +export { default as DialogChatError } from './dialogs/DialogChatError.svelte'; +export { default as DialogChatSettings } from './dialogs/DialogChatSettings.svelte'; +export { default as DialogConfirmation } from './dialogs/DialogConfirmation.svelte'; +export { default as DialogConversationSelection } from './dialogs/DialogConversationSelection.svelte'; +export { default as DialogConversationTitleUpdate } from './dialogs/DialogConversationTitleUpdate.svelte'; +export { default as DialogEmptyFileAlert } from './dialogs/DialogEmptyFileAlert.svelte'; + +// Miscellanous + +export { default as ActionButton } from './misc/ActionButton.svelte'; +export { default as ActionDropdown } from './misc/ActionDropdown.svelte'; +export { default as ConversationSelection } from './misc/ConversationSelection.svelte'; export { default as KeyboardShortcutInfo } from './misc/KeyboardShortcutInfo.svelte'; - export { default as MarkdownContent } from './misc/MarkdownContent.svelte'; - export { default as RemoveButton } from './misc/RemoveButton.svelte'; +// Server + export { default as ServerStatus } from './server/ServerStatus.svelte'; export { default as ServerErrorSplash } from './server/ServerErrorSplash.svelte'; export { default as ServerLoadingSplash } from './server/ServerLoadingSplash.svelte'; export { default as ServerInfo } from './server/ServerInfo.svelte'; - -// Shared components -export { default as ActionButton } from './misc/ActionButton.svelte'; -export { default as ActionDropdown } from './misc/ActionDropdown.svelte'; -export { default as ConfirmationDialog } from './dialogs/ConfirmationDialog.svelte'; diff --git a/tools/server/webui/src/lib/components/app/misc/ConversationSelection.svelte b/tools/server/webui/src/lib/components/app/misc/ConversationSelection.svelte new file mode 100644 index 0000000000..e2095e0876 --- /dev/null +++ b/tools/server/webui/src/lib/components/app/misc/ConversationSelection.svelte @@ -0,0 +1,205 @@ + + +
+
+ + + + + {#if searchQuery} + + {/if} +
+ +
+ + {selectedIds.size} of {conversations.length} selected + {#if searchQuery} + ({filteredConversations.length} shown) + {/if} + +
+ +
+ + + + + + + + + + + + + {#if filteredConversations.length === 0} + + + + {:else} + {#each filteredConversations as conv (conv.id)} + toggleConversation(conv.id, e.shiftKey)} + > + + + + + + + {/each} + {/if} + +
+ + Conversation NameMessages
+ {#if searchQuery} + No conversations found matching "{searchQuery}" + {:else} + No conversations available + {/if} +
+ { + e.preventDefault(); + e.stopPropagation(); + toggleConversation(conv.id, e.shiftKey); + }} + /> + +
+ {conv.name || 'Untitled conversation'} +
+
+ {messageCountMap.get(conv.id) ?? 0} +
+
+
+ +
+ + + +
+
diff --git a/tools/server/webui/src/routes/+layout.svelte b/tools/server/webui/src/routes/+layout.svelte index b08bd59c15..dfe094c079 100644 --- a/tools/server/webui/src/routes/+layout.svelte +++ b/tools/server/webui/src/routes/+layout.svelte @@ -1,7 +1,7 @@ + + diff --git a/tools/server/webui/src/stories/ChatSettingsDialog.stories.svelte b/tools/server/webui/src/stories/ChatSettingsDialog.stories.svelte deleted file mode 100644 index 1e53f70708..0000000000 --- a/tools/server/webui/src/stories/ChatSettingsDialog.stories.svelte +++ /dev/null @@ -1,26 +0,0 @@ - - - - - diff --git a/vendor/cpp-httplib/CMakeLists.txt b/vendor/cpp-httplib/CMakeLists.txt index 3b42fc8c1d..8e0f8064f7 100644 --- a/vendor/cpp-httplib/CMakeLists.txt +++ b/vendor/cpp-httplib/CMakeLists.txt @@ -22,7 +22,38 @@ target_compile_definitions(${TARGET} PRIVATE CPPHTTPLIB_TCP_NODELAY=1 ) -if (LLAMA_OPENSSL) +if (LLAMA_BUILD_BORINGSSL) + set(OPENSSL_NO_ASM ON CACHE BOOL "Disable OpenSSL ASM code (BoringSSL)") + set(FIPS OFF CACHE BOOL "Enable FIPS (BoringSSL)") + + set(BORINGSSL_GIT "https://boringssl.googlesource.com/boringssl" CACHE STRING "BoringSSL git repository") + set(BORINGSSL_VERSION "0.20251002.0" CACHE STRING "BoringSSL version") + + message(STATUS "Fetching BoringSSL version ${BORINGSSL_VERSION}") + + include(FetchContent) + FetchContent_Declare( + boringssl + GIT_REPOSITORY ${BORINGSSL_GIT} + GIT_TAG ${BORINGSSL_VERSION} + PATCH_COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_SOURCE_DIR}/patch-boringssl.cmake" + ) + + set(SAVED_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS}) + set(SAVED_BUILD_TESTING ${BUILD_TESTING}) + + set(BUILD_SHARED_LIBS OFF) + set(BUILD_TESTING OFF) + + FetchContent_MakeAvailable(boringssl) + + set(BUILD_SHARED_LIBS ${SAVED_BUILD_SHARED_LIBS}) + set(BUILD_TESTING ${SAVED_BUILD_TESTING}) + + set(CPPHTTPLIB_OPENSSL_SUPPORT TRUE) + target_link_libraries(${TARGET} PUBLIC ssl crypto) + +elseif (LLAMA_OPENSSL) find_package(OpenSSL) if (OpenSSL_FOUND) include(CheckCSourceCompiles) @@ -44,17 +75,20 @@ if (LLAMA_OPENSSL) set(CMAKE_REQUIRED_INCLUDES ${SAVED_CMAKE_REQUIRED_INCLUDES}) if (OPENSSL_VERSION_SUPPORTED) message(STATUS "OpenSSL found: ${OPENSSL_VERSION}") - target_compile_definitions(${TARGET} PUBLIC CPPHTTPLIB_OPENSSL_SUPPORT) + set(CPPHTTPLIB_OPENSSL_SUPPORT TRUE) target_link_libraries(${TARGET} PUBLIC OpenSSL::SSL OpenSSL::Crypto) - if (APPLE AND CMAKE_SYSTEM_NAME STREQUAL "Darwin") - target_compile_definitions(${TARGET} PUBLIC CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN) - find_library(CORE_FOUNDATION_FRAMEWORK CoreFoundation REQUIRED) - find_library(SECURITY_FRAMEWORK Security REQUIRED) - target_link_libraries(${TARGET} PUBLIC ${CORE_FOUNDATION_FRAMEWORK} ${SECURITY_FRAMEWORK}) - endif() endif() else() message(STATUS "OpenSSL not found, SSL support disabled") endif() endif() +if (CPPHTTPLIB_OPENSSL_SUPPORT) + target_compile_definitions(${TARGET} PUBLIC CPPHTTPLIB_OPENSSL_SUPPORT) # used in server.cpp + if (APPLE AND CMAKE_SYSTEM_NAME STREQUAL "Darwin") + target_compile_definitions(${TARGET} PRIVATE CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN) + find_library(CORE_FOUNDATION_FRAMEWORK CoreFoundation REQUIRED) + find_library(SECURITY_FRAMEWORK Security REQUIRED) + target_link_libraries(${TARGET} PUBLIC ${CORE_FOUNDATION_FRAMEWORK} ${SECURITY_FRAMEWORK}) + endif() +endif() diff --git a/vendor/cpp-httplib/patch-boringssl.cmake b/vendor/cpp-httplib/patch-boringssl.cmake new file mode 100644 index 0000000000..2914e1dddb --- /dev/null +++ b/vendor/cpp-httplib/patch-boringssl.cmake @@ -0,0 +1,6 @@ +# Remove bssl +file(READ "CMakeLists.txt" content) +string(REPLACE "add_executable(bssl" "#add_executable(bssl" content "${content}") +string(REPLACE "target_link_libraries(bssl" "#target_link_libraries(bssl" content "${content}") +string(REPLACE "install(TARGETS bssl" "#install(TARGETS bssl" content "${content}") +file(WRITE "CMakeLists.txt" "${content}")