Merge branch 'ggml-org:master' into llama-quant-refactor

This commit is contained in:
Bartowski 2026-03-16 09:46:31 -04:00 committed by GitHub
commit 3adf377a15
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
54 changed files with 1962 additions and 1483 deletions

57
.github/workflows/build-3rd-party.yml vendored Normal file
View File

@ -0,0 +1,57 @@
name: CI (3rd-party)
on:
workflow_dispatch: # allows manual triggering
push:
branches:
- master
paths: [
'.github/workflows/build-3rd-party.yml',
'**/CMakeLists.txt',
'**/.cmake',
'**/*.h',
'**/*.hpp',
'**/*.c',
'**/*.cpp'
]
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
env:
GGML_NLOOP: 3
GGML_N_THREADS: 1
LLAMA_LOG_COLORS: 1
LLAMA_LOG_PREFIX: 1
LLAMA_LOG_TIMESTAMPS: 1
jobs:
ubuntu-24-llguidance:
runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential libssl-dev
- name: Build
id: cmake_build
run: |
cmake -B build \
-DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_LLGUIDANCE=ON
cmake --build build --config Release -j $(nproc)
- name: Test
id: cmake_test
run: |
cd build
ctest -L main --verbose --timeout 900

140
.github/workflows/build-android.yml vendored Normal file
View File

@ -0,0 +1,140 @@
name: CI (android)
on:
workflow_dispatch: # allows manual triggering
push:
branches:
- master
paths: [
'.github/workflows/build-android.yml',
'**/CMakeLists.txt',
'**/.cmake',
'**/*.h',
'**/*.hpp',
'**/*.c',
'**/*.cpp'
]
pull_request:
types: [opened, synchronize, reopened]
paths: [
'.github/workflows/build-android.yml',
'examples/llama.android/**'
]
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
env:
GGML_NLOOP: 3
GGML_N_THREADS: 1
LLAMA_LOG_COLORS: 1
LLAMA_LOG_PREFIX: 1
LLAMA_LOG_TIMESTAMPS: 1
jobs:
android:
runs-on: ubuntu-latest
steps:
- name: Clone
uses: actions/checkout@v6
# Disabled due to size (400MB) and always 0 cache hits
# - name: ccache
# uses: ggml-org/ccache-action@v1.2.16
# with:
# key: android-build
# evict-old-files: 1d
- name: Set up JDK
uses: actions/setup-java@v5
with:
java-version: 17
distribution: zulu
- name: Setup Android SDK
uses: android-actions/setup-android@v3
with:
log-accepted-android-sdk-licenses: false
- name: Build
run: |
cd examples/llama.android
./gradlew build --no-daemon
android-ndk:
runs-on: ubuntu-latest
env:
OPENCL_VERSION: 2025.07.22
strategy:
matrix:
include:
- build: 'arm64-cpu'
defines: '-D ANDROID_ABI=arm64-v8a -D ANDROID_PLATFORM=android-31 -D CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -D GGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm -G Ninja -D LLAMA_OPENSSL=OFF -D GGML_OPENMP=OFF'
- build: 'arm64-snapdragon'
defines: '--preset arm64-android-snapdragon-release'
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: Install OpenCL Headers and Libs
id: install_opencl
if: ${{ matrix.build == 'arm64-snapdragon' }}
run: |
mkdir opencl
curl -L -o opencl/clhpp.tar.gz https://github.com/KhronosGroup/OpenCL-CLHPP/archive/refs/tags/v${OPENCL_VERSION}.tar.gz
curl -L -o opencl/headers.tar.gz https://github.com/KhronosGroup/OpenCL-Headers/archive/refs/tags/v${OPENCL_VERSION}.tar.gz
curl -L -o opencl/icd-loader.tar.gz https://github.com/KhronosGroup/OpenCL-ICD-Loader/archive/refs/tags/v${OPENCL_VERSION}.tar.gz
tar -xaf opencl/headers.tar.gz -C opencl
tar -xaf opencl/clhpp.tar.gz -C opencl
tar -xaf opencl/icd-loader.tar.gz -C opencl
sudo cp -r opencl/OpenCL-Headers-${OPENCL_VERSION}/CL ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
sudo cp -r opencl/OpenCL-CLHPP-${OPENCL_VERSION}/include/CL/* ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include/CL
cd opencl/OpenCL-ICD-Loader-${OPENCL_VERSION}
cmake -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -DOPENCL_ICD_LOADER_HEADERS_DIR=${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=31 -DANDROID_STL=c++_shared
cmake --build build
sudo cp build/libOpenCL.so ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
rm -rf opencl
- name: Install Hexagon SDK
id: install_hexsdk
if: ${{ matrix.build == 'arm64-snapdragon' }}
env:
HEXSDK_VER: 6.4.0.2
HEXTLS_VER: 19.0.04
run: |
curl -L -o hex-sdk.tar.gz https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v$HEXSDK_VER/hexagon-sdk-v$HEXSDK_VER-amd64-lnx.tar.xz
mkdir hex-sdk
tar -xaf hex-sdk.tar.gz -C hex-sdk
ls -l hex-sdk
sudo mv hex-sdk /opt/hexagon
echo "HEXAGON_SDK_ROOT=/opt/hexagon/$HEXSDK_VER" >> "$GITHUB_ENV"
echo "HEXAGON_TOOLS_ROOT=/opt/hexagon/$HEXSDK_VER/tools/HEXAGON_Tools/$HEXTLS_VER" >> "$GITHUB_ENV"
echo "DEFAULT_HLOS_ARCH=64" >> "$GITHUB_ENV"
echo "DEFAULT_TOOLS_VARIANT=toolv19" >> "$GITHUB_ENV"
echo "DEFAULT_NO_QURT_INC=0" >> "$GITHUB_ENV"
echo "DEFAULT_DSP_ARCH=v73" >> "$GITHUB_ENV"
- name: Update CMake presets
id: update_presets
if: ${{ matrix.build == 'arm64-snapdragon' }}
run: |
cp docs/backend/snapdragon/CMakeUserPresets.json .
- name: Build
id: ndk_build
run: |
cmake ${{ matrix.defines }} -B build
cmake --build build
cmake --install build --prefix pkg-adb/llama.cpp
- name: Test
id: cmake_test
run: |
echo "FIXME: test on devices"

214
.github/workflows/build-apple.yml vendored Normal file
View File

@ -0,0 +1,214 @@
name: CI (apple)
on:
workflow_dispatch: # allows manual triggering
push:
branches:
- master
paths: [
'.github/workflows/build-apple.yml',
'**/CMakeLists.txt',
'**/.cmake',
'**/*.h',
'**/*.hpp',
'**/*.c',
'**/*.cpp',
'**/*.swift',
'**/*.m',
'**/*.metal'
]
pull_request:
types: [opened, synchronize, reopened]
paths: [
'.github/workflows/build-apple.yml',
'ggml/src/ggml-metal/**'
]
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
env:
GGML_NLOOP: 3
GGML_N_THREADS: 1
LLAMA_LOG_COLORS: 1
LLAMA_LOG_PREFIX: 1
LLAMA_LOG_TIMESTAMPS: 1
jobs:
macOS-latest-ios:
runs-on: macos-latest
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: macOS-latest-ios
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Build
id: cmake_build
run: |
sysctl -a
cmake -B build -G Xcode \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
-DLLAMA_BUILD_COMMON=OFF \
-DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_TOOLS=OFF \
-DLLAMA_BUILD_TESTS=OFF \
-DLLAMA_BUILD_SERVER=OFF \
-DCMAKE_SYSTEM_NAME=iOS \
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
macos-latest-ios-xcode:
runs-on: macos-latest
steps:
- name: Checkout code
uses: actions/checkout@v6
- name: Setup Xcode
uses: ggml-org/setup-xcode@v1
with:
xcode-version: latest-stable
- name: Build
id: cmake_build
run: |
sysctl -a
cmake -B build -G Xcode \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
-DLLAMA_OPENSSL=OFF \
-DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_TOOLS=OFF \
-DLLAMA_BUILD_TESTS=OFF \
-DLLAMA_BUILD_SERVER=OFF \
-DCMAKE_SYSTEM_NAME=iOS \
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
- name: xcodebuild for swift package
id: xcodebuild
run: |
./build-xcframework.sh
- name: Upload xcframework artifact
uses: actions/upload-artifact@v6
with:
name: llama-xcframework
path: build-apple/llama.xcframework/
retention-days: 1
- name: Build Xcode project
run: |
xcodebuild -downloadPlatform iOS
xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
macOS-latest-tvos:
runs-on: macos-latest
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: macOS-latest-tvos
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Build
id: cmake_build
run: |
sysctl -a
cmake -B build -G Xcode \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
-DLLAMA_BUILD_COMMON=OFF \
-DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_TOOLS=OFF \
-DLLAMA_BUILD_TESTS=OFF \
-DLLAMA_BUILD_SERVER=OFF \
-DCMAKE_SYSTEM_NAME=tvOS \
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
macOS-latest-visionos:
runs-on: macos-latest
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: Build
id: cmake_build
run: |
sysctl -a
cmake -B build -G Xcode \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
-DLLAMA_BUILD_COMMON=OFF \
-DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_TOOLS=OFF \
-DLLAMA_BUILD_TESTS=OFF \
-DLLAMA_BUILD_SERVER=OFF \
-DCMAKE_SYSTEM_NAME=visionOS \
-DCMAKE_OSX_DEPLOYMENT_TARGET=1.0 \
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
macOS-latest-swift:
runs-on: macos-latest
needs: macos-latest-ios-xcode
strategy:
matrix:
destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS']
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: macOS-latest-swift
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Download xcframework artifact
uses: actions/download-artifact@v7
with:
name: llama-xcframework
path: build-apple/llama.xcframework/
- name: Build llama.cpp with CMake
id: cmake_build
run: |
sysctl -a
cmake -B build -G Xcode \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
-DLLAMA_OPENSSL=OFF \
-DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_TOOLS=OFF \
-DLLAMA_BUILD_TESTS=OFF \
-DLLAMA_BUILD_SERVER=OFF \
-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)

View File

@ -37,31 +37,31 @@ jobs:
path: ./vulkan_sdk
version: ${{ env.VULKAN_SDK_VERSION }}
ubuntu-24-spacemit-cache:
runs-on: ubuntu-24.04
#ubuntu-24-spacemit-cache:
# runs-on: ubuntu-24.04
env:
# Make sure this is in sync with build-linux-cross.yml
SPACEMIT_IME_TOOLCHAIN_VERSION: "1.1.2"
# env:
# # Make sure this is in sync with build-linux-cross.yml
# SPACEMIT_IME_TOOLCHAIN_VERSION: "1.1.2"
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
# steps:
# - name: Clone
# id: checkout
# uses: actions/checkout@v6
- name: Setup Cache
uses: actions/cache@v5
id: cache-toolchain
with:
path: ./spacemit_toolchain
key: spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
# - name: Setup Cache
# uses: actions/cache@v5
# id: cache-toolchain
# with:
# path: ./spacemit_toolchain
# key: spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
- name: Setup SpacemiT Toolchain
if: steps.cache-toolchain.outputs.cache-hit != 'true'
uses: ./.github/actions/linux-setup-spacemit
with:
path: ./spacemit_toolchain
version: ${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}
# - name: Setup SpacemiT Toolchain
# if: steps.cache-toolchain.outputs.cache-hit != 'true'
# uses: ./.github/actions/linux-setup-spacemit
# with:
# path: ./spacemit_toolchain
# version: ${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}
ubuntu-24-openvino-cache:
runs-on: ubuntu-24.04

102
.github/workflows/build-cann.yml vendored Normal file
View File

@ -0,0 +1,102 @@
name: CI (cann)
on:
workflow_dispatch: # allows manual triggering
push:
branches:
- master
paths: [
'.github/workflows/build-cann.yml',
'**/CMakeLists.txt',
'**/.cmake',
'**/*.h',
'**/*.hpp',
'**/*.c',
'**/*.cpp'
]
pull_request:
types: [opened, synchronize, reopened]
paths: [
'.github/workflows/build-cann.yml',
'ggml/src/ggml-cann/**'
]
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
env:
GGML_NLOOP: 3
GGML_N_THREADS: 1
LLAMA_LOG_COLORS: 1
LLAMA_LOG_PREFIX: 1
LLAMA_LOG_TIMESTAMPS: 1
jobs:
openEuler-latest-cann:
defaults:
run:
shell: bash -el {0}
strategy:
matrix:
arch: [x86, aarch64]
chip_type: ['910b', '310p']
build: ['Release']
use_acl_graph: ['on', 'off']
exclude:
# 310P does not support USE_ACL_GRAPH=on
- chip_type: '310p'
use_acl_graph: 'on'
runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
steps:
- name: Checkout
uses: actions/checkout@v6
with:
fetch-depth: 0
- name: Free up disk space
uses: ggml-org/free-disk-space@v1.3.1
with:
tool-cache: true
- name: Set container image
id: cann-image
run: |
image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.3.rc2-910b-openeuler24.03-py3.11' || '8.3.rc2-310p-openeuler24.03-py3.11' }}"
echo "image=${image}" >> "${GITHUB_OUTPUT}"
- name: Pull container image
run: docker pull "${{ steps.cann-image.outputs.image }}"
- name: Build
env:
BUILD_TYPE: ${{ matrix.build }}
SOC_TYPE: ascend${{ matrix.chip_type }}
USE_ACL_GRAPH: ${{ matrix.use_acl_graph }}
run: |
HOST_UID=$(id -u)
HOST_GID=$(id -g)
docker run --rm \
-v "${PWD}:/workspace" \
-w /workspace \
-e SOC_TYPE=${SOC_TYPE} \
-e BUILD_TYPE=${BUILD_TYPE} \
-e USE_ACL_GRAPH=${USE_ACL_GRAPH} \
"${{ steps.cann-image.outputs.image }}" \
bash -lc '
set -e
yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake openssl-devel
yum clean all && rm -rf /var/cache/yum
git config --global --add safe.directory "/workspace"
export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
cmake -S . -B build \
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
-DGGML_CANN=on \
-DSOC_TYPE=${SOC_TYPE} \
-DUSE_ACL_GRAPH=${USE_ACL_GRAPH}
cmake --build build -j $(nproc)
chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
'

View File

@ -1,7 +1,24 @@
name: Build on Linux using cross-compiler
name: CI (cross)
on:
# only manual triggers due to low-importance of the workflows
# TODO: for regular runs, provision dedicated self-hosted runners
workflow_dispatch:
workflow_call:
push:
branches:
- master
paths: [
'.github/workflows/build-cross.yml',
'ggml/src/spacemit/*',
'ggml/src/arch/loongarch/*'
]
# run once every week
schedule:
- cron: '0 0 * * 0'
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
jobs:
# ubuntu-24-riscv64-cpu-cross:
@ -264,15 +281,15 @@ jobs:
steps:
- uses: actions/checkout@v6
- name: Use SpacemiT Toolchain Cache
uses: actions/cache@v5
id: cache-toolchain
with:
path: ./spacemit_toolchain
key: spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
#- name: Use SpacemiT Toolchain Cache
# uses: actions/cache@v5
# id: cache-toolchain
# with:
# path: ./spacemit_toolchain
# key: spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
- name: Setup SpacemiT Toolchain
if: steps.cache-toolchain.outputs.cache-hit != 'true'
#if: steps.cache-toolchain.outputs.cache-hit != 'true'
uses: ./.github/actions/linux-setup-spacemit
with:
path: ./spacemit_toolchain

72
.github/workflows/build-msys.yml vendored Normal file
View File

@ -0,0 +1,72 @@
name: CI (msys)
on:
# only manual triggers due to low-importance of the workflows
# TODO: for regular runs, provision dedicated self-hosted runners
workflow_dispatch:
# run once every week
schedule:
- cron: '0 0 * * 0'
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
env:
GGML_NLOOP: 3
GGML_N_THREADS: 1
LLAMA_LOG_COLORS: 1
LLAMA_LOG_PREFIX: 1
LLAMA_LOG_TIMESTAMPS: 1
jobs:
windows-msys2:
runs-on: windows-2025
strategy:
fail-fast: false
matrix:
include:
- { sys: UCRT64, env: ucrt-x86_64, build: Release }
- { sys: CLANG64, env: clang-x86_64, build: Release }
steps:
- name: Clone
uses: actions/checkout@v6
#- name: ccache
# uses: ggml-org/ccache-action@v1.2.16
# with:
# key: windows-msys2
# variant: ccache
# evict-old-files: 1d
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Setup ${{ matrix.sys }}
uses: msys2/setup-msys2@v2
with:
update: true
msystem: ${{matrix.sys}}
install: >-
base-devel
git
mingw-w64-${{matrix.env}}-toolchain
mingw-w64-${{matrix.env}}-cmake
mingw-w64-${{matrix.env}}-openblas
- name: Build using CMake
shell: msys2 {0}
run: |
cmake -B build
cmake --build build --config ${{ matrix.build }} -j $(nproc)
- name: Clean after building using CMake
shell: msys2 {0}
run: |
rm -rf build
- name: Build using CMake w/ OpenBLAS
shell: msys2 {0}
run: |
cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
cmake --build build --config ${{ matrix.build }} -j $(nproc)

136
.github/workflows/build-riscv.yml vendored Normal file
View File

@ -0,0 +1,136 @@
name: CI (riscv)
on:
workflow_dispatch: # allows manual triggering
push:
branches:
- master
paths: [
'.github/workflows/build-riscv.yml',
'**/CMakeLists.txt',
'**/.cmake',
'**/*.h',
'**/*.hpp',
'**/*.c',
'**/*.cpp'
]
pull_request:
types: [opened, synchronize, reopened]
paths: [
'.github/workflows/build-riscv.yml',
'ggml/src/ggml-cpu/arch/riscv/**'
]
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
env:
GGML_NLOOP: 3
GGML_N_THREADS: 1
LLAMA_LOG_COLORS: 1
LLAMA_LOG_PREFIX: 1
LLAMA_LOG_TIMESTAMPS: 1
jobs:
ubuntu-riscv64-native-sanitizer:
runs-on: RISCV64
continue-on-error: true
strategy:
matrix:
sanitizer: [ADDRESS, THREAD, UNDEFINED]
build_type: [Debug]
steps:
- name: Install dependencies
run: |
sudo apt-get update
# Install necessary packages
sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential wget ccache git-lfs
# Set gcc-14 and g++-14 as the default compilers
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc
sudo ln -sf /usr/bin/g++-14 /usr/bin/g++
# Install Rust stable version
rustup install stable
rustup default stable
git lfs install
- name: GCC version check
run: |
gcc --version
g++ --version
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: Setup ccache
run: |
# Unique cache directory per matrix combination
export CCACHE_DIR="$HOME/.ccache/sanitizer-${{ matrix.sanitizer }}-${{ matrix.build_type }}"
mkdir -p "$CCACHE_DIR"
# Configure ccache
ccache --set-config=max_size=5G
ccache --set-config=compression=true
ccache --set-config=compression_level=6
ccache --set-config=cache_dir="$CCACHE_DIR"
ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime
ccache --set-config=hash_dir=false
# Export for subsequent steps
echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV
echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV
- name: Build
id: cmake_build
if: ${{ matrix.sanitizer != 'THREAD' }}
run: |
cmake -B build \
-DLLAMA_OPENSSL=OFF \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DGGML_OPENMP=ON \
-DLLAMA_BUILD_EXAMPLES=ON \
-DLLAMA_BUILD_TOOLS=ON \
-DLLAMA_BUILD_TESTS=OFF \
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
- name: Build (no OpenMP)
id: cmake_build_no_openmp
if: ${{ matrix.sanitizer == 'THREAD' }}
run: |
cmake -B build \
-DLLAMA_OPENSSL=OFF \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
-DLLAMA_BUILD_TOOLS=ON \
-DLLAMA_BUILD_TESTS=OFF \
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
- name: Test
id: cmake_test
run: |
cd build
ctest -L main --verbose --timeout 900

87
.github/workflows/build-sanitize.yml vendored Normal file
View File

@ -0,0 +1,87 @@
name: CI (sanitize)
on:
workflow_dispatch: # allows manual triggering
push:
branches:
- master
paths: [
'.github/workflows/build-sanitize.yml',
'**/CMakeLists.txt',
'**/.cmake',
'**/*.h',
'**/*.hpp',
'**/*.c',
'**/*.cpp'
]
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
env:
GGML_NLOOP: 3
GGML_N_THREADS: 1
LLAMA_LOG_COLORS: 1
LLAMA_LOG_PREFIX: 1
LLAMA_LOG_TIMESTAMPS: 1
jobs:
ubuntu-latest-sanitizer:
runs-on: ubuntu-latest
continue-on-error: true
strategy:
matrix:
sanitizer: [ADDRESS, THREAD, UNDEFINED]
build_type: [Debug]
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: ubuntu-latest-sanitizer-${{ matrix.sanitizer }}
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential libssl-dev
- name: Build
id: cmake_build
if: ${{ matrix.sanitizer != 'THREAD' }}
run: |
cmake -B build \
-DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
- name: Build (no OpenMP)
id: cmake_build_no_openmp
if: ${{ matrix.sanitizer == 'THREAD' }}
run: |
cmake -B build \
-DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DGGML_OPENMP=OFF
cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
- name: Test
id: cmake_test
run: |
cd build
ctest -L main --verbose --timeout 900

View File

@ -222,15 +222,7 @@ jobs:
id: checkout
uses: actions/checkout@v6
- name: Use OpenVINO Toolkit Cache
uses: actions/cache@v5
id: cache-openvino
with:
path: ./openvino_toolkit
key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
- name: Setup OpenVINO Toolkit
if: steps.cache-openvino.outputs.cache-hit != 'true'
uses: ./.github/actions/linux-setup-openvino
with:
path: ./openvino_toolkit

96
.github/workflows/build-vulkan.yml vendored Normal file
View File

@ -0,0 +1,96 @@
name: CI (vulkan)
on:
workflow_dispatch: # allows manual triggering
push:
branches:
- master
paths: [
'.github/workflows/build-vulkan.yml',
'**/CMakeLists.txt',
'**/.cmake',
'**/*.h',
'**/*.hpp',
'**/*.c',
'**/*.cpp',
'**/*.comp',
'**/*.glsl'
]
pull_request:
types: [opened, synchronize, reopened]
paths: [
'.github/workflows/build-vulkan.yml',
'ggml/src/ggml-vulkan/**'
]
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
env:
GGML_NLOOP: 3
GGML_N_THREADS: 1
LLAMA_LOG_COLORS: 1
LLAMA_LOG_PREFIX: 1
LLAMA_LOG_TIMESTAMPS: 1
jobs:
ubuntu-24-vulkan-llvmpipe:
runs-on: ubuntu-24.04
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: ubuntu-24-vulkan-llvmpipe
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Dependencies
id: depends
run: |
sudo add-apt-repository -y ppa:kisak/kisak-mesa
sudo apt-get update -y
sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev
- name: Get latest Vulkan SDK version
id: vulkan_sdk_version
run: |
echo "VULKAN_SDK_VERSION=$(curl https://vulkan.lunarg.com/sdk/latest/linux.txt)" >> "$GITHUB_ENV"
- name: Use Vulkan SDK Cache
uses: actions/cache@v5
id: cache-sdk
with:
path: ./vulkan_sdk
key: vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}
- name: Setup Vulkan SDK
if: steps.cache-sdk.outputs.cache-hit != 'true'
uses: ./.github/actions/linux-setup-vulkan-llvmpipe
with:
path: ./vulkan_sdk
version: ${{ env.VULKAN_SDK_VERSION }}
- name: Build
id: cmake_build
run: |
source ./vulkan_sdk/setup-env.sh
cmake -B build \
-DGGML_VULKAN=ON
cmake --build build --config Release -j $(nproc)
- name: Test
id: cmake_test
run: |
cd build
export GGML_VK_VISIBLE_DEVICES=0
export GGML_VK_DISABLE_F16=1
export GGML_VK_DISABLE_COOPMAT=1
# This is using llvmpipe and runs slower than other backends
ctest -L main --verbose --timeout 4800

File diff suppressed because it is too large Load Diff

View File

@ -4,10 +4,16 @@ on:
push:
branches:
- master
paths: ['.github/workflows/python-lint.yml', '**/*.py']
paths: [
'.github/workflows/python-lint.yml',
'**/*.py'
]
pull_request:
types: [opened, synchronize, reopened]
paths: ['.github/workflows/python-lint.yml', '**/*.py']
paths: [
'.github/workflows/python-lint.yml',
'**/*.py'
]
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}

View File

@ -10,7 +10,22 @@ on:
push:
branches:
- master
paths: ['.github/workflows/release.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
paths: [
'.github/workflows/release.yml',
'**/CMakeLists.txt',
'**/.cmake',
'**/*.h',
'**/*.hpp',
'**/*.c',
'**/*.cpp',
'**/*.cu',
'**/*.cuh',
'**/*.swift',
'**/*.m',
'**/*.metal',
'**/*.comp',
'**/*.glsl'
]
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@ -34,7 +49,7 @@ jobs:
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: macOS-latest-cmake-arm64
key: macOS-latest-arm64
evict-old-files: 1d
- name: Build
@ -81,7 +96,7 @@ jobs:
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: macOS-latest-cmake-x64
key: macOS-latest-x64
evict-old-files: 1d
- name: Build
@ -140,7 +155,7 @@ jobs:
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: ubuntu-cpu-cmake-${{ matrix.build }}
key: ubuntu-cpu-${{ matrix.build }}
evict-old-files: 1d
- name: Dependencies
@ -191,7 +206,7 @@ jobs:
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: ubuntu-22-cmake-vulkan
key: ubuntu-22-vulkan
evict-old-files: 1d
- name: Dependencies
@ -256,7 +271,7 @@ jobs:
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: ubuntu-24-cmake-openvino-release-no-preset-v1
key: ubuntu-24-openvino-release-no-preset-v1
evict-old-files: 1d
- name: Dependencies
@ -329,7 +344,7 @@ jobs:
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: windows-latest-cmake-cpu-${{ matrix.arch }}
key: windows-latest-cpu-${{ matrix.arch }}
variant: ccache
evict-old-files: 1d
@ -390,7 +405,7 @@ jobs:
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: windows-latest-cmake-${{ matrix.backend }}-${{ matrix.arch }}
key: windows-latest-${{ matrix.backend }}-${{ matrix.arch }}
variant: ccache
evict-old-files: 1d
@ -536,7 +551,7 @@ jobs:
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: windows-latest-cmake-sycl
key: windows-latest-sycl
variant: ccache
evict-old-files: 1d
@ -616,7 +631,7 @@ jobs:
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: ubuntu-rocm-cmake-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}
key: ubuntu-rocm-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}
evict-old-files: 1d
- name: Dependencies
@ -726,7 +741,7 @@ jobs:
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: windows-latest-cmake-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}-x64
key: windows-latest-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}-x64
evict-old-files: 1d
- name: Install ROCm

105
.github/workflows/server-sanitize.yml vendored Normal file
View File

@ -0,0 +1,105 @@
name: Server (sanitize)
on:
workflow_dispatch: # allows manual triggering
inputs:
sha:
description: 'Commit SHA1 to build'
required: false
type: string
slow_tests:
description: 'Run slow tests'
required: true
type: boolean
push:
branches:
- master
paths: [
'.github/workflows/server-sanitize.yml',
'**/CMakeLists.txt',
'**/Makefile',
'**/*.h',
'**/*.hpp',
'**/*.c',
'**/*.cpp',
'tools/server/**.*'
]
env:
LLAMA_LOG_COLORS: 1
LLAMA_LOG_PREFIX: 1
LLAMA_LOG_TIMESTAMPS: 1
LLAMA_LOG_VERBOSITY: 10
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
jobs:
server:
runs-on: ubuntu-latest
strategy:
matrix:
sanitizer: [ADDRESS, UNDEFINED] # THREAD is very slow
build_type: [RelWithDebInfo]
fail-fast: false
steps:
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get -y install \
build-essential \
xxd \
git \
cmake \
curl \
wget \
language-pack-en \
libssl-dev
- name: Clone
id: checkout
uses: actions/checkout@v6
with:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: Build
id: cmake_build
run: |
cmake -B build \
-DLLAMA_BUILD_BORINGSSL=ON \
-DGGML_SCHED_NO_REALLOC=ON \
-DGGML_SANITIZE_ADDRESS=${{ matrix.sanitizer == 'ADDRESS' }} \
-DGGML_SANITIZE_THREAD=${{ matrix.sanitizer == 'THREAD' }} \
-DGGML_SANITIZE_UNDEFINED=${{ matrix.sanitizer == 'UNDEFINED' }} \
-DLLAMA_SANITIZE_ADDRESS=${{ matrix.sanitizer == 'ADDRESS' }} \
-DLLAMA_SANITIZE_THREAD=${{ matrix.sanitizer == 'THREAD' }} \
-DLLAMA_SANITIZE_UNDEFINED=${{ matrix.sanitizer == 'UNDEFINED' }}
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
- name: Python setup
id: setup_python
uses: actions/setup-python@v6
with:
python-version: '3.11'
pip-install: -r tools/server/tests/requirements.txt
- name: Tests
id: server_integration_tests
if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
run: |
cd tools/server/tests
export ${{ matrix.extra_args }}
pytest -v -x -m "not slow"
- name: Slow tests
id: server_integration_tests_slow
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
run: |
cd tools/server/tests
export ${{ matrix.extra_args }}
SLOW_TESTS=1 pytest -v -x

View File

@ -14,7 +14,19 @@ on:
push:
branches:
- master
paths: ['.github/workflows/server-self-hosted.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*']
paths: [
'.github/workflows/server-self-hosted.yml',
'**/CMakeLists.txt',
'**/Makefile',
'**/*.h',
'**/*.hpp',
'**/*.c',
'**/*.cpp',
'**/*.cu',
'**/*.swift',
'**/*.m',
'tools/server/**.*'
]
env:
LLAMA_LOG_COLORS: 1

View File

@ -1,4 +1,3 @@
# Server WebUI build and tests
name: Server WebUI
on:
@ -11,10 +10,20 @@ on:
push:
branches:
- master
paths: ['.github/workflows/server-webui.yml', 'tools/server/webui/**.*', 'tools/server/tests/**.*', 'tools/server/public/**']
paths: [
'.github/workflows/server-webui.yml',
'tools/server/webui/**.*',
'tools/server/tests/**.*',
'tools/server/public/**'
]
pull_request:
types: [opened, synchronize, reopened]
paths: ['.github/workflows/server-webui.yml', 'tools/server/webui/**.*', 'tools/server/tests/**.*', 'tools/server/public/**']
paths: [
'.github/workflows/server-webui.yml',
'tools/server/webui/**.*',
'tools/server/tests/**.*',
'tools/server/public/**'
]
env:
LLAMA_LOG_COLORS: 1

View File

@ -1,4 +1,3 @@
# Server build and tests
name: Server
on:
@ -15,10 +14,34 @@ on:
push:
branches:
- master
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*']
paths: [
'.github/workflows/server.yml',
'**/CMakeLists.txt',
'**/Makefile',
'**/*.h',
'**/*.hpp',
'**/*.c',
'**/*.cpp',
'**/*.cu',
'**/*.swift',
'**/*.m',
'tools/server/**.*'
]
pull_request:
types: [opened, synchronize, reopened]
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*']
paths: [
'.github/workflows/server.yml',
'**/CMakeLists.txt',
'**/Makefile',
'**/*.h',
'**/*.hpp',
'**/*.c',
'**/*.cpp',
'**/*.cu',
'**/*.swift',
'**/*.m',
'tools/server/**.*'
]
env:
LLAMA_LOG_COLORS: 1
@ -34,17 +57,18 @@ jobs:
server:
runs-on: ubuntu-latest
name: server (${{ matrix.wf_name }})
strategy:
matrix:
sanitizer: [ADDRESS, UNDEFINED] # THREAD is very slow
build_type: [RelWithDebInfo]
build_type: [Release]
wf_name: ["default"]
include:
- build_type: Release
sanitizer: ""
extra_args: ""
wf_name: "default"
- build_type: Release
sanitizer: ""
extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
wf_name: "backend-sampling"
fail-fast: false
steps:
@ -74,13 +98,7 @@ jobs:
run: |
cmake -B build \
-DLLAMA_BUILD_BORINGSSL=ON \
-DGGML_SCHED_NO_REALLOC=ON \
-DGGML_SANITIZE_ADDRESS=${{ matrix.sanitizer == 'ADDRESS' }} \
-DGGML_SANITIZE_THREAD=${{ matrix.sanitizer == 'THREAD' }} \
-DGGML_SANITIZE_UNDEFINED=${{ matrix.sanitizer == 'UNDEFINED' }} \
-DLLAMA_SANITIZE_ADDRESS=${{ matrix.sanitizer == 'ADDRESS' }} \
-DLLAMA_SANITIZE_THREAD=${{ matrix.sanitizer == 'THREAD' }} \
-DLLAMA_SANITIZE_UNDEFINED=${{ matrix.sanitizer == 'UNDEFINED' }}
-DGGML_SCHED_NO_REALLOC=ON
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
- name: Python setup

View File

@ -2,29 +2,13 @@
# multiplie collaborators per item can be specified
/.devops/*.Dockerfile @ngxson
/.github/actions/ @CISC
/.github/workflows/ @CISC
/.github/actions/ @ggml-org/ci
/.github/workflows/ @ggml-org/ci
/ci/ @ggerganov
/cmake/ @ggerganov
/common/CMakeLists.txt @ggerganov
/common/arg.* @ggerganov
/common/base64.hpp.* @ggerganov
/common/build-info.* @ggerganov
/common/chat.* @pwilkin
/common/chat-auto*.* @pwilkin
/common/chat-diff-analyzer.* @pwilkin
/common/chat-peg-parser.* @aldehir
/common/common.* @ggerganov
/common/console.* @ggerganov
/common/http.* @angt
/common/jinja/ @ngxson @CISC @aldehir
/common/llguidance.* @ggerganov
/common/log.* @ggerganov
/common/ @ggml-org/llama-common
/common/jinja/ @CISC
/common/ngram-map.* @srogmann
/common/peg-parser.* @aldehir
/common/sampling.* @ggerganov
/common/speculative.* @ggerganov
/common/unicode.* @aldehir
/convert_*.py @CISC
/examples/batched.swift/ @ggerganov
/examples/batched/ @ggerganov
@ -51,29 +35,27 @@
/examples/speculative/ @ggerganov
/ggml/cmake/ @ggerganov
/ggml/include/ @ggerganov
/ggml/src/ggml-cann/ @ggml-org/ggml-cann
/ggml/src/ggml-common.h @ggerganov
/ggml/src/ggml-cpu/ @ggerganov
/ggml/src/ggml-cpu/spacemit/ @alex-spacemit
/ggml/src/ggml-cuda/fattn* @JohannesGaessler
/ggml/src/ggml-cuda/mmf.* @JohannesGaessler @am17an
/ggml/src/ggml-cuda/mmq.* @JohannesGaessler
/ggml/src/ggml-cuda/mmvf.* @JohannesGaessler
/ggml/src/ggml-cuda/mmvq.* @JohannesGaessler
/ggml/src/ggml-cuda/ @ggml-org/ggml-cuda
/ggml/src/ggml-cuda/fattn-wmma* @IMbackK
/ggml/src/ggml-hip/ @IMbackK
/ggml/src/ggml-cuda/vendors/hip.h @IMbackK
/ggml/src/ggml-impl.h @ggerganov
/ggml/src/ggml-metal/ @ggerganov
/ggml/src/ggml-opencl/ @lhez @max-krasnyansky
/ggml/src/ggml-hexagon/ @max-krasnyansky @lhez
/ggml/src/ggml-metal/ @ggml-org/ggml-metal
/ggml/src/ggml-opencl/ @ggml-org/ggml-opencl
/ggml/src/ggml-hexagon/ @ggml-org/ggml-hexagon
/ggml/src/ggml-opt.cpp @JohannesGaessler
/ggml/src/ggml-quants.* @ggerganov
/ggml/src/ggml-rpc/ @rgerganov
/ggml/src/ggml-rpc/ @ggml-org/ggml-rpc
/ggml/src/ggml-sycl/ @ggml-org/ggml-sycl
/ggml/src/ggml-threading.* @ggerganov
/ggml/src/ggml-vulkan/ @0cc4m
/ggml/src/ggml-vulkan/ @ggml-org/ggml-vulkan
/ggml/src/ggml-virtgpu/ @kpouget
/ggml/src/ggml-webgpu/ @reeselevine
/ggml/src/ggml-zdnn/ @taronaeo @Andreas-Krebbel @AlekseiNikiforovIBM
/ggml/src/ggml-webgpu/ @ggml-org/ggml-webgpu
/ggml/src/ggml-zdnn/ @ggml-org/ggml-zdnn @Andreas-Krebbel @AlekseiNikiforovIBM
/ggml/src/ggml-openvino/ @cavusmustafa @wine99
/ggml/src/ggml.c @ggerganov
/ggml/src/ggml.cpp @ggerganov
@ -93,16 +75,18 @@
/src/models/ @CISC
/tests/ @ggerganov
/tests/test-chat.* @pwilkin
/tests/test-llama-archs.cpp @JohannesGaessler
/tools/batched-bench/ @ggerganov
/tools/cli/ @ngxson
/tools/completion/ @ggerganov
/tools/mtmd/ @ngxson
/tools/mtmd/ @ggml-org/llama-mtmd
/tools/perplexity/ @ggerganov
/tools/parser/ @pwilkin
/tools/quantize/ @ggerganov
/tools/rpc/ @rgerganov
/tools/server/* @ngxson @ggerganov # no subdir
/tools/server/webui/ @allozaur
/tools/rpc/ @ggml-org/ggml-rpc
/tools/server/* @ggml-org/llama-server # no subdir
/tools/server/tests/ @ggml-org/llama-server
/tools/server/webui/ @ggml-org/llama-webui
/tools/tokenize/ @ggerganov
/tools/tts/ @ggerganov
/vendor/ @ggerganov

View File

@ -479,6 +479,7 @@ analyze_content::analyze_content(const common_chat_template & tmpl, const analyz
if (!comparison_with_tools || !comparison_with_reasoning) {
LOG_DBG(ANSI_ORANGE "%s: Template application failed\n" ANSI_RESET, __func__);
return;
}
const auto & diff_tools = comparison_with_tools->diff;
@ -911,8 +912,10 @@ void analyze_tools::extract_function_markers() {
// we'll have to rely on an extra diff with no-calls version
auto notool_comp = compare_variants(
*tmpl, params, [&](template_params & p) { p.messages = json::array({ user_msg, assistant_nocall }); });
auto nt_diff = notool_comp->diff;
closer_suffix = nt_diff.left.substr(nt_diff.left.find("YYYY") + 4);
if (notool_comp) {
auto nt_diff = notool_comp->diff;
closer_suffix = nt_diff.left.substr(nt_diff.left.find("YYYY") + 4);
}
} else {
closer_suffix = diff.suffix.substr(0, diff.suffix.find(suffix_marker));
}

View File

@ -102,7 +102,7 @@ std::string regex_to_reversed_partial_regex(const std::string & pattern) {
auto is_star = *it == '*';
++it;
if (is_star) {
if (*it == '?') {
if (it != end && *it == '?') {
++it;
}
}

View File

@ -272,8 +272,9 @@ class ModelBase:
return tensors
def dequant_model(self):
if self._is_nvfp4:
return # NVFP4 weights are repacked in _generate_nvfp4_tensors
# If all quantized tensors were already handled (e.g. pure NVFP4), skip
if self._is_nvfp4 and not any(k.endswith((".weight_scale", ".weight_scale_inv")) for k in self.model_tensors):
return
tensors_to_remove: list[str] = []
new_tensors: dict[str, Callable[[], Tensor]] = {}
@ -474,7 +475,20 @@ class ModelBase:
tensors_to_remove.append(base_name + "_zero_point")
else:
raise NotImplementedError(f"Quant format {quant_format!r} for method {quant_method!r} is not yet supported")
else:
elif quant_method == "modelopt":
# Mixed-precision ModelOpt models: NVFP4 tensors are handled by
# _generate_nvfp4_tensors; FP8 tensors have 1D weight_scale and
# are dequantized here. input_scale tensors are unused.
for name in self.model_tensors.keys():
if name.endswith(".weight_scale"):
weight_name = name.removesuffix("_scale")
w = self.model_tensors[weight_name]
s = self.model_tensors[name]
self.model_tensors[weight_name] = lambda w=w, s=s: dequant_simple(w(), s(), None)
tensors_to_remove.append(name)
if name.endswith((".input_scale", ".k_scale", ".v_scale")):
tensors_to_remove.append(name)
elif quant_method is not None:
raise NotImplementedError(f"Quant method is not yet supported: {quant_method!r}")
for name in tensors_to_remove:
@ -520,12 +534,6 @@ class ModelBase:
raise NotImplementedError("set_gguf_parameters() must be implemented in subclasses")
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# skip NVFP4 auxiliary tensors (handled in _generate_nvfp4_tensors)
if self._is_nvfp4:
if name.endswith((".weight_scale", ".weight_scale_2", ".input_scale", ".k_scale", ".v_scale")):
return []
if name.endswith(".weight") and name.replace(".weight", ".weight_scale") in self.model_tensors:
return []
new_name = self.map_tensor_name(name)
@ -609,6 +617,7 @@ class ModelBase:
expert_scales: dict[tuple[int, str], list[tuple[int, float]]] = {}
expert_shapes: dict[tuple[int, str], list[int]] = {}
n_experts = self.find_hparam(["num_local_experts", "num_experts"], optional=True) or 0
consumed: list[str] = []
for name in list(self.model_tensors.keys()):
if not name.endswith(".weight"):
@ -620,8 +629,18 @@ class ModelBase:
# Force eager materialization of lazy tensors
weight = LazyTorchTensor.to_eager(self.model_tensors[name]())
scale = LazyTorchTensor.to_eager(self.model_tensors[scale_name]())
# Skip non-NVFP4 tensors (e.g. FP8 with per-channel 1D scales)
if scale.ndim < 2:
continue
scale2 = LazyTorchTensor.to_eager(self.model_tensors.get(scale2_name, lambda: torch.tensor(1.0))())
# Mark tensors for removal from model_tensors (already written to gguf)
consumed.extend([name, scale_name])
if scale2_name in self.model_tensors:
consumed.append(scale2_name)
# Check if this is a per-expert tensor
m = re.search(r'\.experts\.(\d+)\.(gate_proj|up_proj|down_proj)\.weight$', name)
if m:
@ -652,6 +671,15 @@ class ModelBase:
for (bid, proj_type) in list(expert_blocks.keys()):
self._flush_nvfp4_experts((bid, proj_type), expert_blocks, expert_scales, expert_shapes, bid, proj_type)
# Remove consumed tensors so get_tensors/modify_tensors won't see them
for name in consumed:
self.model_tensors.pop(name, None)
# Remove unused auxiliary tensors (input_scale, k_scale, v_scale)
for name in list(self.model_tensors.keys()):
if name.endswith((".input_scale", ".k_scale", ".v_scale")):
del self.model_tensors[name]
def _flush_nvfp4_experts(self, key, expert_blocks, expert_scales, expert_shapes, bid, proj_type):
experts = expert_blocks.pop(key)
scales = expert_scales.pop(key)
@ -677,20 +705,31 @@ class ModelBase:
def prepare_tensors(self):
# detect NVFP4 quantization (ModelOpt format)
quant_algo = (self.hparams.get("quantization_config") or {}).get("quant_algo")
quant_layers = (self.hparams.get("quantization_config") or {}).get("quantized_layers") or {}
quant_config_file = self.dir_model / "hf_quant_config.json"
if not quant_algo and quant_config_file.is_file():
if (not quant_algo or not quant_layers) and quant_config_file.is_file():
with open(quant_config_file, "r", encoding="utf-8") as f:
quant_algo = (json.load(f).get("quantization") or {}).get("quant_algo")
quant_config = json.load(f).get("quantization") or {}
quant_algo = quant_config.get("quant_algo", quant_algo)
quant_layers = quant_config.get("quantized_layers", quant_layers) or {}
# Some models use per-tensor quant_algo (e.g. "MIXED_PRECISION" with
# per-layer NVFP4/FP8) instead of a single global "NVFP4" value.
if quant_algo != "NVFP4":
if any(v.get("quant_algo") == "NVFP4" for v in quant_layers.values() if isinstance(v, dict)):
quant_algo = "NVFP4"
self._is_nvfp4 = quant_algo == "NVFP4"
self.dequant_model()
# NVFP4 weights are repacked and written directly to gguf_writer
# NVFP4 weights are repacked and written directly to gguf_writer.
# This must run before dequant_model so NVFP4 tensors are removed
# from model_tensors, leaving only non-NVFP4 (e.g. FP8) for dequant.
if self._is_nvfp4:
self._generate_nvfp4_tensors()
self.dequant_model()
# Handle empty tensor_map for models with block_count=0 (like MobileNetV5)
if self.tensor_map.mapping:
max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")

View File

@ -128,6 +128,12 @@ class LoraTorchTensor:
assert dim is None
return self.shape
def contiguous(self) -> LoraTorchTensor:
return LoraTorchTensor(
self._lora_A.contiguous(),
self._lora_B.contiguous(),
)
def reshape(self, *shape: int | tuple[int, ...]) -> LoraTorchTensor:
if isinstance(shape[0], tuple):
new_shape: tuple[int, ...] = shape[0]

View File

@ -892,7 +892,7 @@ void launch_fattn(
const int ntiles_x = ((Q->ne[1] + ncols1 - 1) / ncols1);
const int gqa_ratio = Q->ne[2] / K->ne[2];
const int ntiles_z_gqa = ((gqa_ratio + ncols2 - 1) / ncols2);
const int ntiles_total = ntiles_x * ntiles_z_gqa * K->ne[2] * Q->ne[3];
const int ntiles_dst = ntiles_x * ntiles_z_gqa * K->ne[2] * Q->ne[3];
// Optional optimization where the mask is scanned to determine whether part of the calculation can be skipped.
// Only worth the overhead if there is at lease one FATTN_KQ_STRIDE x FATTN_KQ_STRIDE square to be skipped or
@ -919,37 +919,37 @@ void launch_fattn(
GGML_ASSERT(max_blocks_per_sm > 0);
int parallel_blocks = max_blocks_per_sm;
const int ntiles_KV = (K->ne[1] + nbatch_fa - 1) / nbatch_fa; // Max. number of parallel blocks limited by KV cache length.
dim3 blocks_num;
if (stream_k) {
// For short contexts it can be faster to have the SMs work on whole tiles because this lets us skip the fixup.
const int max_blocks = max_blocks_per_sm*nsm;
const int tiles_nwaves = (ntiles_total + max_blocks - 1) / max_blocks;
const int tiles_efficiency_percent = 100 * ntiles_total / (max_blocks*tiles_nwaves);
const int tiles_nwaves = (ntiles_dst + max_blocks - 1) / max_blocks;
const int tiles_efficiency_percent = 100 * ntiles_dst / (max_blocks*tiles_nwaves);
const int nblocks_stream_k = max_blocks;
const int nblocks_stream_k = std::min(max_blocks, ntiles_KV*ntiles_dst);
const bool use_stream_k = cc >= GGML_CUDA_CC_ADA_LOVELACE || amd_wmma_available(cc) || tiles_efficiency_percent < 75;
blocks_num.x = use_stream_k ? nblocks_stream_k : ntiles_total;
blocks_num.x = use_stream_k ? nblocks_stream_k : ntiles_dst;
blocks_num.y = 1;
blocks_num.z = 1;
if (ntiles_total % blocks_num.x != 0) { // Fixup is only needed if the SMs work on fractional tiles.
if (ntiles_dst % blocks_num.x != 0) { // Fixup is only needed if the SMs work on fractional tiles.
dst_tmp_meta.alloc((size_t(blocks_num.x) * ncols * (2 + DV/2)));
}
} else {
const int ntiles_KQ = (K->ne[1] + nbatch_fa - 1) / nbatch_fa; // Max. number of parallel blocks limited by tensor size.
// parallel_blocks must not be larger than what the tensor size allows:
parallel_blocks = std::min(parallel_blocks, ntiles_KQ);
parallel_blocks = std::min(parallel_blocks, ntiles_KV);
// If ntiles_total % blocks_per_wave != 0 then some efficiency is lost due to tail effects.
// Test whether parallel_blocks can be set to a higher value for better efficiency.
const int blocks_per_wave = nsm * max_blocks_per_sm;
int nwaves_best = 0;
int efficiency_percent_best = 0;
for (int parallel_blocks_test = parallel_blocks; parallel_blocks_test <= ntiles_KQ; ++parallel_blocks_test) {
const int nblocks_total = ntiles_total * parallel_blocks_test;
for (int parallel_blocks_test = parallel_blocks; parallel_blocks_test <= ntiles_KV; ++parallel_blocks_test) {
const int nblocks_total = ntiles_dst * parallel_blocks_test;
const int nwaves = (nblocks_total + blocks_per_wave - 1) / blocks_per_wave;
const int efficiency_percent = 100 * nblocks_total / (nwaves*blocks_per_wave);
@ -1015,7 +1015,7 @@ void launch_fattn(
CUDA_CHECK(cudaGetLastError());
if (stream_k) {
if (ntiles_total % blocks_num.x != 0) { // Fixup is only needed if the SMs work on fractional tiles.
if (ntiles_dst % blocks_num.x != 0) { // Fixup is only needed if the SMs work on fractional tiles.
const dim3 block_dim_combine(DV, 1, 1);
const dim3 blocks_num_combine = {blocks_num.x, ncols1, ncols2};

View File

@ -1,7 +1,8 @@
#include "gated_delta_net.cuh"
template <int S_v, bool KDA>
__global__ void gated_delta_net_cuda(const float * q,
__global__ void __launch_bounds__((ggml_cuda_get_physical_warp_size() < S_v ? ggml_cuda_get_physical_warp_size() : S_v) * 4, 2)
gated_delta_net_cuda(const float * q,
const float * k,
const float * v,
const float * g,
@ -38,7 +39,7 @@ __global__ void gated_delta_net_cuda(const float * q,
const int64_t state_offset = (sequence * H + h_idx) * S_v * S_v;
state += state_offset;
curr_state += state_offset;
curr_state += state_offset + col * S_v;
attn_data += (sequence * n_tokens * H + h_idx) * S_v;
constexpr int warp_size = ggml_cuda_get_physical_warp_size() < S_v ? ggml_cuda_get_physical_warp_size() : S_v;
@ -46,10 +47,11 @@ __global__ void gated_delta_net_cuda(const float * q,
constexpr int rows_per_lane = (S_v + warp_size - 1) / warp_size;
float s_shard[rows_per_lane];
// state is stored transposed: M[col][i] = S[i][col], row col is contiguous
#pragma unroll
for (int r = 0; r < rows_per_lane; r++) {
const int i = r * warp_size + lane;
s_shard[r] = curr_state[col * S_v + i];
s_shard[r] = curr_state[i];
}
for (int t = 0; t < n_tokens; t++) {
@ -63,6 +65,16 @@ __global__ void gated_delta_net_cuda(const float * q,
const float beta_val = *beta_t;
// Cache k and q in registers
float k_reg[rows_per_lane];
float q_reg[rows_per_lane];
#pragma unroll
for (int r = 0; r < rows_per_lane; r++) {
const int i = r * warp_size + lane;
k_reg[r] = k_t[i];
q_reg[r] = q_t[i];
}
if constexpr (!KDA) {
const float g_val = expf(*g_t);
@ -70,8 +82,7 @@ __global__ void gated_delta_net_cuda(const float * q,
float kv_shard = 0.0f;
#pragma unroll
for (int r = 0; r < rows_per_lane; r++) {
const int i = r * warp_size + lane;
kv_shard += s_shard[r] * k_t[i];
kv_shard += s_shard[r] * k_reg[r];
}
float kv_col = warp_reduce_sum<warp_size>(kv_shard);
@ -83,9 +94,8 @@ __global__ void gated_delta_net_cuda(const float * q,
float attn_partial = 0.0f;
#pragma unroll
for (int r = 0; r < rows_per_lane; r++) {
const int i = r * warp_size + lane;
s_shard[r] = g_val * s_shard[r] + k_t[i] * delta_col;
attn_partial += s_shard[r] * q_t[i];
s_shard[r] = g_val * s_shard[r] + k_reg[r] * delta_col;
attn_partial += s_shard[r] * q_reg[r];
}
float attn_col = warp_reduce_sum<warp_size>(attn_partial);
@ -99,7 +109,7 @@ __global__ void gated_delta_net_cuda(const float * q,
#pragma unroll
for (int r = 0; r < rows_per_lane; r++) {
const int i = r * warp_size + lane;
kv_shard += expf(g_t[i]) * s_shard[r] * k_t[i];
kv_shard += expf(g_t[i]) * s_shard[r] * k_reg[r];
}
float kv_col = warp_reduce_sum<warp_size>(kv_shard);
@ -113,8 +123,8 @@ __global__ void gated_delta_net_cuda(const float * q,
#pragma unroll
for (int r = 0; r < rows_per_lane; r++) {
const int i = r * warp_size + lane;
s_shard[r] = expf(g_t[i]) * s_shard[r] + k_t[i] * delta_col;
attn_partial += s_shard[r] * q_t[i];
s_shard[r] = expf(g_t[i]) * s_shard[r] + k_reg[r] * delta_col;
attn_partial += s_shard[r] * q_reg[r];
}
float attn_col = warp_reduce_sum<warp_size>(attn_partial);

View File

@ -124,7 +124,10 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
err = cudaMallocManaged(ptr, size);
#if defined(GGML_USE_HIP)
if (err == hipSuccess) {
CUDA_CHECK(cudaMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device));
// hipMemAdviseSetCoarseGrain is an optional performance hint;
// ignore errors (e.g. hipErrorInvalidValue on some APU/iGPU configs).
cudaMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device);
(void)hipGetLastError(); // clear any error
}
// fall back to cudaMalloc if not supported (e.g. on Windows)
@ -251,11 +254,6 @@ static ggml_cuda_device_info ggml_cuda_init() {
info.devices[id].supports_cooperative_launch = false;
#endif // !(GGML_USE_MUSA)
// cudaMemGetInfo returns info for the current device
size_t free_mem;
CUDA_CHECK(cudaSetDevice(id));
CUDA_CHECK(cudaMemGetInfo(&free_mem, NULL));
#if defined(GGML_USE_HIP)
info.devices[id].smpbo = prop.sharedMemPerBlock;
@ -270,25 +268,25 @@ static ggml_cuda_device_info ggml_cuda_init() {
info.devices[id].cc += prop.minor * 0x10;
}
}
GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d, VRAM: %zu MiB (%zu MiB free)\n",
GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d, VRAM: %zu MiB\n",
id, prop.name, prop.gcnArchName, info.devices[id].cc & 0xffff,
device_vmm ? "yes" : "no", prop.warpSize,
(size_t)(prop.totalGlobalMem / (1024 * 1024)), free_mem / (1024 * 1024));
(size_t)(prop.totalGlobalMem / (1024 * 1024)));
#elif defined(GGML_USE_MUSA)
// FIXME: Ensure compatibility with varying warp sizes across different MUSA archs.
info.devices[id].warp_size = 32;
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
info.devices[id].cc = GGML_CUDA_CC_OFFSET_MTHREADS + prop.major * 0x100;
info.devices[id].cc += prop.minor * 0x10;
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, VRAM: %zu MiB (%zu MiB free)\n",
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, VRAM: %zu MiB\n",
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
(size_t)(prop.totalGlobalMem / (1024 * 1024)), free_mem / (1024 * 1024));
(size_t)(prop.totalGlobalMem / (1024 * 1024)));
#else
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
info.devices[id].cc = 100*prop.major + 10*prop.minor;
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, VRAM: %zu MiB (%zu MiB free)\n",
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, VRAM: %zu MiB\n",
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
(size_t)(prop.totalGlobalMem / (1024 * 1024)), free_mem / (1024 * 1024));
(size_t)(prop.totalGlobalMem / (1024 * 1024)));
std::string device_name(prop.name);
if (device_name == "NVIDIA GeForce MX450") {
turing_devices_without_mma.push_back({ id, device_name });
@ -303,6 +301,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
// TODO: Check for future drivers the default scheduling strategy and
// remove this call again when cudaDeviceScheduleSpin is default.
if (prop.major == 12 && prop.minor == 1) {
CUDA_CHECK(cudaSetDevice(id));
CUDA_CHECK(cudaSetDeviceFlags(cudaDeviceScheduleSpin));
}

View File

@ -60,11 +60,17 @@ static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
enum mmvq_parameter_table_id {
MMVQ_PARAMETERS_GENERIC = 0,
MMVQ_PARAMETERS_GCN,
MMVQ_PARAMETERS_RDNA2
MMVQ_PARAMETERS_RDNA2,
MMVQ_PARAMETERS_RDNA3_0,
MMVQ_PARAMETERS_RDNA4
};
static constexpr __device__ mmvq_parameter_table_id get_device_table_id() {
#if defined(RDNA2) || defined(RDNA3) || defined(RDNA4)
#if defined(RDNA4)
return MMVQ_PARAMETERS_RDNA4;
#elif defined(RDNA3_0)
return MMVQ_PARAMETERS_RDNA3_0;
#elif defined(RDNA2) || defined(RDNA3_5)
return MMVQ_PARAMETERS_RDNA2;
#elif defined(GCN) || defined(CDNA)
return MMVQ_PARAMETERS_GCN;
@ -74,7 +80,13 @@ static constexpr __device__ mmvq_parameter_table_id get_device_table_id() {
}
static __host__ mmvq_parameter_table_id get_device_table_id(int cc) {
if (GGML_CUDA_CC_IS_RDNA2(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
if (GGML_CUDA_CC_IS_RDNA4(cc)) {
return MMVQ_PARAMETERS_RDNA4;
}
if (GGML_CUDA_CC_IS_RDNA3_0(cc)) {
return MMVQ_PARAMETERS_RDNA3_0;
}
if (GGML_CUDA_CC_IS_RDNA2(cc) || GGML_CUDA_CC_IS_RDNA3_5(cc)) {
return MMVQ_PARAMETERS_RDNA2;
}
if (GGML_CUDA_CC_IS_GCN(cc) || GGML_CUDA_CC_IS_CDNA(cc)) {
@ -83,7 +95,7 @@ static __host__ mmvq_parameter_table_id get_device_table_id(int cc) {
return MMVQ_PARAMETERS_GENERIC;
}
static constexpr __host__ __device__ int calc_nwarps(int ncols_dst, mmvq_parameter_table_id table_id) {
static constexpr __host__ __device__ int calc_nwarps(ggml_type type, int ncols_dst, mmvq_parameter_table_id table_id) {
if (table_id == MMVQ_PARAMETERS_GENERIC) {
switch (ncols_dst) {
case 1:
@ -114,6 +126,50 @@ static constexpr __host__ __device__ int calc_nwarps(int ncols_dst, mmvq_paramet
return 1;
}
}
if (table_id == MMVQ_PARAMETERS_RDNA4) {
// nwarps=8 benefits types with simple vec_dot on RDNA4 (ncols_dst=1).
// Types with complex vec_dot (Q3_K, IQ2_*, IQ3_*) regress due to register
// pressure and lookup table contention at higher thread counts.
if (ncols_dst == 1) {
switch (type) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
case GGML_TYPE_Q5_1:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q2_K:
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_XS:
return 8;
default:
return 1;
}
}
return 1;
}
if (table_id == MMVQ_PARAMETERS_RDNA3_0) {
// RDNA3 (W7900): stricter whitelist than RDNA4.
// Q2_K / Q5_K / IQ4_XS regress in full quant sweeps.
if (ncols_dst == 1) {
switch (type) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
case GGML_TYPE_Q5_1:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q6_K:
case GGML_TYPE_IQ4_NL:
return 8;
default:
return 1;
}
}
return 1;
}
return 1;
}
@ -138,7 +194,7 @@ static constexpr __host__ __device__ int calc_rows_per_block(int ncols_dst, int
}
template <ggml_type type, int ncols_dst, bool has_fusion, bool is_multi_token_id = false>
__launch_bounds__(calc_nwarps(ncols_dst, get_device_table_id())*ggml_cuda_get_physical_warp_size(), 1)
__launch_bounds__(calc_nwarps(type, ncols_dst, get_device_table_id())*ggml_cuda_get_physical_warp_size(), 1)
static __global__ void mul_mat_vec_q(
const void * __restrict__ vx, const void * __restrict__ vy, const int32_t * __restrict__ ids, const ggml_cuda_mm_fusion_args_device fusion, float * __restrict__ dst,
const uint32_t ncols_x, const uint3 nchannels_y, const uint32_t stride_row_x, const uint32_t stride_col_y,
@ -151,7 +207,7 @@ static __global__ void mul_mat_vec_q(
constexpr int qi = ggml_cuda_type_traits<type>::qi;
constexpr int vdr = get_vdr_mmvq(type);
constexpr mmvq_parameter_table_id table_id = get_device_table_id();
constexpr int nwarps = calc_nwarps(ncols_dst, table_id);
constexpr int nwarps = calc_nwarps(type, ncols_dst, table_id);
constexpr int rows_per_cuda_block = calc_rows_per_block(ncols_dst, table_id);
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
@ -355,12 +411,13 @@ static __global__ void mul_mat_vec_q(
}
}
template<ggml_type type>
static std::pair<dim3, dim3> calc_launch_params(
const int ncols_dst, const int nrows_x, const int nchannels_dst, const int nsamples_or_ntokens,
const int warp_size, const mmvq_parameter_table_id table_id) {
const int64_t nblocks = (nrows_x + calc_rows_per_block(ncols_dst, table_id) - 1) / calc_rows_per_block(ncols_dst, table_id);
const dim3 block_nums(nblocks, nchannels_dst, nsamples_or_ntokens);
const dim3 block_dims(warp_size, calc_nwarps(ncols_dst, table_id), 1);
const dim3 block_dims(warp_size, calc_nwarps(type, ncols_dst, table_id), 1);
return {block_nums, block_dims};
}
@ -420,7 +477,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
if (has_ids && ncols_dst > 1) {
// Multi-token MUL_MAT_ID path only - single-token goes through regular path below
constexpr int c_ncols_dst = 1;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, ncols_dst, warp_size, table_id);
std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst, ncols_dst, warp_size, table_id);
mul_mat_vec_q_switch_fusion<type, c_ncols_dst, true>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
@ -431,7 +488,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
switch (ncols_dst) {
case 1: {
constexpr int c_ncols_dst = 1;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
@ -439,7 +496,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
} break;
case 2: {
constexpr int c_ncols_dst = 2;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
@ -447,7 +504,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
} break;
case 3: {
constexpr int c_ncols_dst = 3;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
@ -455,7 +512,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
} break;
case 4: {
constexpr int c_ncols_dst = 4;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
@ -463,7 +520,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
} break;
case 5: {
constexpr int c_ncols_dst = 5;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
@ -471,7 +528,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
} break;
case 6: {
constexpr int c_ncols_dst = 6;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
@ -479,7 +536,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
} break;
case 7: {
constexpr int c_ncols_dst = 7;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
@ -487,7 +544,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
} break;
case 8: {
constexpr int c_ncols_dst = 8;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,

View File

@ -207,6 +207,14 @@
#define RDNA3
#endif // defined(__GFX11__)
#if defined(__gfx1150__) || defined(__gfx1151__)
#define RDNA3_5
#endif // defined(__gfx1150__) || defined(__gfx1151__)
#if defined(RDNA3) && !defined(RDNA3_5)
#define RDNA3_0
#endif // defined(RDNA3) && !defined(RDNA3_5)
#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
#define RDNA2

View File

@ -4767,7 +4767,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
sumqx += w*q*xb[j];
sumq2 += w*q*q;
}
d = sumqx/sumq2;
d = sumq2 > 0 ? sumqx/sumq2 : 0.f;
float best = d*sumqx;
for (int itry = -ntry; itry <= ntry; ++itry) {
id = (itry + values[0])/max;

View File

@ -55,7 +55,7 @@ void gated_delta_net_sycl(const float * q,
#pragma unroll
for (int r = 0; r < rows_per_lane; r++) {
const int i = r * warp_size + lane;
s_shard[r] = curr_state[i * S_v + col];
s_shard[r] = curr_state[col * S_v + i];
}
for (int t = 0; t < n_tokens; t++) {
@ -137,7 +137,7 @@ void gated_delta_net_sycl(const float * q,
#pragma unroll
for (int r = 0; r < rows_per_lane; r++) {
const int i = r * warp_size + lane;
state[i * S_v + col] = s_shard[r];
state[col * S_v + i] = s_shard[r];
}
}

View File

@ -4981,8 +4981,10 @@ static vk_device ggml_vk_get_device(size_t idx) {
std::vector<vk::QueueFamilyProperties> queue_family_props = device->physical_device.getQueueFamilyProperties();
// Try to find a non-graphics compute queue and transfer-focused queues
const uint32_t compute_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eCompute, vk::QueueFlagBits::eGraphics, -1, 1);
const uint32_t transfer_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eTransfer, vk::QueueFlagBits::eCompute | vk::QueueFlagBits::eGraphics, compute_queue_family_index, 1);
// On AMD, the graphics queue seems to be faster, so don't avoid it
const vk::QueueFlagBits graphics_flag = device->vendor_id == VK_VENDOR_ID_AMD ? (vk::QueueFlagBits)0 : vk::QueueFlagBits::eGraphics;
const uint32_t compute_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eCompute, graphics_flag, -1, 1);
const uint32_t transfer_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eTransfer, vk::QueueFlagBits::eCompute | graphics_flag, compute_queue_family_index, 1);
const float priorities[] = { 1.0f, 1.0f };
device->single_queue = compute_queue_family_index == transfer_queue_family_index && queue_family_props[compute_queue_family_index].queueCount == 1;
@ -5441,13 +5443,11 @@ static vk_device ggml_vk_get_device(size_t idx) {
ggml_vk_load_shaders(device);
const bool prefers_transfer_queue = device->vendor_id == VK_VENDOR_ID_AMD && device->architecture != AMD_GCN;
if (!device->single_queue) {
const uint32_t transfer_queue_index = compute_queue_family_index == transfer_queue_family_index ? 1 : 0;
ggml_vk_create_queue(device, device->transfer_queue, transfer_queue_family_index, transfer_queue_index, { vk::PipelineStageFlagBits::eTransfer }, true);
device->async_use_transfer_queue = prefers_transfer_queue || (getenv("GGML_VK_ASYNC_USE_TRANSFER_QUEUE") != nullptr);
device->async_use_transfer_queue = (getenv("GGML_VK_ASYNC_USE_TRANSFER_QUEUE") != nullptr);
} else {
// TODO: Use pointer or reference to avoid copy
device->transfer_queue.copyFrom(device->compute_queue);

View File

@ -245,7 +245,7 @@ void main() {
#endif
}
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
Sf[r][c] += ACC_TYPE(dot(Q_cache[r], K_Tf));
Sf[r][c] += dot(ACC_TYPEV4(Q_cache[r]), ACC_TYPEV4(K_Tf));
}
}
}
@ -270,7 +270,7 @@ void main() {
#endif
}
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
Sf[r][c] += ACC_TYPE(dot(Qf[tile_row(r) * qf_stride + d * D_split + d_tid], K_Tf));
Sf[r][c] += dot(ACC_TYPEV4(Qf[tile_row(r) * qf_stride + d * D_split + d_tid]), ACC_TYPEV4(K_Tf));
}
}
}

View File

@ -5,7 +5,7 @@ import os
import sys
import subprocess
HTTPLIB_VERSION = "refs/tags/v0.37.2"
HTTPLIB_VERSION = "refs/tags/v0.38.0"
vendor = {
"https://github.com/nlohmann/json/releases/latest/download/json.hpp": "vendor/nlohmann/json.hpp",

View File

@ -1953,6 +1953,12 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32
cells.pos_set(i, pos);
if (hparams.n_pos_per_embd() > 1) {
llama_kv_cell_ext ext;
io.read_to(&ext, sizeof(ext));
cells.ext_set(i, ext);
}
for (uint32_t j = 0; j < n_seq_id; ++j) {
llama_seq_id seq_id;
io.read_to(&seq_id, sizeof(seq_id));

View File

@ -7501,6 +7501,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
}
// recurrent / linear-attention weight scales (per-tensor, shape {1})
if (!layer.ssm_in_s && layer.ssm_in) {
layer.ssm_in_s = create_tensor(tn(LLM_TENSOR_SSM_IN, "scale", i), {1}, TENSOR_NOT_REQUIRED);
}
if (!layer.ssm_out_s && layer.ssm_out) {
layer.ssm_out_s = create_tensor(tn(LLM_TENSOR_SSM_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED);
}

View File

@ -409,7 +409,8 @@ struct llama_layer {
struct ggml_tensor * ffn_gate_shexp_s = nullptr;
struct ggml_tensor * ffn_up_shexp_s = nullptr;
struct ggml_tensor * ffn_down_shexp_s = nullptr;
struct ggml_tensor * ssm_out_s = nullptr;
struct ggml_tensor * ssm_in_s = nullptr;
struct ggml_tensor * ssm_out_s = nullptr;
struct ggml_tensor * ssm_alpha_s = nullptr;
struct ggml_tensor * ssm_beta_s = nullptr;

View File

@ -42,7 +42,7 @@ ggml_tensor * llm_build_mamba_base::build_mamba_layer(llm_graph_input_rs * inp,
cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
// {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs}
ggml_tensor * xz = build_lora_mm(layer.ssm_in, cur);
ggml_tensor * xz = build_lora_mm(layer.ssm_in, cur, layer.ssm_in_s);
// split the above in two
// => {d_inner, n_seq_tokens, n_seqs}
ggml_tensor * x = ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], 0);
@ -137,7 +137,7 @@ ggml_tensor * llm_build_mamba_base::build_mamba_layer(llm_graph_input_rs * inp,
y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
// {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
cur = build_lora_mm(layer.ssm_out, y);
cur = build_lora_mm(layer.ssm_out, y, layer.ssm_out_s);
}
// {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
@ -184,7 +184,7 @@ ggml_tensor * llm_build_mamba_base::build_mamba2_layer(llm_graph_input_rs * inp,
// d_in_proj = 2 * self.d_inner + 2 * self.ngroups * self.d_state + self.nheads
// {n_embd, d_in_proj} @ {n_embd, n_seq_tokens, n_seqs} => {d_in_proj, n_seq_tokens, n_seqs}
ggml_tensor * zxBCdt = build_lora_mm(model.layers[il].ssm_in, cur);
ggml_tensor * zxBCdt = build_lora_mm(model.layers[il].ssm_in, cur, model.layers[il].ssm_in_s);
// split the above in three
ggml_tensor * z = ggml_view_4d(ctx0, zxBCdt, head_dim, n_head, n_seq_tokens, n_seqs, head_dim * zxBCdt->nb[0],
@ -278,7 +278,7 @@ ggml_tensor * llm_build_mamba_base::build_mamba2_layer(llm_graph_input_rs * inp,
y = ggml_reshape_3d(ctx0, y, d_inner, n_seq_tokens, n_seqs);
// {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
cur = build_lora_mm(model.layers[il].ssm_out, y);
cur = build_lora_mm(model.layers[il].ssm_out, y, model.layers[il].ssm_out_s);
}
// {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}

View File

@ -107,9 +107,9 @@ ggml_tensor * llm_build_nemotron_h::build_attention_layer(ggml_tensor *
ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const llama_model & model, int il) {
if (model.layers[il].ffn_gate_inp == nullptr) {
cur = build_ffn(cur,
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
model.layers[il].ffn_up, model.layers[il].ffn_up_b, model.layers[il].ffn_up_s,
NULL, NULL, NULL,
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
model.layers[il].ffn_down, model.layers[il].ffn_down_b, model.layers[il].ffn_down_s,
NULL,
LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
@ -136,7 +136,10 @@ ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const lla
hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
il,
router_logits);
router_logits, nullptr,
model.layers[il].ffn_up_exps_s,
nullptr, // no gate
model.layers[il].ffn_down_exps_s);
cb(moe_out, "ffn_moe_out", il);
if (model.layers[il].ffn_latent_up) {
@ -144,9 +147,9 @@ ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const lla
}
ggml_tensor * ffn_shexp = build_ffn(inp_emb,
model.layers[il].ffn_up_shexp, NULL, NULL,
NULL /* no gate */ , NULL, NULL,
model.layers[il].ffn_down_shexp, NULL, NULL,
model.layers[il].ffn_up_shexp, NULL, model.layers[il].ffn_up_shexp_s,
NULL /* no gate */ , NULL, NULL,
model.layers[il].ffn_down_shexp, NULL, model.layers[il].ffn_down_shexp_s,
NULL,
LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
cb(ffn_shexp, "ffn_shexp", il);

View File

@ -215,7 +215,7 @@ struct cli_context {
inputs.parallel_tool_calls = false;
inputs.add_generation_prompt = true;
inputs.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
inputs.enable_thinking = common_chat_templates_support_enable_thinking(chat_params.tmpls.get());
inputs.enable_thinking = chat_params.enable_thinking ? common_chat_templates_support_enable_thinking(chat_params.tmpls.get()) : false;
// Apply chat template to the list of messages
return common_chat_templates_apply(chat_params.tmpls.get(), inputs);

Binary file not shown.

View File

@ -563,7 +563,7 @@ def test_cancel_request():
except requests.exceptions.ReadTimeout:
pass # expected
# make sure the slot is free
time.sleep(1) # wait for HTTP_POLLING_SECONDS
time.sleep(2)
res = server.make_request("GET", "/slots")
assert res.body[0]["is_processing"] == False

View File

@ -939,7 +939,6 @@
"integrity": "sha512-oJrXtQiAXLvT9clCf1K4kxp3eKsQhIaZqxEyowkBcsvZDdZkbWrVmnGknxs5flTD0VGsxrxKgBCZty1EzoiMzA==",
"dev": true,
"license": "Apache-2.0",
"peer": true,
"dependencies": {
"@swc/helpers": "^0.5.0"
}
@ -2161,7 +2160,6 @@
"integrity": "sha512-W9R51zUCd2iHOQBg/D93+bdpYv6kbtFx+kft5X8lPKQl6yEu0aKs9i5N5GyCASOhIApgx/tkqZIJ7vgM4cqrHA==",
"dev": true,
"license": "MIT",
"peer": true,
"dependencies": {
"ts-dedent": "^2.0.0",
"type-fest": "~2.19"
@ -2245,7 +2243,6 @@
"integrity": "sha512-875hTUkEbz+MyJIxWbQjfMaekqdmEKUUfR7JyKcpfMRZqcGyrO9Gd+iS1D/Dx8LpE5FEtutWGOtlAh4ReSAiOA==",
"dev": true,
"license": "MIT",
"peer": true,
"dependencies": {
"@standard-schema/spec": "^1.0.0",
"@sveltejs/acorn-typescript": "^1.0.5",
@ -2289,7 +2286,6 @@
"integrity": "sha512-YZs/OSKOQAQCnJvM/P+F1URotNnYNeU3P2s4oIpzm1uFaqUEqRxUB0g5ejMjEb5Gjb9/PiBI5Ktrq4rUUF8UVQ==",
"dev": true,
"license": "MIT",
"peer": true,
"dependencies": {
"@sveltejs/vite-plugin-svelte-inspector": "^5.0.0",
"debug": "^4.4.1",
@ -2705,7 +2701,6 @@
"integrity": "sha512-pemlzrSESWbdAloYml3bAJMEfNh1Z7EduzqPKprCH5S341frlpYnUEW0H72dLxa6IsYr+mPno20GiSm+h9dEdQ==",
"dev": true,
"license": "MIT",
"peer": true,
"dependencies": {
"@babel/code-frame": "^7.10.4",
"@babel/runtime": "^7.12.5",
@ -2873,7 +2868,6 @@
"integrity": "sha512-+0/4J266CBGPUq/ELg7QUHhN25WYjE0wYTPSQJn1xeu8DOlIOPxXxrNGiLmfAWl7HMMgWFWXpt9IDjMWrF5Iow==",
"dev": true,
"license": "MIT",
"peer": true,
"dependencies": {
"undici-types": "~7.16.0"
}
@ -2940,7 +2934,6 @@
"integrity": "sha512-IgSWvLobTDOjnaxAfDTIHaECbkNlAlKv2j5SjpB2v7QHKv1FIfjwMy8FsDbVfDX/KjmCmYICcw7uGaXLhtsLNg==",
"dev": true,
"license": "MIT",
"peer": true,
"dependencies": {
"@typescript-eslint/scope-manager": "8.56.0",
"@typescript-eslint/types": "8.56.0",
@ -3177,7 +3170,6 @@
"integrity": "sha512-tJxiPrWmzH8a+w9nLKlQMzAKX/7VjFs50MWgcAj7p9XQ7AQ9/35fByFYptgPELyLw+0aixTnC4pUWV+APcZ/kw==",
"dev": true,
"license": "MIT",
"peer": true,
"dependencies": {
"@testing-library/dom": "^10.4.0",
"@testing-library/user-event": "^14.6.1",
@ -3305,7 +3297,6 @@
"integrity": "sha512-oukfKT9Mk41LreEW09vt45f8wx7DordoWUZMYdY/cyAk7w5TWkTRCNZYF7sX7n2wB7jyGAl74OxgwhPgKaqDMQ==",
"dev": true,
"license": "MIT",
"peer": true,
"dependencies": {
"@vitest/utils": "3.2.4",
"pathe": "^2.0.3",
@ -3376,7 +3367,6 @@
"resolved": "https://registry.npmjs.org/acorn/-/acorn-8.15.0.tgz",
"integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==",
"license": "MIT",
"peer": true,
"bin": {
"acorn": "bin/acorn"
},
@ -4094,7 +4084,8 @@
"resolved": "https://registry.npmjs.org/csstype/-/csstype-3.1.3.tgz",
"integrity": "sha512-M1uQkMl8rQK/szD0LNhtqxIPLpimGm8sOBwU7lLnCpSbTyY3yeU1Vc7l4KT5zT4s/yOxHH5O7tIuuLOCnLADRw==",
"dev": true,
"license": "MIT"
"license": "MIT",
"peer": true
},
"node_modules/debug": {
"version": "4.4.3",
@ -4404,7 +4395,6 @@
"dev": true,
"hasInstallScript": true,
"license": "MIT",
"peer": true,
"bin": {
"esbuild": "bin/esbuild"
},
@ -4465,7 +4455,6 @@
"integrity": "sha512-LEyamqS7W5HB3ujJyvi0HQK/dtVINZvd5mAAp9eT5S/ujByGjiZLCzPcHVzuXbpJDJF/cxwHlfceVUDZ2lnSTw==",
"dev": true,
"license": "MIT",
"peer": true,
"dependencies": {
"@eslint-community/eslint-utils": "^4.8.0",
"@eslint-community/regexpp": "^4.12.1",
@ -5672,7 +5661,6 @@
"resolved": "https://registry.npmjs.org/hono/-/hono-4.11.7.tgz",
"integrity": "sha512-l7qMiNee7t82bH3SeyUCt9UF15EVmaBvsppY2zQtrbIhl/yzBTny+YUxsVjSjQ6gaqaeVtZmGocom8TzBlA4Yw==",
"license": "MIT",
"peer": true,
"engines": {
"node": ">=16.9.0"
}
@ -8097,7 +8085,6 @@
}
],
"license": "MIT",
"peer": true,
"dependencies": {
"nanoid": "^3.3.11",
"picocolors": "^1.1.1",
@ -8231,7 +8218,6 @@
"integrity": "sha512-I7AIg5boAr5R0FFtJ6rCfD+LFsWHp81dolrFD8S79U9tb8Az2nGrJncnMSnys+bpQJfRUzqs9hnA81OAA3hCuQ==",
"dev": true,
"license": "MIT",
"peer": true,
"bin": {
"prettier": "bin/prettier.cjs"
},
@ -8248,7 +8234,6 @@
"integrity": "sha512-pn1ra/0mPObzqoIQn/vUTR3ZZI6UuZ0sHqMK5x2jMLGrs53h0sXhkVuDcrlssHwIMk7FYrMjHBPoUSyyEEDlBQ==",
"dev": true,
"license": "MIT",
"peer": true,
"peerDependencies": {
"prettier": "^3.0.0",
"svelte": "^3.2.0 || ^4.0.0-next.0 || ^5.0.0-next.0"
@ -8480,7 +8465,6 @@
"integrity": "sha512-FS+XFBNvn3GTAWq26joslQgWNoFu08F4kl0J4CgdNKADkdSGXQyTCnKteIAJy96Br6YbpEU1LSzV5dYtjMkMDg==",
"dev": true,
"license": "MIT",
"peer": true,
"engines": {
"node": ">=0.10.0"
}
@ -8491,7 +8475,6 @@
"integrity": "sha512-Xs1hdnE+DyKgeHJeJznQmYMIBG3TKIHJJT95Q58nHLSrElKlGQqDTR2HQ9fx5CN/Gk6Vh/kupBTDLU11/nDk/g==",
"dev": true,
"license": "MIT",
"peer": true,
"dependencies": {
"scheduler": "^0.26.0"
},
@ -8766,7 +8749,6 @@
"integrity": "sha512-4iya7Jb76fVpQyLoiVpzUrsjQ12r3dM7fIVz+4NwoYvZOShknRmiv+iu9CClZml5ZLGb0XMcYLutK6w9tgxHDw==",
"dev": true,
"license": "MIT",
"peer": true,
"dependencies": {
"@types/estree": "1.0.8"
},
@ -8877,7 +8859,6 @@
"integrity": "sha512-elOcIZRTM76dvxNAjqYrucTSI0teAF/L2Lv0s6f6b7FOwcwIuA357bIE871580AjHJuSvLIRUosgV+lIWx6Rgg==",
"dev": true,
"license": "MIT",
"peer": true,
"dependencies": {
"chokidar": "^4.0.0",
"immutable": "^5.0.2",
@ -9172,7 +9153,6 @@
"integrity": "sha512-LwF0VZsT4qkgx66Ad/q0QgZZrU2a5WftaADDEcJ3bGq3O2fHvwWPlSZjM1HiXD4vqP9U5JiMqQkV1gkyH0XJkw==",
"dev": true,
"license": "MIT",
"peer": true,
"dependencies": {
"@storybook/global": "^5.0.0",
"@storybook/icons": "^2.0.1",
@ -9387,7 +9367,6 @@
"resolved": "https://registry.npmjs.org/svelte/-/svelte-5.48.3.tgz",
"integrity": "sha512-w7QZ398cdNherTdiQ/v3SYLLGOO4948Jgjh04PYqtTYVohmBvbmFwLmo7pp8gp4/1tceRWfSTjHgjtfpCVNJmQ==",
"license": "MIT",
"peer": true,
"dependencies": {
"@jridgewell/remapping": "^2.3.4",
"@jridgewell/sourcemap-codec": "^1.5.0",
@ -9633,7 +9612,6 @@
"integrity": "sha512-gBXpgUm/3rp1lMZZrM/w7D8GKqshif0zAymAhbCyIt8KMe+0v9DQ7cdYLR4FHH/cKpdTXb+A/tKKU3eolfsI+g==",
"dev": true,
"license": "MIT",
"peer": true,
"funding": {
"type": "github",
"url": "https://github.com/sponsors/dcastil"
@ -9664,8 +9642,7 @@
"resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-4.1.11.tgz",
"integrity": "sha512-2E9TBm6MDD/xKYe+dvJZAmg3yxIEDNRc0jwlNyDg/4Fil2QcSLjFKGVff0lAf1jjeaArlG/M75Ey/EYr/OJtBA==",
"dev": true,
"license": "MIT",
"peer": true
"license": "MIT"
},
"node_modules/tapable": {
"version": "2.2.2",
@ -9942,7 +9919,6 @@
"integrity": "sha512-p1diW6TqL9L07nNxvRMM7hMMw4c5XOo/1ibL4aAIGmSAt9slTE1Xgw5KWuof2uTOvCg9BY7ZRi+GaF+7sfgPeQ==",
"dev": true,
"license": "Apache-2.0",
"peer": true,
"bin": {
"tsc": "bin/tsc",
"tsserver": "bin/tsserver"
@ -10336,7 +10312,6 @@
"integrity": "sha512-BxAKBWmIbrDgrokdGZH1IgkIk/5mMHDreLDmCJ0qpyJaAteP8NvMhkwr/ZCQNqNH97bw/dANTE9PDzqwJghfMQ==",
"dev": true,
"license": "MIT",
"peer": true,
"dependencies": {
"esbuild": "^0.25.0",
"fdir": "^6.5.0",
@ -10497,7 +10472,6 @@
"integrity": "sha512-LUCP5ev3GURDysTWiP47wRRUpLKMOfPh+yKTx3kVIEiu5KOMeqzpnYNsKyOoVrULivR8tLcks4+lga33Whn90A==",
"dev": true,
"license": "MIT",
"peer": true,
"dependencies": {
"@types/chai": "^5.2.2",
"@vitest/expect": "3.2.4",
@ -10819,7 +10793,6 @@
"resolved": "https://registry.npmjs.org/zod/-/zod-4.2.1.tgz",
"integrity": "sha512-0wZ1IRqGGhMP76gLqz8EyfBXKk0J2qo2+H3fi4mcUP/KtTocoX08nmIAHl1Z2kJIZbZee8KOpBCSNPRgauucjw==",
"license": "MIT",
"peer": true,
"funding": {
"url": "https://github.com/sponsors/colinhacks"
}

View File

@ -65,7 +65,8 @@
$effect(() => {
if (conversationModel) {
modelsStore.selectModelByName(conversationModel);
} else if (isRouter && modelsStore.loadedModelIds.length > 0) {
} else if (isRouter && !modelsStore.selectedModelId && modelsStore.loadedModelIds.length > 0) {
// auto-select the first loaded model only when nothing is selected yet
const first = modelOptions().find((m) => modelsStore.loadedModelIds.includes(m.model));
if (first) modelsStore.selectModelById(first.id);
}

View File

@ -3,6 +3,7 @@
import { Button } from '$lib/components/ui/button';
import { DialogConversationSelection, DialogConfirmation } from '$lib/components/app';
import { createMessageCountMap } from '$lib/utils';
import { ISO_DATE_TIME_SEPARATOR } from '$lib/constants';
import { conversationsStore, conversations } from '$lib/stores/conversations.svelte';
import { toast } from 'svelte-sonner';
@ -55,18 +56,10 @@
})
);
const blob = new Blob([JSON.stringify(allData, null, 2)], {
type: 'application/json'
});
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = `conversations_${new Date().toISOString().split('T')[0]}.json`;
document.body.appendChild(a);
a.click();
document.body.removeChild(a);
URL.revokeObjectURL(url);
conversationsStore.downloadConversationFile(
allData,
`${new Date().toISOString().split(ISO_DATE_TIME_SEPARATOR)[0]}_conversations.json`
);
exportedConversations = selectedConversations;
showExportSummary = true;

View File

@ -6,6 +6,7 @@
import { parseHeadersToArray, serializeHeaders } from '$lib/utils';
import { UrlProtocol } from '$lib/enums';
import { MCP_SERVER_URL_PLACEHOLDER } from '$lib/constants';
import { mcpStore } from '$lib/stores/mcp.svelte';
interface Props {
url: string;
@ -62,14 +63,33 @@
{/if}
{#if !isWebSocket && onUseProxyChange}
<label class="mt-3 flex cursor-pointer items-center gap-2">
<label
class="mt-3 flex items-start gap-2"
class:cursor-pointer={mcpStore.isProxyAvailable}
class:opacity-80={!mcpStore.isProxyAvailable}
>
<Switch
class="mt-1"
id="use-proxy-{id}"
checked={useProxy}
disabled={!mcpStore.isProxyAvailable}
onCheckedChange={(checked) => onUseProxyChange?.(checked)}
/>
<span class="text-xs text-muted-foreground">Use llama-server proxy</span>
<span>
<span class="text-xs text-muted-foreground">Use llama-server proxy</span>
<br />
{#if !mcpStore.isProxyAvailable}
<span class="inline-flex gap-0.75 text-xs text-muted-foreground/60"
>(Run <pre>llama-server</pre>
with
<pre>--webui-mcp-proxy</pre>
flag)</span
>
{/if}
</span>
</label>
{/if}
</div>

View File

@ -24,6 +24,7 @@ export * from './max-bundle-size';
export * from './mcp';
export * from './mcp-form';
export * from './mcp-resource';
export * from './message-export';
export * from './model-id';
export * from './precision';
export * from './processing-info';

View File

@ -0,0 +1,20 @@
// Conversation filename constants
// Length of the trimmed conversation ID in the filename
export const EXPORT_CONV_ID_TRIM_LENGTH = 8;
// Maximum length of the sanitized conversation name snippet
export const EXPORT_CONV_NAME_SUFFIX_MAX_LENGTH = 20;
// Characters to keep in the ISO timestamp. 19 keeps 2026-01-01T00:00:00
export const ISO_TIMESTAMP_SLICE_LENGTH = 19;
// Replacements for making the conversation title filename-friendly
export const NON_ALPHANUMERIC_REGEX = /[^a-z0-9]/gi;
export const EXPORT_CONV_NONALNUM_REPLACEMENT = '_';
export const MULTIPLE_UNDERSCORE_REGEX = /_+/g;
// Replacements to the ISO date for use in the export filename
export const ISO_DATE_TIME_SEPARATOR = 'T';
export const ISO_DATE_TIME_SEPARATOR_REPLACEMENT = '_';
export const ISO_TIME_SEPARATOR = ':';
export const ISO_TIME_SEPARATOR_REPLACEMENT = '-';

View File

@ -26,6 +26,18 @@ import { config } from '$lib/stores/settings.svelte';
import { filterByLeafNodeId, findLeafNode } from '$lib/utils';
import type { McpServerOverride } from '$lib/types/database';
import { MessageRole } from '$lib/enums';
import {
ISO_DATE_TIME_SEPARATOR,
ISO_DATE_TIME_SEPARATOR_REPLACEMENT,
ISO_TIMESTAMP_SLICE_LENGTH,
EXPORT_CONV_ID_TRIM_LENGTH,
EXPORT_CONV_NONALNUM_REPLACEMENT,
EXPORT_CONV_NAME_SUFFIX_MAX_LENGTH,
ISO_TIME_SEPARATOR,
ISO_TIME_SEPARATOR_REPLACEMENT,
NON_ALPHANUMERIC_REGEX,
MULTIPLE_UNDERSCORE_REGEX
} from '$lib/constants';
class ConversationsStore {
/**
@ -619,6 +631,66 @@ class ConversationsStore {
*
*/
/**
* Generates a sanitized filename for a conversation export
* @param conversation - The conversation metadata
* @param msgs - Optional array of messages belonging to the conversation
* @returns The generated filename string
*/
generateConversationFilename(
conversation: { id?: string; name?: string },
msgs?: DatabaseMessage[]
): string {
const conversationName = (conversation.name ?? '').trim().toLowerCase();
const sanitizedName = conversationName
.replace(NON_ALPHANUMERIC_REGEX, EXPORT_CONV_NONALNUM_REPLACEMENT)
.replace(MULTIPLE_UNDERSCORE_REGEX, '_')
.substring(0, EXPORT_CONV_NAME_SUFFIX_MAX_LENGTH);
// If we have messages, use the timestamp of the newest message
const referenceDate = msgs?.length
? new Date(Math.max(...msgs.map((m) => m.timestamp)))
: new Date();
const iso = referenceDate.toISOString().slice(0, ISO_TIMESTAMP_SLICE_LENGTH);
const formattedDate = iso
.replace(ISO_DATE_TIME_SEPARATOR, ISO_DATE_TIME_SEPARATOR_REPLACEMENT)
.replaceAll(ISO_TIME_SEPARATOR, ISO_TIME_SEPARATOR_REPLACEMENT);
const trimmedConvId = conversation.id?.slice(0, EXPORT_CONV_ID_TRIM_LENGTH) ?? '';
return `${formattedDate}_conv_${trimmedConvId}_${sanitizedName}.json`;
}
/**
* Triggers a browser download of the provided exported conversation data
* @param data - The exported conversation payload (either a single conversation or array of them)
* @param filename - Filename; if omitted, a deterministic name is generated
*/
downloadConversationFile(data: ExportedConversations, filename?: string): void {
// Choose the first conversation or message
const conversation =
'conv' in data ? data.conv : Array.isArray(data) ? data[0]?.conv : undefined;
const msgs =
'messages' in data ? data.messages : Array.isArray(data) ? data[0]?.messages : undefined;
if (!conversation) {
console.error('Invalid data: missing conversation');
return;
}
const downloadFilename = filename ?? this.generateConversationFilename(conversation, msgs);
const blob = new Blob([JSON.stringify(data, null, 2)], { type: 'application/json' });
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = downloadFilename;
document.body.appendChild(a);
a.click();
document.body.removeChild(a);
URL.revokeObjectURL(url);
}
/**
* Downloads a conversation as JSON file.
* @param convId - The conversation ID to download
@ -636,40 +708,7 @@ class ConversationsStore {
messages = await DatabaseService.getConversationMessages(convId);
}
this.triggerDownload({ conv: conversation, messages });
}
/**
* Exports all conversations with their messages as a JSON file
* @returns The list of exported conversations
*/
async exportAllConversations(): Promise<DatabaseConversation[]> {
const allConversations = await DatabaseService.getAllConversations();
if (allConversations.length === 0) {
throw new Error('No conversations to export');
}
const allData = await Promise.all(
allConversations.map(async (conv) => {
const messages = await DatabaseService.getConversationMessages(conv.id);
return { conv, messages };
})
);
const blob = new Blob([JSON.stringify(allData, null, 2)], { type: 'application/json' });
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = `all_conversations_${new Date().toISOString().split('T')[0]}.json`;
document.body.appendChild(a);
a.click();
document.body.removeChild(a);
URL.revokeObjectURL(url);
toast.success(`All conversations (${allConversations.length}) prepared for download`);
return allConversations;
this.downloadConversationFile({ conv: conversation, messages });
}
/**
@ -743,37 +782,6 @@ class ConversationsStore {
await this.loadConversations();
return result;
}
/**
* Triggers file download in browser
*/
private triggerDownload(data: ExportedConversations, filename?: string): void {
const conversation =
'conv' in data ? data.conv : Array.isArray(data) ? data[0]?.conv : undefined;
if (!conversation) {
console.error('Invalid data: missing conversation');
return;
}
const conversationName = conversation.name?.trim() || '';
const truncatedSuffix = conversationName
.toLowerCase()
.replace(/[^a-z0-9]/gi, '_')
.replace(/_+/g, '_')
.substring(0, 20);
const downloadFilename = filename || `conversation_${conversation.id}_${truncatedSuffix}.json`;
const blob = new Blob([JSON.stringify(data, null, 2)], { type: 'application/json' });
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = downloadFilename;
document.body.appendChild(a);
a.click();
document.body.removeChild(a);
URL.revokeObjectURL(url);
}
}
export const conversationsStore = new ConversationsStore();

View File

@ -20,6 +20,7 @@
*/
import { browser } from '$app/environment';
import { base } from '$app/paths';
import { MCPService } from '$lib/services/mcp.service';
import { config, settingsStore } from '$lib/stores/settings.svelte';
import { mcpResourceStore } from '$lib/stores/mcp-resources.svelte';
@ -42,6 +43,7 @@ import {
ToolCallType
} from '$lib/enums';
import {
CORS_PROXY_ENDPOINT,
DEFAULT_CACHE_TTL_MS,
DEFAULT_MCP_CONFIG,
EXPECTED_THEMED_ICON_PAIR_COUNT,
@ -78,165 +80,13 @@ import type { ListChangedHandlers } from '@modelcontextprotocol/sdk/types.js';
import type { DatabaseMessageExtraMcpResource, McpServerOverride } from '$lib/types/database';
import type { SettingsConfigType } from '$lib/types/settings';
export function buildMcpClientConfig(
cfg: SettingsConfigType,
perChatOverrides?: McpServerOverride[]
): MCPClientConfig | undefined {
return buildMcpClientConfigInternal(cfg, perChatOverrides);
}
/**
* Internal helper to build MCP client config.
* Kept as standalone function for external use and tests.
*/
export function buildMcpClientConfigInternal(
cfg: SettingsConfigType,
perChatOverrides?: McpServerOverride[]
): MCPClientConfig | undefined {
const rawServers = parseServerSettings(cfg.mcpServers);
if (!rawServers.length) {
return undefined;
}
const servers: Record<string, MCPServerConfig> = {};
for (const [index, entry] of rawServers.entries()) {
if (!checkServerEnabled(entry, perChatOverrides)) continue;
const normalized = buildServerConfig(entry);
if (normalized) servers[generateMcpServerId(entry.id, index)] = normalized;
}
if (Object.keys(servers).length === 0) {
return undefined;
}
return {
protocolVersion: DEFAULT_MCP_CONFIG.protocolVersion,
capabilities: DEFAULT_MCP_CONFIG.capabilities,
clientInfo: DEFAULT_MCP_CONFIG.clientInfo,
requestTimeoutMs: Math.round(DEFAULT_MCP_CONFIG.requestTimeoutSeconds * 1000),
servers
};
}
/**
* Generates a unique server ID from an optional ID string or index.
* @deprecated Use MCPStore.#generateServerId instead
*/
function generateMcpServerId(id: unknown, index: number): string {
if (typeof id === 'string' && id.trim()) {
return id.trim();
}
return `${MCP_SERVER_ID_PREFIX}-${index + 1}`;
}
/**
* Parses raw server settings from config into MCPServerSettingsEntry array.
* @deprecated Use MCPStore.#parseServerSettings instead
*/
function parseServerSettings(rawServers: unknown): MCPServerSettingsEntry[] {
if (!rawServers) {
return [];
}
let parsed: unknown;
if (typeof rawServers === 'string') {
const trimmed = rawServers.trim();
if (!trimmed) {
return [];
}
try {
parsed = JSON.parse(trimmed);
} catch (error) {
console.warn('[MCP] Failed to parse mcpServers JSON:', error);
return [];
}
} else {
parsed = rawServers;
}
if (!Array.isArray(parsed)) {
return [];
}
return parsed.map((entry, index) => {
const url = typeof entry?.url === 'string' ? entry.url.trim() : '';
const headers = typeof entry?.headers === 'string' ? entry.headers.trim() : undefined;
return {
id: generateMcpServerId((entry as { id?: unknown })?.id, index),
enabled: Boolean((entry as { enabled?: unknown })?.enabled),
url,
name: (entry as { name?: string })?.name,
requestTimeoutSeconds: DEFAULT_MCP_CONFIG.requestTimeoutSeconds,
headers: headers || undefined,
useProxy: Boolean((entry as { useProxy?: unknown })?.useProxy)
} satisfies MCPServerSettingsEntry;
});
}
/**
* Builds server configuration from a settings entry.
* @deprecated Use MCPStore.#buildServerConfig instead
*/
function buildServerConfig(
entry: MCPServerSettingsEntry,
connectionTimeoutMs = DEFAULT_MCP_CONFIG.connectionTimeoutMs
): MCPServerConfig | undefined {
if (!entry?.url) {
return undefined;
}
let headers: Record<string, string> | undefined;
if (entry.headers) {
try {
const parsed = JSON.parse(entry.headers);
if (typeof parsed === 'object' && parsed !== null && !Array.isArray(parsed))
headers = parsed as Record<string, string>;
} catch {
console.warn('[MCP] Failed to parse custom headers JSON:', entry.headers);
}
}
return {
url: entry.url,
transport: detectMcpTransportFromUrl(entry.url),
handshakeTimeoutMs: connectionTimeoutMs,
requestTimeoutMs: Math.round(entry.requestTimeoutSeconds * 1000),
headers,
useProxy: entry.useProxy
};
}
/**
* Checks if a server is enabled, considering per-chat overrides.
* @deprecated Use MCPStore.#checkServerEnabled instead
*/
function checkServerEnabled(
server: MCPServerSettingsEntry,
perChatOverrides?: McpServerOverride[]
): boolean {
if (!server.enabled) {
return false;
}
if (perChatOverrides) {
const override = perChatOverrides.find((o) => o.serverId === server.id);
return override?.enabled ?? false;
}
return false;
}
class MCPStore {
private _isInitializing = $state(false);
private _error = $state<string | null>(null);
private _toolCount = $state(0);
private _connectedServers = $state<string[]>([]);
private _healthChecks = $state<Record<string, HealthCheckState>>({});
private _proxyAvailable = $state(false);
private connections = new Map<string, MCPConnection>();
private toolsIndex = new Map<string, string>();
@ -246,6 +96,29 @@ class MCPStore {
private initPromise: Promise<boolean> | null = null;
private activeFlowCount = 0;
constructor() {
if (browser) {
this.probeProxy();
}
}
/**
* Probes the CORS proxy endpoint to determine availability.
* The endpoint is only registered when llama-server runs with --webui-mcp-proxy.
*/
async probeProxy(): Promise<void> {
try {
const response = await fetch(`${base}${CORS_PROXY_ENDPOINT}`, { method: 'HEAD' });
this._proxyAvailable = response.status !== 404;
} catch {
this._proxyAvailable = false;
}
}
get isProxyAvailable(): boolean {
return this._proxyAvailable;
}
/**
* Generates a unique server ID from an optional ID string or index.
*/
@ -520,6 +393,7 @@ class MCPStore {
getServerLabel(server: MCPServerSettingsEntry): string {
const healthState = this.getHealthCheckState(server.id);
if (healthState?.status === HealthCheckStatus.SUCCESS)
return (
healthState.serverInfo?.title || healthState.serverInfo?.name || server.name || server.url
@ -603,6 +477,7 @@ class MCPStore {
*/
#proxyIconSrc(src: string): string {
if (src.startsWith('data:')) return src;
if (!this._proxyAvailable) return src;
return getProxiedUrlString(src);
}
@ -629,7 +504,7 @@ class MCPStore {
}
}
return getFaviconUrl(server.url);
return getFaviconUrl(server.url, this._proxyAvailable);
}
isAnyServerLoading(): boolean {
@ -2072,6 +1947,7 @@ export const mcpIsInitializing = () => mcpStore.isInitializing;
export const mcpIsInitialized = () => mcpStore.isInitialized;
export const mcpError = () => mcpStore.error;
export const mcpIsEnabled = () => mcpStore.isEnabled;
export const mcpIsProxyAvailable = () => mcpStore.isProxyAvailable;
export const mcpAvailableTools = () => mcpStore.availableTools;
export const mcpConnectedServerCount = () => mcpStore.connectedServerCount;
export const mcpConnectedServerNames = () => mcpStore.connectedServerNames;

View File

@ -1,6 +1,7 @@
/**
* Utility functions for conversation data manipulation
*/
import type { DatabaseMessage } from '$lib/types';
/**
* Creates a map of conversation IDs to their message counts from exported conversation data

View File

@ -17,7 +17,7 @@ import {
* @param urlString - The URL to get the favicon for
* @returns The favicon URL or null if invalid
*/
export function getFaviconUrl(urlString: string): string | null {
export function getFaviconUrl(urlString: string, useProxy = true): string | null {
try {
const url = new URL(urlString);
const hostnameParts = url.hostname.split(DOMAIN_SEPARATOR);
@ -27,7 +27,7 @@ export function getFaviconUrl(urlString: string): string | null {
: url.hostname;
const googleFaviconUrl = `${GOOGLE_FAVICON_BASE_URL}?domain=${rootDomain}&sz=${DEFAULT_FAVICON_SIZE}`;
return getProxiedUrlString(googleFaviconUrl);
return useProxy ? getProxiedUrlString(googleFaviconUrl) : googleFaviconUrl;
} catch {
return null;
}

View File

@ -231,7 +231,7 @@
<Sidebar.Trigger
class="transition-left absolute left-0 z-[900] duration-200 ease-linear {sidebarOpen
? 'md:left-[var(--sidebar-width)]'
: ''}"
: 'md:left-0!'}"
style="translate: 1rem 1rem;"
/>
{/if}

View File

@ -1025,6 +1025,30 @@ bool is_valid_path(const std::string &path) {
return true;
}
bool canonicalize_path(const char *path, std::string &resolved) {
#if defined(_WIN32)
char buf[_MAX_PATH];
if (_fullpath(buf, path, _MAX_PATH) == nullptr) { return false; }
resolved = buf;
#else
char buf[PATH_MAX];
if (realpath(path, buf) == nullptr) { return false; }
resolved = buf;
#endif
return true;
}
bool is_path_within_base(const std::string &resolved_path,
const std::string &resolved_base) {
#if defined(_WIN32)
return _strnicmp(resolved_path.c_str(), resolved_base.c_str(),
resolved_base.size()) == 0;
#else
return strncmp(resolved_path.c_str(), resolved_base.c_str(),
resolved_base.size()) == 0;
#endif
}
FileStat::FileStat(const std::string &path) {
#if defined(_WIN32)
auto wpath = u8string_to_wstring(path.c_str());
@ -2627,33 +2651,114 @@ bool can_compress_content_type(const std::string &content_type) {
}
}
bool parse_quality(const char *b, const char *e, std::string &token,
double &quality) {
quality = 1.0;
token.clear();
// Split on first ';': left = token name, right = parameters
const char *params_b = nullptr;
std::size_t params_len = 0;
divide(
b, static_cast<std::size_t>(e - b), ';',
[&](const char *lb, std::size_t llen, const char *rb, std::size_t rlen) {
auto r = trim(lb, lb + llen, 0, llen);
if (r.first < r.second) { token.assign(lb + r.first, lb + r.second); }
params_b = rb;
params_len = rlen;
});
if (token.empty()) { return false; }
if (params_len == 0) { return true; }
// Scan parameters for q= (stops on first match)
bool invalid = false;
split_find(params_b, params_b + params_len, ';',
(std::numeric_limits<size_t>::max)(),
[&](const char *pb, const char *pe) -> bool {
// Match exactly "q=" or "Q=" (not "query=" etc.)
auto len = static_cast<size_t>(pe - pb);
if (len < 2) { return false; }
if ((pb[0] != 'q' && pb[0] != 'Q') || pb[1] != '=') {
return false;
}
// Trim the value portion
auto r = trim(pb, pe, 2, len);
if (r.first >= r.second) {
invalid = true;
return true;
}
double v = 0.0;
auto res = from_chars(pb + r.first, pb + r.second, v);
if (res.ec != std::errc{} || v < 0.0 || v > 1.0) {
invalid = true;
return true;
}
quality = v;
return true;
});
return !invalid;
}
EncodingType encoding_type(const Request &req, const Response &res) {
auto ret =
detail::can_compress_content_type(res.get_header_value("Content-Type"));
if (!ret) { return EncodingType::None; }
if (!can_compress_content_type(res.get_header_value("Content-Type"))) {
return EncodingType::None;
}
const auto &s = req.get_header_value("Accept-Encoding");
(void)(s);
if (s.empty()) { return EncodingType::None; }
// Single-pass: iterate tokens and track the best supported encoding.
// Server preference breaks ties (br > gzip > zstd).
EncodingType best = EncodingType::None;
double best_q = 0.0; // q=0 means "not acceptable"
// Server preference: Brotli > Gzip > Zstd (lower = more preferred)
auto priority = [](EncodingType t) -> int {
switch (t) {
case EncodingType::Brotli: return 0;
case EncodingType::Gzip: return 1;
case EncodingType::Zstd: return 2;
default: return 3;
}
};
std::string name;
split(s.data(), s.data() + s.size(), ',', [&](const char *b, const char *e) {
double quality = 1.0;
if (!parse_quality(b, e, name, quality)) { return; }
if (quality <= 0.0) { return; }
EncodingType type = EncodingType::None;
#ifdef CPPHTTPLIB_BROTLI_SUPPORT
// TODO: 'Accept-Encoding' has br, not br;q=0
ret = s.find("br") != std::string::npos;
if (ret) { return EncodingType::Brotli; }
if (case_ignore::equal(name, "br")) { type = EncodingType::Brotli; }
#endif
#ifdef CPPHTTPLIB_ZLIB_SUPPORT
// TODO: 'Accept-Encoding' has gzip, not gzip;q=0
ret = s.find("gzip") != std::string::npos;
if (ret) { return EncodingType::Gzip; }
if (type == EncodingType::None && case_ignore::equal(name, "gzip")) {
type = EncodingType::Gzip;
}
#endif
#ifdef CPPHTTPLIB_ZSTD_SUPPORT
// TODO: 'Accept-Encoding' has zstd, not zstd;q=0
ret = s.find("zstd") != std::string::npos;
if (ret) { return EncodingType::Zstd; }
if (type == EncodingType::None && case_ignore::equal(name, "zstd")) {
type = EncodingType::Zstd;
}
#endif
return EncodingType::None;
if (type == EncodingType::None) { return; }
// Higher q-value wins; for equal q, server preference breaks ties
if (quality > best_q ||
(quality == best_q && priority(type) < priority(best))) {
best_q = quality;
best = type;
}
});
return best;
}
bool nocompressor::compress(const char *data, size_t data_length,
@ -2937,6 +3042,21 @@ create_decompressor(const std::string &encoding) {
return decompressor;
}
// Returns the best available compressor and its Content-Encoding name.
// Priority: Brotli > Gzip > Zstd (matches server-side preference).
std::pair<std::unique_ptr<compressor>, const char *>
create_compressor() {
#ifdef CPPHTTPLIB_BROTLI_SUPPORT
return {detail::make_unique<brotli_compressor>(), "br"};
#elif defined(CPPHTTPLIB_ZLIB_SUPPORT)
return {detail::make_unique<gzip_compressor>(), "gzip"};
#elif defined(CPPHTTPLIB_ZSTD_SUPPORT)
return {detail::make_unique<zstd_compressor>(), "zstd"};
#else
return {nullptr, nullptr};
#endif
}
bool is_prohibited_header_name(const std::string &name) {
using udl::operator""_t;
@ -3769,7 +3889,7 @@ bool parse_accept_header(const std::string &s,
struct AcceptEntry {
std::string media_type;
double quality;
int order; // Original order in header
int order;
};
std::vector<AcceptEntry> entries;
@ -3787,48 +3907,12 @@ bool parse_accept_header(const std::string &s,
}
AcceptEntry accept_entry;
accept_entry.quality = 1.0; // Default quality
accept_entry.order = order++;
// Find q= parameter
auto q_pos = entry.find(";q=");
if (q_pos == std::string::npos) { q_pos = entry.find("; q="); }
if (q_pos != std::string::npos) {
// Extract media type (before q parameter)
accept_entry.media_type = trim_copy(entry.substr(0, q_pos));
// Extract quality value
auto q_start = entry.find('=', q_pos) + 1;
auto q_end = entry.find(';', q_start);
if (q_end == std::string::npos) { q_end = entry.length(); }
std::string quality_str =
trim_copy(entry.substr(q_start, q_end - q_start));
if (quality_str.empty()) {
has_invalid_entry = true;
return;
}
{
double v = 0.0;
auto res = detail::from_chars(
quality_str.data(), quality_str.data() + quality_str.size(), v);
if (res.ec == std::errc{}) {
accept_entry.quality = v;
} else {
has_invalid_entry = true;
return;
}
}
// Check if quality is in valid range [0.0, 1.0]
if (accept_entry.quality < 0.0 || accept_entry.quality > 1.0) {
has_invalid_entry = true;
return;
}
} else {
// No quality parameter, use entire entry as media type
accept_entry.media_type = entry;
if (!parse_quality(entry.data(), entry.data() + entry.size(),
accept_entry.media_type, accept_entry.quality)) {
has_invalid_entry = true;
return;
}
// Remove additional parameters from media type
@ -5481,7 +5565,8 @@ std::string decode_path_component(const std::string &component) {
// Unicode %uXXXX encoding
auto val = 0;
if (detail::from_hex_to_i(component, i + 2, 4, val)) {
// 4 digits Unicode codes
// 4 digits Unicode codes: val is 0x0000-0xFFFF (from 4 hex digits),
// so to_utf8 writes at most 3 bytes. buff[4] is safe.
char buff[4];
size_t len = detail::to_utf8(val, buff);
if (len > 0) { result.append(buff, len); }
@ -5586,6 +5671,30 @@ std::string decode_query_component(const std::string &component,
return result;
}
std::string sanitize_filename(const std::string &filename) {
// Extract basename: find the last path separator (/ or \)
auto pos = filename.find_last_of("/\\");
auto result =
(pos != std::string::npos) ? filename.substr(pos + 1) : filename;
// Strip null bytes
result.erase(std::remove(result.begin(), result.end(), '\0'), result.end());
// Trim whitespace
{
auto start = result.find_first_not_of(" \t");
auto end = result.find_last_not_of(" \t");
result = (start == std::string::npos)
? ""
: result.substr(start, end - start + 1);
}
// Reject . and ..
if (result == "." || result == "..") { return ""; }
return result;
}
std::string append_query_params(const std::string &path,
const Params &params) {
std::string path_with_query = path;
@ -6714,7 +6823,18 @@ bool Server::set_mount_point(const std::string &mount_point,
if (stat.is_dir()) {
std::string mnt = !mount_point.empty() ? mount_point : "/";
if (!mnt.empty() && mnt[0] == '/') {
base_dirs_.push_back({std::move(mnt), dir, std::move(headers)});
std::string resolved_base;
if (detail::canonicalize_path(dir.c_str(), resolved_base)) {
#if defined(_WIN32)
if (resolved_base.back() != '\\' && resolved_base.back() != '/') {
resolved_base += '\\';
}
#else
if (resolved_base.back() != '/') { resolved_base += '/'; }
#endif
}
base_dirs_.push_back(
{std::move(mnt), dir, std::move(resolved_base), std::move(headers)});
return true;
}
}
@ -6874,6 +6994,20 @@ Server &Server::set_payload_max_length(size_t length) {
return *this;
}
Server &Server::set_websocket_ping_interval(time_t sec) {
websocket_ping_interval_sec_ = sec;
return *this;
}
template <class Rep, class Period>
Server &Server::set_websocket_ping_interval(
const std::chrono::duration<Rep, Period> &duration) {
detail::duration_to_sec_and_usec(duration, [&](time_t sec, time_t /*usec*/) {
set_websocket_ping_interval(sec);
});
return *this;
}
bool Server::bind_to_port(const std::string &host, int port,
int socket_flags) {
auto ret = bind_internal(host, port, socket_flags);
@ -7294,6 +7428,18 @@ bool Server::handle_file_request(Request &req, Response &res) {
auto path = entry.base_dir + sub_path;
if (path.back() == '/') { path += "index.html"; }
// Defense-in-depth: is_valid_path blocks ".." traversal in the URL,
// but symlinks/junctions can still escape the base directory.
if (!entry.resolved_base_dir.empty()) {
std::string resolved_path;
if (detail::canonicalize_path(path.c_str(), resolved_path) &&
!detail::is_path_within_base(resolved_path,
entry.resolved_base_dir)) {
res.status = StatusCode::Forbidden_403;
return true;
}
}
detail::FileStat stat(path);
if (stat.is_dir()) {
@ -8012,7 +8158,7 @@ Server::process_request(Stream &strm, const std::string &remote_addr,
{
// Use WebSocket-specific read timeout instead of HTTP timeout
strm.set_read_timeout(CPPHTTPLIB_WEBSOCKET_READ_TIMEOUT_SECOND, 0);
ws::WebSocket ws(strm, req, true);
ws::WebSocket ws(strm, req, true, websocket_ping_interval_sec_);
entry.handler(req, ws);
}
return true;
@ -8256,6 +8402,13 @@ bool ClientImpl::ensure_socket_connection(Socket &socket, Error &error) {
return create_and_connect_socket(socket, error);
}
bool ClientImpl::setup_proxy_connection(
Socket & /*socket*/,
std::chrono::time_point<std::chrono::steady_clock> /*start_time*/,
Response & /*res*/, bool & /*success*/, Error & /*error*/) {
return true;
}
void ClientImpl::shutdown_ssl(Socket & /*socket*/,
bool /*shutdown_gracefully*/) {
// If there are any requests in flight from threads other than us, then it's
@ -8377,27 +8530,14 @@ bool ClientImpl::send_(Request &req, Response &res, Error &error) {
return false;
}
#ifdef CPPHTTPLIB_SSL_ENABLED
// TODO: refactoring
if (is_ssl()) {
auto &scli = static_cast<SSLClient &>(*this);
if (!proxy_host_.empty() && proxy_port_ != -1) {
auto success = false;
if (!scli.connect_with_proxy(socket_, req.start_time_, res, success,
error)) {
if (!success) { output_error_log(error, &req); }
return success;
}
}
if (!proxy_host_.empty() && proxy_port_ != -1) {
if (!scli.initialize_ssl(socket_, error)) {
output_error_log(error, &req);
return false;
}
{
auto success = true;
if (!setup_proxy_connection(socket_, req.start_time_, res, success,
error)) {
if (!success) { output_error_log(error, &req); }
return success;
}
}
#endif
}
// Mark the current socket as being in use so that it cannot be closed by
@ -8558,17 +8698,15 @@ ClientImpl::open_stream(const std::string &method, const std::string &path,
return handle;
}
#ifdef CPPHTTPLIB_SSL_ENABLED
if (is_ssl()) {
auto &scli = static_cast<SSLClient &>(*this);
if (!proxy_host_.empty() && proxy_port_ != -1) {
if (!scli.initialize_ssl(socket_, handle.error)) {
handle.response.reset();
return handle;
}
{
auto success = true;
auto start_time = std::chrono::steady_clock::now();
if (!setup_proxy_connection(socket_, start_time, *handle.response,
success, handle.error)) {
if (!success) { handle.response.reset(); }
return handle;
}
}
#endif
}
transfer_socket_ownership_to_handle(handle);
@ -8847,7 +8985,7 @@ bool ClientImpl::handle_request(Stream &strm, Request &req,
if (res.get_header_value("Connection") == "close" ||
(res.version == "HTTP/1.0" && res.reason != "Connection established")) {
// TODO this requires a not-entirely-obvious chain of calls to be correct
// NOTE: this requires a not-entirely-obvious chain of calls to be correct
// for this to be safe.
// This is safe to call because handle_request is only called by send_
@ -9086,14 +9224,9 @@ bool ClientImpl::write_content_with_provider(Stream &strm,
auto is_shutting_down = []() { return false; };
if (req.is_chunked_content_provider_) {
// TODO: Brotli support
std::unique_ptr<detail::compressor> compressor;
#ifdef CPPHTTPLIB_ZLIB_SUPPORT
if (compress_) {
compressor = detail::make_unique<detail::gzip_compressor>();
} else
#endif
{
auto compressor = compress_ ? detail::create_compressor().first
: std::unique_ptr<detail::compressor>();
if (!compressor) {
compressor = detail::make_unique<detail::nocompressor>();
}
@ -9324,14 +9457,15 @@ ClientImpl::send_with_content_provider_and_receiver(
Error &error) {
if (!content_type.empty()) { req.set_header("Content-Type", content_type); }
#ifdef CPPHTTPLIB_ZLIB_SUPPORT
if (compress_) { req.set_header("Content-Encoding", "gzip"); }
#endif
auto enc = compress_
? detail::create_compressor()
: std::pair<std::unique_ptr<detail::compressor>, const char *>(
nullptr, nullptr);
#ifdef CPPHTTPLIB_ZLIB_SUPPORT
if (compress_ && !content_provider_without_length) {
// TODO: Brotli support
detail::gzip_compressor compressor;
if (enc.second) { req.set_header("Content-Encoding", enc.second); }
if (enc.first && !content_provider_without_length) {
auto &compressor = enc.first;
if (content_provider) {
auto ok = true;
@ -9342,7 +9476,7 @@ ClientImpl::send_with_content_provider_and_receiver(
if (ok) {
auto last = offset + data_len == content_length;
auto ret = compressor.compress(
auto ret = compressor->compress(
data, data_len, last,
[&](const char *compressed_data, size_t compressed_data_len) {
req.body.append(compressed_data, compressed_data_len);
@ -9366,19 +9500,17 @@ ClientImpl::send_with_content_provider_and_receiver(
}
}
} else {
if (!compressor.compress(body, content_length, true,
[&](const char *data, size_t data_len) {
req.body.append(data, data_len);
return true;
})) {
if (!compressor->compress(body, content_length, true,
[&](const char *data, size_t data_len) {
req.body.append(data, data_len);
return true;
})) {
error = Error::Compression;
output_error_log(error, &req);
return nullptr;
}
}
} else
#endif
{
} else {
if (content_provider) {
req.content_length_ = content_length;
req.content_provider_ = std::move(content_provider);
@ -11545,6 +11677,24 @@ bool SSLClient::create_and_connect_socket(Socket &socket, Error &error) {
return ClientImpl::create_and_connect_socket(socket, error);
}
bool SSLClient::setup_proxy_connection(
Socket &socket,
std::chrono::time_point<std::chrono::steady_clock> start_time,
Response &res, bool &success, Error &error) {
if (proxy_host_.empty() || proxy_port_ == -1) { return true; }
if (!connect_with_proxy(socket, start_time, res, success, error)) {
return false;
}
if (!initialize_ssl(socket, error)) {
success = false;
return false;
}
return true;
}
// Assumes that socket_mutex_ is locked and that there are no requests in
// flight
bool SSLClient::connect_with_proxy(
@ -16061,11 +16211,11 @@ WebSocket::~WebSocket() {
}
void WebSocket::start_heartbeat() {
if (ping_interval_sec_ == 0) { return; }
ping_thread_ = std::thread([this]() {
std::unique_lock<std::mutex> lock(ping_mutex_);
while (!closed_) {
ping_cv_.wait_for(lock, std::chrono::seconds(
CPPHTTPLIB_WEBSOCKET_PING_INTERVAL_SECOND));
ping_cv_.wait_for(lock, std::chrono::seconds(ping_interval_sec_));
if (closed_) { break; }
lock.unlock();
if (!send_frame(Opcode::Ping, nullptr, 0)) {
@ -16203,7 +16353,8 @@ bool WebSocketClient::connect() {
Request req;
req.method = "GET";
req.path = path_;
ws_ = std::unique_ptr<WebSocket>(new WebSocket(std::move(strm), req, false));
ws_ = std::unique_ptr<WebSocket>(
new WebSocket(std::move(strm), req, false, websocket_ping_interval_sec_));
return true;
}
@ -16243,6 +16394,10 @@ void WebSocketClient::set_write_timeout(time_t sec, time_t usec) {
write_timeout_usec_ = usec;
}
void WebSocketClient::set_websocket_ping_interval(time_t sec) {
websocket_ping_interval_sec_ = sec;
}
#ifdef CPPHTTPLIB_SSL_ENABLED
void WebSocketClient::set_ca_cert_path(const std::string &path) {

View File

@ -8,8 +8,8 @@
#ifndef CPPHTTPLIB_HTTPLIB_H
#define CPPHTTPLIB_HTTPLIB_H
#define CPPHTTPLIB_VERSION "0.37.2"
#define CPPHTTPLIB_VERSION_NUM "0x002502"
#define CPPHTTPLIB_VERSION "0.38.0"
#define CPPHTTPLIB_VERSION_NUM "0x002600"
#ifdef _WIN32
#if defined(_WIN32_WINNT) && _WIN32_WINNT < 0x0A00
@ -1666,6 +1666,11 @@ public:
Server &set_payload_max_length(size_t length);
Server &set_websocket_ping_interval(time_t sec);
template <class Rep, class Period>
Server &set_websocket_ping_interval(
const std::chrono::duration<Rep, Period> &duration);
bool bind_to_port(const std::string &host, int port, int socket_flags = 0);
int bind_to_any_port(const std::string &host, int socket_flags = 0);
bool listen_after_bind();
@ -1700,6 +1705,8 @@ protected:
time_t idle_interval_sec_ = CPPHTTPLIB_IDLE_INTERVAL_SECOND;
time_t idle_interval_usec_ = CPPHTTPLIB_IDLE_INTERVAL_USECOND;
size_t payload_max_length_ = CPPHTTPLIB_PAYLOAD_MAX_LENGTH;
time_t websocket_ping_interval_sec_ =
CPPHTTPLIB_WEBSOCKET_PING_INTERVAL_SECOND;
private:
using Handlers =
@ -1769,6 +1776,7 @@ private:
struct MountPointEntry {
std::string mount_point;
std::string base_dir;
std::string resolved_base_dir;
Headers headers;
};
std::vector<MountPointEntry> base_dirs_;
@ -2186,6 +2194,10 @@ protected:
virtual bool create_and_connect_socket(Socket &socket, Error &error);
virtual bool ensure_socket_connection(Socket &socket, Error &error);
virtual bool setup_proxy_connection(
Socket &socket,
std::chrono::time_point<std::chrono::steady_clock> start_time,
Response &res, bool &success, Error &error);
// All of:
// shutdown_ssl
@ -2712,6 +2724,10 @@ private:
std::function<bool(Stream &strm)> callback) override;
bool is_ssl() const override;
bool setup_proxy_connection(
Socket &socket,
std::chrono::time_point<std::chrono::steady_clock> start_time,
Response &res, bool &success, Error &error) override;
bool connect_with_proxy(
Socket &sock,
std::chrono::time_point<std::chrono::steady_clock> start_time,
@ -2911,6 +2927,8 @@ std::string encode_query_component(const std::string &component,
std::string decode_query_component(const std::string &component,
bool plus_as_space = true);
std::string sanitize_filename(const std::string &filename);
std::string append_query_params(const std::string &path, const Params &params);
std::pair<std::string, std::string> make_range_header(const Ranges &ranges);
@ -3714,15 +3732,19 @@ private:
friend class httplib::Server;
friend class WebSocketClient;
WebSocket(Stream &strm, const Request &req, bool is_server)
: strm_(strm), req_(req), is_server_(is_server) {
WebSocket(
Stream &strm, const Request &req, bool is_server,
time_t ping_interval_sec = CPPHTTPLIB_WEBSOCKET_PING_INTERVAL_SECOND)
: strm_(strm), req_(req), is_server_(is_server),
ping_interval_sec_(ping_interval_sec) {
start_heartbeat();
}
WebSocket(std::unique_ptr<Stream> &&owned_strm, const Request &req,
bool is_server)
WebSocket(
std::unique_ptr<Stream> &&owned_strm, const Request &req, bool is_server,
time_t ping_interval_sec = CPPHTTPLIB_WEBSOCKET_PING_INTERVAL_SECOND)
: strm_(*owned_strm), owned_strm_(std::move(owned_strm)), req_(req),
is_server_(is_server) {
is_server_(is_server), ping_interval_sec_(ping_interval_sec) {
start_heartbeat();
}
@ -3733,6 +3755,7 @@ private:
std::unique_ptr<Stream> owned_strm_;
Request req_;
bool is_server_;
time_t ping_interval_sec_;
std::atomic<bool> closed_{false};
std::mutex write_mutex_;
std::thread ping_thread_;
@ -3761,6 +3784,7 @@ public:
const std::string &subprotocol() const;
void set_read_timeout(time_t sec, time_t usec = 0);
void set_write_timeout(time_t sec, time_t usec = 0);
void set_websocket_ping_interval(time_t sec);
#ifdef CPPHTTPLIB_SSL_ENABLED
void set_ca_cert_path(const std::string &path);
@ -3784,6 +3808,8 @@ private:
time_t read_timeout_usec_ = 0;
time_t write_timeout_sec_ = CPPHTTPLIB_CLIENT_WRITE_TIMEOUT_SECOND;
time_t write_timeout_usec_ = CPPHTTPLIB_CLIENT_WRITE_TIMEOUT_USECOND;
time_t websocket_ping_interval_sec_ =
CPPHTTPLIB_WEBSOCKET_PING_INTERVAL_SECOND;
#ifdef CPPHTTPLIB_SSL_ENABLED
bool is_ssl_ = false;