diff --git a/.devops/vulkan.Dockerfile b/.devops/vulkan.Dockerfile index 5d6c87ed6b..3112ec85ef 100644 --- a/.devops/vulkan.Dockerfile +++ b/.devops/vulkan.Dockerfile @@ -53,10 +53,11 @@ RUN apt-get update \ && apt-get install -y \ build-essential \ git \ - python3 \ - python3-dev \ + python3.13 \ + python3.13-dev \ python3-pip \ python3-wheel \ + && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.13 100 \ && pip install --break-system-packages --upgrade setuptools \ && pip install --break-system-packages -r requirements.txt \ && apt autoremove -y \ diff --git a/.github/workflows/build-3rd-party.yml b/.github/workflows/build-3rd-party.yml new file mode 100644 index 0000000000..642d978644 --- /dev/null +++ b/.github/workflows/build-3rd-party.yml @@ -0,0 +1,57 @@ +name: CI (3rd-party) + +on: + workflow_dispatch: # allows manual triggering + push: + branches: + - master + paths: [ + '.github/workflows/build-3rd-party.yml', + '**/CMakeLists.txt', + '**/.cmake', + '**/*.h', + '**/*.hpp', + '**/*.c', + '**/*.cpp' + ] + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} + cancel-in-progress: true + +env: + GGML_NLOOP: 3 + GGML_N_THREADS: 1 + LLAMA_LOG_COLORS: 1 + LLAMA_LOG_PREFIX: 1 + LLAMA_LOG_TIMESTAMPS: 1 + +jobs: + ubuntu-24-llguidance: + runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }} + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + + - name: Dependencies + id: depends + run: | + sudo apt-get update + sudo apt-get install build-essential libssl-dev + + - name: Build + id: cmake_build + run: | + cmake -B build \ + -DLLAMA_FATAL_WARNINGS=ON \ + -DLLAMA_LLGUIDANCE=ON + cmake --build build --config Release -j $(nproc) + + - name: Test + id: cmake_test + run: | + cd build + ctest -L main --verbose --timeout 900 + diff --git a/.github/workflows/build-android.yml b/.github/workflows/build-android.yml new file mode 100644 index 0000000000..cd9d99ffab --- /dev/null +++ b/.github/workflows/build-android.yml @@ -0,0 +1,140 @@ +name: CI (android) + +on: + workflow_dispatch: # allows manual triggering + push: + branches: + - master + paths: [ + '.github/workflows/build-android.yml', + '**/CMakeLists.txt', + '**/.cmake', + '**/*.h', + '**/*.hpp', + '**/*.c', + '**/*.cpp' + ] + + pull_request: + types: [opened, synchronize, reopened] + paths: [ + '.github/workflows/build-android.yml', + 'examples/llama.android/**' + ] + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} + cancel-in-progress: true + +env: + GGML_NLOOP: 3 + GGML_N_THREADS: 1 + LLAMA_LOG_COLORS: 1 + LLAMA_LOG_PREFIX: 1 + LLAMA_LOG_TIMESTAMPS: 1 + +jobs: + android: + runs-on: ubuntu-latest + + steps: + - name: Clone + uses: actions/checkout@v6 + + # Disabled due to size (400MB) and always 0 cache hits + # - name: ccache + # uses: ggml-org/ccache-action@v1.2.16 + # with: + # key: android-build + # evict-old-files: 1d + + - name: Set up JDK + uses: actions/setup-java@v5 + with: + java-version: 17 + distribution: zulu + + - name: Setup Android SDK + uses: android-actions/setup-android@v3 + with: + log-accepted-android-sdk-licenses: false + + - name: Build + run: | + cd examples/llama.android + ./gradlew build --no-daemon + + android-ndk: + runs-on: ubuntu-latest + + env: + OPENCL_VERSION: 2025.07.22 + + strategy: + matrix: + include: + - build: 'arm64-cpu' + defines: '-D ANDROID_ABI=arm64-v8a -D ANDROID_PLATFORM=android-31 -D CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -D GGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm -G Ninja -D LLAMA_OPENSSL=OFF -D GGML_OPENMP=OFF' + - build: 'arm64-snapdragon' + defines: '--preset arm64-android-snapdragon-release' + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + + - name: Install OpenCL Headers and Libs + id: install_opencl + if: ${{ matrix.build == 'arm64-snapdragon' }} + run: | + mkdir opencl + curl -L -o opencl/clhpp.tar.gz https://github.com/KhronosGroup/OpenCL-CLHPP/archive/refs/tags/v${OPENCL_VERSION}.tar.gz + curl -L -o opencl/headers.tar.gz https://github.com/KhronosGroup/OpenCL-Headers/archive/refs/tags/v${OPENCL_VERSION}.tar.gz + curl -L -o opencl/icd-loader.tar.gz https://github.com/KhronosGroup/OpenCL-ICD-Loader/archive/refs/tags/v${OPENCL_VERSION}.tar.gz + tar -xaf opencl/headers.tar.gz -C opencl + tar -xaf opencl/clhpp.tar.gz -C opencl + tar -xaf opencl/icd-loader.tar.gz -C opencl + sudo cp -r opencl/OpenCL-Headers-${OPENCL_VERSION}/CL ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include + sudo cp -r opencl/OpenCL-CLHPP-${OPENCL_VERSION}/include/CL/* ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include/CL + cd opencl/OpenCL-ICD-Loader-${OPENCL_VERSION} + cmake -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -DOPENCL_ICD_LOADER_HEADERS_DIR=${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=31 -DANDROID_STL=c++_shared + cmake --build build + sudo cp build/libOpenCL.so ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android + rm -rf opencl + + - name: Install Hexagon SDK + id: install_hexsdk + if: ${{ matrix.build == 'arm64-snapdragon' }} + env: + HEXSDK_VER: 6.4.0.2 + HEXTLS_VER: 19.0.04 + run: | + curl -L -o hex-sdk.tar.gz https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v$HEXSDK_VER/hexagon-sdk-v$HEXSDK_VER-amd64-lnx.tar.xz + mkdir hex-sdk + tar -xaf hex-sdk.tar.gz -C hex-sdk + ls -l hex-sdk + sudo mv hex-sdk /opt/hexagon + echo "HEXAGON_SDK_ROOT=/opt/hexagon/$HEXSDK_VER" >> "$GITHUB_ENV" + echo "HEXAGON_TOOLS_ROOT=/opt/hexagon/$HEXSDK_VER/tools/HEXAGON_Tools/$HEXTLS_VER" >> "$GITHUB_ENV" + echo "DEFAULT_HLOS_ARCH=64" >> "$GITHUB_ENV" + echo "DEFAULT_TOOLS_VARIANT=toolv19" >> "$GITHUB_ENV" + echo "DEFAULT_NO_QURT_INC=0" >> "$GITHUB_ENV" + echo "DEFAULT_DSP_ARCH=v73" >> "$GITHUB_ENV" + + - name: Update CMake presets + id: update_presets + if: ${{ matrix.build == 'arm64-snapdragon' }} + run: | + cp docs/backend/snapdragon/CMakeUserPresets.json . + + - name: Build + id: ndk_build + run: | + cmake ${{ matrix.defines }} -B build + cmake --build build + cmake --install build --prefix pkg-adb/llama.cpp + + - name: Test + id: cmake_test + run: | + echo "FIXME: test on devices" diff --git a/.github/workflows/build-apple.yml b/.github/workflows/build-apple.yml new file mode 100644 index 0000000000..51f0ef2302 --- /dev/null +++ b/.github/workflows/build-apple.yml @@ -0,0 +1,214 @@ +name: CI (apple) + +on: + workflow_dispatch: # allows manual triggering + push: + branches: + - master + paths: [ + '.github/workflows/build-apple.yml', + '**/CMakeLists.txt', + '**/.cmake', + '**/*.h', + '**/*.hpp', + '**/*.c', + '**/*.cpp', + '**/*.swift', + '**/*.m', + '**/*.metal' + ] + + pull_request: + types: [opened, synchronize, reopened] + paths: [ + '.github/workflows/build-apple.yml', + 'ggml/src/ggml-metal/**' + ] + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} + cancel-in-progress: true + +env: + GGML_NLOOP: 3 + GGML_N_THREADS: 1 + LLAMA_LOG_COLORS: 1 + LLAMA_LOG_PREFIX: 1 + LLAMA_LOG_TIMESTAMPS: 1 + +jobs: + macOS-latest-ios: + runs-on: macos-latest + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + + - name: ccache + uses: ggml-org/ccache-action@v1.2.16 + with: + key: macOS-latest-ios + evict-old-files: 1d + save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} + + - name: Build + id: cmake_build + run: | + sysctl -a + cmake -B build -G Xcode \ + -DGGML_METAL_USE_BF16=ON \ + -DGGML_METAL_EMBED_LIBRARY=ON \ + -DLLAMA_BUILD_COMMON=OFF \ + -DLLAMA_BUILD_EXAMPLES=OFF \ + -DLLAMA_BUILD_TOOLS=OFF \ + -DLLAMA_BUILD_TESTS=OFF \ + -DLLAMA_BUILD_SERVER=OFF \ + -DCMAKE_SYSTEM_NAME=iOS \ + -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \ + -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml + cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO + + macos-latest-ios-xcode: + runs-on: macos-latest + + steps: + - name: Checkout code + uses: actions/checkout@v6 + + - name: Setup Xcode + uses: ggml-org/setup-xcode@v1 + with: + xcode-version: latest-stable + + - name: Build + id: cmake_build + run: | + sysctl -a + cmake -B build -G Xcode \ + -DGGML_METAL_USE_BF16=ON \ + -DGGML_METAL_EMBED_LIBRARY=ON \ + -DLLAMA_OPENSSL=OFF \ + -DLLAMA_BUILD_EXAMPLES=OFF \ + -DLLAMA_BUILD_TOOLS=OFF \ + -DLLAMA_BUILD_TESTS=OFF \ + -DLLAMA_BUILD_SERVER=OFF \ + -DCMAKE_SYSTEM_NAME=iOS \ + -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \ + -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml + cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO + + - name: xcodebuild for swift package + id: xcodebuild + run: | + ./build-xcframework.sh + + - name: Upload xcframework artifact + uses: actions/upload-artifact@v6 + with: + name: llama-xcframework + path: build-apple/llama.xcframework/ + retention-days: 1 + + - name: Build Xcode project + run: | + xcodebuild -downloadPlatform iOS + xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build + + macOS-latest-tvos: + runs-on: macos-latest + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + + - name: ccache + uses: ggml-org/ccache-action@v1.2.16 + with: + key: macOS-latest-tvos + evict-old-files: 1d + save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} + + - name: Build + id: cmake_build + run: | + sysctl -a + cmake -B build -G Xcode \ + -DGGML_METAL_USE_BF16=ON \ + -DGGML_METAL_EMBED_LIBRARY=ON \ + -DLLAMA_BUILD_COMMON=OFF \ + -DLLAMA_BUILD_EXAMPLES=OFF \ + -DLLAMA_BUILD_TOOLS=OFF \ + -DLLAMA_BUILD_TESTS=OFF \ + -DLLAMA_BUILD_SERVER=OFF \ + -DCMAKE_SYSTEM_NAME=tvOS \ + -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \ + -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml + cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO + + macOS-latest-visionos: + runs-on: macos-latest + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + + - name: Build + id: cmake_build + run: | + sysctl -a + cmake -B build -G Xcode \ + -DGGML_METAL_USE_BF16=ON \ + -DGGML_METAL_EMBED_LIBRARY=ON \ + -DLLAMA_BUILD_COMMON=OFF \ + -DLLAMA_BUILD_EXAMPLES=OFF \ + -DLLAMA_BUILD_TOOLS=OFF \ + -DLLAMA_BUILD_TESTS=OFF \ + -DLLAMA_BUILD_SERVER=OFF \ + -DCMAKE_SYSTEM_NAME=visionOS \ + -DCMAKE_OSX_DEPLOYMENT_TARGET=1.0 \ + -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml + cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO + + macOS-latest-swift: + runs-on: macos-latest + needs: macos-latest-ios-xcode + + strategy: + matrix: + destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS'] + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + + - name: ccache + uses: ggml-org/ccache-action@v1.2.16 + with: + key: macOS-latest-swift + evict-old-files: 1d + save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} + + - name: Download xcframework artifact + uses: actions/download-artifact@v7 + with: + name: llama-xcframework + path: build-apple/llama.xcframework/ + + - name: Build llama.cpp with CMake + id: cmake_build + run: | + sysctl -a + cmake -B build -G Xcode \ + -DGGML_METAL_USE_BF16=ON \ + -DGGML_METAL_EMBED_LIBRARY=ON \ + -DLLAMA_OPENSSL=OFF \ + -DLLAMA_BUILD_EXAMPLES=OFF \ + -DLLAMA_BUILD_TOOLS=OFF \ + -DLLAMA_BUILD_TESTS=OFF \ + -DLLAMA_BUILD_SERVER=OFF \ + -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" + cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) diff --git a/.github/workflows/build-cache.yml b/.github/workflows/build-cache.yml index 7cfdaff605..bc0a92c7fc 100644 --- a/.github/workflows/build-cache.yml +++ b/.github/workflows/build-cache.yml @@ -37,37 +37,37 @@ jobs: path: ./vulkan_sdk version: ${{ env.VULKAN_SDK_VERSION }} - ubuntu-24-spacemit-cache: - runs-on: ubuntu-24.04 + #ubuntu-24-spacemit-cache: + # runs-on: ubuntu-24.04 - env: - # Make sure this is in sync with build-linux-cross.yml - SPACEMIT_IME_TOOLCHAIN_VERSION: "1.1.2" + # env: + # # Make sure this is in sync with build-linux-cross.yml + # SPACEMIT_IME_TOOLCHAIN_VERSION: "1.1.2" - steps: - - name: Clone - id: checkout - uses: actions/checkout@v6 + # steps: + # - name: Clone + # id: checkout + # uses: actions/checkout@v6 - - name: Setup Cache - uses: actions/cache@v5 - id: cache-toolchain - with: - path: ./spacemit_toolchain - key: spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }} + # - name: Setup Cache + # uses: actions/cache@v5 + # id: cache-toolchain + # with: + # path: ./spacemit_toolchain + # key: spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }} - - name: Setup SpacemiT Toolchain - if: steps.cache-toolchain.outputs.cache-hit != 'true' - uses: ./.github/actions/linux-setup-spacemit - with: - path: ./spacemit_toolchain - version: ${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }} + # - name: Setup SpacemiT Toolchain + # if: steps.cache-toolchain.outputs.cache-hit != 'true' + # uses: ./.github/actions/linux-setup-spacemit + # with: + # path: ./spacemit_toolchain + # version: ${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }} ubuntu-24-openvino-cache: runs-on: ubuntu-24.04 env: - # Sync versions in build.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile + # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile OPENVINO_VERSION_MAJOR: "2026.0" OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886" diff --git a/.github/workflows/build-cann.yml b/.github/workflows/build-cann.yml new file mode 100644 index 0000000000..de641ca148 --- /dev/null +++ b/.github/workflows/build-cann.yml @@ -0,0 +1,102 @@ +name: CI (cann) + +on: + workflow_dispatch: # allows manual triggering + push: + branches: + - master + paths: [ + '.github/workflows/build-cann.yml', + '**/CMakeLists.txt', + '**/.cmake', + '**/*.h', + '**/*.hpp', + '**/*.c', + '**/*.cpp' + ] + + pull_request: + types: [opened, synchronize, reopened] + paths: [ + '.github/workflows/build-cann.yml', + 'ggml/src/ggml-cann/**' + ] + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} + cancel-in-progress: true + +env: + GGML_NLOOP: 3 + GGML_N_THREADS: 1 + LLAMA_LOG_COLORS: 1 + LLAMA_LOG_PREFIX: 1 + LLAMA_LOG_TIMESTAMPS: 1 + +jobs: + openEuler-latest-cann: + defaults: + run: + shell: bash -el {0} + strategy: + matrix: + arch: [x86, aarch64] + chip_type: ['910b', '310p'] + build: ['Release'] + use_acl_graph: ['on', 'off'] + exclude: + # 310P does not support USE_ACL_GRAPH=on + - chip_type: '310p' + use_acl_graph: 'on' + runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }} + steps: + - name: Checkout + uses: actions/checkout@v6 + with: + fetch-depth: 0 + + - name: Free up disk space + uses: ggml-org/free-disk-space@v1.3.1 + with: + tool-cache: true + + - name: Set container image + id: cann-image + run: | + image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.3.rc2-910b-openeuler24.03-py3.11' || '8.3.rc2-310p-openeuler24.03-py3.11' }}" + echo "image=${image}" >> "${GITHUB_OUTPUT}" + + - name: Pull container image + run: docker pull "${{ steps.cann-image.outputs.image }}" + + - name: Build + env: + BUILD_TYPE: ${{ matrix.build }} + SOC_TYPE: ascend${{ matrix.chip_type }} + USE_ACL_GRAPH: ${{ matrix.use_acl_graph }} + run: | + HOST_UID=$(id -u) + HOST_GID=$(id -g) + + docker run --rm \ + -v "${PWD}:/workspace" \ + -w /workspace \ + -e SOC_TYPE=${SOC_TYPE} \ + -e BUILD_TYPE=${BUILD_TYPE} \ + -e USE_ACL_GRAPH=${USE_ACL_GRAPH} \ + "${{ steps.cann-image.outputs.image }}" \ + bash -lc ' + set -e + yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake openssl-devel + yum clean all && rm -rf /var/cache/yum + git config --global --add safe.directory "/workspace" + export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH} + cmake -S . -B build \ + -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ + -DGGML_CANN=on \ + -DSOC_TYPE=${SOC_TYPE} \ + -DUSE_ACL_GRAPH=${USE_ACL_GRAPH} + cmake --build build -j $(nproc) + + chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build + ' diff --git a/.github/workflows/build-cmake-pkg.yml b/.github/workflows/build-cmake-pkg.yml index 259efa43c8..84cf8ddf48 100644 --- a/.github/workflows/build-cmake-pkg.yml +++ b/.github/workflows/build-cmake-pkg.yml @@ -5,7 +5,7 @@ on: jobs: linux: - runs-on: ubuntu-24.04 + runs-on: ubuntu-slim steps: - uses: actions/checkout@v6 with: @@ -14,7 +14,7 @@ jobs: - name: Install dependencies run: | sudo apt update - sudo apt install -y build-essential tcl + sudo apt install -y build-essential tcl cmake - name: Build run: | diff --git a/.github/workflows/build-linux-cross.yml b/.github/workflows/build-cross.yml similarity index 93% rename from .github/workflows/build-linux-cross.yml rename to .github/workflows/build-cross.yml index 8b6ebaf4a3..74508129ac 100644 --- a/.github/workflows/build-linux-cross.yml +++ b/.github/workflows/build-cross.yml @@ -1,7 +1,24 @@ -name: Build on Linux using cross-compiler +name: CI (cross) on: + # only manual triggers due to low-importance of the workflows + # TODO: for regular runs, provision dedicated self-hosted runners workflow_dispatch: - workflow_call: + push: + branches: + - master + paths: [ + '.github/workflows/build-cross.yml', + 'ggml/src/spacemit/*', + 'ggml/src/arch/loongarch/*' + ] + # run once every week + schedule: + - cron: '0 0 * * 0' + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} + cancel-in-progress: true + jobs: # ubuntu-24-riscv64-cpu-cross: @@ -142,7 +159,7 @@ jobs: # cmake --build build --config Release -j $(nproc) debian-13-loongarch64-cpu-cross: - runs-on: ubuntu-24.04 + runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }} container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671 steps: @@ -197,7 +214,7 @@ jobs: cmake --build build --config Release -j $(nproc) debian-13-loongarch64-vulkan-cross: - runs-on: ubuntu-24.04 + runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }} container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671 steps: @@ -264,15 +281,15 @@ jobs: steps: - uses: actions/checkout@v6 - - name: Use SpacemiT Toolchain Cache - uses: actions/cache@v5 - id: cache-toolchain - with: - path: ./spacemit_toolchain - key: spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }} + #- name: Use SpacemiT Toolchain Cache + # uses: actions/cache@v5 + # id: cache-toolchain + # with: + # path: ./spacemit_toolchain + # key: spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }} - name: Setup SpacemiT Toolchain - if: steps.cache-toolchain.outputs.cache-hit != 'true' + #if: steps.cache-toolchain.outputs.cache-hit != 'true' uses: ./.github/actions/linux-setup-spacemit with: path: ./spacemit_toolchain diff --git a/.github/workflows/build-msys.yml b/.github/workflows/build-msys.yml new file mode 100644 index 0000000000..431d9b6a53 --- /dev/null +++ b/.github/workflows/build-msys.yml @@ -0,0 +1,72 @@ +name: CI (msys) + +on: + # only manual triggers due to low-importance of the workflows + # TODO: for regular runs, provision dedicated self-hosted runners + workflow_dispatch: + # run once every week + schedule: + - cron: '0 0 * * 0' + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} + cancel-in-progress: true + +env: + GGML_NLOOP: 3 + GGML_N_THREADS: 1 + LLAMA_LOG_COLORS: 1 + LLAMA_LOG_PREFIX: 1 + LLAMA_LOG_TIMESTAMPS: 1 + +jobs: + windows-msys2: + runs-on: windows-2025 + + strategy: + fail-fast: false + matrix: + include: + - { sys: UCRT64, env: ucrt-x86_64, build: Release } + - { sys: CLANG64, env: clang-x86_64, build: Release } + + steps: + - name: Clone + uses: actions/checkout@v6 + + #- name: ccache + # uses: ggml-org/ccache-action@v1.2.16 + # with: + # key: windows-msys2 + # variant: ccache + # evict-old-files: 1d + # save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} + + - name: Setup ${{ matrix.sys }} + uses: msys2/setup-msys2@v2 + with: + update: true + msystem: ${{matrix.sys}} + install: >- + base-devel + git + mingw-w64-${{matrix.env}}-toolchain + mingw-w64-${{matrix.env}}-cmake + mingw-w64-${{matrix.env}}-openblas + + - name: Build using CMake + shell: msys2 {0} + run: | + cmake -B build + cmake --build build --config ${{ matrix.build }} -j $(nproc) + + - name: Clean after building using CMake + shell: msys2 {0} + run: | + rm -rf build + + - name: Build using CMake w/ OpenBLAS + shell: msys2 {0} + run: | + cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS + cmake --build build --config ${{ matrix.build }} -j $(nproc) diff --git a/.github/workflows/build-riscv.yml b/.github/workflows/build-riscv.yml new file mode 100644 index 0000000000..36a3a1155a --- /dev/null +++ b/.github/workflows/build-riscv.yml @@ -0,0 +1,136 @@ +name: CI (riscv) + +on: + workflow_dispatch: # allows manual triggering + push: + branches: + - master + paths: [ + '.github/workflows/build-riscv.yml', + '**/CMakeLists.txt', + '**/.cmake', + '**/*.h', + '**/*.hpp', + '**/*.c', + '**/*.cpp' + ] + + pull_request: + types: [opened, synchronize, reopened] + paths: [ + '.github/workflows/build-riscv.yml', + 'ggml/src/ggml-cpu/arch/riscv/**' + ] + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} + cancel-in-progress: true + +env: + GGML_NLOOP: 3 + GGML_N_THREADS: 1 + LLAMA_LOG_COLORS: 1 + LLAMA_LOG_PREFIX: 1 + LLAMA_LOG_TIMESTAMPS: 1 + +jobs: + ubuntu-riscv64-native-sanitizer: + runs-on: RISCV64 + + continue-on-error: true + + strategy: + matrix: + sanitizer: [ADDRESS, THREAD, UNDEFINED] + build_type: [Debug] + + steps: + - name: Install dependencies + run: | + sudo apt-get update + + # Install necessary packages + sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential wget ccache git-lfs + + # Set gcc-14 and g++-14 as the default compilers + sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100 + sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100 + sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc + sudo ln -sf /usr/bin/g++-14 /usr/bin/g++ + + # Install Rust stable version + rustup install stable + rustup default stable + + git lfs install + + - name: GCC version check + run: | + gcc --version + g++ --version + + - name: Clone + id: checkout + uses: actions/checkout@v6 + + - name: Setup ccache + run: | + # Unique cache directory per matrix combination + export CCACHE_DIR="$HOME/.ccache/sanitizer-${{ matrix.sanitizer }}-${{ matrix.build_type }}" + mkdir -p "$CCACHE_DIR" + + # Configure ccache + ccache --set-config=max_size=5G + ccache --set-config=compression=true + ccache --set-config=compression_level=6 + ccache --set-config=cache_dir="$CCACHE_DIR" + ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime + ccache --set-config=hash_dir=false + + # Export for subsequent steps + echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV + echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV + + - name: Build + id: cmake_build + if: ${{ matrix.sanitizer != 'THREAD' }} + run: | + cmake -B build \ + -DLLAMA_OPENSSL=OFF \ + -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ + -DGGML_OPENMP=ON \ + -DLLAMA_BUILD_EXAMPLES=ON \ + -DLLAMA_BUILD_TOOLS=ON \ + -DLLAMA_BUILD_TESTS=OFF \ + -DCMAKE_C_COMPILER_LAUNCHER=ccache \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ + -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \ + -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \ + -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 + + cmake --build build --config ${{ matrix.build_type }} -j $(nproc) + + - name: Build (no OpenMP) + id: cmake_build_no_openmp + if: ${{ matrix.sanitizer == 'THREAD' }} + run: | + cmake -B build \ + -DLLAMA_OPENSSL=OFF \ + -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ + -DGGML_OPENMP=OFF \ + -DLLAMA_BUILD_EXAMPLES=ON \ + -DLLAMA_BUILD_TOOLS=ON \ + -DLLAMA_BUILD_TESTS=OFF \ + -DCMAKE_C_COMPILER_LAUNCHER=ccache \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ + -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \ + -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \ + -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 + + cmake --build build --config ${{ matrix.build_type }} -j $(nproc) + + - name: Test + id: cmake_test + run: | + cd build + ctest -L main --verbose --timeout 900 diff --git a/.github/workflows/build-sanitize.yml b/.github/workflows/build-sanitize.yml new file mode 100644 index 0000000000..0b17857504 --- /dev/null +++ b/.github/workflows/build-sanitize.yml @@ -0,0 +1,87 @@ +name: CI (sanitize) + +on: + workflow_dispatch: # allows manual triggering + push: + branches: + - master + paths: [ + '.github/workflows/build-sanitize.yml', + '**/CMakeLists.txt', + '**/.cmake', + '**/*.h', + '**/*.hpp', + '**/*.c', + '**/*.cpp' + ] + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} + cancel-in-progress: true + +env: + GGML_NLOOP: 3 + GGML_N_THREADS: 1 + LLAMA_LOG_COLORS: 1 + LLAMA_LOG_PREFIX: 1 + LLAMA_LOG_TIMESTAMPS: 1 + +jobs: + ubuntu-latest-sanitizer: + runs-on: ubuntu-latest + + continue-on-error: true + + strategy: + matrix: + sanitizer: [ADDRESS, THREAD, UNDEFINED] + build_type: [Debug] + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + + - name: ccache + uses: ggml-org/ccache-action@v1.2.16 + with: + key: ubuntu-latest-sanitizer-${{ matrix.sanitizer }} + evict-old-files: 1d + save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} + + - name: Dependencies + id: depends + run: | + sudo apt-get update + sudo apt-get install build-essential libssl-dev + + - name: Build + id: cmake_build + if: ${{ matrix.sanitizer != 'THREAD' }} + run: | + cmake -B build \ + -DLLAMA_FATAL_WARNINGS=ON \ + -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \ + -DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \ + -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} + + cmake --build build --config ${{ matrix.build_type }} -j $(nproc) + + - name: Build (no OpenMP) + id: cmake_build_no_openmp + if: ${{ matrix.sanitizer == 'THREAD' }} + run: | + cmake -B build \ + -DLLAMA_FATAL_WARNINGS=ON \ + -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \ + -DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \ + -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ + -DGGML_OPENMP=OFF + + cmake --build build --config ${{ matrix.build_type }} -j $(nproc) + + - name: Test + id: cmake_test + run: | + cd build + ctest -L main --verbose --timeout 900 diff --git a/.github/workflows/build-self-hosted.yml b/.github/workflows/build-self-hosted.yml new file mode 100644 index 0000000000..7c7710fe45 --- /dev/null +++ b/.github/workflows/build-self-hosted.yml @@ -0,0 +1,242 @@ +name: CI (self-hosted) + +on: + workflow_dispatch: # allows manual triggering + push: + branches: + - master + paths: [ + '.github/workflows/build.yml', + '**/CMakeLists.txt', + '**/.cmake', + '**/*.h', + '**/*.hpp', + '**/*.c', + '**/*.cpp', + '**/*.cu', + '**/*.cuh', + '**/*.swift', + '**/*.m', + '**/*.metal', + '**/*.comp', + '**/*.glsl', + '**/*.wgsl' + ] + + pull_request: + types: [opened, synchronize, reopened] + paths: [ + '.github/workflows/build-self-hosted.yml', + '**/CMakeLists.txt', + '**/.cmake', + '**/*.h', + '**/*.hpp', + '**/*.c', + '**/*.cpp', + '**/*.cu', + '**/*.cuh', + '**/*.swift', + '**/*.m', + '**/*.metal', + '**/*.comp', + '**/*.glsl', + '**/*.wgsl' + ] + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} + cancel-in-progress: true + +env: + GGML_NLOOP: 3 + GGML_N_THREADS: 1 + LLAMA_LOG_COLORS: 1 + LLAMA_LOG_PREFIX: 1 + LLAMA_LOG_TIMESTAMPS: 1 + +jobs: + ggml-ci-nvidia-cuda: + runs-on: [self-hosted, Linux, NVIDIA] + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + + - name: Test + id: ggml-ci + run: | + nvidia-smi + GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp + + ggml-ci-nvidia-vulkan-cm: + runs-on: [self-hosted, Linux, NVIDIA] + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + + - name: Test + id: ggml-ci + run: | + vulkaninfo --summary + GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp + + ggml-ci-nvidia-vulkan-cm2: + runs-on: [self-hosted, Linux, NVIDIA, COOPMAT2] + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + + - name: Test + id: ggml-ci + run: | + vulkaninfo --summary + GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp + + ggml-ci-cpu-amx: + runs-on: [self-hosted, Linux, CPU, AMX] + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + + - name: Test + id: ggml-ci + run: | + bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp + + # ggml-ci-amd-vulkan: + # runs-on: [self-hosted, Linux, AMD] + + # steps: + # - name: Clone + # id: checkout + # uses: actions/checkout@v6 + + # - name: Test + # id: ggml-ci + # run: | + # vulkaninfo --summary + # GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp + + # ggml-ci-amd-rocm: + # runs-on: [self-hosted, Linux, AMD] + + # steps: + # - name: Clone + # id: checkout + # uses: actions/checkout@v6 + + # - name: Test + # id: ggml-ci + # run: | + # amd-smi static + # GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp + + ggml-ci-mac-metal: + runs-on: [self-hosted, macOS, ARM64] + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + + - name: Test + id: ggml-ci + run: | + GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp + + ggml-ci-mac-webgpu: + runs-on: [self-hosted, macOS, ARM64] + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + + - name: Dawn Dependency + id: dawn-depends + run: | + DAWN_VERSION="v2.0.0" + DAWN_OWNER="reeselevine" + DAWN_REPO="dawn" + DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release" + echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip" + curl -L -o artifact.zip \ + "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip" + mkdir dawn + unzip artifact.zip + tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1 + + - name: Test + id: ggml-ci + run: | + GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \ + bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp + + ggml-ci-mac-vulkan: + runs-on: [self-hosted, macOS, ARM64] + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + + - name: Test + id: ggml-ci + run: | + vulkaninfo --summary + GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp + + ggml-ci-linux-intel-vulkan: + runs-on: [self-hosted, Linux, Intel] + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + with: + persist-credentials: false + + - name: Test + id: ggml-ci + run: | + vulkaninfo --summary + GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp + + ggml-ci-intel-openvino-gpu-low-perf: + runs-on: [self-hosted, Linux, Intel, OpenVINO] + + env: + # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile + OPENVINO_VERSION_MAJOR: "2026.0" + OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886" + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + + - name: Setup OpenVINO Toolkit + uses: ./.github/actions/linux-setup-openvino + with: + path: ./openvino_toolkit + version_major: ${{ env.OPENVINO_VERSION_MAJOR }} + version_full: ${{ env.OPENVINO_VERSION_FULL }} + + - name: Install OpenVINO dependencies + run: | + cd ./openvino_toolkit + chmod +x ./install_dependencies/install_openvino_dependencies.sh + echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh + + - name: Test + id: ggml-ci + run: | + source ./openvino_toolkit/setupvars.sh + GG_BUILD_OPENVINO=1 GGML_OPENVINO_DEVICE=GPU GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt diff --git a/.github/workflows/build-vulkan.yml b/.github/workflows/build-vulkan.yml new file mode 100644 index 0000000000..b25ec51471 --- /dev/null +++ b/.github/workflows/build-vulkan.yml @@ -0,0 +1,96 @@ +name: CI (vulkan) + +on: + workflow_dispatch: # allows manual triggering + push: + branches: + - master + paths: [ + '.github/workflows/build-vulkan.yml', + '**/CMakeLists.txt', + '**/.cmake', + '**/*.h', + '**/*.hpp', + '**/*.c', + '**/*.cpp', + '**/*.comp', + '**/*.glsl' + ] + + pull_request: + types: [opened, synchronize, reopened] + paths: [ + '.github/workflows/build-vulkan.yml', + 'ggml/src/ggml-vulkan/**' + ] + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} + cancel-in-progress: true + +env: + GGML_NLOOP: 3 + GGML_N_THREADS: 1 + LLAMA_LOG_COLORS: 1 + LLAMA_LOG_PREFIX: 1 + LLAMA_LOG_TIMESTAMPS: 1 + +jobs: + ubuntu-24-vulkan-llvmpipe: + runs-on: ubuntu-24.04 + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + + - name: ccache + uses: ggml-org/ccache-action@v1.2.16 + with: + key: ubuntu-24-vulkan-llvmpipe + evict-old-files: 1d + save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} + + - name: Dependencies + id: depends + run: | + sudo add-apt-repository -y ppa:kisak/kisak-mesa + sudo apt-get update -y + sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev + + - name: Get latest Vulkan SDK version + id: vulkan_sdk_version + run: | + echo "VULKAN_SDK_VERSION=$(curl https://vulkan.lunarg.com/sdk/latest/linux.txt)" >> "$GITHUB_ENV" + + - name: Use Vulkan SDK Cache + uses: actions/cache@v5 + id: cache-sdk + with: + path: ./vulkan_sdk + key: vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }} + + - name: Setup Vulkan SDK + if: steps.cache-sdk.outputs.cache-hit != 'true' + uses: ./.github/actions/linux-setup-vulkan-llvmpipe + with: + path: ./vulkan_sdk + version: ${{ env.VULKAN_SDK_VERSION }} + + - name: Build + id: cmake_build + run: | + source ./vulkan_sdk/setup-env.sh + cmake -B build \ + -DGGML_VULKAN=ON + cmake --build build --config Release -j $(nproc) + + - name: Test + id: cmake_test + run: | + cd build + export GGML_VK_VISIBLE_DEVICES=0 + export GGML_VK_DISABLE_F16=1 + export GGML_VK_DISABLE_COOPMAT=1 + # This is using llvmpipe and runs slower than other backends + ctest -L main --verbose --timeout 4800 diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 22a96e23fb..fef08d4c00 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -7,7 +7,6 @@ on: - master paths: [ '.github/workflows/build.yml', - '.github/workflows/build-linux-cross.yml', '.github/workflows/build-cmake-pkg.yml', '**/CMakeLists.txt', '**/.cmake', @@ -29,7 +28,6 @@ on: types: [opened, synchronize, reopened] paths: [ '.github/workflows/build.yml', - '.github/workflows/build-linux-cross.yml', '.github/workflows/build-cmake-pkg.yml', '**/CMakeLists.txt', '**/.cmake', @@ -59,7 +57,10 @@ env: LLAMA_LOG_TIMESTAMPS: 1 jobs: - macOS-latest-cmake-arm64: + build-cmake-pkg: + uses: ./.github/workflows/build-cmake-pkg.yml + + macOS-latest-arm64: runs-on: macos-latest steps: @@ -70,7 +71,7 @@ jobs: - name: ccache uses: ggml-org/ccache-action@v1.2.16 with: - key: macOS-latest-cmake-arm64 + key: macOS-latest-arm64 evict-old-files: 1d save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} @@ -95,7 +96,7 @@ jobs: cd build ctest -L main -E "test-llama-archs" --verbose --timeout 900 - macOS-latest-cmake-x64: + macOS-latest-x64: runs-on: macos-15-intel steps: @@ -106,7 +107,7 @@ jobs: - name: ccache uses: ggml-org/ccache-action@v1.2.16 with: - key: macOS-latest-cmake-x64 + key: macOS-latest-x64 evict-old-files: 1d save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} @@ -131,7 +132,7 @@ jobs: cd build ctest -L main --verbose --timeout 900 - macOS-latest-cmake-arm64-webgpu: + macOS-latest-arm64-webgpu: runs-on: macos-latest steps: @@ -142,7 +143,7 @@ jobs: - name: ccache uses: ggml-org/ccache-action@v1.2.16 with: - key: macOS-latest-cmake-arm64-webgpu + key: macOS-latest-arm64-webgpu evict-old-files: 1d save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} @@ -173,7 +174,7 @@ jobs: cd build ctest -L main --verbose --timeout 900 - ubuntu-cpu-cmake: + ubuntu-cpu: strategy: matrix: include: @@ -196,7 +197,7 @@ jobs: - name: ccache uses: ggml-org/ccache-action@v1.2.16 with: - key: ubuntu-cpu-cmake-${{ matrix.build }} + key: ubuntu-cpu-${{ matrix.build }} evict-old-files: 1d save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} @@ -258,94 +259,7 @@ jobs: wget https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K-be.gguf ./bin/llama-completion -m stories260K-be.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256 - ubuntu-latest-cmake-sanitizer: - runs-on: ubuntu-latest - - continue-on-error: true - - strategy: - matrix: - sanitizer: [ADDRESS, THREAD, UNDEFINED] - build_type: [Debug] - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v6 - - - name: ccache - uses: ggml-org/ccache-action@v1.2.16 - with: - key: ubuntu-latest-cmake-sanitizer-${{ matrix.sanitizer }} - evict-old-files: 1d - save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} - - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get install build-essential libssl-dev - - - name: Build - id: cmake_build - if: ${{ matrix.sanitizer != 'THREAD' }} - run: | - cmake -B build \ - -DLLAMA_FATAL_WARNINGS=ON \ - -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \ - -DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \ - -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} - - cmake --build build --config ${{ matrix.build_type }} -j $(nproc) - - - name: Build (no OpenMP) - id: cmake_build_no_openmp - if: ${{ matrix.sanitizer == 'THREAD' }} - run: | - cmake -B build \ - -DLLAMA_FATAL_WARNINGS=ON \ - -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \ - -DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \ - -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ - -DGGML_OPENMP=OFF - - cmake --build build --config ${{ matrix.build_type }} -j $(nproc) - - - name: Test - id: cmake_test - run: | - cd build - ctest -L main --verbose --timeout 900 - - ubuntu-latest-llguidance: - runs-on: ubuntu-latest - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v6 - - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get install build-essential libssl-dev - - - name: Build - id: cmake_build - run: | - cmake -B build \ - -DLLAMA_FATAL_WARNINGS=ON \ - -DLLAMA_LLGUIDANCE=ON - cmake --build build --config Release -j $(nproc) - - - name: Test - id: cmake_test - run: | - cd build - ctest -L main --verbose --timeout 900 - - ubuntu-latest-cmake-rpc: + ubuntu-latest-rpc: runs-on: ubuntu-latest continue-on-error: true @@ -355,12 +269,6 @@ jobs: id: checkout uses: actions/checkout@v6 - # - name: ccache - # uses: ggml-org/ccache-action@v1.2.16 - # with: - # key: ubuntu-latest-cmake-rpc - # evict-old-files: 1d - - name: Dependencies id: depends run: | @@ -380,21 +288,14 @@ jobs: cd build ctest -L main --verbose - ubuntu-24-cmake-vulkan-deb: - runs-on: ubuntu-24.04 + ubuntu-24-vulkan: + runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }} steps: - name: Clone id: checkout uses: actions/checkout@v6 - - name: ccache - uses: ggml-org/ccache-action@v1.2.16 - with: - key: ubuntu-24-cmake-vulkan-deb - evict-old-files: 1d - save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} - - name: Dependencies id: depends run: | @@ -414,7 +315,7 @@ jobs: run: | cmake --build build -j $(nproc) - ubuntu-24-cmake-vulkan: + ubuntu-24-webgpu: runs-on: ubuntu-24.04 steps: @@ -425,66 +326,7 @@ jobs: - name: ccache uses: ggml-org/ccache-action@v1.2.16 with: - key: ubuntu-24-cmake-vulkan - evict-old-files: 1d - save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} - - - name: Dependencies - id: depends - run: | - sudo add-apt-repository -y ppa:kisak/kisak-mesa - sudo apt-get update -y - sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev - - - name: Get latest Vulkan SDK version - id: vulkan_sdk_version - run: | - echo "VULKAN_SDK_VERSION=$(curl https://vulkan.lunarg.com/sdk/latest/linux.txt)" >> "$GITHUB_ENV" - - - name: Use Vulkan SDK Cache - uses: actions/cache@v5 - id: cache-sdk - with: - path: ./vulkan_sdk - key: vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }} - - - name: Setup Vulkan SDK - if: steps.cache-sdk.outputs.cache-hit != 'true' - uses: ./.github/actions/linux-setup-vulkan - with: - path: ./vulkan_sdk - version: ${{ env.VULKAN_SDK_VERSION }} - - - name: Build - id: cmake_build - run: | - source ./vulkan_sdk/setup-env.sh - cmake -B build \ - -DGGML_VULKAN=ON - cmake --build build --config Release -j $(nproc) - - - name: Test - id: cmake_test - run: | - cd build - export GGML_VK_VISIBLE_DEVICES=0 - export GGML_VK_DISABLE_F16=1 - export GGML_VK_DISABLE_COOPMAT=1 - # This is using llvmpipe and runs slower than other backends - ctest -L main --verbose --timeout 4800 - - ubuntu-24-cmake-webgpu: - runs-on: ubuntu-24.04 - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v6 - - - name: ccache - uses: ggml-org/ccache-action@v1.2.16 - with: - key: ubuntu-24-cmake-webgpu + key: ubuntu-24-webgpu evict-old-files: 1d save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} @@ -542,23 +384,16 @@ jobs: run: | cd build # This is using llvmpipe and runs slower than other backends - ctest -L main --verbose --timeout 3600 + ctest -L main --verbose --timeout 900 - ubuntu-24-wasm-webgpu: - runs-on: ubuntu-24.04 + ubuntu-24-webgpu-wasm: + runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }} steps: - name: Clone id: checkout uses: actions/checkout@v6 - - name: ccache - uses: ggml-org/ccache-action@v1.2.16 - with: - key: ubuntu-latest-wasm-webgpu - evict-old-files: 1d - save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} - - name: Install Emscripten run: | git clone https://github.com/emscripten-core/emsdk.git @@ -585,7 +420,7 @@ jobs: cmake --build build-wasm --target test-backend-ops -j $(nproc) - ubuntu-22-cmake-hip: + ubuntu-22-hip: runs-on: ubuntu-22.04 container: rocm/dev-ubuntu-22.04:6.1.2 @@ -603,7 +438,7 @@ jobs: - name: ccache uses: ggml-org/ccache-action@v1.2.16 with: - key: ubuntu-22-cmake-hip + key: ubuntu-22-hip evict-old-files: 1d save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} @@ -616,7 +451,7 @@ jobs: -DGGML_HIP=ON cmake --build build --config Release -j $(nproc) - ubuntu-22-cmake-musa: + ubuntu-22-musa: runs-on: ubuntu-22.04 container: mthreads/musa:rc4.3.0-devel-ubuntu22.04-amd64 @@ -634,7 +469,7 @@ jobs: - name: ccache uses: ggml-org/ccache-action@v1.2.16 with: - key: ubuntu-22-cmake-musa + key: ubuntu-22-musa evict-old-files: 1d save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} @@ -645,7 +480,7 @@ jobs: -DGGML_MUSA=ON cmake --build build --config Release -j $(nproc) - ubuntu-22-cmake-sycl: + ubuntu-22-sycl: runs-on: ubuntu-22.04 continue-on-error: true @@ -680,7 +515,7 @@ jobs: - name: ccache uses: ggml-org/ccache-action@v1.2.16 with: - key: ubuntu-22-cmake-sycl + key: ubuntu-22-sycl evict-old-files: 1d save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} @@ -694,7 +529,7 @@ jobs: -DCMAKE_CXX_COMPILER=icpx cmake --build build --config Release -j $(nproc) - ubuntu-22-cmake-sycl-fp16: + ubuntu-22-sycl-fp16: runs-on: ubuntu-22.04 continue-on-error: true @@ -729,7 +564,7 @@ jobs: - name: ccache uses: ggml-org/ccache-action@v1.2.16 with: - key: ubuntu-22-cmake-sycl-fp16 + key: ubuntu-22-sycl-fp16 evict-old-files: 1d save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} @@ -744,8 +579,8 @@ jobs: -DGGML_SYCL_F16=ON cmake --build build --config Release -j $(nproc) - ubuntu-24-cmake-openvino: - name: ubuntu-24-cmake-openvino-${{ matrix.openvino_device }} + ubuntu-24-openvino: + name: ubuntu-24-openvino-${{ matrix.openvino_device }} strategy: matrix: include: @@ -759,7 +594,7 @@ jobs: runs-on: ${{ fromJSON(matrix.runner) }} env: - # Sync versions in build.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile + # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile OPENVINO_VERSION_MAJOR: "2026.0" OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886" @@ -769,10 +604,12 @@ jobs: uses: actions/checkout@v6 - name: ccache + if: runner.environment == 'github-hosted' uses: ggml-org/ccache-action@v1.2.16 with: - key: ubuntu-24-cmake-openvino-${{ matrix.variant }}-no-preset-v1 + key: ubuntu-24-openvino-${{ matrix.variant }}-no-preset-v1 evict-old-files: 1d + save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} - name: Dependencies id: depends @@ -782,6 +619,7 @@ jobs: sudo apt-get install -y ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd - name: Use OpenVINO Toolkit Cache + if: runner.environment == 'github-hosted' uses: actions/cache@v5 id: cache-openvino with: @@ -821,194 +659,7 @@ jobs: fi ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 2000 - build-linux-cross: - uses: ./.github/workflows/build-linux-cross.yml - - build-cmake-pkg: - uses: ./.github/workflows/build-cmake-pkg.yml - - macOS-latest-cmake-ios: - runs-on: macos-latest - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v6 - - - name: ccache - uses: ggml-org/ccache-action@v1.2.16 - with: - key: macOS-latest-cmake-ios - evict-old-files: 1d - save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} - - - name: Build - id: cmake_build - run: | - sysctl -a - cmake -B build -G Xcode \ - -DGGML_METAL_USE_BF16=ON \ - -DGGML_METAL_EMBED_LIBRARY=ON \ - -DLLAMA_BUILD_COMMON=OFF \ - -DLLAMA_BUILD_EXAMPLES=OFF \ - -DLLAMA_BUILD_TOOLS=OFF \ - -DLLAMA_BUILD_TESTS=OFF \ - -DLLAMA_BUILD_SERVER=OFF \ - -DCMAKE_SYSTEM_NAME=iOS \ - -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \ - -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml - cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO - - macOS-latest-cmake-tvos: - runs-on: macos-latest - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v6 - - - name: ccache - uses: ggml-org/ccache-action@v1.2.16 - with: - key: macOS-latest-cmake-tvos - evict-old-files: 1d - save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} - - - name: Build - id: cmake_build - run: | - sysctl -a - cmake -B build -G Xcode \ - -DGGML_METAL_USE_BF16=ON \ - -DGGML_METAL_EMBED_LIBRARY=ON \ - -DLLAMA_BUILD_COMMON=OFF \ - -DLLAMA_BUILD_EXAMPLES=OFF \ - -DLLAMA_BUILD_TOOLS=OFF \ - -DLLAMA_BUILD_TESTS=OFF \ - -DLLAMA_BUILD_SERVER=OFF \ - -DCMAKE_SYSTEM_NAME=tvOS \ - -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \ - -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml - cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO - - macOS-latest-cmake-visionos: - runs-on: macos-latest - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v6 - - - name: Build - id: cmake_build - run: | - sysctl -a - cmake -B build -G Xcode \ - -DGGML_METAL_USE_BF16=ON \ - -DGGML_METAL_EMBED_LIBRARY=ON \ - -DLLAMA_BUILD_COMMON=OFF \ - -DLLAMA_BUILD_EXAMPLES=OFF \ - -DLLAMA_BUILD_TOOLS=OFF \ - -DLLAMA_BUILD_TESTS=OFF \ - -DLLAMA_BUILD_SERVER=OFF \ - -DCMAKE_SYSTEM_NAME=visionOS \ - -DCMAKE_OSX_DEPLOYMENT_TARGET=1.0 \ - -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml - cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO - - macOS-latest-swift: - runs-on: macos-latest - needs: ios-xcode-build - - strategy: - matrix: - destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS'] - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v6 - - - name: ccache - uses: ggml-org/ccache-action@v1.2.16 - with: - key: macOS-latest-swift - evict-old-files: 1d - save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} - - - name: Download xcframework artifact - uses: actions/download-artifact@v7 - with: - name: llama-xcframework - path: build-apple/llama.xcframework/ - - - name: Build llama.cpp with CMake - id: cmake_build - run: | - sysctl -a - cmake -B build -G Xcode \ - -DGGML_METAL_USE_BF16=ON \ - -DGGML_METAL_EMBED_LIBRARY=ON \ - -DLLAMA_OPENSSL=OFF \ - -DLLAMA_BUILD_EXAMPLES=OFF \ - -DLLAMA_BUILD_TOOLS=OFF \ - -DLLAMA_BUILD_TESTS=OFF \ - -DLLAMA_BUILD_SERVER=OFF \ - -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" - cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) - - windows-msys2: - runs-on: windows-2025 - - strategy: - fail-fast: false - matrix: - include: - - { sys: UCRT64, env: ucrt-x86_64, build: Release } - - { sys: CLANG64, env: clang-x86_64, build: Release } - - steps: - - name: Clone - uses: actions/checkout@v6 - - - name: ccache - uses: ggml-org/ccache-action@v1.2.16 - with: - key: windows-msys2 - variant: ccache - evict-old-files: 1d - save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} - - - name: Setup ${{ matrix.sys }} - uses: msys2/setup-msys2@v2 - with: - update: true - msystem: ${{matrix.sys}} - install: >- - base-devel - git - mingw-w64-${{matrix.env}}-toolchain - mingw-w64-${{matrix.env}}-cmake - mingw-w64-${{matrix.env}}-openblas - - - name: Build using CMake - shell: msys2 {0} - run: | - cmake -B build - cmake --build build --config ${{ matrix.build }} -j $(nproc) - - - name: Clean after building using CMake - shell: msys2 {0} - run: | - rm -rf build - - - name: Build using CMake w/ OpenBLAS - shell: msys2 {0} - run: | - cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS - cmake --build build --config ${{ matrix.build }} -j $(nproc) - - windows-latest-cmake: + windows-latest: runs-on: windows-2025 env: @@ -1043,7 +694,7 @@ jobs: - name: ccache uses: ggml-org/ccache-action@v1.2.16 with: - key: windows-latest-cmake-${{ matrix.build }} + key: windows-latest-${{ matrix.build }} variant: ccache evict-old-files: 1d save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} @@ -1130,7 +781,7 @@ jobs: # $env:LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR = 1 # & $sde -future -- ctest -L main -C Release --verbose --timeout 900 - ubuntu-latest-cmake-cuda: + ubuntu-latest-cuda: runs-on: ubuntu-latest container: nvidia/cuda:12.6.2-devel-ubuntu24.04 @@ -1149,7 +800,7 @@ jobs: - name: ccache uses: ggml-org/ccache-action@v1.2.16 with: - key: ubuntu-latest-cmake-cuda + key: ubuntu-latest-cuda evict-old-files: 1d save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} @@ -1166,7 +817,7 @@ jobs: -DGGML_CUDA_CUB_3DOT2=ON cmake --build build - windows-2022-cmake-cuda: + windows-2022-cuda: runs-on: windows-2022 strategy: @@ -1215,7 +866,7 @@ jobs: cmake --build build --config Release -j %NINJA_JOBS% -t ggml cmake --build build --config Release - windows-latest-cmake-sycl: + windows-latest-sycl: runs-on: windows-2022 defaults: @@ -1234,7 +885,7 @@ jobs: - name: ccache uses: ggml-org/ccache-action@v1.2.16 with: - key: windows-latest-cmake-sycl + key: windows-latest-sycl variant: ccache evict-old-files: 1d save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} @@ -1249,7 +900,7 @@ jobs: id: cmake_build run: examples/sycl/win-build-sycl.bat - windows-latest-cmake-hip: + windows-latest-hip: runs-on: windows-2022 env: @@ -1316,223 +967,95 @@ jobs: -DGGML_RPC=ON cmake --build build -j ${env:NUMBER_OF_PROCESSORS} - ios-xcode-build: - runs-on: macos-latest + ubuntu-cpu-riscv64-native: + runs-on: RISCV64 steps: - - name: Checkout code - uses: actions/checkout@v6 - - - name: Setup Xcode - uses: maxim-lobanov/setup-xcode@v1 - with: - xcode-version: latest-stable - - - name: Build - id: cmake_build + - name: Install dependencies run: | - sysctl -a - cmake -B build -G Xcode \ - -DGGML_METAL_USE_BF16=ON \ - -DGGML_METAL_EMBED_LIBRARY=ON \ - -DLLAMA_OPENSSL=OFF \ - -DLLAMA_BUILD_EXAMPLES=OFF \ - -DLLAMA_BUILD_TOOLS=OFF \ - -DLLAMA_BUILD_TESTS=OFF \ - -DLLAMA_BUILD_SERVER=OFF \ - -DCMAKE_SYSTEM_NAME=iOS \ - -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \ - -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml - cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO + sudo apt-get update - - name: xcodebuild for swift package - id: xcodebuild - run: | - ./build-xcframework.sh + # Install necessary packages + sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential libssl-dev wget ccache git-lfs - - name: Upload xcframework artifact - uses: actions/upload-artifact@v6 - with: - name: llama-xcframework - path: build-apple/llama.xcframework/ - retention-days: 1 + # Set gcc-14 and g++-14 as the default compilers + sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100 + sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100 + sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc + sudo ln -sf /usr/bin/g++-14 /usr/bin/g++ - - name: Build Xcode project - run: | - xcodebuild -downloadPlatform iOS - xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build + # Install Rust stable version + rustup install stable + rustup default stable - android-build: - runs-on: ubuntu-latest + git lfs install - steps: - - name: Clone - uses: actions/checkout@v6 - - # Disabled due to size (400MB) and always 0 cache hits - # - name: ccache - # uses: ggml-org/ccache-action@v1.2.16 - # with: - # key: android-build - # evict-old-files: 1d - - - name: Set up JDK - uses: actions/setup-java@v5 - with: - java-version: 17 - distribution: zulu - - - name: Setup Android SDK - uses: android-actions/setup-android@v3 - with: - log-accepted-android-sdk-licenses: false - - - name: Build - run: | - cd examples/llama.android - ./gradlew build --no-daemon - - android-ndk-build: - runs-on: ubuntu-latest - - env: - OPENCL_VERSION: 2025.07.22 - - strategy: - matrix: - include: - - build: 'arm64-cpu' - defines: '-D ANDROID_ABI=arm64-v8a -D ANDROID_PLATFORM=android-31 -D CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -D GGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm -G Ninja -D LLAMA_OPENSSL=OFF -D GGML_OPENMP=OFF' - - build: 'arm64-snapdragon' - defines: '--preset arm64-android-snapdragon-release' - - steps: - name: Clone id: checkout uses: actions/checkout@v6 - - name: Install OpenCL Headers and Libs - id: install_opencl - if: ${{ matrix.build == 'arm64-snapdragon' }} + - name: Check environment run: | - mkdir opencl - curl -L -o opencl/clhpp.tar.gz https://github.com/KhronosGroup/OpenCL-CLHPP/archive/refs/tags/v${OPENCL_VERSION}.tar.gz - curl -L -o opencl/headers.tar.gz https://github.com/KhronosGroup/OpenCL-Headers/archive/refs/tags/v${OPENCL_VERSION}.tar.gz - curl -L -o opencl/icd-loader.tar.gz https://github.com/KhronosGroup/OpenCL-ICD-Loader/archive/refs/tags/v${OPENCL_VERSION}.tar.gz - tar -xaf opencl/headers.tar.gz -C opencl - tar -xaf opencl/clhpp.tar.gz -C opencl - tar -xaf opencl/icd-loader.tar.gz -C opencl - sudo cp -r opencl/OpenCL-Headers-${OPENCL_VERSION}/CL ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include - sudo cp -r opencl/OpenCL-CLHPP-${OPENCL_VERSION}/include/CL/* ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include/CL - cd opencl/OpenCL-ICD-Loader-${OPENCL_VERSION} - cmake -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -DOPENCL_ICD_LOADER_HEADERS_DIR=${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=31 -DANDROID_STL=c++_shared - cmake --build build - sudo cp build/libOpenCL.so ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android - rm -rf opencl + uname -a + gcc --version + g++ --version + ldd --version + cmake --version + rustc --version - - name: Install Hexagon SDK - id: install_hexsdk - if: ${{ matrix.build == 'arm64-snapdragon' }} - env: - HEXSDK_VER: 6.4.0.2 - HEXTLS_VER: 19.0.04 + - name: Setup ccache run: | - curl -L -o hex-sdk.tar.gz https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v$HEXSDK_VER/hexagon-sdk-v$HEXSDK_VER-amd64-lnx.tar.xz - mkdir hex-sdk - tar -xaf hex-sdk.tar.gz -C hex-sdk - ls -l hex-sdk - sudo mv hex-sdk /opt/hexagon - echo "HEXAGON_SDK_ROOT=/opt/hexagon/$HEXSDK_VER" >> "$GITHUB_ENV" - echo "HEXAGON_TOOLS_ROOT=/opt/hexagon/$HEXSDK_VER/tools/HEXAGON_Tools/$HEXTLS_VER" >> "$GITHUB_ENV" - echo "DEFAULT_HLOS_ARCH=64" >> "$GITHUB_ENV" - echo "DEFAULT_TOOLS_VARIANT=toolv19" >> "$GITHUB_ENV" - echo "DEFAULT_NO_QURT_INC=0" >> "$GITHUB_ENV" - echo "DEFAULT_DSP_ARCH=v73" >> "$GITHUB_ENV" + # Set unique cache directory for this job + export CCACHE_DIR="$HOME/.ccache/cpu-cmake-rv64-native" + mkdir -p "$CCACHE_DIR" - - name: Update CMake presets - id: update_presets - if: ${{ matrix.build == 'arm64-snapdragon' }} - run: | - cp docs/backend/snapdragon/CMakeUserPresets.json . + # Configure ccache for optimal performance + ccache --set-config=max_size=5G + ccache --set-config=compression=true + ccache --set-config=compression_level=6 + ccache --set-config=cache_dir="$CCACHE_DIR" + + # Enable more aggressive caching + ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime + ccache --set-config=hash_dir=false + + # Export for subsequent steps + echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV + echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV - name: Build - id: ndk_build + id: cmake_build run: | - cmake ${{ matrix.defines }} -B build - cmake --build build - cmake --install build --prefix pkg-adb/llama.cpp + cmake -B build \ + -DCMAKE_BUILD_TYPE=Release \ + -DGGML_OPENMP=OFF \ + -DLLAMA_BUILD_EXAMPLES=ON \ + -DLLAMA_BUILD_TOOLS=ON \ + -DLLAMA_BUILD_TESTS=ON \ + -DCMAKE_C_COMPILER_LAUNCHER=ccache \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ + -DGGML_RPC=ON \ + -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \ + -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 + + cmake --build build --config Release -j $(nproc) - name: Test id: cmake_test run: | - echo "FIXME: test on devices" + cd build + ctest -L main --verbose --timeout 900 - openEuler-latest-cmake-cann: - defaults: - run: - shell: bash -el {0} - strategy: - matrix: - arch: [x86, aarch64] - chip_type: ['910b', '310p'] - build: ['Release'] - use_acl_graph: ['on', 'off'] - exclude: - # 310P does not support USE_ACL_GRAPH=on - - chip_type: '310p' - use_acl_graph: 'on' - runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }} - steps: - - name: Checkout - uses: actions/checkout@v6 - with: - fetch-depth: 0 - - - name: Free up disk space - uses: ggml-org/free-disk-space@v1.3.1 - with: - tool-cache: true - - - name: Set container image - id: cann-image + - name: Test llama2c conversion + id: llama2c_test run: | - image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.3.rc2-910b-openeuler24.03-py3.11' || '8.3.rc2-310p-openeuler24.03-py3.11' }}" - echo "image=${image}" >> "${GITHUB_OUTPUT}" - - - name: Pull container image - run: docker pull "${{ steps.cann-image.outputs.image }}" - - - name: Build - env: - BUILD_TYPE: ${{ matrix.build }} - SOC_TYPE: ascend${{ matrix.chip_type }} - USE_ACL_GRAPH: ${{ matrix.use_acl_graph }} - run: | - HOST_UID=$(id -u) - HOST_GID=$(id -g) - - docker run --rm \ - -v "${PWD}:/workspace" \ - -w /workspace \ - -e SOC_TYPE=${SOC_TYPE} \ - -e BUILD_TYPE=${BUILD_TYPE} \ - -e USE_ACL_GRAPH=${USE_ACL_GRAPH} \ - "${{ steps.cann-image.outputs.image }}" \ - bash -lc ' - set -e - yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake openssl-devel - yum clean all && rm -rf /var/cache/yum - git config --global --add safe.directory "/workspace" - export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH} - cmake -S . -B build \ - -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ - -DGGML_CANN=on \ - -DSOC_TYPE=${SOC_TYPE} \ - -DUSE_ACL_GRAPH=${USE_ACL_GRAPH} - cmake --build build -j $(nproc) - - chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build - ' + cd build + echo "Fetch tokenizer" + wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin + echo "Fetch llama2c model" + wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin + ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf + ./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256 # TODO: simplify the following workflows using a matrix # TODO: run lighter CI on PRs and the full CI only on master (if needed) @@ -1666,160 +1189,6 @@ jobs: run: | LLAMA_ARG_THREADS=$(nproc) GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt - ggml-ci-x64-nvidia-cuda: - runs-on: [self-hosted, Linux, X64, NVIDIA] - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v6 - - - name: Test - id: ggml-ci - run: | - nvidia-smi - GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp - - ggml-ci-x64-nvidia-vulkan-cm: - runs-on: [self-hosted, Linux, X64, NVIDIA] - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v6 - - - name: Test - id: ggml-ci - run: | - vulkaninfo --summary - GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp - - ggml-ci-x64-nvidia-vulkan-cm2: - runs-on: [self-hosted, Linux, X64, NVIDIA, COOPMAT2] - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v6 - - - name: Test - id: ggml-ci - run: | - vulkaninfo --summary - GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp - - ggml-ci-x64-cpu-amx: - runs-on: [self-hosted, Linux, X64, CPU, AMX] - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v6 - - - name: Test - id: ggml-ci - run: | - bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp - - # ggml-ci-x64-amd-vulkan: - # runs-on: [self-hosted, Linux, X64, AMD] - - # steps: - # - name: Clone - # id: checkout - # uses: actions/checkout@v6 - - # - name: Test - # id: ggml-ci - # run: | - # vulkaninfo --summary - # GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp - - # ggml-ci-x64-amd-rocm: - # runs-on: [self-hosted, Linux, X64, AMD] - - # steps: - # - name: Clone - # id: checkout - # uses: actions/checkout@v6 - - # - name: Test - # id: ggml-ci - # run: | - # amd-smi static - # GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp - - ggml-ci-mac-metal: - runs-on: [self-hosted, macOS, ARM64] - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v6 - - - name: Test - id: ggml-ci - run: | - GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp - - ggml-ci-mac-webgpu: - runs-on: [self-hosted, macOS, ARM64] - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v6 - - - name: Dawn Dependency - id: dawn-depends - run: | - DAWN_VERSION="v2.0.0" - DAWN_OWNER="reeselevine" - DAWN_REPO="dawn" - DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release" - echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip" - curl -L -o artifact.zip \ - "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip" - mkdir dawn - unzip artifact.zip - tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1 - - - name: Test - id: ggml-ci - run: | - GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \ - bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp - - ggml-ci-mac-vulkan: - runs-on: [self-hosted, macOS, ARM64] - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v6 - - - name: Test - id: ggml-ci - run: | - vulkaninfo --summary - GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp - - ggml-ci-x64-linux-intel-vulkan: - runs-on: [self-hosted, Linux, X64, Intel] - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v6 - with: - persist-credentials: false - - - name: Test - id: ggml-ci - run: | - vulkaninfo --summary - GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp - ggml-ci-arm64-cpu-kleidiai: runs-on: ubuntu-22.04-arm @@ -1846,382 +1215,7 @@ jobs: run: | GG_BUILD_KLEIDIAI=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt - ggml-ci-x64-intel-openvino-gpu-low-perf: - runs-on: [self-hosted, Linux, X64, Intel, OpenVINO] - - env: - # Sync versions in build.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile - OPENVINO_VERSION_MAJOR: "2026.0" - OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886" - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v6 - - - name: Use OpenVINO Toolkit Cache - uses: actions/cache@v5 - id: cache-openvino - with: - path: ./openvino_toolkit - key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }} - - - name: Setup OpenVINO Toolkit - if: steps.cache-openvino.outputs.cache-hit != 'true' - uses: ./.github/actions/linux-setup-openvino - with: - path: ./openvino_toolkit - version_major: ${{ env.OPENVINO_VERSION_MAJOR }} - version_full: ${{ env.OPENVINO_VERSION_FULL }} - - - name: Install OpenVINO dependencies - run: | - cd ./openvino_toolkit - chmod +x ./install_dependencies/install_openvino_dependencies.sh - echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh - - - name: Test - id: ggml-ci - run: | - source ./openvino_toolkit/setupvars.sh - GG_BUILD_OPENVINO=1 GGML_OPENVINO_DEVICE=GPU GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt - - ubuntu-cpu-cmake-riscv64-native: - runs-on: RISCV64 - - steps: - - name: Install dependencies - run: | - sudo apt-get update - - # Install necessary packages - sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential libssl-dev wget ccache git-lfs - - # Set gcc-14 and g++-14 as the default compilers - sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100 - sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100 - sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc - sudo ln -sf /usr/bin/g++-14 /usr/bin/g++ - - # Install Rust stable version - rustup install stable - rustup default stable - - git lfs install - - - name: Clone - id: checkout - uses: actions/checkout@v6 - - - name: Check environment - run: | - uname -a - gcc --version - g++ --version - ldd --version - cmake --version - rustc --version - - - name: Setup ccache - run: | - # Set unique cache directory for this job - export CCACHE_DIR="$HOME/.ccache/cpu-cmake-rv64-native" - mkdir -p "$CCACHE_DIR" - - # Configure ccache for optimal performance - ccache --set-config=max_size=5G - ccache --set-config=compression=true - ccache --set-config=compression_level=6 - ccache --set-config=cache_dir="$CCACHE_DIR" - - # Enable more aggressive caching - ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime - ccache --set-config=hash_dir=false - - # Export for subsequent steps - echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV - echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV - - - name: Build - id: cmake_build - run: | - cmake -B build \ - -DCMAKE_BUILD_TYPE=Release \ - -DGGML_OPENMP=OFF \ - -DLLAMA_BUILD_EXAMPLES=ON \ - -DLLAMA_BUILD_TOOLS=ON \ - -DLLAMA_BUILD_TESTS=ON \ - -DCMAKE_C_COMPILER_LAUNCHER=ccache \ - -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ - -DGGML_RPC=ON \ - -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \ - -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 - - cmake --build build --config Release -j $(nproc) - - - name: Test - id: cmake_test - run: | - cd build - ctest -L main --verbose --timeout 900 - - - name: Test llama2c conversion - id: llama2c_test - run: | - cd build - echo "Fetch tokenizer" - wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin - echo "Fetch llama2c model" - wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin - ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf - ./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256 - - ubuntu-cmake-sanitizer-riscv64-native: - runs-on: RISCV64 - - continue-on-error: true - - strategy: - matrix: - sanitizer: [ADDRESS, THREAD, UNDEFINED] - build_type: [Debug] - - steps: - - name: Install dependencies - run: | - sudo apt-get update - - # Install necessary packages - sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential wget ccache git-lfs - - # Set gcc-14 and g++-14 as the default compilers - sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100 - sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100 - sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc - sudo ln -sf /usr/bin/g++-14 /usr/bin/g++ - - # Install Rust stable version - rustup install stable - rustup default stable - - git lfs install - - - name: GCC version check - run: | - gcc --version - g++ --version - - - name: Clone - id: checkout - uses: actions/checkout@v6 - - - name: Setup ccache - run: | - # Unique cache directory per matrix combination - export CCACHE_DIR="$HOME/.ccache/sanitizer-${{ matrix.sanitizer }}-${{ matrix.build_type }}" - mkdir -p "$CCACHE_DIR" - - # Configure ccache - ccache --set-config=max_size=5G - ccache --set-config=compression=true - ccache --set-config=compression_level=6 - ccache --set-config=cache_dir="$CCACHE_DIR" - ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime - ccache --set-config=hash_dir=false - - # Export for subsequent steps - echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV - echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV - - - name: Build - id: cmake_build - if: ${{ matrix.sanitizer != 'THREAD' }} - run: | - cmake -B build \ - -DLLAMA_OPENSSL=OFF \ - -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ - -DGGML_OPENMP=ON \ - -DLLAMA_BUILD_EXAMPLES=ON \ - -DLLAMA_BUILD_TOOLS=ON \ - -DLLAMA_BUILD_TESTS=OFF \ - -DCMAKE_C_COMPILER_LAUNCHER=ccache \ - -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ - -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \ - -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \ - -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 - - cmake --build build --config ${{ matrix.build_type }} -j $(nproc) - - - name: Build (no OpenMP) - id: cmake_build_no_openmp - if: ${{ matrix.sanitizer == 'THREAD' }} - run: | - cmake -B build \ - -DLLAMA_OPENSSL=OFF \ - -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ - -DGGML_OPENMP=OFF \ - -DLLAMA_BUILD_EXAMPLES=ON \ - -DLLAMA_BUILD_TOOLS=ON \ - -DLLAMA_BUILD_TESTS=OFF \ - -DCMAKE_C_COMPILER_LAUNCHER=ccache \ - -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ - -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \ - -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \ - -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 - - cmake --build build --config ${{ matrix.build_type }} -j $(nproc) - - - name: Test - id: cmake_test - run: | - cd build - ctest -L main --verbose --timeout 900 - - - ubuntu-llguidance-riscv64-native: - runs-on: RISCV64 - steps: - - name: Install dependencies - run: | - sudo apt-get update - - # Install necessary packages - sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential wget ccache git-lfs - - # Set gcc-14 and g++-14 as the default compilers - sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100 - sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100 - sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc - sudo ln -sf /usr/bin/g++-14 /usr/bin/g++ - - # Install Rust stable version - rustup install stable - rustup default stable - - git lfs install - - - name: GCC version check - run: | - gcc --version - g++ --version - - - name: Clone - id: checkout - uses: actions/checkout@v6 - - - name: Setup ccache - run: | - export CCACHE_DIR="$HOME/.ccache/llguidance-riscv64" - mkdir -p "$CCACHE_DIR" - - ccache --set-config=max_size=5G - ccache --set-config=compression=true - ccache --set-config=compression_level=6 - ccache --set-config=cache_dir="$CCACHE_DIR" - ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime - ccache --set-config=hash_dir=false - - echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV - echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV - - - name: Build - id: cmake_build - run: | - cmake -B build \ - -DLLAMA_OPENSSL=OFF \ - -DCMAKE_BUILD_TYPE=Release \ - -DGGML_OPENMP=OFF \ - -DLLAMA_BUILD_EXAMPLES=ON \ - -DLLAMA_BUILD_TOOLS=ON \ - -DLLAMA_BUILD_TESTS=OFF \ - -DCMAKE_C_COMPILER_LAUNCHER=ccache \ - -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ - -DLLAMA_LLGUIDANCE=ON \ - -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \ - -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 - - cmake --build build --config Release -j $(nproc) - - - name: Test - id: cmake_test - run: | - cd build - ctest -L main --verbose --timeout 900 - - - ubuntu-cmake-rpc-riscv64-native: - runs-on: RISCV64 - - continue-on-error: true - - steps: - - name: Install dependencies - run: | - sudo apt-get update - - # Install necessary packages - sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential libssl-dev wget ccache git-lfs - - # Set gcc-14 and g++-14 as the default compilers - sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100 - sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100 - sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc - sudo ln -sf /usr/bin/g++-14 /usr/bin/g++ - - # Install Rust stable version - rustup install stable - rustup default stable - - git lfs install - - - name: GCC version check - run: | - gcc --version - g++ --version - - - name: Clone - id: checkout - uses: actions/checkout@v6 - - - name: Setup ccache - run: | - export CCACHE_DIR="$HOME/.ccache/rpc-riscv64" - mkdir -p "$CCACHE_DIR" - - ccache --set-config=max_size=5G - ccache --set-config=compression=true - ccache --set-config=compression_level=6 - ccache --set-config=cache_dir="$CCACHE_DIR" - ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime - ccache --set-config=hash_dir=false - - echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV - echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV - - - name: Build - id: cmake_build - run: | - cmake -B build \ - -DCMAKE_BUILD_TYPE=Release \ - -DGGML_OPENMP=OFF \ - -DLLAMA_BUILD_EXAMPLES=ON \ - -DLLAMA_BUILD_TOOLS=ON \ - -DLLAMA_BUILD_TESTS=ON \ - -DCMAKE_C_COMPILER_LAUNCHER=ccache \ - -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ - -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \ - -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \ - -DGGML_RPC=ON - - cmake --build build --config Release -j $(nproc) - - - name: Test - id: cmake_test - run: | - cd build - ctest -L main --verbose - - ggml-ci-arm64-graviton4-kleidiai: + ggml-ci-arm64-cpu-kleidiai-graviton4: runs-on: ah-ubuntu_22_04-c8g_8x steps: @@ -2258,7 +1252,7 @@ jobs: - name: ccache uses: ggml-org/ccache-action@v1.2.16 with: - key: ggml-ci-arm64-graviton4-kleidiai + key: ggml-ci-arm64-cpu-kleidiai-graviton4 evict-old-files: 1d save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} diff --git a/.github/workflows/python-lint.yml b/.github/workflows/python-lint.yml index 8d1dd7a7d5..e21b3b6568 100644 --- a/.github/workflows/python-lint.yml +++ b/.github/workflows/python-lint.yml @@ -4,10 +4,16 @@ on: push: branches: - master - paths: ['.github/workflows/python-lint.yml', '**/*.py'] + paths: [ + '.github/workflows/python-lint.yml', + '**/*.py' + ] pull_request: types: [opened, synchronize, reopened] - paths: ['.github/workflows/python-lint.yml', '**/*.py'] + paths: [ + '.github/workflows/python-lint.yml', + '**/*.py' + ] concurrency: group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 6d5d3774dd..b0f2714ffd 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -10,7 +10,22 @@ on: push: branches: - master - paths: ['.github/workflows/release.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp'] + paths: [ + '.github/workflows/release.yml', + '**/CMakeLists.txt', + '**/.cmake', + '**/*.h', + '**/*.hpp', + '**/*.c', + '**/*.cpp', + '**/*.cu', + '**/*.cuh', + '**/*.swift', + '**/*.m', + '**/*.metal', + '**/*.comp', + '**/*.glsl' + ] concurrency: group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} @@ -34,7 +49,7 @@ jobs: - name: ccache uses: ggml-org/ccache-action@v1.2.16 with: - key: macOS-latest-cmake-arm64 + key: macOS-latest-arm64 evict-old-files: 1d - name: Build @@ -81,7 +96,7 @@ jobs: - name: ccache uses: ggml-org/ccache-action@v1.2.16 with: - key: macOS-latest-cmake-x64 + key: macOS-latest-x64 evict-old-files: 1d - name: Build @@ -140,7 +155,7 @@ jobs: - name: ccache uses: ggml-org/ccache-action@v1.2.16 with: - key: ubuntu-cpu-cmake-${{ matrix.build }} + key: ubuntu-cpu-${{ matrix.build }} evict-old-files: 1d - name: Dependencies @@ -191,7 +206,7 @@ jobs: - name: ccache uses: ggml-org/ccache-action@v1.2.16 with: - key: ubuntu-22-cmake-vulkan + key: ubuntu-22-vulkan evict-old-files: 1d - name: Dependencies @@ -238,7 +253,7 @@ jobs: openvino_version: ${{ steps.openvino_version.outputs.value }} env: - # Sync versions in build.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile + # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile OPENVINO_VERSION_MAJOR: "2026.0" OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886" @@ -256,7 +271,7 @@ jobs: - name: ccache uses: ggml-org/ccache-action@v1.2.16 with: - key: ubuntu-24-cmake-openvino-release-no-preset-v1 + key: ubuntu-24-openvino-release-no-preset-v1 evict-old-files: 1d - name: Dependencies @@ -329,7 +344,7 @@ jobs: - name: ccache uses: ggml-org/ccache-action@v1.2.16 with: - key: windows-latest-cmake-cpu-${{ matrix.arch }} + key: windows-latest-cpu-${{ matrix.arch }} variant: ccache evict-old-files: 1d @@ -390,7 +405,7 @@ jobs: - name: ccache uses: ggml-org/ccache-action@v1.2.16 with: - key: windows-latest-cmake-${{ matrix.backend }}-${{ matrix.arch }} + key: windows-latest-${{ matrix.backend }}-${{ matrix.arch }} variant: ccache evict-old-files: 1d @@ -536,7 +551,7 @@ jobs: - name: ccache uses: ggml-org/ccache-action@v1.2.16 with: - key: windows-latest-cmake-sycl + key: windows-latest-sycl variant: ccache evict-old-files: 1d @@ -616,7 +631,7 @@ jobs: - name: ccache uses: ggml-org/ccache-action@v1.2.16 with: - key: ubuntu-rocm-cmake-${{ matrix.ROCM_VERSION }}-${{ matrix.build }} + key: ubuntu-rocm-${{ matrix.ROCM_VERSION }}-${{ matrix.build }} evict-old-files: 1d - name: Dependencies @@ -726,7 +741,7 @@ jobs: - name: ccache uses: ggml-org/ccache-action@v1.2.16 with: - key: windows-latest-cmake-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}-x64 + key: windows-latest-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}-x64 evict-old-files: 1d - name: Install ROCm @@ -952,7 +967,7 @@ jobs: permissions: contents: write # for creating release - runs-on: ubuntu-latest + runs-on: ubuntu-slim needs: - windows diff --git a/.github/workflows/server-sanitize.yml b/.github/workflows/server-sanitize.yml new file mode 100644 index 0000000000..4c9f447cf8 --- /dev/null +++ b/.github/workflows/server-sanitize.yml @@ -0,0 +1,105 @@ +name: Server (sanitize) + +on: + workflow_dispatch: # allows manual triggering + inputs: + sha: + description: 'Commit SHA1 to build' + required: false + type: string + slow_tests: + description: 'Run slow tests' + required: true + type: boolean + push: + branches: + - master + paths: [ + '.github/workflows/server-sanitize.yml', + '**/CMakeLists.txt', + '**/Makefile', + '**/*.h', + '**/*.hpp', + '**/*.c', + '**/*.cpp', + 'tools/server/**.*' + ] + +env: + LLAMA_LOG_COLORS: 1 + LLAMA_LOG_PREFIX: 1 + LLAMA_LOG_TIMESTAMPS: 1 + LLAMA_LOG_VERBOSITY: 10 + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +jobs: + server: + runs-on: ubuntu-latest + + strategy: + matrix: + sanitizer: [ADDRESS, UNDEFINED] # THREAD is very slow + build_type: [RelWithDebInfo] + fail-fast: false + + steps: + - name: Dependencies + id: depends + run: | + sudo apt-get update + sudo apt-get -y install \ + build-essential \ + xxd \ + git \ + cmake \ + curl \ + wget \ + language-pack-en \ + libssl-dev + + - name: Clone + id: checkout + uses: actions/checkout@v6 + with: + fetch-depth: 0 + ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} + + - name: Build + id: cmake_build + run: | + cmake -B build \ + -DLLAMA_BUILD_BORINGSSL=ON \ + -DGGML_SCHED_NO_REALLOC=ON \ + -DGGML_SANITIZE_ADDRESS=${{ matrix.sanitizer == 'ADDRESS' }} \ + -DGGML_SANITIZE_THREAD=${{ matrix.sanitizer == 'THREAD' }} \ + -DGGML_SANITIZE_UNDEFINED=${{ matrix.sanitizer == 'UNDEFINED' }} \ + -DLLAMA_SANITIZE_ADDRESS=${{ matrix.sanitizer == 'ADDRESS' }} \ + -DLLAMA_SANITIZE_THREAD=${{ matrix.sanitizer == 'THREAD' }} \ + -DLLAMA_SANITIZE_UNDEFINED=${{ matrix.sanitizer == 'UNDEFINED' }} + cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server + + - name: Python setup + id: setup_python + uses: actions/setup-python@v6 + with: + python-version: '3.11' + pip-install: -r tools/server/tests/requirements.txt + + - name: Tests + id: server_integration_tests + if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }} + run: | + cd tools/server/tests + export ${{ matrix.extra_args }} + pytest -v -x -m "not slow" + + - name: Slow tests + id: server_integration_tests_slow + if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }} + run: | + cd tools/server/tests + export ${{ matrix.extra_args }} + SLOW_TESTS=1 pytest -v -x diff --git a/.github/workflows/server-metal.yml b/.github/workflows/server-self-hosted.yml similarity index 55% rename from .github/workflows/server-metal.yml rename to .github/workflows/server-self-hosted.yml index 1d707bef44..29bd79690a 100644 --- a/.github/workflows/server-metal.yml +++ b/.github/workflows/server-self-hosted.yml @@ -1,4 +1,4 @@ -name: Server-Metal +name: Server (self-hosted) on: workflow_dispatch: # allows manual triggering @@ -14,7 +14,19 @@ on: push: branches: - master - paths: ['.github/workflows/server-metal.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*'] + paths: [ + '.github/workflows/server-self-hosted.yml', + '**/CMakeLists.txt', + '**/Makefile', + '**/*.h', + '**/*.hpp', + '**/*.c', + '**/*.cpp', + '**/*.cu', + '**/*.swift', + '**/*.m', + 'tools/server/**.*' + ] env: LLAMA_LOG_COLORS: 1 @@ -28,7 +40,7 @@ concurrency: jobs: server-metal: - runs-on: [self-hosted, macOS, ARM64] + runs-on: [self-hosted, llama-server, macOS, ARM64] name: server-metal (${{ matrix.wf_name }}) strategy: @@ -71,3 +83,42 @@ jobs: pip install -r requirements.txt export ${{ matrix.extra_args }} pytest -v -x -m "not slow" + + server-cuda: + runs-on: [self-hosted, llama-server, Linux, NVIDIA] + + name: server-cuda (${{ matrix.wf_name }}) + strategy: + matrix: + build_type: [Release] + wf_name: ["GPUx1"] + include: + - build_type: Release + extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1" + wf_name: "GPUx1, backend-sampling" + fail-fast: false + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + with: + fetch-depth: 0 + ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} + + - name: Build + id: cmake_build + run: | + cmake -B build -DGGML_SCHED_NO_REALLOC=ON + cmake --build build --config ${{ matrix.build_type }} -j $(sysctl -n hw.logicalcpu) --target llama-server + + - name: Tests + id: server_integration_tests + if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }} + run: | + cd tools/server/tests + python3 -m venv venv + source venv/bin/activate + pip install -r requirements.txt + export ${{ matrix.extra_args }} + pytest -v -x -m "not slow" diff --git a/.github/workflows/server-webui.yml b/.github/workflows/server-webui.yml index 94899c9376..492107ffd8 100644 --- a/.github/workflows/server-webui.yml +++ b/.github/workflows/server-webui.yml @@ -1,4 +1,3 @@ -# Server WebUI build and tests name: Server WebUI on: @@ -11,10 +10,20 @@ on: push: branches: - master - paths: ['.github/workflows/server-webui.yml', 'tools/server/webui/**.*', 'tools/server/tests/**.*', 'tools/server/public/**'] + paths: [ + '.github/workflows/server-webui.yml', + 'tools/server/webui/**.*', + 'tools/server/tests/**.*', + 'tools/server/public/**' + ] pull_request: types: [opened, synchronize, reopened] - paths: ['.github/workflows/server-webui.yml', 'tools/server/webui/**.*', 'tools/server/tests/**.*', 'tools/server/public/**'] + paths: [ + '.github/workflows/server-webui.yml', + 'tools/server/webui/**.*', + 'tools/server/tests/**.*', + 'tools/server/public/**' + ] env: LLAMA_LOG_COLORS: 1 @@ -29,7 +38,7 @@ concurrency: jobs: webui-check: name: WebUI Checks - runs-on: ubuntu-latest + runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }} continue-on-error: true steps: - name: Checkout code diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index 99d05226ba..750c29f08e 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -1,4 +1,3 @@ -# Server build and tests name: Server on: @@ -15,10 +14,34 @@ on: push: branches: - master - paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*'] + paths: [ + '.github/workflows/server.yml', + '**/CMakeLists.txt', + '**/Makefile', + '**/*.h', + '**/*.hpp', + '**/*.c', + '**/*.cpp', + '**/*.cu', + '**/*.swift', + '**/*.m', + 'tools/server/**.*' + ] pull_request: types: [opened, synchronize, reopened] - paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*'] + paths: [ + '.github/workflows/server.yml', + '**/CMakeLists.txt', + '**/Makefile', + '**/*.h', + '**/*.hpp', + '**/*.c', + '**/*.cpp', + '**/*.cu', + '**/*.swift', + '**/*.m', + 'tools/server/**.*' + ] env: LLAMA_LOG_COLORS: 1 @@ -34,17 +57,18 @@ jobs: server: runs-on: ubuntu-latest + name: server (${{ matrix.wf_name }}) strategy: matrix: - sanitizer: [ADDRESS, UNDEFINED] # THREAD is very slow - build_type: [RelWithDebInfo] + build_type: [Release] + wf_name: ["default"] include: - build_type: Release - sanitizer: "" extra_args: "" + wf_name: "default" - build_type: Release - sanitizer: "" extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1" + wf_name: "backend-sampling" fail-fast: false steps: @@ -74,13 +98,7 @@ jobs: run: | cmake -B build \ -DLLAMA_BUILD_BORINGSSL=ON \ - -DGGML_SCHED_NO_REALLOC=ON \ - -DGGML_SANITIZE_ADDRESS=${{ matrix.sanitizer == 'ADDRESS' }} \ - -DGGML_SANITIZE_THREAD=${{ matrix.sanitizer == 'THREAD' }} \ - -DGGML_SANITIZE_UNDEFINED=${{ matrix.sanitizer == 'UNDEFINED' }} \ - -DLLAMA_SANITIZE_ADDRESS=${{ matrix.sanitizer == 'ADDRESS' }} \ - -DLLAMA_SANITIZE_THREAD=${{ matrix.sanitizer == 'THREAD' }} \ - -DLLAMA_SANITIZE_UNDEFINED=${{ matrix.sanitizer == 'UNDEFINED' }} + -DGGML_SCHED_NO_REALLOC=ON cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server - name: Python setup diff --git a/.gitignore b/.gitignore index bb122d6924..73954e8f5d 100644 --- a/.gitignore +++ b/.gitignore @@ -124,6 +124,11 @@ poetry.toml # Scripts !/scripts/install-oneapi.bat +# Generated by scripts +/hellaswag_val_full.txt +/winogrande-debiased-eval.csv +/wikitext-2-raw/ + # Test models for lora adapters /lora-tests diff --git a/CODEOWNERS b/CODEOWNERS index 29673a7b26..4257f5927a 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -2,29 +2,13 @@ # multiplie collaborators per item can be specified /.devops/*.Dockerfile @ngxson -/.github/actions/ @CISC -/.github/workflows/ @CISC +/.github/actions/ @ggml-org/ci +/.github/workflows/ @ggml-org/ci /ci/ @ggerganov /cmake/ @ggerganov -/common/CMakeLists.txt @ggerganov -/common/arg.* @ggerganov -/common/base64.hpp.* @ggerganov -/common/build-info.* @ggerganov -/common/chat.* @pwilkin -/common/chat-auto*.* @pwilkin -/common/chat-diff-analyzer.* @pwilkin -/common/chat-peg-parser.* @aldehir -/common/common.* @ggerganov -/common/console.* @ggerganov -/common/http.* @angt -/common/jinja/ @ngxson @CISC @aldehir -/common/llguidance.* @ggerganov -/common/log.* @ggerganov +/common/ @ggml-org/llama-common +/common/jinja/ @CISC /common/ngram-map.* @srogmann -/common/peg-parser.* @aldehir -/common/sampling.* @ggerganov -/common/speculative.* @ggerganov -/common/unicode.* @aldehir /convert_*.py @CISC /examples/batched.swift/ @ggerganov /examples/batched/ @ggerganov @@ -51,29 +35,27 @@ /examples/speculative/ @ggerganov /ggml/cmake/ @ggerganov /ggml/include/ @ggerganov +/ggml/src/ggml-cann/ @ggml-org/ggml-cann /ggml/src/ggml-common.h @ggerganov /ggml/src/ggml-cpu/ @ggerganov /ggml/src/ggml-cpu/spacemit/ @alex-spacemit -/ggml/src/ggml-cuda/fattn* @JohannesGaessler -/ggml/src/ggml-cuda/mmf.* @JohannesGaessler @am17an -/ggml/src/ggml-cuda/mmq.* @JohannesGaessler -/ggml/src/ggml-cuda/mmvf.* @JohannesGaessler -/ggml/src/ggml-cuda/mmvq.* @JohannesGaessler +/ggml/src/ggml-cuda/ @ggml-org/ggml-cuda /ggml/src/ggml-cuda/fattn-wmma* @IMbackK /ggml/src/ggml-hip/ @IMbackK /ggml/src/ggml-cuda/vendors/hip.h @IMbackK /ggml/src/ggml-impl.h @ggerganov -/ggml/src/ggml-metal/ @ggerganov -/ggml/src/ggml-opencl/ @lhez @max-krasnyansky -/ggml/src/ggml-hexagon/ @max-krasnyansky @lhez +/ggml/src/ggml-metal/ @ggml-org/ggml-metal +/ggml/src/ggml-opencl/ @ggml-org/ggml-opencl +/ggml/src/ggml-hexagon/ @ggml-org/ggml-hexagon /ggml/src/ggml-opt.cpp @JohannesGaessler /ggml/src/ggml-quants.* @ggerganov -/ggml/src/ggml-rpc/ @rgerganov +/ggml/src/ggml-rpc/ @ggml-org/ggml-rpc +/ggml/src/ggml-sycl/ @ggml-org/ggml-sycl /ggml/src/ggml-threading.* @ggerganov -/ggml/src/ggml-vulkan/ @0cc4m +/ggml/src/ggml-vulkan/ @ggml-org/ggml-vulkan /ggml/src/ggml-virtgpu/ @kpouget -/ggml/src/ggml-webgpu/ @reeselevine -/ggml/src/ggml-zdnn/ @taronaeo @Andreas-Krebbel @AlekseiNikiforovIBM +/ggml/src/ggml-webgpu/ @ggml-org/ggml-webgpu +/ggml/src/ggml-zdnn/ @ggml-org/ggml-zdnn @Andreas-Krebbel @AlekseiNikiforovIBM /ggml/src/ggml-openvino/ @cavusmustafa @wine99 /ggml/src/ggml.c @ggerganov /ggml/src/ggml.cpp @ggerganov @@ -93,16 +75,18 @@ /src/models/ @CISC /tests/ @ggerganov /tests/test-chat.* @pwilkin +/tests/test-llama-archs.cpp @JohannesGaessler /tools/batched-bench/ @ggerganov /tools/cli/ @ngxson /tools/completion/ @ggerganov -/tools/mtmd/ @ngxson +/tools/mtmd/ @ggml-org/llama-mtmd /tools/perplexity/ @ggerganov /tools/parser/ @pwilkin /tools/quantize/ @ggerganov -/tools/rpc/ @rgerganov -/tools/server/* @ngxson @ggerganov # no subdir -/tools/server/webui/ @allozaur +/tools/rpc/ @ggml-org/ggml-rpc +/tools/server/* @ggml-org/llama-server # no subdir +/tools/server/tests/ @ggml-org/llama-server +/tools/server/webui/ @ggml-org/llama-webui /tools/tokenize/ @ggerganov /tools/tts/ @ggerganov /vendor/ @ggerganov diff --git a/common/chat-diff-analyzer.cpp b/common/chat-diff-analyzer.cpp index 4068340a5c..05b3b6b6a8 100644 --- a/common/chat-diff-analyzer.cpp +++ b/common/chat-diff-analyzer.cpp @@ -479,6 +479,7 @@ analyze_content::analyze_content(const common_chat_template & tmpl, const analyz if (!comparison_with_tools || !comparison_with_reasoning) { LOG_DBG(ANSI_ORANGE "%s: Template application failed\n" ANSI_RESET, __func__); + return; } const auto & diff_tools = comparison_with_tools->diff; @@ -911,8 +912,10 @@ void analyze_tools::extract_function_markers() { // we'll have to rely on an extra diff with no-calls version auto notool_comp = compare_variants( *tmpl, params, [&](template_params & p) { p.messages = json::array({ user_msg, assistant_nocall }); }); - auto nt_diff = notool_comp->diff; - closer_suffix = nt_diff.left.substr(nt_diff.left.find("YYYY") + 4); + if (notool_comp) { + auto nt_diff = notool_comp->diff; + closer_suffix = nt_diff.left.substr(nt_diff.left.find("YYYY") + 4); + } } else { closer_suffix = diff.suffix.substr(0, diff.suffix.find(suffix_marker)); } diff --git a/common/regex-partial.cpp b/common/regex-partial.cpp index e667a209e9..bd9034e931 100644 --- a/common/regex-partial.cpp +++ b/common/regex-partial.cpp @@ -102,7 +102,7 @@ std::string regex_to_reversed_partial_regex(const std::string & pattern) { auto is_star = *it == '*'; ++it; if (is_star) { - if (*it == '?') { + if (it != end && *it == '?') { ++it; } } diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index eec0ea14e3..b4ff8dd959 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -272,8 +272,9 @@ class ModelBase: return tensors def dequant_model(self): - if self._is_nvfp4: - return # NVFP4 weights are repacked in _generate_nvfp4_tensors + # If all quantized tensors were already handled (e.g. pure NVFP4), skip + if self._is_nvfp4 and not any(k.endswith((".weight_scale", ".weight_scale_inv")) for k in self.model_tensors): + return tensors_to_remove: list[str] = [] new_tensors: dict[str, Callable[[], Tensor]] = {} @@ -474,7 +475,20 @@ class ModelBase: tensors_to_remove.append(base_name + "_zero_point") else: raise NotImplementedError(f"Quant format {quant_format!r} for method {quant_method!r} is not yet supported") - else: + elif quant_method == "modelopt": + # Mixed-precision ModelOpt models: NVFP4 tensors are handled by + # _generate_nvfp4_tensors; FP8 tensors have 1D weight_scale and + # are dequantized here. input_scale tensors are unused. + for name in self.model_tensors.keys(): + if name.endswith(".weight_scale"): + weight_name = name.removesuffix("_scale") + w = self.model_tensors[weight_name] + s = self.model_tensors[name] + self.model_tensors[weight_name] = lambda w=w, s=s: dequant_simple(w(), s(), None) + tensors_to_remove.append(name) + if name.endswith((".input_scale", ".k_scale", ".v_scale")): + tensors_to_remove.append(name) + elif quant_method is not None: raise NotImplementedError(f"Quant method is not yet supported: {quant_method!r}") for name in tensors_to_remove: @@ -520,12 +534,6 @@ class ModelBase: raise NotImplementedError("set_gguf_parameters() must be implemented in subclasses") def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # skip NVFP4 auxiliary tensors (handled in _generate_nvfp4_tensors) - if self._is_nvfp4: - if name.endswith((".weight_scale", ".weight_scale_2", ".input_scale", ".k_scale", ".v_scale")): - return [] - if name.endswith(".weight") and name.replace(".weight", ".weight_scale") in self.model_tensors: - return [] new_name = self.map_tensor_name(name) @@ -609,6 +617,7 @@ class ModelBase: expert_scales: dict[tuple[int, str], list[tuple[int, float]]] = {} expert_shapes: dict[tuple[int, str], list[int]] = {} n_experts = self.find_hparam(["num_local_experts", "num_experts"], optional=True) or 0 + consumed: list[str] = [] for name in list(self.model_tensors.keys()): if not name.endswith(".weight"): @@ -620,8 +629,18 @@ class ModelBase: # Force eager materialization of lazy tensors weight = LazyTorchTensor.to_eager(self.model_tensors[name]()) scale = LazyTorchTensor.to_eager(self.model_tensors[scale_name]()) + + # Skip non-NVFP4 tensors (e.g. FP8 with per-channel 1D scales) + if scale.ndim < 2: + continue + scale2 = LazyTorchTensor.to_eager(self.model_tensors.get(scale2_name, lambda: torch.tensor(1.0))()) + # Mark tensors for removal from model_tensors (already written to gguf) + consumed.extend([name, scale_name]) + if scale2_name in self.model_tensors: + consumed.append(scale2_name) + # Check if this is a per-expert tensor m = re.search(r'\.experts\.(\d+)\.(gate_proj|up_proj|down_proj)\.weight$', name) if m: @@ -652,6 +671,15 @@ class ModelBase: for (bid, proj_type) in list(expert_blocks.keys()): self._flush_nvfp4_experts((bid, proj_type), expert_blocks, expert_scales, expert_shapes, bid, proj_type) + # Remove consumed tensors so get_tensors/modify_tensors won't see them + for name in consumed: + self.model_tensors.pop(name, None) + + # Remove unused auxiliary tensors (input_scale, k_scale, v_scale) + for name in list(self.model_tensors.keys()): + if name.endswith((".input_scale", ".k_scale", ".v_scale")): + del self.model_tensors[name] + def _flush_nvfp4_experts(self, key, expert_blocks, expert_scales, expert_shapes, bid, proj_type): experts = expert_blocks.pop(key) scales = expert_scales.pop(key) @@ -677,20 +705,31 @@ class ModelBase: def prepare_tensors(self): # detect NVFP4 quantization (ModelOpt format) quant_algo = (self.hparams.get("quantization_config") or {}).get("quant_algo") + quant_layers = (self.hparams.get("quantization_config") or {}).get("quantized_layers") or {} quant_config_file = self.dir_model / "hf_quant_config.json" - if not quant_algo and quant_config_file.is_file(): + if (not quant_algo or not quant_layers) and quant_config_file.is_file(): with open(quant_config_file, "r", encoding="utf-8") as f: - quant_algo = (json.load(f).get("quantization") or {}).get("quant_algo") + quant_config = json.load(f).get("quantization") or {} + quant_algo = quant_config.get("quant_algo", quant_algo) + quant_layers = quant_config.get("quantized_layers", quant_layers) or {} + + # Some models use per-tensor quant_algo (e.g. "MIXED_PRECISION" with + # per-layer NVFP4/FP8) instead of a single global "NVFP4" value. + if quant_algo != "NVFP4": + if any(v.get("quant_algo") == "NVFP4" for v in quant_layers.values() if isinstance(v, dict)): + quant_algo = "NVFP4" self._is_nvfp4 = quant_algo == "NVFP4" - self.dequant_model() - - # NVFP4 weights are repacked and written directly to gguf_writer + # NVFP4 weights are repacked and written directly to gguf_writer. + # This must run before dequant_model so NVFP4 tensors are removed + # from model_tensors, leaving only non-NVFP4 (e.g. FP8) for dequant. if self._is_nvfp4: self._generate_nvfp4_tensors() + self.dequant_model() + # Handle empty tensor_map for models with block_count=0 (like MobileNetV5) if self.tensor_map.mapping: max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,") diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index b0adde8a8b..871ce82422 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -128,6 +128,12 @@ class LoraTorchTensor: assert dim is None return self.shape + def contiguous(self) -> LoraTorchTensor: + return LoraTorchTensor( + self._lora_A.contiguous(), + self._lora_B.contiguous(), + ) + def reshape(self, *shape: int | tuple[int, ...]) -> LoraTorchTensor: if isinstance(shape[0], tuple): new_shape: tuple[int, ...] = shape[0] diff --git a/docs/ops.md b/docs/ops.md index f914c2b7d2..00bdcd5d2b 100644 --- a/docs/ops.md +++ b/docs/ops.md @@ -15,7 +15,7 @@ Legend: | Operation | BLAS | CANN | CPU | CUDA | Metal | OpenCL | SYCL | Vulkan | WebGPU | ZenDNN | zDNN | |-----------|------|------|------|------|------|------|------|------|------|------|------| | ABS | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ | -| ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | +| ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ | | ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | | ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | | ADD_ID | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | @@ -47,7 +47,7 @@ Legend: | FILL | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | | FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ | | FLOOR | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ | -| GATED_DELTA_NET | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | +| GATED_DELTA_NET | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | | GATED_LINEAR_ATTN | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | | GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ | | GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ | diff --git a/docs/ops/SYCL.csv b/docs/ops/SYCL.csv index 03bfacfc9e..13ac447cc1 100644 --- a/docs/ops/SYCL.csv +++ b/docs/ops/SYCL.csv @@ -6841,10 +6841,6 @@ "SYCL0","MUL_MAT","type_a=f16,type_b=f32,m=1056,n=1,k=193,bs=[1,1],nr=[4,1],per=[0,2,1,3],k_v=0,o=1","support","1","yes","SYCL" "SYCL0","MUL_MAT","type_a=f16,type_b=f32,m=1056,n=1,k=67,bs=[1,1],nr=[4,1],per=[0,2,1,3],k_v=0,o=1","support","1","yes","SYCL" "SYCL0","MUL_MAT","type_a=f32,type_b=f32,m=64,n=77,k=77,bs=[12,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","SYCL" -"SYCL0","MUL_MAT","type_a=f16,type_b=f32,m=2,n=1,k=3,bs=[128,1024],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","SYCL" -"SYCL0","MUL_MAT","type_a=f16,type_b=f32,m=2,n=3,k=4,bs=[128,1024],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","SYCL" -"SYCL0","MUL_MAT","type_a=f16,type_b=f32,m=2,n=1,k=3,bs=[131072,1],nr=[1,1],per=[0,2,1,3],k_v=0,o=1","support","1","yes","SYCL" -"SYCL0","MUL_MAT","type_a=f16,type_b=f32,m=2,n=1,k=3,bs=[131072,1],nr=[1,1],per=[0,1,2,3],k_v=64,o=1","support","1","yes","SYCL" "SYCL0","MUL_MAT","type_a=q4_0,type_b=f32,m=576,n=512,k=576,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","SYCL" "SYCL0","MUL_MAT","type_a=q4_0,type_b=f32,m=1,n=2048,k=8192,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","SYCL" "SYCL0","MUL_MAT","type_a=f32,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","SYCL" @@ -10261,8 +10257,8 @@ "SYCL0","ACC","type=f32,ne_a=[256,17,1,1],ne_b=[256,16,1,1],stride_dim=-1","support","1","yes","SYCL" "SYCL0","ACC","type=f32,ne_a=[256,17,2,3],ne_b=[256,16,2,3],stride_dim=-1","support","1","yes","SYCL" "SYCL0","ACC","type=f32,ne_a=[256,17,2,3],ne_b=[128,16,2,3],stride_dim=-1","support","1","yes","SYCL" -"SYCL0","ACC","type=f32,ne_a=[256,17,2,3],ne_b=[256,16,2,3],stride_dim=1","support","1","yes","SYCL" -"SYCL0","ACC","type=f32,ne_a=[256,17,2,3],ne_b=[128,16,2,3],stride_dim=2","support","1","yes","SYCL" +"SYCL0","ACC","type=f32,ne_a=[256,17,2,3],ne_b=[256,16,2,3],stride_dim=1","support","0","no","SYCL" +"SYCL0","ACC","type=f32,ne_a=[256,17,2,3],ne_b=[128,16,2,3],stride_dim=2","support","0","no","SYCL" "SYCL0","ACC","type=f32,ne_a=[256,17,2,3],ne_b=[64,16,2,3],stride_dim=3","support","1","yes","SYCL" "SYCL0","PAD","type=f32,ne_a=[512,512,1,1],pad_0=1,pad_1=1,circular=0","support","1","yes","SYCL" "SYCL0","PAD","type=f32,ne_a=[33,17,2,1],pad_0=4,pad_1=3,circular=1","support","0","no","SYCL" @@ -13591,16 +13587,21 @@ "SYCL0","CROSS_ENTROPY_LOSS_BACK","type=f32,ne=[30000,1,1,1]","support","0","no","SYCL" "SYCL0","OPT_STEP_ADAMW","type=f32,ne=[10,5,4,3]","support","0","no","SYCL" "SYCL0","OPT_STEP_SGD","type=f32,ne=[10,5,4,3]","support","0","no","SYCL" -"SYCL0","GATED_DELTA_NET","type=f32,head_count=32,head_size=128,n_seq_tokens=1,n_seqs=1,v_repeat=1,permuted=0,kda=0","support","0","no","SYCL" -"SYCL0","GATED_DELTA_NET","type=f32,head_count=16,head_size=64,n_seq_tokens=1,n_seqs=2,v_repeat=1,permuted=0,kda=0","support","0","no","SYCL" -"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=1,v_repeat=1,permuted=0,kda=0","support","0","no","SYCL" -"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=0,kda=0","support","0","no","SYCL" -"SYCL0","GATED_DELTA_NET","type=f32,head_count=8,head_size=32,n_seq_tokens=4,n_seqs=2,v_repeat=2,permuted=0,kda=0","support","0","no","SYCL" -"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=1,kda=0","support","0","no","SYCL" -"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=1,v_repeat=1,permuted=1,kda=0","support","0","no","SYCL" -"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=1,n_seqs=1,v_repeat=1,permuted=0,kda=1","support","0","no","SYCL" -"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=1,n_seqs=2,v_repeat=1,permuted=0,kda=1","support","0","no","SYCL" -"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=32,n_seq_tokens=4,n_seqs=1,v_repeat=1,permuted=0,kda=1","support","0","no","SYCL" -"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=0,kda=1","support","0","no","SYCL" -"SYCL0","GATED_DELTA_NET","type=f32,head_count=8,head_size=32,n_seq_tokens=4,n_seqs=2,v_repeat=2,permuted=0,kda=1","support","0","no","SYCL" -"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=1,kda=1","support","0","no","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=32,head_size=128,n_seq_tokens=1,n_seqs=1,v_repeat=1,permuted=0,kda=0","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=32,head_size=16,n_seq_tokens=1,n_seqs=1,v_repeat=1,permuted=0,kda=0","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=32,head_size=16,n_seq_tokens=1,n_seqs=1,v_repeat=1,permuted=1,kda=1","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=32,head_size=16,n_seq_tokens=1,n_seqs=1,v_repeat=1,permuted=0,kda=1","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=16,head_size=64,n_seq_tokens=1,n_seqs=2,v_repeat=1,permuted=0,kda=0","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=1,v_repeat=1,permuted=0,kda=0","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=0,kda=0","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=8,head_size=32,n_seq_tokens=4,n_seqs=2,v_repeat=2,permuted=0,kda=0","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=1,kda=0","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=1,v_repeat=1,permuted=1,kda=0","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=1,n_seqs=1,v_repeat=1,permuted=0,kda=1","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=1,n_seqs=2,v_repeat=1,permuted=0,kda=1","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=16,n_seq_tokens=1,n_seqs=2,v_repeat=1,permuted=0,kda=1","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=32,n_seq_tokens=4,n_seqs=1,v_repeat=1,permuted=0,kda=1","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=0,kda=1","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=8,head_size=32,n_seq_tokens=4,n_seqs=2,v_repeat=2,permuted=0,kda=1","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=1,kda=1","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=16,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=1,kda=1","support","1","yes","SYCL" diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh index b6a7460da8..e9abdf288c 100644 --- a/ggml/src/ggml-cuda/fattn-common.cuh +++ b/ggml/src/ggml-cuda/fattn-common.cuh @@ -892,7 +892,7 @@ void launch_fattn( const int ntiles_x = ((Q->ne[1] + ncols1 - 1) / ncols1); const int gqa_ratio = Q->ne[2] / K->ne[2]; const int ntiles_z_gqa = ((gqa_ratio + ncols2 - 1) / ncols2); - const int ntiles_total = ntiles_x * ntiles_z_gqa * K->ne[2] * Q->ne[3]; + const int ntiles_dst = ntiles_x * ntiles_z_gqa * K->ne[2] * Q->ne[3]; // Optional optimization where the mask is scanned to determine whether part of the calculation can be skipped. // Only worth the overhead if there is at lease one FATTN_KQ_STRIDE x FATTN_KQ_STRIDE square to be skipped or @@ -919,37 +919,37 @@ void launch_fattn( GGML_ASSERT(max_blocks_per_sm > 0); int parallel_blocks = max_blocks_per_sm; + const int ntiles_KV = (K->ne[1] + nbatch_fa - 1) / nbatch_fa; // Max. number of parallel blocks limited by KV cache length. + dim3 blocks_num; if (stream_k) { // For short contexts it can be faster to have the SMs work on whole tiles because this lets us skip the fixup. const int max_blocks = max_blocks_per_sm*nsm; - const int tiles_nwaves = (ntiles_total + max_blocks - 1) / max_blocks; - const int tiles_efficiency_percent = 100 * ntiles_total / (max_blocks*tiles_nwaves); + const int tiles_nwaves = (ntiles_dst + max_blocks - 1) / max_blocks; + const int tiles_efficiency_percent = 100 * ntiles_dst / (max_blocks*tiles_nwaves); - const int nblocks_stream_k = max_blocks; + const int nblocks_stream_k = std::min(max_blocks, ntiles_KV*ntiles_dst); const bool use_stream_k = cc >= GGML_CUDA_CC_ADA_LOVELACE || amd_wmma_available(cc) || tiles_efficiency_percent < 75; - blocks_num.x = use_stream_k ? nblocks_stream_k : ntiles_total; + blocks_num.x = use_stream_k ? nblocks_stream_k : ntiles_dst; blocks_num.y = 1; blocks_num.z = 1; - if (ntiles_total % blocks_num.x != 0) { // Fixup is only needed if the SMs work on fractional tiles. + if (ntiles_dst % blocks_num.x != 0) { // Fixup is only needed if the SMs work on fractional tiles. dst_tmp_meta.alloc((size_t(blocks_num.x) * ncols * (2 + DV/2))); } } else { - const int ntiles_KQ = (K->ne[1] + nbatch_fa - 1) / nbatch_fa; // Max. number of parallel blocks limited by tensor size. - // parallel_blocks must not be larger than what the tensor size allows: - parallel_blocks = std::min(parallel_blocks, ntiles_KQ); + parallel_blocks = std::min(parallel_blocks, ntiles_KV); // If ntiles_total % blocks_per_wave != 0 then some efficiency is lost due to tail effects. // Test whether parallel_blocks can be set to a higher value for better efficiency. const int blocks_per_wave = nsm * max_blocks_per_sm; int nwaves_best = 0; int efficiency_percent_best = 0; - for (int parallel_blocks_test = parallel_blocks; parallel_blocks_test <= ntiles_KQ; ++parallel_blocks_test) { - const int nblocks_total = ntiles_total * parallel_blocks_test; + for (int parallel_blocks_test = parallel_blocks; parallel_blocks_test <= ntiles_KV; ++parallel_blocks_test) { + const int nblocks_total = ntiles_dst * parallel_blocks_test; const int nwaves = (nblocks_total + blocks_per_wave - 1) / blocks_per_wave; const int efficiency_percent = 100 * nblocks_total / (nwaves*blocks_per_wave); @@ -1015,7 +1015,7 @@ void launch_fattn( CUDA_CHECK(cudaGetLastError()); if (stream_k) { - if (ntiles_total % blocks_num.x != 0) { // Fixup is only needed if the SMs work on fractional tiles. + if (ntiles_dst % blocks_num.x != 0) { // Fixup is only needed if the SMs work on fractional tiles. const dim3 block_dim_combine(DV, 1, 1); const dim3 blocks_num_combine = {blocks_num.x, ncols1, ncols2}; diff --git a/ggml/src/ggml-cuda/gated_delta_net.cu b/ggml/src/ggml-cuda/gated_delta_net.cu index 1ce6d5f31b..6b44bec731 100644 --- a/ggml/src/ggml-cuda/gated_delta_net.cu +++ b/ggml/src/ggml-cuda/gated_delta_net.cu @@ -1,7 +1,8 @@ #include "gated_delta_net.cuh" template -__global__ void gated_delta_net_cuda(const float * q, +__global__ void __launch_bounds__((ggml_cuda_get_physical_warp_size() < S_v ? ggml_cuda_get_physical_warp_size() : S_v) * 4, 2) +gated_delta_net_cuda(const float * q, const float * k, const float * v, const float * g, @@ -38,7 +39,7 @@ __global__ void gated_delta_net_cuda(const float * q, const int64_t state_offset = (sequence * H + h_idx) * S_v * S_v; state += state_offset; - curr_state += state_offset; + curr_state += state_offset + col * S_v; attn_data += (sequence * n_tokens * H + h_idx) * S_v; constexpr int warp_size = ggml_cuda_get_physical_warp_size() < S_v ? ggml_cuda_get_physical_warp_size() : S_v; @@ -46,10 +47,11 @@ __global__ void gated_delta_net_cuda(const float * q, constexpr int rows_per_lane = (S_v + warp_size - 1) / warp_size; float s_shard[rows_per_lane]; // state is stored transposed: M[col][i] = S[i][col], row col is contiguous + #pragma unroll for (int r = 0; r < rows_per_lane; r++) { const int i = r * warp_size + lane; - s_shard[r] = curr_state[col * S_v + i]; + s_shard[r] = curr_state[i]; } for (int t = 0; t < n_tokens; t++) { @@ -63,6 +65,16 @@ __global__ void gated_delta_net_cuda(const float * q, const float beta_val = *beta_t; + // Cache k and q in registers + float k_reg[rows_per_lane]; + float q_reg[rows_per_lane]; +#pragma unroll + for (int r = 0; r < rows_per_lane; r++) { + const int i = r * warp_size + lane; + k_reg[r] = k_t[i]; + q_reg[r] = q_t[i]; + } + if constexpr (!KDA) { const float g_val = expf(*g_t); @@ -70,8 +82,7 @@ __global__ void gated_delta_net_cuda(const float * q, float kv_shard = 0.0f; #pragma unroll for (int r = 0; r < rows_per_lane; r++) { - const int i = r * warp_size + lane; - kv_shard += s_shard[r] * k_t[i]; + kv_shard += s_shard[r] * k_reg[r]; } float kv_col = warp_reduce_sum(kv_shard); @@ -83,9 +94,8 @@ __global__ void gated_delta_net_cuda(const float * q, float attn_partial = 0.0f; #pragma unroll for (int r = 0; r < rows_per_lane; r++) { - const int i = r * warp_size + lane; - s_shard[r] = g_val * s_shard[r] + k_t[i] * delta_col; - attn_partial += s_shard[r] * q_t[i]; + s_shard[r] = g_val * s_shard[r] + k_reg[r] * delta_col; + attn_partial += s_shard[r] * q_reg[r]; } float attn_col = warp_reduce_sum(attn_partial); @@ -99,7 +109,7 @@ __global__ void gated_delta_net_cuda(const float * q, #pragma unroll for (int r = 0; r < rows_per_lane; r++) { const int i = r * warp_size + lane; - kv_shard += expf(g_t[i]) * s_shard[r] * k_t[i]; + kv_shard += expf(g_t[i]) * s_shard[r] * k_reg[r]; } float kv_col = warp_reduce_sum(kv_shard); @@ -113,8 +123,8 @@ __global__ void gated_delta_net_cuda(const float * q, #pragma unroll for (int r = 0; r < rows_per_lane; r++) { const int i = r * warp_size + lane; - s_shard[r] = expf(g_t[i]) * s_shard[r] + k_t[i] * delta_col; - attn_partial += s_shard[r] * q_t[i]; + s_shard[r] = expf(g_t[i]) * s_shard[r] + k_reg[r] * delta_col; + attn_partial += s_shard[r] * q_reg[r]; } float attn_col = warp_reduce_sum(attn_partial); diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 2d7660565e..8188e2acec 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -124,7 +124,10 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) err = cudaMallocManaged(ptr, size); #if defined(GGML_USE_HIP) if (err == hipSuccess) { - CUDA_CHECK(cudaMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device)); + // hipMemAdviseSetCoarseGrain is an optional performance hint; + // ignore errors (e.g. hipErrorInvalidValue on some APU/iGPU configs). + cudaMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device); + (void)hipGetLastError(); // clear any error } // fall back to cudaMalloc if not supported (e.g. on Windows) @@ -251,11 +254,6 @@ static ggml_cuda_device_info ggml_cuda_init() { info.devices[id].supports_cooperative_launch = false; #endif // !(GGML_USE_MUSA) - // cudaMemGetInfo returns info for the current device - size_t free_mem; - CUDA_CHECK(cudaSetDevice(id)); - CUDA_CHECK(cudaMemGetInfo(&free_mem, NULL)); - #if defined(GGML_USE_HIP) info.devices[id].smpbo = prop.sharedMemPerBlock; @@ -270,25 +268,25 @@ static ggml_cuda_device_info ggml_cuda_init() { info.devices[id].cc += prop.minor * 0x10; } } - GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d, VRAM: %zu MiB (%zu MiB free)\n", + GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d, VRAM: %zu MiB\n", id, prop.name, prop.gcnArchName, info.devices[id].cc & 0xffff, device_vmm ? "yes" : "no", prop.warpSize, - (size_t)(prop.totalGlobalMem / (1024 * 1024)), free_mem / (1024 * 1024)); + (size_t)(prop.totalGlobalMem / (1024 * 1024))); #elif defined(GGML_USE_MUSA) // FIXME: Ensure compatibility with varying warp sizes across different MUSA archs. info.devices[id].warp_size = 32; info.devices[id].smpbo = prop.sharedMemPerBlockOptin; info.devices[id].cc = GGML_CUDA_CC_OFFSET_MTHREADS + prop.major * 0x100; info.devices[id].cc += prop.minor * 0x10; - GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, VRAM: %zu MiB (%zu MiB free)\n", + GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, VRAM: %zu MiB\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no", - (size_t)(prop.totalGlobalMem / (1024 * 1024)), free_mem / (1024 * 1024)); + (size_t)(prop.totalGlobalMem / (1024 * 1024))); #else info.devices[id].smpbo = prop.sharedMemPerBlockOptin; info.devices[id].cc = 100*prop.major + 10*prop.minor; - GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, VRAM: %zu MiB (%zu MiB free)\n", + GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, VRAM: %zu MiB\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no", - (size_t)(prop.totalGlobalMem / (1024 * 1024)), free_mem / (1024 * 1024)); + (size_t)(prop.totalGlobalMem / (1024 * 1024))); std::string device_name(prop.name); if (device_name == "NVIDIA GeForce MX450") { turing_devices_without_mma.push_back({ id, device_name }); @@ -303,6 +301,7 @@ static ggml_cuda_device_info ggml_cuda_init() { // TODO: Check for future drivers the default scheduling strategy and // remove this call again when cudaDeviceScheduleSpin is default. if (prop.major == 12 && prop.minor == 1) { + CUDA_CHECK(cudaSetDevice(id)); CUDA_CHECK(cudaSetDeviceFlags(cudaDeviceScheduleSpin)); } diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index ce25ccf427..632246e43f 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -60,11 +60,17 @@ static constexpr __device__ int get_vdr_mmvq(ggml_type type) { enum mmvq_parameter_table_id { MMVQ_PARAMETERS_GENERIC = 0, MMVQ_PARAMETERS_GCN, - MMVQ_PARAMETERS_RDNA2 + MMVQ_PARAMETERS_RDNA2, + MMVQ_PARAMETERS_RDNA3_0, + MMVQ_PARAMETERS_RDNA4 }; static constexpr __device__ mmvq_parameter_table_id get_device_table_id() { -#if defined(RDNA2) || defined(RDNA3) || defined(RDNA4) +#if defined(RDNA4) + return MMVQ_PARAMETERS_RDNA4; +#elif defined(RDNA3_0) + return MMVQ_PARAMETERS_RDNA3_0; +#elif defined(RDNA2) || defined(RDNA3_5) return MMVQ_PARAMETERS_RDNA2; #elif defined(GCN) || defined(CDNA) return MMVQ_PARAMETERS_GCN; @@ -74,7 +80,13 @@ static constexpr __device__ mmvq_parameter_table_id get_device_table_id() { } static __host__ mmvq_parameter_table_id get_device_table_id(int cc) { - if (GGML_CUDA_CC_IS_RDNA2(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) { + if (GGML_CUDA_CC_IS_RDNA4(cc)) { + return MMVQ_PARAMETERS_RDNA4; + } + if (GGML_CUDA_CC_IS_RDNA3_0(cc)) { + return MMVQ_PARAMETERS_RDNA3_0; + } + if (GGML_CUDA_CC_IS_RDNA2(cc) || GGML_CUDA_CC_IS_RDNA3_5(cc)) { return MMVQ_PARAMETERS_RDNA2; } if (GGML_CUDA_CC_IS_GCN(cc) || GGML_CUDA_CC_IS_CDNA(cc)) { @@ -83,7 +95,7 @@ static __host__ mmvq_parameter_table_id get_device_table_id(int cc) { return MMVQ_PARAMETERS_GENERIC; } -static constexpr __host__ __device__ int calc_nwarps(int ncols_dst, mmvq_parameter_table_id table_id) { +static constexpr __host__ __device__ int calc_nwarps(ggml_type type, int ncols_dst, mmvq_parameter_table_id table_id) { if (table_id == MMVQ_PARAMETERS_GENERIC) { switch (ncols_dst) { case 1: @@ -114,6 +126,50 @@ static constexpr __host__ __device__ int calc_nwarps(int ncols_dst, mmvq_paramet return 1; } } + if (table_id == MMVQ_PARAMETERS_RDNA4) { + // nwarps=8 benefits types with simple vec_dot on RDNA4 (ncols_dst=1). + // Types with complex vec_dot (Q3_K, IQ2_*, IQ3_*) regress due to register + // pressure and lookup table contention at higher thread counts. + if (ncols_dst == 1) { + switch (type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + case GGML_TYPE_IQ4_NL: + case GGML_TYPE_IQ4_XS: + return 8; + default: + return 1; + } + } + return 1; + } + if (table_id == MMVQ_PARAMETERS_RDNA3_0) { + // RDNA3 (W7900): stricter whitelist than RDNA4. + // Q2_K / Q5_K / IQ4_XS regress in full quant sweeps. + if (ncols_dst == 1) { + switch (type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q6_K: + case GGML_TYPE_IQ4_NL: + return 8; + default: + return 1; + } + } + return 1; + } return 1; } @@ -138,7 +194,7 @@ static constexpr __host__ __device__ int calc_rows_per_block(int ncols_dst, int } template -__launch_bounds__(calc_nwarps(ncols_dst, get_device_table_id())*ggml_cuda_get_physical_warp_size(), 1) +__launch_bounds__(calc_nwarps(type, ncols_dst, get_device_table_id())*ggml_cuda_get_physical_warp_size(), 1) static __global__ void mul_mat_vec_q( const void * __restrict__ vx, const void * __restrict__ vy, const int32_t * __restrict__ ids, const ggml_cuda_mm_fusion_args_device fusion, float * __restrict__ dst, const uint32_t ncols_x, const uint3 nchannels_y, const uint32_t stride_row_x, const uint32_t stride_col_y, @@ -151,7 +207,7 @@ static __global__ void mul_mat_vec_q( constexpr int qi = ggml_cuda_type_traits::qi; constexpr int vdr = get_vdr_mmvq(type); constexpr mmvq_parameter_table_id table_id = get_device_table_id(); - constexpr int nwarps = calc_nwarps(ncols_dst, table_id); + constexpr int nwarps = calc_nwarps(type, ncols_dst, table_id); constexpr int rows_per_cuda_block = calc_rows_per_block(ncols_dst, table_id); constexpr int warp_size = ggml_cuda_get_physical_warp_size(); @@ -355,12 +411,13 @@ static __global__ void mul_mat_vec_q( } } +template static std::pair calc_launch_params( const int ncols_dst, const int nrows_x, const int nchannels_dst, const int nsamples_or_ntokens, const int warp_size, const mmvq_parameter_table_id table_id) { const int64_t nblocks = (nrows_x + calc_rows_per_block(ncols_dst, table_id) - 1) / calc_rows_per_block(ncols_dst, table_id); const dim3 block_nums(nblocks, nchannels_dst, nsamples_or_ntokens); - const dim3 block_dims(warp_size, calc_nwarps(ncols_dst, table_id), 1); + const dim3 block_dims(warp_size, calc_nwarps(type, ncols_dst, table_id), 1); return {block_nums, block_dims}; } @@ -420,7 +477,7 @@ static void mul_mat_vec_q_switch_ncols_dst( if (has_ids && ncols_dst > 1) { // Multi-token MUL_MAT_ID path only - single-token goes through regular path below constexpr int c_ncols_dst = 1; - std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, ncols_dst, warp_size, table_id); + std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, ncols_dst, warp_size, table_id); mul_mat_vec_q_switch_fusion(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst, channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, @@ -431,7 +488,7 @@ static void mul_mat_vec_q_switch_ncols_dst( switch (ncols_dst) { case 1: { constexpr int c_ncols_dst = 1; - std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id); + std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id); mul_mat_vec_q_switch_fusion(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst, channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, @@ -439,7 +496,7 @@ static void mul_mat_vec_q_switch_ncols_dst( } break; case 2: { constexpr int c_ncols_dst = 2; - std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id); + std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id); mul_mat_vec_q_switch_fusion(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst, channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, @@ -447,7 +504,7 @@ static void mul_mat_vec_q_switch_ncols_dst( } break; case 3: { constexpr int c_ncols_dst = 3; - std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id); + std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id); mul_mat_vec_q_switch_fusion(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst, channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, @@ -455,7 +512,7 @@ static void mul_mat_vec_q_switch_ncols_dst( } break; case 4: { constexpr int c_ncols_dst = 4; - std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id); + std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id); mul_mat_vec_q_switch_fusion(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst, channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, @@ -463,7 +520,7 @@ static void mul_mat_vec_q_switch_ncols_dst( } break; case 5: { constexpr int c_ncols_dst = 5; - std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id); + std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id); mul_mat_vec_q_switch_fusion(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst, channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, @@ -471,7 +528,7 @@ static void mul_mat_vec_q_switch_ncols_dst( } break; case 6: { constexpr int c_ncols_dst = 6; - std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id); + std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id); mul_mat_vec_q_switch_fusion(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst, channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, @@ -479,7 +536,7 @@ static void mul_mat_vec_q_switch_ncols_dst( } break; case 7: { constexpr int c_ncols_dst = 7; - std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id); + std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id); mul_mat_vec_q_switch_fusion(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst, channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, @@ -487,7 +544,7 @@ static void mul_mat_vec_q_switch_ncols_dst( } break; case 8: { constexpr int c_ncols_dst = 8; - std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id); + std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id); mul_mat_vec_q_switch_fusion(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst, channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h index 5cc1b54319..35d1e1a063 100644 --- a/ggml/src/ggml-cuda/vendors/hip.h +++ b/ggml/src/ggml-cuda/vendors/hip.h @@ -207,6 +207,14 @@ #define RDNA3 #endif // defined(__GFX11__) +#if defined(__gfx1150__) || defined(__gfx1151__) +#define RDNA3_5 +#endif // defined(__gfx1150__) || defined(__gfx1151__) + +#if defined(RDNA3) && !defined(RDNA3_5) +#define RDNA3_0 +#endif // defined(RDNA3) && !defined(RDNA3_5) + #if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \ defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__) #define RDNA2 diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index d6e9776b87..19917cb114 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -402,6 +402,7 @@ static void pack_q4_0_quants(block_q4_0 * x, const uint8_t * qs, unsigned int bi static void repack_row_q4x4x2(uint8_t * y, const block_q4_0 * x, int64_t k) { static const int qk = QK_Q4_0x4x2; const int nb = (k + qk - 1) / qk; // number of blocks (padded) + const int nloe = k % qk; // leftovers const int dblk_size = 8 * 2; // 8x __fp16 const int qblk_size = qk / 2; // int4 @@ -435,9 +436,11 @@ static void repack_row_q4x4x2(uint8_t * y, const block_q4_0 * x, int64_t k) { unpack_q4_0_quants(qs, &x[i * 8 + 6], 6); unpack_q4_0_quants(qs, &x[i * 8 + 7], 7); + bool partial = (nloe && i == nb-1); + uint8_t * q = y_q + (i * qblk_size); for (int j = 0; j < qk / 2; j++) { - q[j] = (qs[j + 128] << 4) | qs[j]; + q[j] = partial ? (qs[j*2+1] << 4) | qs[j*2+0] : (qs[j+128] << 4) | qs[j+000]; } } @@ -467,6 +470,7 @@ static void repack_row_q4x4x2(uint8_t * y, const block_q4_0 * x, int64_t k) { static void unpack_row_q4x4x2(block_q4_0 * x, const uint8_t * y, int64_t k) { static const int qk = QK_Q4_0x4x2; const int nb = (k + qk - 1) / qk; // number of blocks (padded) + const int nloe = k % qk; // leftovers const int dblk_size = 8 * 2; // 8x __fp16 const int qblk_size = qk / 2; // int4 @@ -485,10 +489,17 @@ static void unpack_row_q4x4x2(block_q4_0 * x, const uint8_t * y, int64_t k) { for (int i = 0; i < nb; i++) { uint8_t qs[QK_Q4_0x4x2]; // unpacked quants + bool partial = (nloe && i == nb-1); + const uint8_t * q = y_q + (i * qblk_size); for (int j = 0; j < qk / 2; j++) { - qs[j] = q[j] & 0xf; - qs[j + 128] = q[j] >> 4; + if (partial) { + qs[j*2+0] = q[j] & 0xf; + qs[j*2+1] = q[j] >> 4; + } else { + qs[j+000] = q[j] & 0xf; + qs[j+128] = q[j] >> 4; + } } pack_q4_0_quants(&x[i * 8 + 0], qs, 0); @@ -1078,6 +1089,7 @@ static void pack_mxfp4_quants(block_mxfp4 * x, const uint8_t * qs, unsigned int static void repack_row_mxfp4x4x2(uint8_t * y, const block_mxfp4 * x, int64_t k) { static const int qk = QK_MXFP4x4x2; const int nb = (k + qk - 1) / qk; // number of blocks (padded) + const int nloe = k % qk; // leftovers const int eblk_size = 8 * 1; // 8x E8M0 const int qblk_size = qk / 2; // int4 @@ -1112,9 +1124,11 @@ static void repack_row_mxfp4x4x2(uint8_t * y, const block_mxfp4 * x, int64_t k) unpack_mxfp4_quants(qs, &x[i * 8 + 6], 6); unpack_mxfp4_quants(qs, &x[i * 8 + 7], 7); + bool partial = (nloe && i == nb-1); + uint8_t * q = y_q + (i * qblk_size); for (int j = 0; j < qk / 2; j++) { - q[j] = (qs[j + 128] << 4) | qs[j]; + q[j] = partial ? (qs[j*2+1] << 4) | qs[j*2+0] : (qs[j+128] << 4) | qs[j+000]; } } @@ -1144,6 +1158,7 @@ static void repack_row_mxfp4x4x2(uint8_t * y, const block_mxfp4 * x, int64_t k) static void unpack_row_mxfp4x4x2(block_mxfp4 * x, const uint8_t * y, int64_t k) { static const int qk = QK_MXFP4x4x2; const int nb = (k + qk - 1) / qk; // number of blocks (padded) + const int nloe = k % qk; // leftovers const int eblk_size = 8 * 1; // 8x E8M0 const int qblk_size = qk / 2; // int4 @@ -1162,10 +1177,17 @@ static void unpack_row_mxfp4x4x2(block_mxfp4 * x, const uint8_t * y, int64_t k) for (int i = 0; i < nb; i++) { uint8_t qs[QK_MXFP4x4x2]; // unpacked quants + bool partial = (nloe && i == nb-1); + const uint8_t * q = y_q + (i * qblk_size); for (int j = 0; j < qk / 2; j++) { - qs[j] = q[j] & 0xf; - qs[j + 128] = q[j] >> 4; + if (partial) { + qs[j*2+0] = q[j] & 0xf; + qs[j*2+1] = q[j] >> 4; + } else { + qs[j+000] = q[j] & 0xf; + qs[j+128] = q[j] >> 4; + } } pack_mxfp4_quants(&x[i * 8 + 0], qs, 0); @@ -1801,12 +1823,12 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s return false; } - if (src0->ne[1] > 16 * 1024) { + if (ggml_nrows(src0) > 16 * 1024) { return false; // typically the lm-head which would be too large for VTCM } - if ((src1->ne[2] != 1 || src1->ne[3] != 1)) { - return false; + if (ggml_nrows(src1) > 1024 || src1->ne[2] != 1 || src1->ne[3] != 1) { + return false; // no huge batches or broadcasting (for now) } // src0 (weights) must be repacked @@ -1820,6 +1842,9 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s GGML_LOG_DEBUG("ggml_hexagon_supported_mul_mat: permuted F16 src0 not supported\n"); return false; } + if (ggml_nrows(src1) > 1024) { + return false; // no huge batches (for now) + } break; default: diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c index 9ca74aedfe..73aaba79eb 100644 --- a/ggml/src/ggml-hexagon/htp/matmul-ops.c +++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c @@ -77,7 +77,7 @@ static inline size_t q8x4x2_row_size(uint32_t ne) { return hex_round_up(ne + nb * 8 * sizeof(__fp16), 128); } -static inline HVX_Vector_x8 hvx_vec_load_q4x4x8(const uint8_t * restrict ptr) { +static inline HVX_Vector_x8 hvx_vec_load_q4x4x8_full(const uint8_t * restrict ptr) { const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr; HVX_Vector v0_1 = vptr[0]; // first 256 elements (128 bytes) @@ -88,9 +88,9 @@ static inline HVX_Vector_x8 hvx_vec_load_q4x4x8(const uint8_t * restrict ptr) { const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F); const HVX_Vector i8 = Q6_Vb_vsplat_R(8); - HVX_Vector v0 = Q6_V_vand_VV(v0_1, mask_h4); // & 0x0F - HVX_Vector v1 = Q6_Vub_vlsr_VubR(v0_1, 4); // >> 4 - HVX_Vector v2 = Q6_V_vand_VV(v2_3, mask_h4); // & 0x0F + HVX_Vector v0 = Q6_V_vand_VV(v0_1, mask_h4); // & 0x0F : first 128 elements + HVX_Vector v1 = Q6_Vub_vlsr_VubR(v0_1, 4); // >> 4 : second 128 elements + HVX_Vector v2 = Q6_V_vand_VV(v2_3, mask_h4); // & 0x0F ... HVX_Vector v3 = Q6_Vub_vlsr_VubR(v2_3, 4); // >> 4 HVX_Vector v4 = Q6_V_vand_VV(v4_5, mask_h4); // & 0x0F HVX_Vector v5 = Q6_Vub_vlsr_VubR(v4_5, 4); // >> 4 @@ -111,7 +111,41 @@ static inline HVX_Vector_x8 hvx_vec_load_q4x4x8(const uint8_t * restrict ptr) { return r; } -static inline HVX_Vector_x8 hvx_vec_load_mxfp4x4x8(const uint8_t * restrict ptr) { +static HVX_Vector_x8 hvx_vec_load_q4x4x8_partial(const uint8_t * restrict ptr, uint32_t n) { + const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr; + + const uint32_t qk = QK_Q4_0x4x2; // 256 + const uint32_t nb = n / qk; + const uint32_t nloe = n % qk; + + const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F); + const HVX_Vector i8 = Q6_Vb_vsplat_R(8); + + HVX_Vector_x8 r; + uint32_t i = 0; + + #pragma unroll(2) + for (i=0; i < nb; i++) { + HVX_Vector v = vptr[i]; // 256 elements (128 bytes) + HVX_Vector v0 = Q6_V_vand_VV(v, mask_h4); // & 0x0F : first 128 elements + HVX_Vector v1 = Q6_Vub_vlsr_VubR(v, 4); // >> 4 : second 128 elements + r.v[i*2+0] = Q6_Vb_vsub_VbVb(v0, i8); + r.v[i*2+1] = Q6_Vb_vsub_VbVb(v1, i8); + } + + if (nloe) { + HVX_Vector v = vptr[i]; // 256 elements (128 bytes) + HVX_Vector v0 = Q6_V_vand_VV(v, mask_h4); // & 0x0F : even 128 elements + HVX_Vector v1 = Q6_Vub_vlsr_VubR(v, 4); // >> 4 : odd 128 elements + HVX_VectorPair v0_1_p = Q6_W_vshuff_VVR(v1, v0, -1); // zip even:odd:... + r.v[i*2+0] = Q6_Vb_vsub_VbVb(Q6_V_lo_W(v0_1_p), i8); + r.v[i*2+1] = Q6_Vb_vsub_VbVb(Q6_V_hi_W(v0_1_p), i8); + } + + return r; +} + +static inline HVX_Vector_x8 hvx_vec_load_mxfp4x4x8_full(const uint8_t * restrict ptr) { const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr; HVX_Vector v0_1 = vptr[0]; // first 256 elements (128 bytes) @@ -144,7 +178,41 @@ static inline HVX_Vector_x8 hvx_vec_load_mxfp4x4x8(const uint8_t * restrict ptr) return r; } -static inline HVX_Vector_x8 hvx_vec_load_q8x4x8(const uint8_t * restrict ptr) { +static inline HVX_Vector_x8 hvx_vec_load_mxfp4x4x8_partial(const uint8_t * restrict ptr, uint32_t n) { + const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr; + + const uint32_t qk = QK_Q4_0x4x2; // 256 + const uint32_t nb = n / qk; + const uint32_t nloe = n % qk; + + const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F); + const HVX_Vector lut = *(const HVX_Vector *) kvalues_mxfp4_lut; + + HVX_Vector_x8 r; + uint32_t i = 0; + + #pragma unroll(2) + for (i=0; i < nb; i++) { + HVX_Vector v = vptr[i]; // 256 elements (128 bytes) + HVX_Vector v0 = Q6_V_vand_VV(v, mask_h4); // & 0x0F : first 128 elements + HVX_Vector v1 = Q6_Vub_vlsr_VubR(v, 4); // >> 4 : second 128 elements + r.v[i*2+0] = Q6_Vb_vlut32_VbVbI(v0, lut, 0); + r.v[i*2+1] = Q6_Vb_vlut32_VbVbI(v1, lut, 0); + } + + if (nloe) { + HVX_Vector v = vptr[i]; // 256 elements (128 bytes) + HVX_Vector v0 = Q6_V_vand_VV(v, mask_h4); // & 0x0F : even 128 elements + HVX_Vector v1 = Q6_Vub_vlsr_VubR(v, 4); // >> 4 : odd 128 elements + HVX_VectorPair v0_1_p = Q6_W_vshuff_VVR(v1, v0, -1); // zip even:odd:... + r.v[i*2+0] = Q6_Vb_vlut32_VbVbI(Q6_V_lo_W(v0_1_p), lut, 0); + r.v[i*2+1] = Q6_Vb_vlut32_VbVbI(Q6_V_hi_W(v0_1_p), lut, 0); + } + + return r; +} + +static inline HVX_Vector_x8 hvx_vec_load_q8x4x8_full(const uint8_t * restrict ptr) { const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr; HVX_Vector v0 = vptr[0]; // first 128 vals @@ -160,6 +228,10 @@ static inline HVX_Vector_x8 hvx_vec_load_q8x4x8(const uint8_t * restrict ptr) { return r; } +static inline HVX_Vector_x8 hvx_vec_load_q8x4x8_partial(const uint8_t * restrict ptr, uint32_t nloe) { + return hvx_vec_load_q8x4x8_full(ptr); +} + // Reduce multiply 1024 x 1024 int8 elements (32x q4/8 blocks in 8x HVX vectors). // Accumulate each block into a single int32 value. // Return a single HVX vector with 32x int32 accumulators. @@ -167,14 +239,14 @@ static inline HVX_Vector_x8 hvx_vec_load_q8x4x8(const uint8_t * restrict ptr) { // if() checks are optimized out at compile time -- make sure to pass N as a constexpr. static inline HVX_Vector hvx_vec_rmpy_x8_n(HVX_Vector_x8 x, HVX_Vector_x8 y, unsigned int n) { - HVX_Vector r0 = Q6_V_vsplat_R(0); - HVX_Vector r1 = Q6_V_vsplat_R(0); - HVX_Vector r2 = Q6_V_vsplat_R(0); - HVX_Vector r3 = Q6_V_vsplat_R(0); - HVX_Vector r4 = Q6_V_vsplat_R(0); - HVX_Vector r5 = Q6_V_vsplat_R(0); - HVX_Vector r6 = Q6_V_vsplat_R(0); - HVX_Vector r7 = Q6_V_vsplat_R(0); + HVX_Vector r0 = Q6_V_vzero(); + HVX_Vector r1 = Q6_V_vzero(); + HVX_Vector r2 = Q6_V_vzero(); + HVX_Vector r3 = Q6_V_vzero(); + HVX_Vector r4 = Q6_V_vzero(); + HVX_Vector r5 = Q6_V_vzero(); + HVX_Vector r6 = Q6_V_vzero(); + HVX_Vector r7 = Q6_V_vzero(); HVX_VectorPair p3; HVX_VectorPair p2; @@ -213,15 +285,42 @@ static inline HVX_Vector hvx_vec_rmpy_x8_n(HVX_Vector_x8 x, HVX_Vector_x8 y, uns } static inline HVX_Vector hvx_vec_rmpy_x8_full(HVX_Vector_x8 x, HVX_Vector_x8 y) { - return hvx_vec_rmpy_x8_n(x, y, 1024); + HVX_Vector r0 = Q6_Vw_vrmpy_VbVb(x.v[0], y.v[0]); + HVX_Vector r1 = Q6_Vw_vrmpy_VbVb(x.v[1], y.v[1]); + HVX_Vector r2 = Q6_Vw_vrmpy_VbVb(x.v[2], y.v[2]); + HVX_Vector r3 = Q6_Vw_vrmpy_VbVb(x.v[3], y.v[3]); + HVX_Vector r4 = Q6_Vw_vrmpy_VbVb(x.v[4], y.v[4]); + HVX_Vector r5 = Q6_Vw_vrmpy_VbVb(x.v[5], y.v[5]); + HVX_Vector r6 = Q6_Vw_vrmpy_VbVb(x.v[6], y.v[6]); + HVX_Vector r7 = Q6_Vw_vrmpy_VbVb(x.v[7], y.v[7]); + + HVX_VectorPair p0 = Q6_W_vdeal_VVR(r1, r0, -4); + HVX_VectorPair p1 = Q6_W_vdeal_VVR(r3, r2, -4); + HVX_VectorPair p2 = Q6_W_vdeal_VVR(r5, r4, -4); + HVX_VectorPair p3 = Q6_W_vdeal_VVR(r7, r6, -4); + + r0 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p0), Q6_V_hi_W(p0)); + r1 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p1), Q6_V_hi_W(p1)); + r2 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p2), Q6_V_hi_W(p2)); + r3 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p3), Q6_V_hi_W(p3)); + + p0 = Q6_W_vdeal_VVR(r1, r0, -4); + p1 = Q6_W_vdeal_VVR(r3, r2, -4); + + r0 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p0), Q6_V_hi_W(p0)); + r1 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p1), Q6_V_hi_W(p1)); + + p0 = Q6_W_vdeal_VVR(r1, r0, -4); + r0 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p0), Q6_V_hi_W(p0)); + + return r0; } -// Handle most common cases of tensors not multiple of 1024. -static inline HVX_Vector hvx_vec_rmpy_x8_nloe(HVX_Vector_x8 x, HVX_Vector_x8 y, unsigned int n) { - if (n <= 256) { return hvx_vec_rmpy_x8_n(x, y, 256); }; - if (n <= 512) { return hvx_vec_rmpy_x8_n(x, y, 512); }; - if (n <= 768) { return hvx_vec_rmpy_x8_n(x, y, 768); }; - return hvx_vec_rmpy_x8_n(x, y, 1024); +static inline HVX_Vector hvx_vec_rmpy_x8_partial(HVX_Vector_x8 x, HVX_Vector_x8 y, unsigned int n) { + if (n >= 512) + return hvx_vec_rmpy_x8_full(x, y); + + return hvx_vec_rmpy_x8_partial(x, y, 512); } static void vec_dot_q4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const void * restrict vx0, const void * restrict vy0) { @@ -246,7 +345,7 @@ static void vec_dot_q4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const vo const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales // Row sum (sf) - HVX_Vector r0_sum = Q6_V_vsplat_R(0); + HVX_Vector r0_sum = Q6_V_vzero(); // Multiply and accumulate into int32. // Compute combined scale (fp32). @@ -257,12 +356,12 @@ static void vec_dot_q4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const vo uint32_t i = 0; for (; i < nb; i++) { - HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size); - HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8(r0_x_q + i * x_qblk_size); + HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size); + HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_full(r0_x_q + i * x_qblk_size); HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q)); - HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); + HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); @@ -272,19 +371,19 @@ static void vec_dot_q4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const vo r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); } - // Process leftovers, we still load full 4x4x2 block but zero out unused scales/blocks + // Process leftovers if (nloe) { - HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size); - HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8(r0_x_q + i * x_qblk_size); + HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe); + HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_partial(r0_x_q + i * x_qblk_size, nloe); - HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy_q, nloe)); + HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe)); - HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); + HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); - // Zero out unused scales + // Zero out unused elements HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8); r0_dd = Q6_V_vand_QV(bmask, r0_dd); r0_ia = Q6_V_vand_QV(bmask, r0_ia); @@ -326,8 +425,8 @@ static void vec_dot_q4x4x2_q8x4x2_2x1(const int n, float * restrict s0, const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales // Row sum (sf) - HVX_Vector r0_sum = Q6_V_vsplat_R(0); - HVX_Vector r1_sum = Q6_V_vsplat_R(0); + HVX_Vector r0_sum = Q6_V_vzero(); + HVX_Vector r1_sum = Q6_V_vzero(); // Multiply and accumulate into int32. // Compute combined scale (fp32). @@ -338,14 +437,14 @@ static void vec_dot_q4x4x2_q8x4x2_2x1(const int n, float * restrict s0, uint32_t i = 0; for (; i < nb; i++) { - HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size); - HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8(r0_x_q + i * x_qblk_size); - HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8(r1_x_q + i * x_qblk_size); + HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size); + HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_full(r0_x_q + i * x_qblk_size); + HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8_full(r1_x_q + i * x_qblk_size); HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q)); HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q)); - HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); + HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size)); @@ -359,23 +458,23 @@ static void vec_dot_q4x4x2_q8x4x2_2x1(const int n, float * restrict s0, r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum)); } - // Process leftovers, we still load full 4x4x2 block but zero out unused scales/blocks + // Process leftovers if (nloe) { - HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size); - HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8(r0_x_q + i * x_qblk_size); - HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8(r1_x_q + i * x_qblk_size); + HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe); + HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_partial(r0_x_q + i * x_qblk_size, nloe); + HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8_partial(r1_x_q + i * x_qblk_size, nloe); - HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy_q, nloe)); - HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy_q, nloe)); + HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe)); + HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy_q, nloe)); - HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); + HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size)); HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d))); - // Zero out unused scales + // Zero out unused elements HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8); r0_dd = Q6_V_vand_QV(bmask, r0_dd); r1_dd = Q6_V_vand_QV(bmask, r1_dd); @@ -423,10 +522,10 @@ static void vec_dot_q4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float * const uint8_t * restrict y1_d = ((const uint8_t *) vy1) + y_qrow_size; // then scales // Row sums (sf) - 4 accumulators for 2×2 tile - HVX_Vector r0_c0_sum = Q6_V_vsplat_R(0); - HVX_Vector r0_c1_sum = Q6_V_vsplat_R(0); - HVX_Vector r1_c0_sum = Q6_V_vsplat_R(0); - HVX_Vector r1_c1_sum = Q6_V_vsplat_R(0); + HVX_Vector r0_c0_sum = Q6_V_vzero(); + HVX_Vector r0_c1_sum = Q6_V_vzero(); + HVX_Vector r1_c0_sum = Q6_V_vzero(); + HVX_Vector r1_c1_sum = Q6_V_vzero(); const uint32_t nb = n / qk; // num full blocks const uint32_t nloe = n % qk; // num leftover elements @@ -434,12 +533,12 @@ static void vec_dot_q4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float * uint32_t i = 0; for (; i < nb; i++) { // Load src1 columns (reused across both src0 rows) - HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8(y0_q + i * y_qblk_size); - HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8(y1_q + i * y_qblk_size); + HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_full(y0_q + i * y_qblk_size); + HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_full(y1_q + i * y_qblk_size); // Load src0 rows (reused across both src1 columns) - HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8(r0_x_q + i * x_qblk_size); - HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8(r1_x_q + i * x_qblk_size); + HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_full(r0_x_q + i * x_qblk_size); + HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8_full(r1_x_q + i * x_qblk_size); // Compute 4 dot products: r0×c0, r0×c1, r1×c0, r1×c1 HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy0_q)); @@ -448,8 +547,8 @@ static void vec_dot_q4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float * HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy1_q)); // Load scales - HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size)); - HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size)); + HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size)); + HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size)); HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size)); @@ -473,18 +572,18 @@ static void vec_dot_q4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float * // Process leftovers if (nloe) { - HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8(y0_q + i * y_qblk_size); - HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8(y1_q + i * y_qblk_size); - HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8(r0_x_q + i * x_qblk_size); - HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8(r1_x_q + i * x_qblk_size); + HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_partial(y0_q + i * y_qblk_size, nloe); + HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_partial(y1_q + i * y_qblk_size, nloe); + HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_partial(r0_x_q + i * x_qblk_size, nloe); + HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8_partial(r1_x_q + i * x_qblk_size, nloe); - HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy0_q, nloe)); - HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy1_q, nloe)); - HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy0_q, nloe)); - HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy1_q, nloe)); + HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy0_q, nloe)); + HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy1_q, nloe)); + HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy0_q, nloe)); + HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy1_q, nloe)); - HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size)); - HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size)); + HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size)); + HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size)); HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size)); @@ -545,7 +644,7 @@ static void vec_dot_q8x4x2_q8x4x2_1x1(const int n, float * restrict s0, const vo const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales // Row sum (sf) - HVX_Vector r0_sum = Q6_V_vsplat_R(0); + HVX_Vector r0_sum = Q6_V_vzero(); // Multiply and accumulate into int32. // Compute combined scale (fp32). @@ -556,12 +655,12 @@ static void vec_dot_q8x4x2_q8x4x2_1x1(const int n, float * restrict s0, const vo uint32_t i = 0; for (; i < nb; i++) { - HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size); - HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8(r0_x_q + i * x_qblk_size); + HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size); + HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_full(r0_x_q + i * x_qblk_size); HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q)); - HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); + HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); @@ -571,19 +670,19 @@ static void vec_dot_q8x4x2_q8x4x2_1x1(const int n, float * restrict s0, const vo r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); } - // Process leftovers, we still load full 4x4x2 block but zero out unused scales/blocks + // Process leftovers if (nloe) { - HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size); - HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8(r0_x_q + i * x_qblk_size); + HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe); + HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_partial(r0_x_q + i * x_qblk_size, nloe); - HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy_q, nloe)); + HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe)); - HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); + HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); - // Zero out unused scales + // Zero out unused elements HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8); r0_dd = Q6_V_vand_QV(bmask, r0_dd); r0_ia = Q6_V_vand_QV(bmask, r0_ia); @@ -625,8 +724,8 @@ static void vec_dot_q8x4x2_q8x4x2_2x1(const int n, float * restrict s0, const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales // Row sum (qf32) - HVX_Vector r0_sum = Q6_V_vsplat_R(0); - HVX_Vector r1_sum = Q6_V_vsplat_R(0); + HVX_Vector r0_sum = Q6_V_vzero(); + HVX_Vector r1_sum = Q6_V_vzero(); // Multiply and accumulate into int32. // Compute combined scale (fp32). @@ -637,14 +736,14 @@ static void vec_dot_q8x4x2_q8x4x2_2x1(const int n, float * restrict s0, uint32_t i = 0; for (; i < nb; i++) { - HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size); - HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8(r0_x_q + i * x_qblk_size); - HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8(r1_x_q + i * x_qblk_size); + HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size); + HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_full(r0_x_q + i * x_qblk_size); + HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8_full(r1_x_q + i * x_qblk_size); HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q)); HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q)); - HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); + HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size)); @@ -658,14 +757,14 @@ static void vec_dot_q8x4x2_q8x4x2_2x1(const int n, float * restrict s0, r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum)); } - // Process leftovers, we still load full 4x4x2 block but zero out unused scales/blocks + // Process leftovers if (nloe) { - HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size); - HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8(r0_x_q + i * x_qblk_size); - HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8(r1_x_q + i * x_qblk_size); + HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe); + HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_partial(r0_x_q + i * x_qblk_size, nloe); + HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8_partial(r1_x_q + i * x_qblk_size, nloe); - HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy_q, nloe)); - HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy_q, nloe)); + HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe)); + HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy_q, nloe)); HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); @@ -674,7 +773,7 @@ static void vec_dot_q8x4x2_q8x4x2_2x1(const int n, float * restrict s0, HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d))); - // Zero out unused scales + // Zero out unused elements HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8); r0_dd = Q6_V_vand_QV(bmask, r0_dd); r1_dd = Q6_V_vand_QV(bmask, r1_dd); @@ -722,10 +821,10 @@ static void vec_dot_q8x4x2_q8x4x2_2x2(const int n, float * restrict s0, float * const uint8_t * restrict y1_d = ((const uint8_t *) vy1) + y_qrow_size; // then scales // Row sums (sf) - 4 accumulators for 2×2 tile - HVX_Vector r0_c0_sum = Q6_V_vsplat_R(0); - HVX_Vector r0_c1_sum = Q6_V_vsplat_R(0); - HVX_Vector r1_c0_sum = Q6_V_vsplat_R(0); - HVX_Vector r1_c1_sum = Q6_V_vsplat_R(0); + HVX_Vector r0_c0_sum = Q6_V_vzero(); + HVX_Vector r0_c1_sum = Q6_V_vzero(); + HVX_Vector r1_c0_sum = Q6_V_vzero(); + HVX_Vector r1_c1_sum = Q6_V_vzero(); const uint32_t nb = n / qk; // num full blocks const uint32_t nloe = n % qk; // num leftover elements @@ -733,12 +832,12 @@ static void vec_dot_q8x4x2_q8x4x2_2x2(const int n, float * restrict s0, float * uint32_t i = 0; for (; i < nb; i++) { // Load src1 columns (reused across both src0 rows) - HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8(y0_q + i * y_qblk_size); - HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8(y1_q + i * y_qblk_size); + HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_full(y0_q + i * y_qblk_size); + HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_full(y1_q + i * y_qblk_size); // Load src0 rows (reused across both src1 columns) - HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8(r0_x_q + i * x_qblk_size); - HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8(r1_x_q + i * x_qblk_size); + HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_full(r0_x_q + i * x_qblk_size); + HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8_full(r1_x_q + i * x_qblk_size); // Compute 4 dot products: r0×c0, r0×c1, r1×c0, r1×c1 HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy0_q)); @@ -747,8 +846,8 @@ static void vec_dot_q8x4x2_q8x4x2_2x2(const int n, float * restrict s0, float * HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy1_q)); // Load scales - HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size)); - HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size)); + HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size)); + HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size)); HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size)); @@ -772,18 +871,18 @@ static void vec_dot_q8x4x2_q8x4x2_2x2(const int n, float * restrict s0, float * // Process leftovers if (nloe) { - HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8(y0_q + i * y_qblk_size); - HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8(y1_q + i * y_qblk_size); - HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8(r0_x_q + i * x_qblk_size); - HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8(r1_x_q + i * x_qblk_size); + HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_partial(y0_q + i * y_qblk_size, nloe); + HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_partial(y1_q + i * y_qblk_size, nloe); + HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_partial(r0_x_q + i * x_qblk_size, nloe); + HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8_partial(r1_x_q + i * x_qblk_size, nloe); - HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy0_q, nloe)); - HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy1_q, nloe)); - HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy0_q, nloe)); - HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy1_q, nloe)); + HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy0_q, nloe)); + HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy1_q, nloe)); + HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy0_q, nloe)); + HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy1_q, nloe)); - HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size)); - HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size)); + HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size)); + HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size)); HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size)); @@ -792,7 +891,7 @@ static void vec_dot_q8x4x2_q8x4x2_2x2(const int n, float * restrict s0, float * HVX_Vector r1_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy0_d))); HVX_Vector r1_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy1_d))); - // Zero out unused scales + // Zero out unused elements HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8); r0_c0_dd = Q6_V_vand_QV(bmask, r0_c0_dd); r0_c1_dd = Q6_V_vand_QV(bmask, r0_c1_dd); @@ -844,7 +943,7 @@ static void vec_dot_mxfp4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales // Row sum (sf) - HVX_Vector r0_sum = Q6_V_vsplat_R(0); + HVX_Vector r0_sum = Q6_V_vzero(); // Multiply and accumulate into int32. // Compute combined scale (fp32). @@ -855,8 +954,8 @@ static void vec_dot_mxfp4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const uint32_t i = 0; for (; i < nb; i++) { - HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size); - HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8(r0_x_q + i * x_qblk_size); + HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full( y_q + i * y_qblk_size); + HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_full(r0_x_q + i * x_qblk_size); HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q)); @@ -887,12 +986,12 @@ static void vec_dot_mxfp4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const // Process leftovers if (nloe) { - HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size); - HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8(r0_x_q + i * x_qblk_size); + HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial( y_q + i * y_qblk_size, nloe); + HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_partial(r0_x_q + i * x_qblk_size, nloe); - HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q)); + HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe)); - HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size); + HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size); HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size); // Convert vy_d from fp16 to fp32 while applying 0.5 scaling which is used for e8m0 halving @@ -954,8 +1053,8 @@ static void vec_dot_mxfp4x4x2_q8x4x2_2x1(const int n, float * restrict s0, const uint8_t * restrict y_d = ((const uint8_t *) vy0) + y_qrow_size; // then scales // Row sum (sf) - HVX_Vector r0_sum = Q6_V_vsplat_R(0); - HVX_Vector r1_sum = Q6_V_vsplat_R(0); + HVX_Vector r0_sum = Q6_V_vzero(); + HVX_Vector r1_sum = Q6_V_vzero(); // Multiply and accumulate into int32. // Compute combined scale (fp32). @@ -966,9 +1065,9 @@ static void vec_dot_mxfp4x4x2_q8x4x2_2x1(const int n, float * restrict s0, uint32_t i = 0; for (; i < nb; i++) { - HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size); - HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8(r0_x_q + i * x_qblk_size); - HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8(r1_x_q + i * x_qblk_size); + HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full( y_q + i * y_qblk_size); + HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_full(r0_x_q + i * x_qblk_size); + HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8_full(r1_x_q + i * x_qblk_size); HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q)); HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q)); @@ -1007,14 +1106,14 @@ static void vec_dot_mxfp4x4x2_q8x4x2_2x1(const int n, float * restrict s0, // Process leftovers if (nloe) { - HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size); - HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8(r0_x_q + i * x_qblk_size); - HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8(r1_x_q + i * x_qblk_size); + HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial( y_q + i * y_qblk_size, nloe); + HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_partial(r0_x_q + i * x_qblk_size, nloe); + HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8_partial(r1_x_q + i * x_qblk_size, nloe); HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q)); HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q)); - HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size); + HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size); HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size); HVX_Vector r1_d = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size); @@ -1087,10 +1186,10 @@ static void vec_dot_mxfp4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float const uint8_t * restrict y1_d = ((const uint8_t *) vy1) + y_qrow_size; // then scales // Row sums (sf) - 4 accumulators for 2×2 tile - HVX_Vector r0_c0_sum = Q6_V_vsplat_R(0); - HVX_Vector r0_c1_sum = Q6_V_vsplat_R(0); - HVX_Vector r1_c0_sum = Q6_V_vsplat_R(0); - HVX_Vector r1_c1_sum = Q6_V_vsplat_R(0); + HVX_Vector r0_c0_sum = Q6_V_vzero(); + HVX_Vector r0_c1_sum = Q6_V_vzero(); + HVX_Vector r1_c0_sum = Q6_V_vzero(); + HVX_Vector r1_c1_sum = Q6_V_vzero(); const uint32_t nb = n / qk; // num full blocks const uint32_t nloe = n % qk; // num leftover elements @@ -1098,12 +1197,12 @@ static void vec_dot_mxfp4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float uint32_t i = 0; for (; i < nb; i++) { // Load src1 columns (reused across both src0 rows) - HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8(y0_q + i * y_qblk_size); - HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8(y1_q + i * y_qblk_size); + HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_full(y0_q + i * y_qblk_size); + HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_full(y1_q + i * y_qblk_size); // Load src0 rows (reused across both src1 columns) - HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8(r0_x_q + i * x_qblk_size); - HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8(r1_x_q + i * x_qblk_size); + HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_full(r0_x_q + i * x_qblk_size); + HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8_full(r1_x_q + i * x_qblk_size); // Compute 4 dot products: r0×c0, r0×c1, r1×c0, r1×c1 HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy0_q)); @@ -1157,15 +1256,15 @@ static void vec_dot_mxfp4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float // Process leftovers if (nloe) { - HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8(y0_q + i * y_qblk_size); - HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8(y1_q + i * y_qblk_size); - HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8(r0_x_q + i * x_qblk_size); - HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8(r1_x_q + i * x_qblk_size); + HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_partial( y0_q + i * y_qblk_size, nloe); + HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_partial( y1_q + i * y_qblk_size, nloe); + HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_partial(r0_x_q + i * x_qblk_size, nloe); + HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8_partial(r1_x_q + i * x_qblk_size, nloe); - HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy0_q, nloe)); - HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy1_q, nloe)); - HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy0_q, nloe)); - HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy1_q, nloe)); + HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy0_q, nloe)); + HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy1_q, nloe)); + HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy0_q, nloe)); + HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy1_q, nloe)); HVX_Vector vy0_d = *(const HVX_UVector *) (y0_d + i * y_dblk_size); HVX_Vector vy1_d = *(const HVX_UVector *) (y1_d + i * y_dblk_size); @@ -1234,7 +1333,7 @@ static void vec_dot_f16_f16_aa_1x1(const int n, float * restrict s, const void * uint32_t nvec = n / VLEN_FP16; // num full fp16 hvx vectors uint32_t nloe = n % VLEN_FP16; // leftover elements - HVX_VectorPair rsum_p = Q6_W_vcombine_VV(Q6_V_vsplat_R(0), Q6_V_vsplat_R(0)); + HVX_VectorPair rsum_p = Q6_W_vzero(); uint32_t i = 0; @@ -1264,8 +1363,8 @@ static void vec_dot_f16_f16_aa_2x1(const int n, float * restrict s0, uint32_t nvec = n / VLEN_FP16; uint32_t nloe = n % VLEN_FP16; - HVX_VectorPair rsum0_p = Q6_W_vcombine_VV(Q6_V_vsplat_R(0), Q6_V_vsplat_R(0)); - HVX_VectorPair rsum1_p = Q6_W_vcombine_VV(Q6_V_vsplat_R(0), Q6_V_vsplat_R(0)); + HVX_VectorPair rsum0_p = Q6_W_vzero(); + HVX_VectorPair rsum1_p = Q6_W_vzero(); uint32_t i = 0; @@ -1303,10 +1402,10 @@ static void vec_dot_f16_f16_aa_2x2(const int n, float * restrict s0, float * res uint32_t nloe = n % VLEN_FP16; // Row sums (sf) - 4 accumulators for 2×2 tile - HVX_VectorPair r0_c0_sum_p = Q6_W_vcombine_VV(Q6_V_vsplat_R(0), Q6_V_vsplat_R(0)); - HVX_VectorPair r0_c1_sum_p = Q6_W_vcombine_VV(Q6_V_vsplat_R(0), Q6_V_vsplat_R(0)); - HVX_VectorPair r1_c0_sum_p = Q6_W_vcombine_VV(Q6_V_vsplat_R(0), Q6_V_vsplat_R(0)); - HVX_VectorPair r1_c1_sum_p = Q6_W_vcombine_VV(Q6_V_vsplat_R(0), Q6_V_vsplat_R(0)); + HVX_VectorPair r0_c0_sum_p = Q6_W_vzero(); + HVX_VectorPair r0_c1_sum_p = Q6_W_vzero(); + HVX_VectorPair r1_c0_sum_p = Q6_W_vzero(); + HVX_VectorPair r1_c1_sum_p = Q6_W_vzero(); uint32_t i = 0; @@ -1358,7 +1457,7 @@ static void vec_dot_f16_f16_uu_1x1(const int n, float * restrict s, const void * uint32_t nvec = n / VLEN_FP16; // num full fp16 hvx vectors uint32_t nloe = n % VLEN_FP16; // leftover elements - HVX_Vector rsum = Q6_V_vsplat_R(0); + HVX_Vector rsum = Q6_V_vzero(); uint32_t i = 0; @@ -1388,9 +1487,9 @@ static void vec_dot_f16_f32_uu_1x1(const int n, float * restrict s, const void * uint32_t nvec = n / VLEN_FP16; // num full fp16 hvx vectors uint32_t nloe = n % VLEN_FP16; // leftover elements - const HVX_Vector zero = Q6_V_vsplat_R(0); + const HVX_Vector zero = Q6_V_vzero(); - HVX_Vector rsum = Q6_V_vsplat_R(0); + HVX_Vector rsum = Q6_V_vzero(); uint32_t i = 0; @@ -1973,7 +2072,7 @@ static inline void quantize_block_f32_q8x1(float * restrict x, uint8_t * restric assert((unsigned long) y_q % 128 == 0); HVX_Vector * vx = (HVX_Vector *) x; - HVX_Vector zero = Q6_V_vsplat_R(0); + HVX_Vector zero = Q6_V_vzero(); // Use reduce max fp32 to find max(abs(e)) first HVX_Vector vmax0_sf = hvx_vec_reduce_max_f32(hvx_vec_abs_f32(vx[0])); @@ -2034,7 +2133,7 @@ static inline void quantize_block_f32_q8x2(float * restrict x, uint8_t * restric HVX_Vector * vx = (HVX_Vector *) x; // Load and convert into QF32 - HVX_Vector zero = Q6_V_vsplat_R(0); + HVX_Vector zero = Q6_V_vzero(); HVX_Vector vx0_qf = Q6_Vqf32_vsub_VsfVsf(vx[0], zero); // 32 elements HVX_Vector vx1_qf = Q6_Vqf32_vsub_VsfVsf(vx[1], zero); // 32 elements HVX_Vector vx2_qf = Q6_Vqf32_vsub_VsfVsf(vx[2], zero); // 32 elements @@ -2077,7 +2176,7 @@ static inline void quantize_block_f32_q8x4(float * restrict x, uint8_t * restric HVX_Vector * vx = (HVX_Vector *) x; // Load and convert into QF32 - HVX_Vector zero = Q6_V_vsplat_R(0); + HVX_Vector zero = Q6_V_vzero(); HVX_Vector vx0_qf = Q6_Vqf32_vsub_VsfVsf(vx[0], zero); // 32 elements HVX_Vector vx1_qf = Q6_Vqf32_vsub_VsfVsf(vx[1], zero); // 32 elements HVX_Vector vx2_qf = Q6_Vqf32_vsub_VsfVsf(vx[2], zero); // 32 elements diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m index b7d587f3bd..82101f4714 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.m +++ b/ggml/src/ggml-metal/ggml-metal-device.m @@ -1142,6 +1142,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te op->src[0]->ne[0] != 128 && op->src[0]->ne[0] != 192 && op->src[0]->ne[0] != 256 && + op->src[0]->ne[0] != 320 && op->src[0]->ne[0] != 576) { return false; } diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index d4b129ed75..b2328605dd 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -6176,6 +6176,7 @@ template [[host_name("kernel_flash_attn_ext_f32_dk128_dv128")]] kernel flash_at template [[host_name("kernel_flash_attn_ext_f32_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext; template [[host_name("kernel_flash_attn_ext_f32_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; template [[host_name("kernel_flash_attn_ext_f32_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_f32_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; template [[host_name("kernel_flash_attn_ext_f32_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext; template [[host_name("kernel_flash_attn_ext_f16_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; @@ -6190,6 +6191,7 @@ template [[host_name("kernel_flash_attn_ext_f16_dk128_dv128")]] kernel flash_at template [[host_name("kernel_flash_attn_ext_f16_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext; template [[host_name("kernel_flash_attn_ext_f16_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; template [[host_name("kernel_flash_attn_ext_f16_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_f16_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; template [[host_name("kernel_flash_attn_ext_f16_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext; #if defined(GGML_METAL_HAS_BF16) @@ -6205,6 +6207,7 @@ template [[host_name("kernel_flash_attn_ext_bf16_dk128_dv128")]] kernel flash_at template [[host_name("kernel_flash_attn_ext_bf16_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext; template [[host_name("kernel_flash_attn_ext_bf16_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; template [[host_name("kernel_flash_attn_ext_bf16_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_bf16_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; template [[host_name("kernel_flash_attn_ext_bf16_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext; #endif @@ -6220,6 +6223,7 @@ template [[host_name("kernel_flash_attn_ext_q4_0_dk128_dv128")]] kernel flash_at template [[host_name("kernel_flash_attn_ext_q4_0_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext; template [[host_name("kernel_flash_attn_ext_q4_0_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; template [[host_name("kernel_flash_attn_ext_q4_0_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_0_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; template [[host_name("kernel_flash_attn_ext_q4_0_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext; template [[host_name("kernel_flash_attn_ext_q4_1_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; @@ -6234,6 +6238,7 @@ template [[host_name("kernel_flash_attn_ext_q4_1_dk128_dv128")]] kernel flash_at template [[host_name("kernel_flash_attn_ext_q4_1_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext; template [[host_name("kernel_flash_attn_ext_q4_1_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; template [[host_name("kernel_flash_attn_ext_q4_1_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_1_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; template [[host_name("kernel_flash_attn_ext_q4_1_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext; template [[host_name("kernel_flash_attn_ext_q5_0_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; @@ -6248,6 +6253,7 @@ template [[host_name("kernel_flash_attn_ext_q5_0_dk128_dv128")]] kernel flash_at template [[host_name("kernel_flash_attn_ext_q5_0_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext; template [[host_name("kernel_flash_attn_ext_q5_0_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; template [[host_name("kernel_flash_attn_ext_q5_0_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_0_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; template [[host_name("kernel_flash_attn_ext_q5_0_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext; template [[host_name("kernel_flash_attn_ext_q5_1_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; @@ -6262,6 +6268,7 @@ template [[host_name("kernel_flash_attn_ext_q5_1_dk128_dv128")]] kernel flash_at template [[host_name("kernel_flash_attn_ext_q5_1_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext; template [[host_name("kernel_flash_attn_ext_q5_1_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; template [[host_name("kernel_flash_attn_ext_q5_1_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_1_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; template [[host_name("kernel_flash_attn_ext_q5_1_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext; template [[host_name("kernel_flash_attn_ext_q8_0_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; @@ -6276,6 +6283,7 @@ template [[host_name("kernel_flash_attn_ext_q8_0_dk128_dv128")]] kernel flash_at template [[host_name("kernel_flash_attn_ext_q8_0_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext; template [[host_name("kernel_flash_attn_ext_q8_0_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; template [[host_name("kernel_flash_attn_ext_q8_0_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q8_0_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; template [[host_name("kernel_flash_attn_ext_q8_0_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext; #undef FA_TYPES @@ -6846,6 +6854,17 @@ template [[host_name("kernel_flash_attn_ext_vec_q5_0_dk256_dv256")]] kernel flas template [[host_name("kernel_flash_attn_ext_vec_q5_1_dk256_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; template [[host_name("kernel_flash_attn_ext_vec_q8_0_dk256_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_f32_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_f16_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +#if defined(GGML_METAL_HAS_BF16) +template [[host_name("kernel_flash_attn_ext_vec_bf16_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +#endif +template [[host_name("kernel_flash_attn_ext_vec_q4_0_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q4_1_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q5_0_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q5_1_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q8_0_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; + template [[host_name("kernel_flash_attn_ext_vec_f32_dk576_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; template [[host_name("kernel_flash_attn_ext_vec_f16_dk576_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; #if defined(GGML_METAL_HAS_BF16) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index cdaded865b..48695a61ea 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -4767,7 +4767,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block sumqx += w*q*xb[j]; sumq2 += w*q*q; } - d = sumqx/sumq2; + d = sumq2 > 0 ? sumqx/sumq2 : 0.f; float best = d*sumqx; for (int itry = -ntry; itry <= ntry; ++itry) { id = (itry + values[0])/max; diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index 9f0efb6535..fcb0db99c6 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -211,7 +211,7 @@ struct sycl_device_info { // number of compute units on a SYCL device. // size_t smpb; // max. shared memory per block size_t smpbo; // max. shared memory per block (with opt-in) - int warp_size; // max sub_group_size of SYCL + int warp_size; // WARP_SIZE(16)|WARP_32_SIZE(32)|WARP_16_SIZE(16). For Intel GPU, 16 is better in most cases. Some OP support 32 only. int max_wg_per_cu; // max work groups per compute unit - refer to // cudaOccupancyMaxActiveBlocksPerMultiprocessor bool vmm; // virtual memory support diff --git a/ggml/src/ggml-sycl/gated_delta_net.cpp b/ggml/src/ggml-sycl/gated_delta_net.cpp new file mode 100644 index 0000000000..648455c134 --- /dev/null +++ b/ggml/src/ggml-sycl/gated_delta_net.cpp @@ -0,0 +1,309 @@ +#include +#include "dpct/helper.hpp" +#include "common.hpp" +#include "ggml.h" +#include "gated_delta_net.hpp" +#include + + +template +void gated_delta_net_sycl(const float * q, + const float * k, + const float * v, + const float * g, + const float * beta, + const float * curr_state, + float * dst, + int64_t H, + int64_t n_tokens, + int64_t n_seqs, + int64_t sq1, + int64_t sq2, + int64_t sq3, + int64_t sv1, + int64_t sv2, + int64_t sv3, + int64_t sb1, + int64_t sb2, + int64_t sb3, + const sycl::uint3 neqk1_magic, + const sycl::uint3 rq3_magic, + float scale) { + auto item_ct1 = sycl::ext::oneapi::this_work_item::get_nd_item<3>(); + const uint32_t h_idx = item_ct1.get_group(2); + const uint32_t sequence = item_ct1.get_group(1); + // each warp owns one column, using warp-level primitives to reduce across rows + const int lane = item_ct1.get_local_id(2); + const int col = item_ct1.get_group(0) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1); + + const uint32_t iq1 = fastmodulo(h_idx, neqk1_magic); + const uint32_t iq3 = fastdiv(sequence, rq3_magic); + + const int64_t attn_score_elems = S_v * H * n_tokens * n_seqs; + float * attn_data = dst; + float * state = dst + attn_score_elems; + + const int64_t state_offset = (sequence * H + h_idx) * S_v * S_v; + state += state_offset; + curr_state += state_offset; + attn_data += (sequence * n_tokens * H + h_idx) * S_v; + + constexpr int warp_size = ggml_sycl_get_physical_warp_size() < S_v ? ggml_sycl_get_physical_warp_size() : S_v; + static_assert(S_v % warp_size == 0, "S_v must be a multiple of warp_size"); + constexpr int rows_per_lane = (S_v + warp_size - 1) / warp_size; + float s_shard[rows_per_lane]; +#pragma unroll + for (int r = 0; r < rows_per_lane; r++) { + const int i = r * warp_size + lane; + s_shard[r] = curr_state[col * S_v + i]; + } + + for (int t = 0; t < n_tokens; t++) { + const float * q_t = q + iq3 * sq3 + t * sq2 + iq1 * sq1; + const float * k_t = k + iq3 * sq3 + t * sq2 + iq1 * sq1; + const float * v_t = v + sequence * sv3 + t * sv2 + h_idx * sv1; + + const int64_t gb_offset = sequence * sb3 + t * sb2 + h_idx * sb1; + const float * beta_t = beta + gb_offset; + const float * g_t = g + gb_offset * (KDA ? S_v : 1); + + const float beta_val = *beta_t; + + if constexpr (!KDA) { + const float g_val = sycl::native::exp(*g_t); + + // kv[col] = (S^T @ k)[col] = sum_i S[i][col] * k[i] + float kv_shard = 0.0f; +#pragma unroll + for (int r = 0; r < rows_per_lane; r++) { + const int i = r * warp_size + lane; + kv_shard += s_shard[r] * k_t[i]; + } + float kv_col = warp_reduce_sum(kv_shard); + + // delta[col] = (v[col] - g * kv[col]) * beta + float delta_col = (v_t[col] - g_val * kv_col) * beta_val; + + // fused: S[i][col] = g * S[i][col] + k[i] * delta[col] + // attn[col] = (S^T @ q)[col] = sum_i S[i][col] * q[i] + float attn_partial = 0.0f; +#pragma unroll + for (int r = 0; r < rows_per_lane; r++) { + const int i = r * warp_size + lane; + s_shard[r] = g_val * s_shard[r] + k_t[i] * delta_col; + attn_partial += s_shard[r] * q_t[i]; + } + + float attn_col = warp_reduce_sum(attn_partial); + + if (lane == 0) { + attn_data[col] = attn_col * scale; + } + } else { + // kv[col] = sum_i g[i] * S[i][col] * k[i] + float kv_shard = 0.0f; +#pragma unroll + for (int r = 0; r < rows_per_lane; r++) { + const int i = r * warp_size + lane; + kv_shard += sycl::native::exp(g_t[i]) * s_shard[r] * k_t[i]; + } + + float kv_col = warp_reduce_sum(kv_shard); + + // delta[col] = (v[col] - kv[col]) * beta + float delta_col = (v_t[col] - kv_col) * beta_val; + + // fused: S[i][col] = g[i] * S[i][col] + k[i] * delta[col] + // attn[col] = (S^T @ q)[col] = sum_i S[i][col] * q[i] + float attn_partial = 0.0f; +#pragma unroll + for (int r = 0; r < rows_per_lane; r++) { + const int i = r * warp_size + lane; + s_shard[r] = sycl::native::exp(g_t[i]) * s_shard[r] + k_t[i] * delta_col; + attn_partial += s_shard[r] * q_t[i]; + } + + float attn_col = warp_reduce_sum(attn_partial); + + if (lane == 0) { + attn_data[col] = attn_col * scale; + } + } + + attn_data += S_v * H; + } + + // Write state back to global memory +#pragma unroll + for (int r = 0; r < rows_per_lane; r++) { + const int i = r * warp_size + lane; + state[col * S_v + i] = s_shard[r]; + } +} + +template +static void launch_gated_delta_net(const float * q_d, + const float * k_d, + const float * v_d, + const float * g_d, + const float * b_d, + const float * s_d, + float * dst_d, + int64_t S_v, + int64_t H, + int64_t n_tokens, + int64_t n_seqs, + int64_t sq1, + int64_t sq2, + int64_t sq3, + int64_t sv1, + int64_t sv2, + int64_t sv3, + int64_t sb1, + int64_t sb2, + int64_t sb3, + int64_t neqk1, + int64_t rq3, + float scale, + dpct::queue_ptr stream) { + //TODO: Add chunked kernel for even faster pre-fill + const int warp_size = ggml_sycl_info().devices[ggml_sycl_get_device()].warp_size; + + const int num_warps = 4; + dpct::dim3 grid_dims(H, n_seqs, (S_v + num_warps - 1) / num_warps); + dpct::dim3 block_dims(warp_size <= S_v ? warp_size : S_v, num_warps, 1); + + const sycl::uint3 neqk1_magic = init_fastdiv_values(neqk1); + const sycl::uint3 rq3_magic = init_fastdiv_values(rq3); + + int cc = ggml_sycl_info().devices[ggml_sycl_get_device()].cc; + + switch (S_v) { + case 16: + { + constexpr int sv = 16; + stream->parallel_for(sycl::nd_range<3>(grid_dims * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + gated_delta_net_sycl(q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H, n_tokens, + n_seqs, sq1, sq2, sq3, sv1, sv2, sv3, sb1, sb2, + sb3, neqk1_magic, rq3_magic, scale); + }); + } + break; + case 32: + { + constexpr int sv = 32; + stream->parallel_for(sycl::nd_range<3>(grid_dims * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + gated_delta_net_sycl(q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H, n_tokens, + n_seqs, sq1, sq2, sq3, sv1, sv2, sv3, sb1, sb2, + sb3, neqk1_magic, rq3_magic, scale); + }); + } + break; + case 64: { + { + constexpr int sv = 64; + stream->parallel_for(sycl::nd_range<3>(grid_dims * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + gated_delta_net_sycl( + q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H, n_tokens, n_seqs, sq1, sq2, + sq3, sv1, sv2, sv3, sb1, sb2, sb3, neqk1_magic, rq3_magic, scale); + }); + } + break; + } + case 128: { + { + constexpr int sv = 128; + stream->parallel_for(sycl::nd_range<3>(grid_dims * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + gated_delta_net_sycl( + q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H, n_tokens, n_seqs, sq1, sq2, + sq3, sv1, sv2, sv3, sb1, sb2, sb3, neqk1_magic, rq3_magic, scale); + }); + } + break; + } + default: + GGML_ABORT("fatal error"); + break; + } +} + +void ggml_sycl_op_gated_delta_net(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_tensor * src_q = dst->src[0]; + ggml_tensor * src_k = dst->src[1]; + ggml_tensor * src_v = dst->src[2]; + ggml_tensor * src_g = dst->src[3]; + ggml_tensor * src_beta = dst->src[4]; + ggml_tensor * src_state = dst->src[5]; + + GGML_TENSOR_LOCALS(int64_t, neq, src_q, ne); + GGML_TENSOR_LOCALS(size_t , nbq, src_q, nb); + GGML_TENSOR_LOCALS(int64_t, nek, src_k, ne); + GGML_TENSOR_LOCALS(size_t , nbk, src_k, nb); + GGML_TENSOR_LOCALS(int64_t, nev, src_v, ne); + GGML_TENSOR_LOCALS(size_t, nbv, src_v, nb); + GGML_TENSOR_LOCALS(size_t, nbb, src_beta, nb); + + const int64_t S_v = nev0; + const int64_t H = nev1; + const int64_t n_tokens = nev2; + const int64_t n_seqs = nev3; + + const bool kda = (src_g->ne[0] == S_v); + + GGML_ASSERT(neq1 == nek1); + const int64_t neqk1 = neq1; + + const int64_t rq3 = nev3 / neq3; + + const float * q_d = (const float *) src_q->data; + const float * k_d = (const float *) src_k->data; + const float * v_d = (const float *) src_v->data; + const float * g_d = (const float *) src_g->data; + const float * b_d = (const float *) src_beta->data; + + const float * s_d = (const float *) src_state->data; + float * dst_d = (float *) dst->data; + + GGML_ASSERT(ggml_is_contiguous_rows(src_q)); + GGML_ASSERT(ggml_is_contiguous_rows(src_k)); + GGML_ASSERT(ggml_is_contiguous_rows(src_v)); + GGML_ASSERT(ggml_are_same_stride(src_q, src_k)); + GGML_ASSERT(src_g->ne[0] == 1 || kda); + GGML_ASSERT(ggml_is_contiguous(src_g)); + GGML_ASSERT(ggml_is_contiguous(src_beta)); + GGML_ASSERT(ggml_is_contiguous(src_state)); + + // strides in floats (beta strides used for both g and beta offset computation) + const int64_t sq1 = nbq1 / sizeof(float); + const int64_t sq2 = nbq2 / sizeof(float); + const int64_t sq3 = nbq3 / sizeof(float); + const int64_t sv1 = nbv1 / sizeof(float); + const int64_t sv2 = nbv2 / sizeof(float); + const int64_t sv3 = nbv3 / sizeof(float); + const int64_t sb1 = nbb1 / sizeof(float); + const int64_t sb2 = nbb2 / sizeof(float); + const int64_t sb3 = nbb3 / sizeof(float); + + const float scale = 1.0f / sqrtf((float) S_v); + + dpct::queue_ptr stream = ctx.stream(); + + if (kda) { + launch_gated_delta_net(q_d, k_d, v_d, g_d, b_d, s_d, dst_d, + S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3, + sb1, sb2, sb3, neqk1, rq3, scale, stream); + } else { + launch_gated_delta_net(q_d, k_d, v_d, g_d, b_d, s_d, dst_d, + S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3, + sb1, sb2, sb3, neqk1, rq3, scale, stream); + } +} + +void ggml_sycl_gated_delta_net(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/6); + ggml_sycl_op_gated_delta_net(ctx, dst); +} diff --git a/ggml/src/ggml-sycl/gated_delta_net.hpp b/ggml/src/ggml-sycl/gated_delta_net.hpp new file mode 100644 index 0000000000..a3308ee876 --- /dev/null +++ b/ggml/src/ggml-sycl/gated_delta_net.hpp @@ -0,0 +1,8 @@ +#pragma once + +#include +#include "dpct/helper.hpp" +#include "common.hpp" +#include "ggml.h" + +void ggml_sycl_gated_delta_net(ggml_backend_sycl_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index f887061b27..1281970584 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -35,6 +35,7 @@ #endif #include +#include "ggml.h" #include "ggml-sycl.h" #include "ggml-impl.h" #include "ggml-backend-impl.h" @@ -43,17 +44,18 @@ #include "ggml-sycl/backend.hpp" #include "ggml-sycl/common.hpp" #include "ggml-sycl/element_wise.hpp" +#include "ggml-sycl/gated_delta_net.hpp" +#include "ggml-sycl/gemm.hpp" +#include "ggml-sycl/getrows.hpp" #include "ggml-sycl/norm.hpp" #include "ggml-sycl/presets.hpp" -#include "ggml-sycl/gemm.hpp" +#include "ggml-sycl/quantize.hpp" +#include "ggml-sycl/repeat_back.hpp" #include "ggml-sycl/set_rows.hpp" #include "ggml-sycl/set.hpp" -#include "ggml-sycl/sycl_hw.hpp" -#include "ggml-sycl/getrows.hpp" -#include "ggml-sycl/repeat_back.hpp" -#include "ggml-sycl/quantize.hpp" #include "ggml-sycl/ssm_conv.hpp" -#include "ggml.h" +#include "ggml-sycl/sycl_hw.hpp" + static bool g_sycl_loaded = false; int g_ggml_sycl_debug = 0; @@ -99,6 +101,8 @@ static ggml_sycl_device_info ggml_sycl_init() { info.devices[i].nsm = prop.get_max_compute_units() / 16; //16: Number of Xe Cores info.devices[i].opt_feature.reorder = device.ext_oneapi_architecture_is(syclex::arch_category::intel_gpu); info.devices[i].smpbo = prop.get_local_mem_size(); + info.devices[i].warp_size = WARP_SIZE; + info.max_work_group_sizes[i] = prop.get_max_work_group_size(); info.devices[i].max_wg_per_cu = info.max_work_group_sizes[i] / prop.get_max_compute_units(); @@ -4181,6 +4185,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg case GGML_OP_GATED_LINEAR_ATTN: ggml_sycl_op_gated_linear_attn(ctx, dst); break; + case GGML_OP_GATED_DELTA_NET: + ggml_sycl_gated_delta_net(ctx, dst); + break; case GGML_OP_SSM_CONV: ggml_sycl_ssm_conv(ctx, dst); break; @@ -4890,6 +4897,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_RWKV_WKV6: case GGML_OP_RWKV_WKV7: case GGML_OP_GATED_LINEAR_ATTN: + case GGML_OP_GATED_DELTA_NET: return true; case GGML_OP_SSM_CONV: return op->type == GGML_TYPE_F32 && diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 3c81805b84..7092361d2e 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -4981,8 +4981,10 @@ static vk_device ggml_vk_get_device(size_t idx) { std::vector queue_family_props = device->physical_device.getQueueFamilyProperties(); // Try to find a non-graphics compute queue and transfer-focused queues - const uint32_t compute_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eCompute, vk::QueueFlagBits::eGraphics, -1, 1); - const uint32_t transfer_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eTransfer, vk::QueueFlagBits::eCompute | vk::QueueFlagBits::eGraphics, compute_queue_family_index, 1); + // On AMD, the graphics queue seems to be faster, so don't avoid it + const vk::QueueFlagBits graphics_flag = device->vendor_id == VK_VENDOR_ID_AMD ? (vk::QueueFlagBits)0 : vk::QueueFlagBits::eGraphics; + const uint32_t compute_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eCompute, graphics_flag, -1, 1); + const uint32_t transfer_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eTransfer, vk::QueueFlagBits::eCompute | graphics_flag, compute_queue_family_index, 1); const float priorities[] = { 1.0f, 1.0f }; device->single_queue = compute_queue_family_index == transfer_queue_family_index && queue_family_props[compute_queue_family_index].queueCount == 1; @@ -5441,13 +5443,11 @@ static vk_device ggml_vk_get_device(size_t idx) { ggml_vk_load_shaders(device); - const bool prefers_transfer_queue = device->vendor_id == VK_VENDOR_ID_AMD && device->architecture != AMD_GCN; - if (!device->single_queue) { const uint32_t transfer_queue_index = compute_queue_family_index == transfer_queue_family_index ? 1 : 0; ggml_vk_create_queue(device, device->transfer_queue, transfer_queue_family_index, transfer_queue_index, { vk::PipelineStageFlagBits::eTransfer }, true); - device->async_use_transfer_queue = prefers_transfer_queue || (getenv("GGML_VK_ASYNC_USE_TRANSFER_QUEUE") != nullptr); + device->async_use_transfer_queue = (getenv("GGML_VK_ASYNC_USE_TRANSFER_QUEUE") != nullptr); } else { // TODO: Use pointer or reference to avoid copy device->transfer_queue.copyFrom(device->compute_queue); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp index ec48f5b115..11b7dce857 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp @@ -245,7 +245,7 @@ void main() { #endif } [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) { - Sf[r][c] += ACC_TYPE(dot(Q_cache[r], K_Tf)); + Sf[r][c] += dot(ACC_TYPEV4(Q_cache[r]), ACC_TYPEV4(K_Tf)); } } } @@ -270,7 +270,7 @@ void main() { #endif } [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) { - Sf[r][c] += ACC_TYPE(dot(Qf[tile_row(r) * qf_stride + d * D_split + d_tid], K_Tf)); + Sf[r][c] += dot(ACC_TYPEV4(Qf[tile_row(r) * qf_stride + d * D_split + d_tid]), ACC_TYPEV4(K_Tf)); } } } diff --git a/scripts/get-hellaswag.sh b/scripts/get-hellaswag.sh index 484e56fd8f..0b161141f4 100755 --- a/scripts/get-hellaswag.sh +++ b/scripts/get-hellaswag.sh @@ -1,10 +1,38 @@ -#!/usr/bin/env bash +#!/bin/sh +# vim: set ts=4 sw=4 et: -wget https://raw.githubusercontent.com/klosax/hellaswag_text_data/main/hellaswag_val_full.txt +FILE="hellaswag_val_full.txt" +URL="https://raw.githubusercontent.com/klosax/hellaswag_text_data/main/$FILE" -echo "Usage:" -echo "" -echo " ./llama-perplexity -m model.gguf -f hellaswag_val_full.txt --hellaswag [--hellaswag-tasks N] [other params]" -echo "" +die() { + printf "%s\n" "$@" >&2 + exit 1 +} -exit 0 +have_cmd() { + for cmd; do + command -v "$cmd" >/dev/null || return + done +} + +dl() { + [ -f "$2" ] && return + if have_cmd wget; then + wget "$1" -O "$2" + elif have_cmd curl; then + curl -L "$1" -o "$2" + else + die "Please install wget or curl" + fi +} + +if [ ! -f "$FILE" ]; then + dl "$URL" "$FILE" || exit +fi + +cat <&2 + exit 1 +} -exit 0 +have_cmd() { + for cmd; do + command -v "$cmd" >/dev/null || return + done +} + +dl() { + [ -f "$2" ] && return + if have_cmd wget; then + wget "$1" -O "$2" + elif have_cmd curl; then + curl -L "$1" -o "$2" + else + die "Please install wget or curl" + fi +} + +if [ ! -f "$FILE" ]; then + dl "$URL" "$FILE" || exit +fi + +cat < 1) { + llama_kv_cell_ext ext; + io.read_to(&ext, sizeof(ext)); + cells.ext_set(i, ext); + } + for (uint32_t j = 0; j < n_seq_id; ++j) { llama_seq_id seq_id; io.read_to(&seq_id, sizeof(seq_id)); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 6cc28eff28..bae02e32b1 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -7462,6 +7462,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) { if (!layer.wo_s && layer.wo) { layer.wo_s = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED); } + if (!layer.wqkv_s && layer.wqkv) { + layer.wqkv_s = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "scale", i), {1}, TENSOR_NOT_REQUIRED); + } + if (!layer.wqkv_gate_s && layer.wqkv_gate) { + layer.wqkv_gate_s = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "scale", i), {1}, TENSOR_NOT_REQUIRED); + } // dense FFN weight scales (per-tensor, shape {1}) if (!layer.ffn_gate_s && layer.ffn_gate) { @@ -7473,6 +7479,15 @@ bool llama_model::load_tensors(llama_model_loader & ml) { if (!layer.ffn_up_s && layer.ffn_up) { layer.ffn_up_s = create_tensor(tn(LLM_TENSOR_FFN_UP, "scale", i), {1}, TENSOR_NOT_REQUIRED); } + if (!layer.ffn_gate_shexp_s && layer.ffn_gate_shexp) { + layer.ffn_gate_shexp_s = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "scale", i), {1}, TENSOR_NOT_REQUIRED); + } + if (!layer.ffn_down_shexp_s && layer.ffn_down_shexp) { + layer.ffn_down_shexp_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "scale", i), {1}, TENSOR_NOT_REQUIRED); + } + if (!layer.ffn_up_shexp_s && layer.ffn_up_shexp) { + layer.ffn_up_shexp_s = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "scale", i), {1}, TENSOR_NOT_REQUIRED); + } // MoE expert weight scales (per-expert, shape {n_expert}) if (!layer.ffn_gate_exps_s && layer.ffn_gate_exps) { @@ -7484,6 +7499,20 @@ bool llama_model::load_tensors(llama_model_loader & ml) { if (!layer.ffn_up_exps_s && layer.ffn_up_exps) { layer.ffn_up_exps_s = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "scale", i), {n_expert}, TENSOR_NOT_REQUIRED); } + + // recurrent / linear-attention weight scales (per-tensor, shape {1}) + if (!layer.ssm_in_s && layer.ssm_in) { + layer.ssm_in_s = create_tensor(tn(LLM_TENSOR_SSM_IN, "scale", i), {1}, TENSOR_NOT_REQUIRED); + } + if (!layer.ssm_out_s && layer.ssm_out) { + layer.ssm_out_s = create_tensor(tn(LLM_TENSOR_SSM_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED); + } + if (!layer.ssm_alpha_s && layer.ssm_alpha) { + layer.ssm_alpha_s = create_tensor(tn(LLM_TENSOR_SSM_ALPHA, "scale", i), {1}, TENSOR_NOT_REQUIRED); + } + if (!layer.ssm_beta_s && layer.ssm_beta) { + layer.ssm_beta_s = create_tensor(tn(LLM_TENSOR_SSM_BETA, "scale", i), {1}, TENSOR_NOT_REQUIRED); + } } } diff --git a/src/llama-model.h b/src/llama-model.h index 9a2dacecca..aefcfe700f 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -401,9 +401,18 @@ struct llama_layer { struct ggml_tensor * wk_s = nullptr; struct ggml_tensor * wv_s = nullptr; struct ggml_tensor * wo_s = nullptr; + struct ggml_tensor * wqkv_s = nullptr; + struct ggml_tensor * wqkv_gate_s = nullptr; struct ggml_tensor * ffn_gate_s = nullptr; struct ggml_tensor * ffn_up_s = nullptr; struct ggml_tensor * ffn_down_s = nullptr; + struct ggml_tensor * ffn_gate_shexp_s = nullptr; + struct ggml_tensor * ffn_up_shexp_s = nullptr; + struct ggml_tensor * ffn_down_shexp_s = nullptr; + struct ggml_tensor * ssm_in_s = nullptr; + struct ggml_tensor * ssm_out_s = nullptr; + struct ggml_tensor * ssm_alpha_s = nullptr; + struct ggml_tensor * ssm_beta_s = nullptr; // altup & laurel struct ggml_tensor * per_layer_inp_gate = nullptr; diff --git a/src/models/mamba-base.cpp b/src/models/mamba-base.cpp index 9de587db55..c37f29c487 100644 --- a/src/models/mamba-base.cpp +++ b/src/models/mamba-base.cpp @@ -42,7 +42,7 @@ ggml_tensor * llm_build_mamba_base::build_mamba_layer(llm_graph_input_rs * inp, cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs); // {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs} - ggml_tensor * xz = build_lora_mm(layer.ssm_in, cur); + ggml_tensor * xz = build_lora_mm(layer.ssm_in, cur, layer.ssm_in_s); // split the above in two // => {d_inner, n_seq_tokens, n_seqs} ggml_tensor * x = ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], 0); @@ -137,7 +137,7 @@ ggml_tensor * llm_build_mamba_base::build_mamba_layer(llm_graph_input_rs * inp, y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y); // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs} - cur = build_lora_mm(layer.ssm_out, y); + cur = build_lora_mm(layer.ssm_out, y, layer.ssm_out_s); } // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens} @@ -184,7 +184,7 @@ ggml_tensor * llm_build_mamba_base::build_mamba2_layer(llm_graph_input_rs * inp, // d_in_proj = 2 * self.d_inner + 2 * self.ngroups * self.d_state + self.nheads // {n_embd, d_in_proj} @ {n_embd, n_seq_tokens, n_seqs} => {d_in_proj, n_seq_tokens, n_seqs} - ggml_tensor * zxBCdt = build_lora_mm(model.layers[il].ssm_in, cur); + ggml_tensor * zxBCdt = build_lora_mm(model.layers[il].ssm_in, cur, model.layers[il].ssm_in_s); // split the above in three ggml_tensor * z = ggml_view_4d(ctx0, zxBCdt, head_dim, n_head, n_seq_tokens, n_seqs, head_dim * zxBCdt->nb[0], @@ -278,7 +278,7 @@ ggml_tensor * llm_build_mamba_base::build_mamba2_layer(llm_graph_input_rs * inp, y = ggml_reshape_3d(ctx0, y, d_inner, n_seq_tokens, n_seqs); // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs} - cur = build_lora_mm(model.layers[il].ssm_out, y); + cur = build_lora_mm(model.layers[il].ssm_out, y, model.layers[il].ssm_out_s); } // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens} diff --git a/src/models/nemotron-h.cpp b/src/models/nemotron-h.cpp index 7af99174d1..d3fccfb70d 100644 --- a/src/models/nemotron-h.cpp +++ b/src/models/nemotron-h.cpp @@ -107,9 +107,9 @@ ggml_tensor * llm_build_nemotron_h::build_attention_layer(ggml_tensor * ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const llama_model & model, int il) { if (model.layers[il].ffn_gate_inp == nullptr) { cur = build_ffn(cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, model.layers[il].ffn_up_s, NULL, NULL, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, model.layers[il].ffn_down_s, NULL, LLM_FFN_RELU_SQR, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); @@ -136,7 +136,10 @@ ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const lla hparams.expert_weights_scale, LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID, il, - router_logits); + router_logits, nullptr, + model.layers[il].ffn_up_exps_s, + nullptr, // no gate + model.layers[il].ffn_down_exps_s); cb(moe_out, "ffn_moe_out", il); if (model.layers[il].ffn_latent_up) { @@ -144,9 +147,9 @@ ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const lla } ggml_tensor * ffn_shexp = build_ffn(inp_emb, - model.layers[il].ffn_up_shexp, NULL, NULL, - NULL /* no gate */ , NULL, NULL, - model.layers[il].ffn_down_shexp, NULL, NULL, + model.layers[il].ffn_up_shexp, NULL, model.layers[il].ffn_up_shexp_s, + NULL /* no gate */ , NULL, NULL, + model.layers[il].ffn_down_shexp, NULL, model.layers[il].ffn_down_shexp_s, NULL, LLM_FFN_RELU_SQR, LLM_FFN_PAR, il); cb(ffn_shexp, "ffn_shexp", il); diff --git a/src/models/qwen35.cpp b/src/models/qwen35.cpp index e12dad7001..d07579ee87 100644 --- a/src/models/qwen35.cpp +++ b/src/models/qwen35.cpp @@ -90,11 +90,11 @@ std::pair llm_build_qwen35::build_qkvz( const int64_t n_seqs = ubatch.n_seqs; const int64_t n_seq_tokens = ubatch.n_seq_tokens; - ggml_tensor * qkv_mixed = build_lora_mm(model.layers[il].wqkv, input); + ggml_tensor * qkv_mixed = build_lora_mm(model.layers[il].wqkv, input, model.layers[il].wqkv_s); qkv_mixed = ggml_reshape_3d(ctx0, qkv_mixed, qkv_mixed->ne[0], n_seq_tokens, n_seqs); cb(qkv_mixed, "linear_attn_qkv_mixed", il); - ggml_tensor * z = build_lora_mm(model.layers[il].wqkv_gate, input); + ggml_tensor * z = build_lora_mm(model.layers[il].wqkv_gate, input, model.layers[il].wqkv_gate_s); cb(z, "z", il); return { qkv_mixed, z }; @@ -123,7 +123,7 @@ ggml_tensor * llm_build_qwen35::build_layer_attn( // Order: joint QG projection, QG split, Q norm, KV projection, K norm, RoPE, attention // Qwen3Next uses a single Q projection that outputs query + gate - ggml_tensor * Qcur_full = build_lora_mm(model.layers[il].wq, cur); // [ (n_embd_head * 2) * n_head, n_tokens ] + ggml_tensor * Qcur_full = build_lora_mm(model.layers[il].wq, cur, model.layers[il].wq_s); // [ (n_embd_head * 2) * n_head, n_tokens ] cb(Qcur_full, "Qcur_full", il); ggml_tensor * Qcur = ggml_view_3d(ctx0, Qcur_full, n_embd_head, n_head, n_tokens, @@ -135,10 +135,10 @@ ggml_tensor * llm_build_qwen35::build_layer_attn( Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il); cb(Qcur, "Qcur_normed", il); - ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur, model.layers[il].wk_s); cb(Kcur, "Kcur", il); - ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s); cb(Vcur, "Vcur", il); // Apply K normalization @@ -186,7 +186,7 @@ ggml_tensor * llm_build_qwen35::build_layer_attn( cur = ggml_mul(ctx0, cur, gate_sigmoid); cb(cur, "attn_gated", il); - cur = build_lora_mm(model.layers[il].wo, cur); + cur = build_lora_mm(model.layers[il].wo, cur, model.layers[il].wo_s); cb(cur, "attn_output", il); return cur; @@ -217,14 +217,14 @@ ggml_tensor * llm_build_qwen35::build_layer_attn_linear( ggml_tensor * qkv_mixed = qkvz.first; ggml_tensor * z = qkvz.second; - ggml_tensor * beta = build_lora_mm(model.layers[il].ssm_beta, cur); + ggml_tensor * beta = build_lora_mm(model.layers[il].ssm_beta, cur, model.layers[il].ssm_beta_s); beta = ggml_reshape_4d(ctx0, beta, 1, num_v_heads, n_seq_tokens, n_seqs); cb(beta, "beta", il); beta = ggml_sigmoid(ctx0, beta); - ggml_tensor * alpha = build_lora_mm(model.layers[il].ssm_alpha, cur); - alpha = ggml_cont_3d(ctx0, alpha, num_v_heads, n_seq_tokens, n_seqs); + ggml_tensor * alpha = build_lora_mm(model.layers[il].ssm_alpha, cur, model.layers[il].ssm_alpha_s); + alpha = ggml_reshape_3d(ctx0, alpha, num_v_heads, n_seq_tokens, n_seqs); cb(alpha, "alpha", il); ggml_tensor * alpha_biased = ggml_add(ctx0, alpha, model.layers[il].ssm_dt); @@ -356,7 +356,7 @@ ggml_tensor * llm_build_qwen35::build_layer_attn_linear( cb(final_output, "final_output", il); // Output projection - cur = build_lora_mm(model.layers[il].ssm_out, final_output); + cur = build_lora_mm(model.layers[il].ssm_out, final_output, model.layers[il].ssm_out_s); cb(cur, "linear_attn_out", il); // Reshape back to original dimensions @@ -370,9 +370,9 @@ ggml_tensor * llm_build_qwen35::build_layer_ffn(ggml_tensor * cur, const int il) GGML_ASSERT(model.layers[il].ffn_gate_inp == nullptr); cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, + model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_s, + model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_s, + model.layers[il].ffn_down, NULL, model.layers[il].ffn_down_s, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); diff --git a/src/models/qwen35moe.cpp b/src/models/qwen35moe.cpp index 8d07c7ed27..b38660c0bc 100644 --- a/src/models/qwen35moe.cpp +++ b/src/models/qwen35moe.cpp @@ -90,11 +90,11 @@ std::pair llm_build_qwen35moe::build_qkvz( const int64_t n_seqs = ubatch.n_seqs; const int64_t n_seq_tokens = ubatch.n_seq_tokens; - ggml_tensor * qkv_mixed = build_lora_mm(model.layers[il].wqkv, input); + ggml_tensor * qkv_mixed = build_lora_mm(model.layers[il].wqkv, input, model.layers[il].wqkv_s); qkv_mixed = ggml_reshape_3d(ctx0, qkv_mixed, qkv_mixed->ne[0], n_seq_tokens, n_seqs); cb(qkv_mixed, "linear_attn_qkv_mixed", il); - ggml_tensor * z = build_lora_mm(model.layers[il].wqkv_gate, input); + ggml_tensor * z = build_lora_mm(model.layers[il].wqkv_gate, input, model.layers[il].wqkv_gate_s); cb(z, "z", il); return { qkv_mixed, z }; @@ -123,7 +123,7 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_attn( // Order: joint QG projection, QG split, Q norm, KV projection, K norm, RoPE, attention // Qwen3Next uses a single Q projection that outputs query + gate - ggml_tensor * Qcur_full = build_lora_mm(model.layers[il].wq, cur); // [ (n_embd_head * 2) * n_head, n_tokens ] + ggml_tensor * Qcur_full = build_lora_mm(model.layers[il].wq, cur, model.layers[il].wq_s); // [ (n_embd_head * 2) * n_head, n_tokens ] cb(Qcur_full, "Qcur_full", il); ggml_tensor * Qcur = ggml_view_3d(ctx0, Qcur_full, n_embd_head, n_head, n_tokens, @@ -135,10 +135,10 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_attn( Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il); cb(Qcur, "Qcur_normed", il); - ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur, model.layers[il].wk_s); cb(Kcur, "Kcur", il); - ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s); cb(Vcur, "Vcur", il); // Apply K normalization @@ -186,7 +186,7 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_attn( cur = ggml_mul(ctx0, cur, gate_sigmoid); cb(cur, "attn_gated", il); - cur = build_lora_mm(model.layers[il].wo, cur); + cur = build_lora_mm(model.layers[il].wo, cur, model.layers[il].wo_s); cb(cur, "attn_output", il); return cur; @@ -217,14 +217,14 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_attn_linear( ggml_tensor * qkv_mixed = qkvz.first; ggml_tensor * z = qkvz.second; - ggml_tensor * beta = build_lora_mm(model.layers[il].ssm_beta, cur); + ggml_tensor * beta = build_lora_mm(model.layers[il].ssm_beta, cur, model.layers[il].ssm_beta_s); beta = ggml_reshape_4d(ctx0, beta, 1, num_v_heads, n_seq_tokens, n_seqs); cb(beta, "beta", il); beta = ggml_sigmoid(ctx0, beta); - ggml_tensor * alpha = build_lora_mm(model.layers[il].ssm_alpha, cur); - alpha = ggml_cont_3d(ctx0, alpha, num_v_heads, n_seq_tokens, n_seqs); + ggml_tensor * alpha = build_lora_mm(model.layers[il].ssm_alpha, cur, model.layers[il].ssm_alpha_s); + alpha = ggml_reshape_3d(ctx0, alpha, num_v_heads, n_seq_tokens, n_seqs); cb(alpha, "alpha", il); ggml_tensor * alpha_biased = ggml_add(ctx0, alpha, model.layers[il].ssm_dt); @@ -356,7 +356,7 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_attn_linear( cb(final_output, "final_output", il); // Output projection - cur = build_lora_mm(model.layers[il].ssm_out, final_output); + cur = build_lora_mm(model.layers[il].ssm_out, final_output, model.layers[il].ssm_out_s); cb(cur, "linear_attn_out", il); // Reshape back to original dimensions @@ -380,16 +380,19 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_ffn(ggml_tensor * cur, const int LLM_FFN_SILU, true, hparams.expert_weights_scale, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il, - nullptr, model.layers[il].ffn_gate_up_exps); + nullptr, model.layers[il].ffn_gate_up_exps, + model.layers[il].ffn_up_exps_s, + model.layers[il].ffn_gate_exps_s, + model.layers[il].ffn_down_exps_s); cb(moe_out, "ffn_moe_out", il); // Add shared experts if present - following Qwen3Next reference implementation if (model.layers[il].ffn_up_shexp != nullptr) { ggml_tensor * ffn_shexp = build_ffn(cur, - model.layers[il].ffn_up_shexp, NULL, NULL, - model.layers[il].ffn_gate_shexp, NULL, NULL, - model.layers[il].ffn_down_shexp, NULL, NULL, + model.layers[il].ffn_up_shexp, NULL, model.layers[il].ffn_up_shexp_s, + model.layers[il].ffn_gate_shexp, NULL, model.layers[il].ffn_gate_shexp_s, + model.layers[il].ffn_down_shexp, NULL, model.layers[il].ffn_down_shexp_s, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(ffn_shexp, "ffn_shexp", il); diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index abf914faa1..c9896cc11e 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -8576,11 +8576,12 @@ static std::vector> make_test_cases_eval() { } } - for (int hsk : { 40, 64, 72, 80, 96, 128, 192, 256, 576 }) { + for (int hsk : { 40, 64, 72, 80, 96, 128, 192, 256, 320, 576 }) { for (int hsv : { 40, 64, 72, 80, 96, 128, 192, 256, 512 }) { - if (hsk != 192 && hsk != 576 && hsk != hsv) continue; + if (hsk != 192 && hsk != 320 && hsk != 576 && hsk != hsv) continue; if (hsk == 192 && (hsv != 128 && hsv != 192)) continue; if (hsk == 576 && hsv != 512) continue; // DeepSeek MLA + if (hsk == 320 && hsv != 256) continue; // MLA for (bool mask : { true, false } ) { for (bool sinks : { true, false } ) { @@ -8589,12 +8590,13 @@ static std::vector> make_test_cases_eval() { for (float logit_softcap : {0.0f, 10.0f}) { if (hsk != 128 && logit_softcap != 0.0f) continue; for (int nh : { 1, 4 }) { - if (nh == 1 && hsk != 576) continue; // GLM 4.7 Flash + if (nh == 1 && hsk != 320 && hsk != 576) continue; // GLM 4.7 Flash for (int nr3 : { 1, 3, }) { if (hsk > 64 && nr3 > 1) continue; // skip broadcast for large head sizes - for (int nr2 : { 1, 4, 12, 20 }) { + for (int nr2 : { 1, 4, 12, 20, 32 }) { if (nr2 == 12 && hsk != 128) continue; if (nr2 == 20 && (nh != 1 || hsk != 576)) continue; + if (nr2 == 32 && (nh != 1 || hsk != 320)) continue; //for (int kv : { 1, 17, 31, 33, 61, 113, 65, 127, 129, 130, 255, 260, 371, 380, 407, 512, 1024, }) { for (int kv : { 113, 512, 1024, }) { if (nr2 != 1 && kv != 512) continue; diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp index 4c2ae7a033..7c4342d6bf 100644 --- a/tools/cli/cli.cpp +++ b/tools/cli/cli.cpp @@ -215,7 +215,7 @@ struct cli_context { inputs.parallel_tool_calls = false; inputs.add_generation_prompt = true; inputs.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; - inputs.enable_thinking = common_chat_templates_support_enable_thinking(chat_params.tmpls.get()); + inputs.enable_thinking = chat_params.enable_thinking ? common_chat_templates_support_enable_thinking(chat_params.tmpls.get()) : false; // Apply chat template to the list of messages return common_chat_templates_apply(chat_params.tmpls.get(), inputs); diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt index 3be3c27e87..a39de8c928 100644 --- a/tools/mtmd/CMakeLists.txt +++ b/tools/mtmd/CMakeLists.txt @@ -62,6 +62,10 @@ set_target_properties(mtmd PROPERTIES PUBLIC_HEADER "${MTMD_PUBLIC_HEADERS}") +set_target_properties(mtmd + PROPERTIES + PRIVATE_HEADER debug/mtmd-debug.h) + install(TARGETS mtmd LIBRARY PUBLIC_HEADER) if (NOT MSVC) @@ -96,3 +100,9 @@ if(LLAMA_TOOLS_INSTALL) endif() target_link_libraries (${TARGET} PRIVATE common mtmd Threads::Threads) target_compile_features(${TARGET} PRIVATE cxx_std_17) + +# mtmd-debug tool +add_executable(llama-mtmd-debug debug/mtmd-debug.cpp) +set_target_properties(llama-mtmd-debug PROPERTIES OUTPUT_NAME llama-mtmd-debug) +target_link_libraries(llama-mtmd-debug PRIVATE common mtmd Threads::Threads) +target_compile_features(llama-mtmd-debug PRIVATE cxx_std_17) diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 06e1ffb7ca..3eb66f9145 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -579,10 +579,9 @@ static void print_tensor_data(ggml_tensor * t, uint8_t * data, int64_t n) { } } -void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value); - // // API used internally with mtmd // projector_type clip_get_projector_type(const struct clip_ctx * ctx); +void clip_set_debug_output_embeddings(struct clip_ctx * ctx, bool debug); diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index b6b31ae866..3d6cf6fd84 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -159,6 +159,8 @@ struct clip_ctx { clip_flash_attn_type flash_attn_type = CLIP_FLASH_ATTN_TYPE_AUTO; bool is_allocated = false; + bool debug_output_embeddings = false; + clip_ctx(clip_context_params & ctx_params) { flash_attn_type = ctx_params.flash_attn_type; backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); @@ -205,6 +207,8 @@ struct clip_ctx { if (ctx_params.cb_eval != nullptr) { ggml_backend_sched_set_eval_callback(sched.get(), ctx_params.cb_eval, ctx_params.cb_eval_user_data); } + + debug_output_embeddings = std::getenv("MTMD_DEBUG_EMBEDDINGS") != nullptr; } ~clip_ctx() { @@ -2193,8 +2197,6 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params // TODO: we don't support audio for Gemma 3N, but GGUF contains audio tensors // we can remove this check when we implement audio support for Gemma 3N skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3NV; - - // clip_debug_encode(ctx_vision, 24*14, 24*14, 0.5f); } if (loader.has_audio && !skip_audio) { @@ -3981,7 +3983,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } // Debug: dump final embeddings if MTMD_DEBUG_EMBEDDINGS is set - if (std::getenv("MTMD_DEBUG_EMBEDDINGS") != nullptr) { + if (ctx->debug_output_embeddings) { const int64_t n_embd = embeddings->ne[0]; const int64_t n_tokens = embeddings->ne[1]; std::vector emb_data(n_embd * n_tokens); @@ -4160,14 +4162,7 @@ const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) { // // API for debugging // -void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value) { - clip_image_f32 img; - img.nx = w; - img.ny = h; - img.buf.resize(h * w * 3); - for (int i = 0; i < h * w * 3; i++) { - img.buf[i] = static_cast(fill_value); - } - clip_image_encode(ctx, 1, &img, nullptr); - GGML_ASSERT(img.buf.empty() && "expected, always stop here"); + +void clip_set_debug_output_embeddings(clip_ctx * ctx, bool enable) { + ctx->debug_output_embeddings = enable; } diff --git a/tools/mtmd/debug/mtmd-debug.cpp b/tools/mtmd/debug/mtmd-debug.cpp new file mode 100644 index 0000000000..d42806ec3f --- /dev/null +++ b/tools/mtmd/debug/mtmd-debug.cpp @@ -0,0 +1,229 @@ +#include "mtmd-debug.h" + +#include "arg.h" +#include "debug.h" +#include "log.h" +#include "common.h" +#include "llama.h" +#include "ggml.h" +#include "mtmd.h" +#include "mtmd-helper.h" + +#include +#include +#include +#include +#include + +// INTERNAL TOOL FOR DEBUGGING PURPOSES ONLY +// NOT INTENDED FOR PUBLIC USE + +static void show_additional_info(int /*argc*/, char ** argv) { + LOG( + "Internal debugging tool for mtmd; See mtmd-debug.md for the pytorch equivalent code\n" + "Note: we repurpose some args from other examples, they will have different meaning here\n" + "\n" + "Usage: %s -m --mmproj -p -n --image --audio