diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md
index 10cb02ff2c..b3cff96604 100644
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -22,12 +22,11 @@
- **DPCPP** *(Data Parallel C++)*: The primary oneAPI SYCL implementation, which includes the icpx/icx Compilers.
- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. Intel oneMKL, oneMath and oneDNN)*.
- **oneAPI LevelZero**: A high performance low level interface for fine-grained control over Intel iGPUs and dGPUs.
-- **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.
### Llama.cpp + SYCL
The llama.cpp SYCL backend is primarily designed for **Intel GPUs**.
-SYCL cross-platform capabilities enable support for Nvidia GPUs as well, with limited support for AMD.
+SYCL cross-platform capabilities enable support for other vendor GPUs as well.
## Recommended Release
@@ -42,6 +41,9 @@ The following releases are verified and recommended:
## News
+- 2026.02
+ - Remove support for Nvidia & AMD GPU, because the oneAPI plugin for Nvidia & AMD GPU is unavailable: download/installation channels are out of work. User can't build up the software for Nvidia & AMD GPU.
+
- 2025.11
- Support malloc memory on device more than 4GB.
@@ -111,8 +113,8 @@ On older Intel GPUs, you may try [OpenCL](/docs/backend/OPENCL.md) although the
|-------------------------------|---------|---------------------------------------|
| Intel Data Center Max Series | Support | Max 1550, 1100 |
| Intel Data Center Flex Series | Support | Flex 170 |
-| Intel Arc A-Series | Support | Arc A770, Arc A730M, Arc A750 |
-| Intel Arc B-Series | Support | Arc B580 |
+| Intel Arc A-Series | Support | Arc A770, Arc A730M, Arc A750 |
+| Intel Arc B-Series | Support | Arc B580 |
| Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake, Arrow Lake, Lunar Lake |
| Intel iGPU | Support | iGPU in 13700k, 13400, i5-1250P, i7-1260P, i7-1165G7 |
@@ -127,20 +129,7 @@ On older Intel GPUs, you may try [OpenCL](/docs/backend/OPENCL.md) although the
### Other Vendor GPU
-**Verified devices**
-
-| Nvidia GPU | Status | Verified Model |
-|--------------------------|-----------|----------------|
-| Ampere Series | Supported | A100, A4000 |
-| Ampere Series *(Mobile)* | Supported | RTX 40 Series |
-
-| AMD GPU | Status | Verified Model |
-|--------------------------|--------------|----------------|
-| Radeon Pro | Experimental | W6800 |
-| Radeon RX | Experimental | 6700 XT |
-
-Note: AMD GPU support is highly experimental and is incompatible with F16.
-Additionally, it only supports GPUs with a sub_group_size (warp size) of 32.
+NA
## Docker
@@ -149,11 +138,11 @@ The docker build option is currently limited to *Intel GPU* targets.
### Build image
```sh
-# Using FP16
-docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" --target light -f .devops/intel.Dockerfile .
-
# Using FP32
docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=OFF" --target light -f .devops/intel.Dockerfile .
+
+# Using FP16
+docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" --target light -f .devops/intel.Dockerfile .
```
*Notes*:
@@ -212,14 +201,6 @@ Platform #0: Intel(R) OpenCL HD Graphics
`-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49]
```
-- **Nvidia GPU**
-
-In order to target Nvidia GPUs through SYCL, please make sure the CUDA/CUBLAS native requirements *-found [here](README.md#cuda)-* are installed.
-
-- **AMD GPU**
-
-To target AMD GPUs with SYCL, the ROCm stack must be installed first.
-
2. **Install IntelĀ® oneAPI Base toolkit**
SYCL backend depends on:
@@ -248,23 +229,6 @@ Upon a successful installation, SYCL is enabled for the available intel devices,
|2025.1|
|2024.1|
-- **Adding support to Nvidia GPUs**
-
-**oneAPI Plugin**: In order to enable SYCL support on Nvidia GPUs, please install the [Codeplay oneAPI Plugin for Nvidia GPUs](https://developer.codeplay.com/products/oneapi/nvidia/download). User should also make sure the plugin version matches the installed base toolkit one *(previous step)* for a seamless "oneAPI on Nvidia GPU" setup.
-
-**oneDNN**: The current oneDNN releases *(shipped with the oneAPI base-toolkit)* do not include the NVIDIA backend. Therefore, oneDNN must be compiled from source to enable the NVIDIA target:
-
-```sh
-git clone https://github.com/oneapi-src/oneDNN.git
-cd oneDNN
-cmake -GNinja -Bbuild-nvidia -DDNNL_CPU_RUNTIME=DPCPP -DDNNL_GPU_RUNTIME=DPCPP -DDNNL_GPU_VENDOR=NVIDIA -DONEDNN_BUILD_GRAPH=OFF -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
-cmake --build build-nvidia --config Release
-```
-
-- **Adding support to AMD GPUs**
-
-**oneAPI Plugin**: In order to enable SYCL support on AMD GPUs, please install the [Codeplay oneAPI Plugin for AMD GPUs](https://developer.codeplay.com/products/oneapi/amd/download). As with Nvidia GPUs, the user should also make sure the plugin version matches the installed base toolkit.
-
3. **Verify installation and environment**
In order to check the available SYCL devices on the machine, please use the `sycl-ls` command.
@@ -285,25 +249,6 @@ When targeting an intel GPU, the user should expect one or more devices among th
[opencl:gpu][opencl:2] Intel(R) OpenCL Graphics, Intel(R) UHD Graphics 730 OpenCL 3.0 NEO [24.39.31294]
```
-- **Nvidia GPU**
-
-Similarly, user targeting Nvidia GPUs should expect at least one SYCL-CUDA device [`cuda:gpu`] as below:
-
-```
-[opencl:acc][opencl:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.12.0.12_195853.xmain-hotfix]
-[opencl:cpu][opencl:1] Intel(R) OpenCL, Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz OpenCL 3.0 (Build 0) [2023.16.12.0.12_195853.xmain-hotfix]
-[cuda:gpu][cuda:0] NVIDIA CUDA BACKEND, NVIDIA A100-PCIE-40GB 8.0 [CUDA 12.5]
-```
-
-- **AMD GPU**
-
-For AMD GPUs we should expect at least one SYCL-HIP device [`hip:gpu`]:
-
-```
-[opencl:cpu][opencl:0] Intel(R) OpenCL, 12th Gen Intel(R) Core(TM) i9-12900K OpenCL 3.0 (Build 0) [2024.18.6.0.02_160000]
-[hip:gpu][hip:0] AMD HIP BACKEND, AMD Radeon PRO W6800 gfx1030 [HIP 60140.9]
-```
-
### II. Build llama.cpp
#### Intel GPU
@@ -332,47 +277,6 @@ It is possible to come across some precision issues when running tests that stem
instructions, which can be circumvented by setting the environment variable `SYCL_PROGRAM_COMPILE_OPTIONS`
as `-cl-fp32-correctly-rounded-divide-sqrt`
-#### Nvidia GPU
-
-The SYCL backend depends on [oneMath](https://github.com/uxlfoundation/oneMath) for Nvidia and AMD devices.
-By default it is automatically built along with the project. A specific build can be provided by setting the CMake flag `-DoneMath_DIR=/path/to/oneMath/install/lib/cmake/oneMath`.
-
-```sh
-# Build LLAMA with Nvidia BLAS acceleration through SYCL
-# Setting GGML_SYCL_DEVICE_ARCH is optional but can improve performance
-GGML_SYCL_DEVICE_ARCH=sm_80 # Example architecture
-
-# Option 1: Use FP32 (recommended for better performance in most cases)
-cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DDNNL_DIR=/path/to/oneDNN/build-nvidia/install/lib/cmake/dnnl
-
-# Option 2: Use FP16
-cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DDNNL_DIR=/path/to/oneDNN/build-nvidia/install/lib/cmake/dnnl
-
-# build all binary
-cmake --build build --config Release -j -v
-```
-
-It is possible to come across some precision issues when running tests that stem from using faster
-instructions, which can be circumvented by passing the `-fno-fast-math` flag to the compiler.
-
-#### AMD GPU
-
-The SYCL backend depends on [oneMath](https://github.com/uxlfoundation/oneMath) for Nvidia and AMD devices.
-By default it is automatically built along with the project. A specific build can be provided by setting the CMake flag `-DoneMath_DIR=/path/to/oneMath/install/lib/cmake/oneMath`.
-
-```sh
-# Build LLAMA with rocBLAS acceleration through SYCL
-
-## AMD
-# Use FP32, FP16 is not supported
-# Find your GGML_SYCL_DEVICE_ARCH with rocminfo, under the key 'Name:'
-GGML_SYCL_DEVICE_ARCH=gfx90a # Example architecture
-cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=AMD -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
-
-# build all binary
-cmake --build build --config Release -j -v
-```
-
### III. Run the inference
#### Retrieve and prepare model
@@ -766,15 +670,15 @@ use 1 SYCL GPUs: [0] with Max compute units:512
| Name | Value | Function |
|--------------------|---------------------------------------|---------------------------------------------|
| GGML_SYCL | ON (mandatory) | Enable build with SYCL code path. |
-| GGML_SYCL_TARGET | INTEL *(default)* \| NVIDIA \| AMD | Set the SYCL target device type. |
-| GGML_SYCL_DEVICE_ARCH | Optional (except for AMD) | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. |
+| GGML_SYCL_TARGET | INTEL *(default)* | Set the SYCL target device type. |
+| GGML_SYCL_DEVICE_ARCH | Optional | Set the SYCL device architecture. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. |
| GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. (1.) |
-| GGML_SYCL_GRAPH | ON *(default)* \|OFF *(Optional)* | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). |
+| GGML_SYCL_GRAPH | OFF *(default)* \|ON *(Optional)* | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). |
| GGML_SYCL_DNN | ON *(default)* \|OFF *(Optional)* | Enable build with oneDNN. |
| CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. |
| CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. |
-1. FP16 is recommended for better prompt processing performance on quantized models. Performance is equivalent in text generation but set `GGML_SYCL_F16=OFF` if you are experiencing issues with FP16 builds.
+1. FP32 or FP16 have different performance impact to LLM. Recommended to test them for better prompt processing performance on your models. You need to rebuild the code after change `GGML_SYCL_F16=OFF/ON`.
#### Runtime
@@ -782,7 +686,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
| GGML_SYCL_DEBUG | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG |
| GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features for Intel GPUs. (Recommended to 1 for intel devices older than Gen 10) |
-| GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because graph performance isn't yet better than non-graph performance. |
+| GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because SYCL Graph is still on development, no better performance. |
| GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. |
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.
Recommended to use when --split-mode = layer |
| UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS | 0 (default) or 1 | Support malloc device memory more than 4GB.|
diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt
index 5a89d8dd68..eefdd9725c 100644
--- a/ggml/src/ggml-sycl/CMakeLists.txt
+++ b/ggml/src/ggml-sycl/CMakeLists.txt
@@ -1,7 +1,7 @@
message(STATUS "GGML_SYCL_TARGET=${GGML_SYCL_TARGET}")
-if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL|NVIDIA|AMD)$")
- message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL, NVIDIA, or AMD")
+if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL)$")
+ message(FATAL_ERROR "GGML_SYCL_TARGET: Invalid target, the supported options are [INTEL]")
endif()
check_cxx_compiler_flag("-fsycl" SUPPORTS_SYCL)
@@ -125,25 +125,22 @@ endif()
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_DNNL=${GGML_SYCL_DNNL})
if (GGML_SYCL_F16)
- if (GGML_SYCL_TARGET STREQUAL "AMD")
- message(WARNING "AMD target does not entirely support FP16 in the SYCL backend.")
- endif()
add_compile_definitions(GGML_SYCL_F16)
endif()
if (GGML_SYCL_TARGET STREQUAL "INTEL")
add_compile_definitions(GGML_SYCL_WARP_SIZE=16)
target_link_options(ggml-sycl PRIVATE -Xs -ze-intel-greater-than-4GB-buffer-required)
-elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
- add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
-elseif (GGML_SYCL_TARGET STREQUAL "AMD")
- # INFO: Allowed Sub_group_sizes are not consistent through all
- # hip targets. For example, 64 is used for certain models, but the backend
- # does not support it.
- # Target archs tested working: gfx1030, gfx1031, (Only tested sub_group_size = 32)
- add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
+
+ # Link against Intel oneMKL
+ if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+ set(SYCL_COMPILER ON)
+ endif()
+ find_package(MKL REQUIRED)
+ target_link_libraries(ggml-sycl PRIVATE MKL::MKL_SYCL::BLAS)
else()
# default for other target
+ message(FATAL_ERROR "GGML_SYCL_TARGET is not supported")
add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
endif()
@@ -151,82 +148,6 @@ if (GGML_SYCL_GRAPH)
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_GRAPH)
endif()
-# Link against Intel oneMKL or oneMath
-if (GGML_SYCL_TARGET STREQUAL "INTEL")
- # Intel devices use Intel oneMKL directly instead of oneMath to avoid the limitation of linking Intel oneMKL statically
- # See https://github.com/uxlfoundation/oneMath/issues/654
- if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
- set(SYCL_COMPILER ON)
- endif()
- find_package(MKL REQUIRED)
- target_link_libraries(ggml-sycl PRIVATE MKL::MKL_SYCL::BLAS)
- target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_USE_INTEL_ONEMKL)
-else()
- find_package(oneMath QUIET)
- if (NOT oneMath_FOUND)
- message(STATUS "oneMath not found: oneMath will be automatically downloaded")
- # Use FetchContent to automatically pull and build oneMath
- include(FetchContent)
- set(BUILD_FUNCTIONAL_TESTS False)
- set(BUILD_EXAMPLES False)
- set(TARGET_DOMAINS blas)
- if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
- set(ENABLE_MKLCPU_BACKEND False)
- set(ENABLE_MKLGPU_BACKEND False)
- set(ENABLE_CUBLAS_BACKEND True)
- elseif (GGML_SYCL_TARGET STREQUAL "AMD")
- set(ENABLE_MKLCPU_BACKEND False)
- set(ENABLE_MKLGPU_BACKEND False)
- set(ENABLE_ROCBLAS_BACKEND True)
- # Ensure setting a string variable here is not overriden by oneMath CACHE variables
- cmake_policy(SET CMP0126 NEW)
- # Setting the device architecture is only needed and useful for AMD devices in oneMath
- set(HIP_TARGETS ${GGML_SYCL_DEVICE_ARCH} CACHE STRING "oneMath HIP target" FORCE)
- endif()
- FetchContent_Declare(
- ONEMATH
- GIT_REPOSITORY https://github.com/uxlfoundation/oneMath.git
- GIT_TAG 8efe85f5aaebb37f1d8c503b7af66315feabf142
- )
- FetchContent_MakeAvailable(ONEMATH)
- # Create alias to match with find_package targets name
- function(onemath_alias target)
- if (TARGET ${target}_obj)
- # Silence verbose warnings from external libraries
- target_compile_options(${target}_obj PRIVATE -w)
- endif()
- if (TARGET ${target})
- add_library(ONEMATH::${target} ALIAS ${target})
- endif()
- endfunction()
- onemath_alias(onemath)
- onemath_alias(onemath_blas_mklcpu)
- onemath_alias(onemath_blas_mklgpu)
- onemath_alias(onemath_blas_cublas)
- onemath_alias(onemath_blas_rocblas)
- endif()
-
- # Below oneMath compile-time dispatching is used for better performance
- if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
- target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath_blas_cublas)
- target_compile_options(ggml-sycl PRIVATE "-fsycl-targets=nvptx64-nvidia-cuda")
- target_link_options(ggml-sycl PRIVATE "-fsycl-targets=nvptx64-nvidia-cuda")
- target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_NVIDIA)
- elseif (GGML_SYCL_TARGET STREQUAL "AMD")
- if (NOT GGML_SYCL_DEVICE_ARCH)
- message(FATAL_ERROR "Can't enable SYCL hip backend, GGML_SYCL_DEVICE_ARCH has not been set.")
- endif()
- target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath_blas_rocblas)
- target_compile_options(ggml-sycl PRIVATE "-fsycl-targets=amdgcn-amd-amdhsa")
- target_link_options(ggml-sycl PRIVATE "-fsycl-targets=amdgcn-amd-amdhsa")
- target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_AMD)
- else()
- # Fallback to oneMath runtime dispatcher
- target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath)
- target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_GENERIC)
- endif()
-endif()
-
if (GGML_SYCL_DEVICE_ARCH)
target_compile_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH})
target_link_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH})
diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp
index 8ae8098717..ece66a7ac1 100644
--- a/ggml/src/ggml-sycl/dpct/helper.hpp
+++ b/ggml/src/ggml-sycl/dpct/helper.hpp
@@ -15,17 +15,9 @@
#include
#include
-#include