sycl : support to malloc memory on device more than 4GB, update the doc and script (#17566)
Co-authored-by: Neo Zhang Jianyu <jianyu.zhang@intel.com>
This commit is contained in:
parent
f698a79c63
commit
7d2add51d8
|
|
@ -42,6 +42,9 @@ The following releases are verified and recommended:
|
||||||
|
|
||||||
## News
|
## News
|
||||||
|
|
||||||
|
- 2025.11
|
||||||
|
- Support malloc memory on device more than 4GB.
|
||||||
|
|
||||||
- 2025.2
|
- 2025.2
|
||||||
- Optimize MUL_MAT Q4_0 on Intel GPU for all dGPUs and built-in GPUs since MTL. Increase the performance of LLM (llama-2-7b.Q4_0.gguf) 21%-87% on Intel GPUs (MTL, ARL-H, Arc, Flex, PVC).
|
- Optimize MUL_MAT Q4_0 on Intel GPU for all dGPUs and built-in GPUs since MTL. Increase the performance of LLM (llama-2-7b.Q4_0.gguf) 21%-87% on Intel GPUs (MTL, ARL-H, Arc, Flex, PVC).
|
||||||
|GPU|Base tokens/s|Increased tokens/s|Percent|
|
|GPU|Base tokens/s|Increased tokens/s|Percent|
|
||||||
|
|
@ -789,6 +792,8 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|
||||||
| GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because graph performance isn't yet better than non-graph performance. |
|
| GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because graph performance isn't yet better than non-graph performance. |
|
||||||
| GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. |
|
| GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. |
|
||||||
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
|
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
|
||||||
|
| UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS | 0 (default) or 1 | Support malloc device memory more than 4GB.|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## Known Issues
|
## Known Issues
|
||||||
|
|
@ -835,6 +840,14 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|
||||||
| The default context is too big. It leads to excessive memory usage.|Set `-c 8192` or a smaller value.|
|
| The default context is too big. It leads to excessive memory usage.|Set `-c 8192` or a smaller value.|
|
||||||
| The model is too big and requires more memory than what is available.|Choose a smaller model or change to a smaller quantization, like Q5 -> Q4;<br>Alternatively, use more than one device to load model.|
|
| The model is too big and requires more memory than what is available.|Choose a smaller model or change to a smaller quantization, like Q5 -> Q4;<br>Alternatively, use more than one device to load model.|
|
||||||
|
|
||||||
|
- `ggml_backend_sycl_buffer_type_alloc_buffer: can't allocate 5000000000 Bytes of memory on device`
|
||||||
|
|
||||||
|
You need to enable to support 4GB memory malloc by:
|
||||||
|
```
|
||||||
|
export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
|
||||||
|
set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
|
||||||
|
```
|
||||||
|
|
||||||
### **GitHub contribution**:
|
### **GitHub contribution**:
|
||||||
Please add the `SYCL :` prefix/tag in issues/PRs titles to help the SYCL contributors to check/address them without delay.
|
Please add the `SYCL :` prefix/tag in issues/PRs titles to help the SYCL contributors to check/address them without delay.
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -15,6 +15,9 @@ MODEL_FILE=models/llama-2-7b.Q4_0.gguf
|
||||||
NGL=99
|
NGL=99
|
||||||
CONTEXT=4096
|
CONTEXT=4096
|
||||||
|
|
||||||
|
#support malloc device memory more than 4GB.
|
||||||
|
export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
|
||||||
|
|
||||||
if [ $# -gt 0 ]; then
|
if [ $# -gt 0 ]; then
|
||||||
GGML_SYCL_DEVICE=$1
|
GGML_SYCL_DEVICE=$1
|
||||||
echo "use $GGML_SYCL_DEVICE as main GPU"
|
echo "use $GGML_SYCL_DEVICE as main GPU"
|
||||||
|
|
|
||||||
|
|
@ -6,7 +6,7 @@
|
||||||
|
|
||||||
# If you want more control, DPC++ Allows selecting a specific device through the
|
# If you want more control, DPC++ Allows selecting a specific device through the
|
||||||
# following environment variable
|
# following environment variable
|
||||||
#export ONEAPI_DEVICE_SELECTOR="level_zero:0"
|
export ONEAPI_DEVICE_SELECTOR="level_zero:0"
|
||||||
source /opt/intel/oneapi/setvars.sh
|
source /opt/intel/oneapi/setvars.sh
|
||||||
|
|
||||||
#export GGML_SYCL_DEBUG=1
|
#export GGML_SYCL_DEBUG=1
|
||||||
|
|
@ -18,11 +18,14 @@ MODEL_FILE=models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf
|
||||||
NGL=99 # Layers offloaded to the GPU. If the device runs out of memory, reduce this value according to the model you are using.
|
NGL=99 # Layers offloaded to the GPU. If the device runs out of memory, reduce this value according to the model you are using.
|
||||||
CONTEXT=4096
|
CONTEXT=4096
|
||||||
|
|
||||||
|
#support malloc device memory more than 4GB.
|
||||||
|
export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
|
||||||
|
|
||||||
if [ $# -gt 0 ]; then
|
if [ $# -gt 0 ]; then
|
||||||
GGML_SYCL_DEVICE=$1
|
GGML_SYCL_DEVICE=$1
|
||||||
echo "Using $GGML_SYCL_DEVICE as the main GPU"
|
echo "Using $GGML_SYCL_DEVICE as the main GPU"
|
||||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none
|
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none
|
||||||
else
|
else
|
||||||
#use multiple GPUs with same max compute units
|
#use multiple GPUs with same max compute units
|
||||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -c ${CONTEXT}
|
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT}
|
||||||
fi
|
fi
|
||||||
|
|
|
||||||
|
|
@ -5,5 +5,7 @@
|
||||||
set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
|
set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
|
||||||
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
||||||
|
|
||||||
|
:: support malloc device memory more than 4GB.
|
||||||
|
set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
|
||||||
|
|
||||||
.\build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 99 -s 0
|
.\build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 99 -s 0
|
||||||
|
|
|
||||||
|
|
@ -5,5 +5,7 @@
|
||||||
set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
|
set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
|
||||||
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
||||||
|
|
||||||
|
:: support malloc device memory more than 4GB.
|
||||||
|
set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
|
||||||
|
|
||||||
.\build\bin\llama-cli.exe -m models\Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf -p %INPUT2% -n 400 -e -ngl 99
|
.\build\bin\llama-cli.exe -m models\Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf -p %INPUT2% -n 400 -s 0 -e -ngl 99
|
||||||
|
|
|
||||||
|
|
@ -91,7 +91,10 @@ if (GGML_SYCL_F16)
|
||||||
add_compile_definitions(GGML_SYCL_F16)
|
add_compile_definitions(GGML_SYCL_F16)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
|
if (GGML_SYCL_TARGET STREQUAL "INTEL")
|
||||||
|
add_compile_definitions(GGML_SYCL_WARP_SIZE=16)
|
||||||
|
target_link_options(ggml-sycl PRIVATE -Xs -ze-intel-greater-than-4GB-buffer-required)
|
||||||
|
elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
|
||||||
add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
|
add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
|
||||||
elseif (GGML_SYCL_TARGET STREQUAL "AMD")
|
elseif (GGML_SYCL_TARGET STREQUAL "AMD")
|
||||||
# INFO: Allowed Sub_group_sizes are not consistent through all
|
# INFO: Allowed Sub_group_sizes are not consistent through all
|
||||||
|
|
@ -100,7 +103,8 @@ elseif (GGML_SYCL_TARGET STREQUAL "AMD")
|
||||||
# Target archs tested working: gfx1030, gfx1031, (Only tested sub_group_size = 32)
|
# Target archs tested working: gfx1030, gfx1031, (Only tested sub_group_size = 32)
|
||||||
add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
|
add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
|
||||||
else()
|
else()
|
||||||
add_compile_definitions(GGML_SYCL_WARP_SIZE=16)
|
# default for other target
|
||||||
|
add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (GGML_SYCL_GRAPH)
|
if (GGML_SYCL_GRAPH)
|
||||||
|
|
|
||||||
|
|
@ -515,9 +515,6 @@ void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, co
|
||||||
const int64_t ne = ggml_nelements(src0);
|
const int64_t ne = ggml_nelements(src0);
|
||||||
GGML_ASSERT(ne == ggml_nelements(src1));
|
GGML_ASSERT(ne == ggml_nelements(src1));
|
||||||
|
|
||||||
GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
|
|
||||||
GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
|
|
||||||
|
|
||||||
GGML_TENSOR_BINARY_OP_LOCALS01;
|
GGML_TENSOR_BINARY_OP_LOCALS01;
|
||||||
|
|
||||||
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue