diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md index c0a422b3dc..10cb02ff2c 100644 --- a/docs/backend/SYCL.md +++ b/docs/backend/SYCL.md @@ -119,7 +119,7 @@ On older Intel GPUs, you may try [OpenCL](/docs/backend/OPENCL.md) although the *Notes:* - **Memory** - - The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/llama-cli`. + - The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/llama-completion`. - Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU. - **Execution Unit (EU)** @@ -423,16 +423,12 @@ Choose one of following methods to run. - Use device 0: ```sh -./examples/sycl/run-llama2.sh 0 -# OR -./examples/sycl/run-llama3.sh 0 +./examples/sycl/test.sh -mg 0 ``` - Use multiple devices: ```sh -./examples/sycl/run-llama2.sh -# OR -./examples/sycl/run-llama3.sh +./examples/sycl/test.sh ``` 2. Command line @@ -455,13 +451,13 @@ Examples: - Use device 0: ```sh -ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 99 -sm none -mg 0 +ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 99 -sm none -mg 0 --mmap ``` - Use multiple devices: ```sh -ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 99 -sm layer +ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 99 -sm layer --mmap ``` *Notes:* @@ -577,13 +573,13 @@ Or, use CMake presets to build: ```sh cmake --preset x64-windows-sycl-release -cmake --build build-x64-windows-sycl-release -j --target llama-cli +cmake --build build-x64-windows-sycl-release -j --target llama-completion cmake -DGGML_SYCL_F16=ON --preset x64-windows-sycl-release -cmake --build build-x64-windows-sycl-release -j --target llama-cli +cmake --build build-x64-windows-sycl-release -j --target llama-completion cmake --preset x64-windows-sycl-debug -cmake --build build-x64-windows-sycl-debug -j --target llama-cli +cmake --build build-x64-windows-sycl-debug -j --target llama-completion ``` #### 3. Visual Studio @@ -608,7 +604,7 @@ You can use Visual Studio to open the `llama.cpp` folder directly as a CMake pro - For a minimal experimental setup, you can build only the inference executable using: ```Powershell - cmake --build build --config Release -j --target llama-cli + cmake --build build --config Release -j --target llama-completion ``` ##### - Generating a Visual Studio Solution @@ -714,13 +710,7 @@ Choose one of following methods to run. 1. Script ``` -examples\sycl\win-run-llama-2.bat -``` - -or - -``` -examples\sycl\win-run-llama-3.bat +examples\sycl\win-test.bat ``` 2. Command line @@ -744,13 +734,13 @@ Examples: - Use device 0: ``` -build\bin\llama-cli.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 99 -sm none -mg 0 +build\bin\llama-completion.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 99 -sm none -mg 0 --mmap ``` - Use multiple devices: ``` -build\bin\llama-cli.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 99 -sm layer +build\bin\llama-completion.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 99 -sm layer --mmap ``` diff --git a/examples/sycl/run-llama2.sh b/examples/sycl/run-llama2.sh index cf23619ee0..d33f82f339 100755 --- a/examples/sycl/run-llama2.sh +++ b/examples/sycl/run-llama2.sh @@ -18,13 +18,14 @@ CONTEXT=4096 #support malloc device memory more than 4GB. export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1 +LOAD_MODE='--mmap' if [ $# -gt 0 ]; then GGML_SYCL_DEVICE=$1 echo "use $GGML_SYCL_DEVICE as main GPU" #use signle GPU only - ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none + ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none ${LOAD_MODE} else #use multiple GPUs with same max compute units - ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} + ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} ${LOAD_MODE} fi diff --git a/examples/sycl/run-llama3.sh b/examples/sycl/run-llama3.sh deleted file mode 100755 index feee5165e9..0000000000 --- a/examples/sycl/run-llama3.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/env bash - -# MIT license -# Copyright (C) 2025 Intel Corporation -# SPDX-License-Identifier: MIT - -# If you want more control, DPC++ Allows selecting a specific device through the -# following environment variable -export ONEAPI_DEVICE_SELECTOR="level_zero:0" -source /opt/intel/oneapi/setvars.sh - -#export GGML_SYCL_DEBUG=1 - -#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer. - -INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:" -MODEL_FILE=models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf -NGL=99 # Layers offloaded to the GPU. If the device runs out of memory, reduce this value according to the model you are using. -CONTEXT=4096 - -#support malloc device memory more than 4GB. -export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1 - -if [ $# -gt 0 ]; then - GGML_SYCL_DEVICE=$1 - echo "Using $GGML_SYCL_DEVICE as the main GPU" - ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none -else - #use multiple GPUs with same max compute units - ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -fi diff --git a/examples/sycl/test.sh b/examples/sycl/test.sh new file mode 100755 index 0000000000..140c191466 --- /dev/null +++ b/examples/sycl/test.sh @@ -0,0 +1,130 @@ +#!/bin/bash + +# MIT license +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: MIT + +Help() { + cat << EOF +Usage: $(basename "$0") [OPTIONS] + +This script processes files with specified options. + +Options: + -h, --help Display this help message and exit. + -c, --context Set context length. Bigger need more memory. + -p, --promote Prompt to start generation with. + -m, --model Full model file path. + -mg,--main-gpu Set main GPU ID (0 - n) for single GPU mode. + -sm,--split-mode How to split the model across multiple GPUs, one of: + - none: use one GPU only + - layer (default): split layers and KV across GPUs + - row: split rows across GPUs + -ngl,--n-gpu-layers Max. number of layers to store in VRAM (default: -1) + -lv,--log-verbosity Set the verbosity threshold. Messages with a higher verbosity will be + ignored. Values: + - 0: generic output + - 1: error + - 2: warning + - 3: info + - 4: debug + + +EOF +} + +BIN_FILE=./build/bin/llama-completion +SEED=0 +GPUS_SETTING="" + +INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:" +MODEL_FILE=models/llama-2-7b.Q4_0.gguf +NGL=99 +CONTEXT=4096 +GGML_SYCL_DEVICE=-1 +SPLIT_MODE=layer +LOG_VERBOSE=3 +while [[ $# -gt 0 ]]; do + case "$1" in + -c|--context) + CONTEXT=$2 + # Shift twice to consume both the option flag and its value + shift + shift + ;; + -p|--promote) + # Option that is a simple flag (boolean) + INPUT_PROMPT="$2" + # Shift once to consume the option flag + shift + shift + ;; + -m|--model) + MODEL_FILE="$2" + # Shift twice to consume both the option flag and its value + shift + shift + ;; + -mg|--main-gpu) + GGML_SYCL_DEVICE=$2 + SPLIT_MODE=none + # Shift twice to consume both the option flag and its value + shift + shift + ;; + -sm|--split-mode) + SPLIT_MODE=$2 + # Shift twice to consume both the option flag and its value + shift + shift + ;; + -ngl|--n-gpu-layers) + NGL=$2 + # Shift twice to consume both the option flag and its value + shift + shift + ;; + -lv|--log-verbosity) + LOG_VERBOSE=$2 + # Shift twice to consume both the option flag and its value + shift + shift + ;; + -h|--help) + Help + exit 0 + ;; + *) + # Handle unknown options or stop processing options + echo "Invalid option: $1" + # Optional: exit script or shift to treat remaining as positional args + exit 1 + ;; + esac +done + + + +source /opt/intel/oneapi/setvars.sh + +#export GGML_SYCL_DEBUG=1 + +#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer. + +#support malloc device memory more than 4GB. +export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1 +echo "UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=${UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS}" + +if [ $GGML_SYCL_DEVICE -ne -1 ]; then + echo "Use $GGML_SYCL_DEVICE as main GPU" + #use signle GPU only + GPUS_SETTING="-mg $GGML_SYCL_DEVICE -sm ${SPLIT_MODE}" + export ONEAPI_DEVICE_SELECTOR="level_zero:${$GGML_SYCL_DEVICE}" + echo "ONEAPI_DEVICE_SELECTOR=${ONEAPI_DEVICE_SELECTOR}" +else + echo "Use all Intel GPUs, including iGPU & dGPU" + fi + +echo "run cmd: ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap " +ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap + diff --git a/examples/sycl/win-run-llama2.bat b/examples/sycl/win-run-llama2.bat index 32ff673ae2..1f2dab8d0a 100644 --- a/examples/sycl/win-run-llama2.bat +++ b/examples/sycl/win-run-llama2.bat @@ -7,5 +7,5 @@ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:" :: support malloc device memory more than 4GB. set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1 - -.\build\bin\llama-completion.exe -m models\llama-2-7b.Q4_0.gguf -no-cnv -p %INPUT2% -n 400 -e -ngl 99 -s 0 +set LOAD_MODE="--mmap" +.\build\bin\llama-completion.exe -m models\llama-2-7b.Q4_0.gguf -no-cnv -p %INPUT2% -n 400 -e -ngl 99 -s 0 %LOAD_MODE% diff --git a/examples/sycl/win-run-llama3.bat b/examples/sycl/win-test.bat similarity index 69% rename from examples/sycl/win-run-llama3.bat rename to examples/sycl/win-test.bat index ea4ae69d6c..1f2dab8d0a 100644 --- a/examples/sycl/win-run-llama3.bat +++ b/examples/sycl/win-test.bat @@ -7,5 +7,5 @@ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:" :: support malloc device memory more than 4GB. set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1 - -.\build\bin\llama-completion.exe -m models\Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf -no-cnv -p %INPUT2% -n 400 -s 0 -e -ngl 99 +set LOAD_MODE="--mmap" +.\build\bin\llama-completion.exe -m models\llama-2-7b.Q4_0.gguf -no-cnv -p %INPUT2% -n 400 -e -ngl 99 -s 0 %LOAD_MODE%