diff --git a/.github/labeler.yml b/.github/labeler.yml index 08cfd7e0bc..e9b75bc29e 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -104,3 +104,20 @@ OpenCL: - any-glob-to-any-file: - ggml/include/ggml-opencl.h - ggml/src/ggml-opencl/** + - docs/backend/OPENCL.md +Hexagon: + - changed-files: + - any-glob-to-any-file: + - ggml/include/ggml-hexagon.h + - ggml/src/ggml-hexagon/** +WebGPU: + - changed-files: + - any-glob-to-any-file: + - ggml/include/ggml-webgpu.h + - ggml/src/ggml-webgpu/** +OpenVINO: + - changed-files: + - any-glob-to-any-file: + - ggml/include/ggml-openvino.h + - ggml/src/ggml-openvino/** + - docs/backend/OPENVINO.md diff --git a/.github/workflows/build-apple.yml b/.github/workflows/build-apple.yml index 51f0ef2302..b99e614666 100644 --- a/.github/workflows/build-apple.yml +++ b/.github/workflows/build-apple.yml @@ -46,7 +46,7 @@ jobs: uses: actions/checkout@v6 - name: ccache - uses: ggml-org/ccache-action@v1.2.16 + uses: ggml-org/ccache-action@v1.2.21 with: key: macOS-latest-ios evict-old-files: 1d @@ -124,7 +124,7 @@ jobs: uses: actions/checkout@v6 - name: ccache - uses: ggml-org/ccache-action@v1.2.16 + uses: ggml-org/ccache-action@v1.2.21 with: key: macOS-latest-tvos evict-old-files: 1d @@ -186,7 +186,7 @@ jobs: uses: actions/checkout@v6 - name: ccache - uses: ggml-org/ccache-action@v1.2.16 + uses: ggml-org/ccache-action@v1.2.21 with: key: macOS-latest-swift evict-old-files: 1d diff --git a/.github/workflows/build-sanitize.yml b/.github/workflows/build-sanitize.yml index 0b17857504..c7b73d1dd0 100644 --- a/.github/workflows/build-sanitize.yml +++ b/.github/workflows/build-sanitize.yml @@ -43,7 +43,7 @@ jobs: uses: actions/checkout@v6 - name: ccache - uses: ggml-org/ccache-action@v1.2.16 + uses: ggml-org/ccache-action@v1.2.21 with: key: ubuntu-latest-sanitizer-${{ matrix.sanitizer }} evict-old-files: 1d diff --git a/.github/workflows/build-self-hosted.yml b/.github/workflows/build-self-hosted.yml index 7c7710fe45..2944cb8401 100644 --- a/.github/workflows/build-self-hosted.yml +++ b/.github/workflows/build-self-hosted.yml @@ -97,19 +97,21 @@ jobs: vulkaninfo --summary GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp - ggml-ci-cpu-amx: - runs-on: [self-hosted, Linux, CPU, AMX] + # TODO: provision AMX-compatible machine + #ggml-ci-cpu-amx: + # runs-on: [self-hosted, Linux, CPU, AMX] - steps: - - name: Clone - id: checkout - uses: actions/checkout@v6 + # steps: + # - name: Clone + # id: checkout + # uses: actions/checkout@v6 - - name: Test - id: ggml-ci - run: | - bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp + # - name: Test + # id: ggml-ci + # run: | + # bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp + # TODO: provision AMD GPU machine # ggml-ci-amd-vulkan: # runs-on: [self-hosted, Linux, AMD] @@ -124,6 +126,7 @@ jobs: # vulkaninfo --summary # GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp + # TODO: provision AMD GPU machine # ggml-ci-amd-rocm: # runs-on: [self-hosted, Linux, AMD] diff --git a/.github/workflows/build-vulkan.yml b/.github/workflows/build-vulkan.yml index b25ec51471..dba240a37e 100644 --- a/.github/workflows/build-vulkan.yml +++ b/.github/workflows/build-vulkan.yml @@ -45,7 +45,7 @@ jobs: uses: actions/checkout@v6 - name: ccache - uses: ggml-org/ccache-action@v1.2.16 + uses: ggml-org/ccache-action@v1.2.21 with: key: ubuntu-24-vulkan-llvmpipe evict-old-files: 1d diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index fef08d4c00..6d500d3098 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -69,7 +69,7 @@ jobs: uses: actions/checkout@v6 - name: ccache - uses: ggml-org/ccache-action@v1.2.16 + uses: ggml-org/ccache-action@v1.2.21 with: key: macOS-latest-arm64 evict-old-files: 1d @@ -105,7 +105,7 @@ jobs: uses: actions/checkout@v6 - name: ccache - uses: ggml-org/ccache-action@v1.2.16 + uses: ggml-org/ccache-action@v1.2.21 with: key: macOS-latest-x64 evict-old-files: 1d @@ -141,7 +141,7 @@ jobs: uses: actions/checkout@v6 - name: ccache - uses: ggml-org/ccache-action@v1.2.16 + uses: ggml-org/ccache-action@v1.2.21 with: key: macOS-latest-arm64-webgpu evict-old-files: 1d @@ -195,7 +195,8 @@ jobs: uses: actions/checkout@v6 - name: ccache - uses: ggml-org/ccache-action@v1.2.16 + if: ${{ matrix.build != 's390x' && matrix.build != 'ppc64le' }} + uses: ggml-org/ccache-action@v1.2.21 with: key: ubuntu-cpu-${{ matrix.build }} evict-old-files: 1d @@ -324,7 +325,7 @@ jobs: uses: actions/checkout@v6 - name: ccache - uses: ggml-org/ccache-action@v1.2.16 + uses: ggml-org/ccache-action@v1.2.21 with: key: ubuntu-24-webgpu evict-old-files: 1d @@ -436,7 +437,7 @@ jobs: sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libssl-dev rocwmma-dev - name: ccache - uses: ggml-org/ccache-action@v1.2.16 + uses: ggml-org/ccache-action@v1.2.21 with: key: ubuntu-22-hip evict-old-files: 1d @@ -467,7 +468,7 @@ jobs: apt-get install -y build-essential git cmake libssl-dev - name: ccache - uses: ggml-org/ccache-action@v1.2.16 + uses: ggml-org/ccache-action@v1.2.21 with: key: ubuntu-22-musa evict-old-files: 1d @@ -513,7 +514,7 @@ jobs: uses: actions/checkout@v6 - name: ccache - uses: ggml-org/ccache-action@v1.2.16 + uses: ggml-org/ccache-action@v1.2.21 with: key: ubuntu-22-sycl evict-old-files: 1d @@ -562,7 +563,7 @@ jobs: uses: actions/checkout@v6 - name: ccache - uses: ggml-org/ccache-action@v1.2.16 + uses: ggml-org/ccache-action@v1.2.21 with: key: ubuntu-22-sycl-fp16 evict-old-files: 1d @@ -605,7 +606,7 @@ jobs: - name: ccache if: runner.environment == 'github-hosted' - uses: ggml-org/ccache-action@v1.2.16 + uses: ggml-org/ccache-action@v1.2.21 with: key: ubuntu-24-openvino-${{ matrix.variant }}-no-preset-v1 evict-old-files: 1d @@ -692,7 +693,7 @@ jobs: uses: actions/checkout@v6 - name: ccache - uses: ggml-org/ccache-action@v1.2.16 + uses: ggml-org/ccache-action@v1.2.21 with: key: windows-latest-${{ matrix.build }} variant: ccache @@ -798,7 +799,7 @@ jobs: apt install -y cmake build-essential ninja-build libgomp1 git libssl-dev - name: ccache - uses: ggml-org/ccache-action@v1.2.16 + uses: ggml-org/ccache-action@v1.2.21 with: key: ubuntu-latest-cuda evict-old-files: 1d @@ -830,7 +831,7 @@ jobs: uses: actions/checkout@v6 - name: Install ccache - uses: ggml-org/ccache-action@v1.2.16 + uses: ggml-org/ccache-action@v1.2.21 with: key: windows-cuda-${{ matrix.cuda }} variant: ccache @@ -883,7 +884,7 @@ jobs: uses: actions/checkout@v6 - name: ccache - uses: ggml-org/ccache-action@v1.2.16 + uses: ggml-org/ccache-action@v1.2.21 with: key: windows-latest-sycl variant: ccache @@ -944,7 +945,7 @@ jobs: & $clangPath.FullName --version - name: Install ccache - uses: ggml-org/ccache-action@v1.2.16 + uses: ggml-org/ccache-action@v1.2.21 with: key: ${{ github.job }} evict-old-files: 1d @@ -1068,7 +1069,7 @@ jobs: uses: actions/checkout@v6 - name: ccache - uses: ggml-org/ccache-action@v1.2.16 + uses: ggml-org/ccache-action@v1.2.21 with: key: ggml-ci-x64-cpu-low-perf evict-old-files: 1d @@ -1094,7 +1095,7 @@ jobs: uses: actions/checkout@v6 - name: ccache - uses: ggml-org/ccache-action@v1.2.16 + uses: ggml-org/ccache-action@v1.2.21 with: key: ggml-ci-arm64-cpu-low-perf evict-old-files: 1d @@ -1120,7 +1121,7 @@ jobs: uses: actions/checkout@v6 - name: ccache - uses: ggml-org/ccache-action@v1.2.16 + uses: ggml-org/ccache-action@v1.2.21 with: key: ggml-ci-x64-cpu-high-perf evict-old-files: 1d @@ -1146,7 +1147,7 @@ jobs: uses: actions/checkout@v6 - name: ccache - uses: ggml-org/ccache-action@v1.2.16 + uses: ggml-org/ccache-action@v1.2.21 with: key: ggml-ci-arm64-cpu-high-perf evict-old-files: 1d @@ -1172,7 +1173,7 @@ jobs: uses: actions/checkout@v6 - name: ccache - uses: ggml-org/ccache-action@v1.2.16 + uses: ggml-org/ccache-action@v1.2.21 with: key: ggml-ci-arm64-cpu-high-perf-sve evict-old-files: 1d @@ -1198,7 +1199,7 @@ jobs: uses: actions/checkout@v6 - name: ccache - uses: ggml-org/ccache-action@v1.2.16 + uses: ggml-org/ccache-action@v1.2.21 with: key: ggml-ci-arm64-cpu-kleidiai evict-old-files: 1d @@ -1250,7 +1251,7 @@ jobs: sudo apt-get install -y cmake - name: ccache - uses: ggml-org/ccache-action@v1.2.16 + uses: ggml-org/ccache-action@v1.2.21 with: key: ggml-ci-arm64-cpu-kleidiai-graviton4 evict-old-files: 1d diff --git a/.github/workflows/copilot-setup-steps.yml b/.github/workflows/copilot-setup-steps.yml index fc3cec5ea1..749debee41 100644 --- a/.github/workflows/copilot-setup-steps.yml +++ b/.github/workflows/copilot-setup-steps.yml @@ -29,7 +29,7 @@ jobs: uses: actions/checkout@v6 - name: ccache - uses: ggml-org/ccache-action@v1.2.16 + uses: ggml-org/ccache-action@v1.2.21 with: key: copilot-setup-steps evict-old-files: 1d @@ -52,6 +52,6 @@ jobs: - name: Install Python dependencies run: | python3 -m venv .venv - .venv/bin/activate + source .venv/bin/activate pip install -r requirements/requirements-all.txt -r tools/server/tests/requirements.txt pip install flake8 pyright pre-commit diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index b0f2714ffd..c3181f1772 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -47,7 +47,7 @@ jobs: fetch-depth: 0 - name: ccache - uses: ggml-org/ccache-action@v1.2.16 + uses: ggml-org/ccache-action@v1.2.21 with: key: macOS-latest-arm64 evict-old-files: 1d @@ -94,7 +94,7 @@ jobs: fetch-depth: 0 - name: ccache - uses: ggml-org/ccache-action@v1.2.16 + uses: ggml-org/ccache-action@v1.2.21 with: key: macOS-latest-x64 evict-old-files: 1d @@ -153,7 +153,8 @@ jobs: fetch-depth: 0 - name: ccache - uses: ggml-org/ccache-action@v1.2.16 + if: ${{ matrix.build != 's390x' }} + uses: ggml-org/ccache-action@v1.2.21 with: key: ubuntu-cpu-${{ matrix.build }} evict-old-files: 1d @@ -204,7 +205,7 @@ jobs: fetch-depth: 0 - name: ccache - uses: ggml-org/ccache-action@v1.2.16 + uses: ggml-org/ccache-action@v1.2.21 with: key: ubuntu-22-vulkan evict-old-files: 1d @@ -269,7 +270,7 @@ jobs: fetch-depth: 0 - name: ccache - uses: ggml-org/ccache-action@v1.2.16 + uses: ggml-org/ccache-action@v1.2.21 with: key: ubuntu-24-openvino-release-no-preset-v1 evict-old-files: 1d @@ -342,7 +343,7 @@ jobs: fetch-depth: 0 - name: ccache - uses: ggml-org/ccache-action@v1.2.16 + uses: ggml-org/ccache-action@v1.2.21 with: key: windows-latest-cpu-${{ matrix.arch }} variant: ccache @@ -403,7 +404,7 @@ jobs: uses: actions/checkout@v6 - name: ccache - uses: ggml-org/ccache-action@v1.2.16 + uses: ggml-org/ccache-action@v1.2.21 with: key: windows-latest-${{ matrix.backend }}-${{ matrix.arch }} variant: ccache @@ -473,7 +474,7 @@ jobs: uses: actions/checkout@v6 - name: Install ccache - uses: ggml-org/ccache-action@v1.2.16 + uses: ggml-org/ccache-action@v1.2.21 with: key: windows-cuda-${{ matrix.cuda }} variant: ccache @@ -549,7 +550,7 @@ jobs: uses: actions/checkout@v6 - name: ccache - uses: ggml-org/ccache-action@v1.2.16 + uses: ggml-org/ccache-action@v1.2.21 with: key: windows-latest-sycl variant: ccache @@ -629,7 +630,7 @@ jobs: fetch-depth: 0 - name: ccache - uses: ggml-org/ccache-action@v1.2.16 + uses: ggml-org/ccache-action@v1.2.21 with: key: ubuntu-rocm-${{ matrix.ROCM_VERSION }}-${{ matrix.build }} evict-old-files: 1d @@ -739,7 +740,7 @@ jobs: key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }} - name: ccache - uses: ggml-org/ccache-action@v1.2.16 + uses: ggml-org/ccache-action@v1.2.21 with: key: windows-latest-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}-x64 evict-old-files: 1d diff --git a/benches/nemotron/nemotron-dgx-spark.md b/benches/nemotron/nemotron-dgx-spark.md index 2bce30a30e..420664194f 100644 --- a/benches/nemotron/nemotron-dgx-spark.md +++ b/benches/nemotron/nemotron-dgx-spark.md @@ -24,9 +24,9 @@ Fri Mar 6 11:39:45 2026 +-----------------------------------------+------------------------+----------------------+ ``` -## ggml-org/nemotron-3-super-120b-GGUF +## ggml-org/Nemotron-3-Super-120B-GGUF -Model: https://huggingface.co/ggml-org/nemotron-3-super-120b-GGUF +Model: https://huggingface.co/ggml-org/Nemotron-3-Super-120B-GGUF - `llama-batched-bench` @@ -53,7 +53,6 @@ main: n_kv_max = 303104, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_ | 8192 | 32 | 16 | 131584 | 171.066 | 766.21 | 10.774 | 47.52 | 181.840 | 723.62 | | 8192 | 32 | 32 | 263168 | 342.140 | 766.19 | 18.969 | 53.98 | 361.109 | 728.78 | - - `llama-bench` | model | size | params | backend | n_ubatch | fa | test | t/s | @@ -70,3 +69,49 @@ main: n_kv_max = 303104, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_ | nemotron 120B.A12B Q4_K | 65.10 GiB | 120.67 B | CUDA | 2048 | 1 | tg32 @ d32768 | 19.45 ± 0.18 | build: 04a65daab (8268) + +## ggml-org/Nemotron-3-Nano-4B-GGUF + +Model: https://huggingface.co/ggml-org/Nemotron-3-Nano-4B-GGUF + +- `llama-batched-bench` + +main: n_kv_max = 303104, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = 99, n_threads = 20, n_threads_batch = 20 + +| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s | +|-------|--------|------|--------|----------|----------|----------|----------|----------|----------| +| 512 | 32 | 1 | 544 | 0.152 | 3371.61 | 0.597 | 53.64 | 0.748 | 726.90 | +| 512 | 32 | 2 | 1088 | 0.319 | 3208.68 | 0.857 | 74.66 | 1.176 | 924.89 | +| 512 | 32 | 4 | 2176 | 0.720 | 2843.56 | 1.323 | 96.78 | 2.043 | 1065.18 | +| 512 | 32 | 8 | 4352 | 1.428 | 2867.96 | 2.311 | 110.76 | 3.739 | 1163.82 | +| 512 | 32 | 16 | 8704 | 2.857 | 2866.94 | 4.203 | 121.82 | 7.060 | 1232.82 | +| 512 | 32 | 32 | 17408 | 5.709 | 2869.76 | 7.964 | 128.58 | 13.673 | 1273.14 | +| 4096 | 32 | 1 | 4128 | 1.458 | 2809.76 | 0.605 | 52.92 | 2.062 | 2001.52 | +| 4096 | 32 | 2 | 8256 | 2.905 | 2819.95 | 0.875 | 73.12 | 3.780 | 2183.95 | +| 4096 | 32 | 4 | 16512 | 5.790 | 2829.74 | 1.361 | 94.07 | 7.151 | 2309.17 | +| 4096 | 32 | 8 | 33024 | 11.598 | 2825.32 | 2.378 | 107.65 | 13.976 | 2362.89 | +| 4096 | 32 | 16 | 66048 | 23.208 | 2823.88 | 4.348 | 117.76 | 27.556 | 2396.89 | +| 4096 | 32 | 32 | 132096 | 46.515 | 2817.85 | 8.279 | 123.69 | 54.794 | 2410.79 | +| 8192 | 32 | 1 | 8224 | 2.950 | 2776.95 | 0.617 | 51.89 | 3.567 | 2305.75 | +| 8192 | 32 | 2 | 16448 | 5.921 | 2767.32 | 0.896 | 71.45 | 6.816 | 2413.05 | +| 8192 | 32 | 4 | 32896 | 11.842 | 2767.21 | 1.401 | 91.34 | 13.243 | 2484.03 | +| 8192 | 32 | 8 | 65792 | 23.726 | 2762.17 | 2.461 | 104.03 | 26.187 | 2512.38 | +| 8192 | 32 | 16 | 131584 | 47.777 | 2743.43 | 4.577 | 111.86 | 52.354 | 2513.36 | +| 8192 | 32 | 32 | 263168 | 96.691 | 2711.16 | 8.772 | 116.73 | 105.463 | 2495.36 | + +- `llama-bench` + +| model | size | params | backend | n_ubatch | fa | test | t/s | +| ----------------------- | ---------: | ---------: | ---------- | -------: | -: | --------------: | -------------------: | +| nemotron 4B Q8_0 | 3.94 GiB | 3.97 B | CUDA | 2048 | 1 | pp2048 | 2761.90 ± 19.31 | +| nemotron 4B Q8_0 | 3.94 GiB | 3.97 B | CUDA | 2048 | 1 | tg32 | 52.85 ± 0.12 | +| nemotron 4B Q8_0 | 3.94 GiB | 3.97 B | CUDA | 2048 | 1 | pp2048 @ d4096 | 2687.07 ± 21.84 | +| nemotron 4B Q8_0 | 3.94 GiB | 3.97 B | CUDA | 2048 | 1 | tg32 @ d4096 | 52.32 ± 0.23 | +| nemotron 4B Q8_0 | 3.94 GiB | 3.97 B | CUDA | 2048 | 1 | pp2048 @ d8192 | 2564.52 ± 57.69 | +| nemotron 4B Q8_0 | 3.94 GiB | 3.97 B | CUDA | 2048 | 1 | tg32 @ d8192 | 51.27 ± 0.34 | +| nemotron 4B Q8_0 | 3.94 GiB | 3.97 B | CUDA | 2048 | 1 | pp2048 @ d16384 | 2334.02 ± 37.83 | +| nemotron 4B Q8_0 | 3.94 GiB | 3.97 B | CUDA | 2048 | 1 | tg32 @ d16384 | 49.71 ± 0.14 | +| nemotron 4B Q8_0 | 3.94 GiB | 3.97 B | CUDA | 2048 | 1 | pp2048 @ d32768 | 2041.46 ± 40.45 | +| nemotron 4B Q8_0 | 3.94 GiB | 3.97 B | CUDA | 2048 | 1 | tg32 @ d32768 | 46.71 ± 0.13 | + +build: 1bbec6a75 (8382) diff --git a/common/arg.cpp b/common/arg.cpp index 10aa1b5e4f..666339a094 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3115,6 +3115,17 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.chat_template = read_file(value); } ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE")); + add_opt(common_arg( + {"--skip-chat-parsing"}, + {"--no-skip-chat-parsing"}, + string_format( + "force a pure content parser, even if a Jinja template is specified; model will output everything " + "in the content section, including any reasoning and/or tool calls (default: disabled)" + ), + [](common_params & params, bool value) { + params.force_pure_content_parser = value; + } + ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SKIP_CHAT_PARSING")); add_opt(common_arg( {"--prefill-assistant"}, {"--no-prefill-assistant"}, diff --git a/common/chat.cpp b/common/chat.cpp index cfd5df30a7..fb4751e531 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -1519,7 +1519,6 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_ // map developer to system for all models except for GPT-OSS workaround::map_developer_role_to_system(params.messages); } - workaround::func_args_not_string(params.messages); if (!tmpl.original_caps().supports_system_role) { workaround::system_message_not_supported(params.messages); @@ -1532,6 +1531,10 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_ workaround::requires_non_null_content(params.messages); } + if (tmpl.original_caps().supports_object_arguments) { + workaround::func_args_not_string(params.messages); + } + params.extra_context = common_chat_extra_context(); for (auto el : inputs.chat_template_kwargs) { params.extra_context[el.first] = json::parse(el.second); @@ -1559,6 +1562,21 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_ } } + if (inputs.force_pure_content) { + LOG_WRN("Forcing pure content template, will not render reasoning or tools separately."); + // Create the result structure + common_chat_params data; + auto params_copy = params; + params_copy.reasoning_format = COMMON_REASONING_FORMAT_NONE; + data.prompt = common_chat_template_direct_apply(tmpl, params_copy); + data.format = COMMON_CHAT_FORMAT_PEG_NATIVE; + auto parser = build_chat_peg_parser([](common_chat_peg_builder &p) { + return p.content(p.rest()); + }); + data.parser = parser.save(); + return data; + } + // Ministral/Mistral Large 3 - uses special reasoning structure fixes, can't use autoparser // Note: Mistral Small 3.2 uses [CALL_ID] which Ministral doesn't have, so we can distinguish them if (src.find("[SYSTEM_PROMPT]") != std::string::npos && src.find("[TOOL_CALLS]") != std::string::npos && diff --git a/common/chat.h b/common/chat.h index 930987cf77..23e80baf69 100644 --- a/common/chat.h +++ b/common/chat.h @@ -204,6 +204,7 @@ struct common_chat_templates_inputs { std::map chat_template_kwargs; bool add_bos = false; bool add_eos = false; + bool force_pure_content = false; }; struct common_chat_params { diff --git a/common/common.h b/common/common.h index ee7a2d805e..073ef566d2 100644 --- a/common/common.h +++ b/common/common.h @@ -544,6 +544,7 @@ struct common_params { std::string chat_template = ""; // NOLINT bool use_jinja = true; // NOLINT bool enable_chat_template = true; + bool force_pure_content_parser = false; common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; int enable_reasoning = -1; // -1 = auto, 0 = disable, 1 = enable int reasoning_budget = -1; diff --git a/common/jinja/caps.cpp b/common/jinja/caps.cpp index 1158d5e5d6..ec207a53e8 100644 --- a/common/jinja/caps.cpp +++ b/common/jinja/caps.cpp @@ -75,6 +75,7 @@ std::map caps::to_map() const { {"supports_parallel_tool_calls", supports_parallel_tool_calls}, {"supports_system_role", supports_system_role}, {"supports_preserve_reasoning", supports_preserve_reasoning}, + {"supports_object_arguments", supports_object_arguments}, }; } @@ -158,9 +159,9 @@ caps caps_get(jinja::program & prog) { } ); - JJ_DEBUG("%s\n", ">>> Running capability check: single tool support"); + JJ_DEBUG("%s\n", ">>> Running capability check: single tool with object arguments support"); - // case: tools support: single call + // case: tools support: single call with object arguments caps_try_execute( prog, [&]() { @@ -226,9 +227,7 @@ caps caps_get(jinja::program & prog) { }, [&](bool success, value & messages, value & tools) { if (!success) { - result.supports_tool_calls = false; - result.supports_tools = false; - return; + return; // Nothing can be inferred } auto & tool_name = tools->at(0)->at("function")->at("name"); @@ -242,16 +241,117 @@ caps caps_get(jinja::program & prog) { caps_print_stats(tool_calls, "messages[1].tool_calls"); if (!tool_calls->stats.used) { result.supports_tool_calls = false; + return; + } + + auto & tool_arg = tool_calls->at(0)->at("function")->at("arguments")->at("arg"); + caps_print_stats(tool_arg, "messages[1].tool_calls[0].function.arguments.arg"); + if (tool_arg->stats.used) { + result.supports_object_arguments = true; } } ); + if (!result.supports_object_arguments) { + JJ_DEBUG("%s\n", ">>> Running capability check: single tool with string arguments support"); + + // case: tools support: single call with string arguments + caps_try_execute( + prog, + [&]() { + // messages + return json::array({ + { + {"role", "user"}, + {"content", "User message"}, + }, + { + {"role", "assistant"}, + {"content", ""}, // Some templates expect content to be empty with tool calls + {"tool_calls", json::array({ + { + {"id", "call00001"}, + {"type", "function"}, + {"function", { + {"name", "tool1"}, + {"arguments", R"({"arg": "value"})"} + }} + } + })} + }, + { + {"role", "tool"}, + {"content", "Tool response"}, + {"tool_call_id", "call00001"} + }, + { + {"role", "assistant"}, + {"content", "The tool response was 'tool response'"} + }, + { + {"role", "user"}, + {"content", "User message"}, + }, + }); + }, + [&]() { + // tools + return json::array({ + { + {"name", "tool"}, + {"type", "function"}, + {"function", { + {"name", "tool1"}, + {"description", "Tool description"}, + {"parameters", { + {"type", "object"}, + {"properties", { + {"arg", { + {"type", "string"}, + {"description", "Arg description"}, + }}, + }}, + {"required", json::array({ "arg" })}, + }}, + }}, + }, + }); + }, + [&](bool success, value & messages, value & tools) { + if (!success) { + result.supports_tool_calls = false; + result.supports_tools = false; + return; + } + + auto & tool_name = tools->at(0)->at("function")->at("name"); + caps_print_stats(tool_name, "tools[0].function.name"); + caps_print_stats(tools, "tools"); + if (!tool_name->stats.used) { + result.supports_tools = false; + } + + auto & tool_calls = messages->at(1)->at("tool_calls"); + caps_print_stats(tool_calls, "messages[1].tool_calls"); + if (!tool_calls->stats.used) { + result.supports_tool_calls = false; + return; + } + } + ); + } + JJ_DEBUG("%s\n", ">>> Running capability check: parallel tool support"); // case: tools support: parallel calls caps_try_execute( prog, [&]() { + json args = json(R"({"arg": "value"})"); + if (result.supports_object_arguments) { + args = json{{"arg", "value"}}; + } + // messages return json::array({ { @@ -267,9 +367,7 @@ caps caps_get(jinja::program & prog) { {"type", "function"}, {"function", { {"name", "tool1"}, - {"arguments", { - {"arg", "value"} - }} + {"arguments", args} }} }, { @@ -277,9 +375,7 @@ caps caps_get(jinja::program & prog) { {"type", "function"}, {"function", { {"name", "tool1"}, - {"arguments", { - {"arg", "value"} - }} + {"arguments", args} }} } })} @@ -328,7 +424,7 @@ caps caps_get(jinja::program & prog) { return; } - auto & tool_calls = messages->at(1)->at("tool_calls");; + auto & tool_calls = messages->at(1)->at("tool_calls"); caps_print_stats(tool_calls, "messages[1].tool_calls"); // check for second tool call usage diff --git a/common/jinja/caps.h b/common/jinja/caps.h index e694e7bfaa..93a7fe0926 100644 --- a/common/jinja/caps.h +++ b/common/jinja/caps.h @@ -18,6 +18,8 @@ struct caps { bool supports_string_content = true; bool supports_typed_content = false; + bool supports_object_arguments = false; + // for reporting on server std::map to_map() const; diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index b4ff8dd959..46469c8620 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -298,11 +298,16 @@ class ModelBase: scale = scale.float() if block_size is not None: + dim_offset = scale.ndim - len(block_size) for i, size in enumerate(block_size): - scale = scale.repeat_interleave(size, i) + scale = scale.repeat_interleave(size, dim_offset + i) # unpad the scale (e.g. when the tensor size isn't a multiple of the block size) scale = scale[tuple(slice(0, size) for size in weight.shape)] + # align scale dims to weight for correct broadcasting (e.g. [128] -> [128, 1, 1]) + while scale.ndim < weight.ndim: + scale = scale.unsqueeze(-1) + return weight.float() * scale # ref: https://github.com/ModelCloud/GPTQModel/blob/037c5c0f6c9e33c500d975b038d02e7ca437546d/gptqmodel/nn_modules/qlinear/__init__.py#L437-L476 @@ -393,7 +398,7 @@ class ModelBase: elif quant_method == "fp8": block_size = quant_config.get("weight_block_size") for name in self.model_tensors.keys(): - if name.endswith(".weight_scale_inv"): + if name.endswith("_scale_inv"): weight_name = name.removesuffix("_scale_inv") w = self.model_tensors[weight_name] s = self.model_tensors[name] @@ -401,6 +406,8 @@ class ModelBase: tensors_to_remove.append(name) if name.endswith(".activation_scale"): # unused tensors_to_remove.append(name) + if name.endswith("_activation_scale"): # Mistral-Small-4-119B-2602, unused + tensors_to_remove.append(name) # mistral format if name.endswith(".qscale_weight"): weight_name = name.removesuffix("qscale_weight") + "weight" @@ -3031,10 +3038,16 @@ class LlavaVisionModel(MmprojModel): def get_token_id(self, token: str) -> int: tokenizer_config_file = self.dir_model / 'tokenizer_config.json' with open(tokenizer_config_file, "r", encoding="utf-8") as f: - added_tokens_decoder = json.load(f)['added_tokens_decoder'] + added_tokens_decoder = json.load(f).get('added_tokens_decoder') or {} for id_, token_data in added_tokens_decoder.items(): - if token_data["content"] == token: + if token_data.get("content") == token: return int(id_) + # fallthrough to tokenizer.json + with open(self.dir_model / "tokenizer.json", "r", encoding="utf-8") as f: + tokenizer_json = json.load(f) + for token_data in tokenizer_json["added_tokens"]: + if token_data["content"] == token: + return int(token_data["id"]) raise ValueError(f"Token '{token}' not found in tokenizer config.") def set_gguf_parameters(self): @@ -3198,40 +3211,6 @@ class Llama4VisionModel(MmprojModel): yield from super().modify_tensors(data_torch, name, bid) -@ModelBase.register( - "Mistral3ForConditionalGeneration", - "Ministral3ForCausalLM", -) -class Mistral3Model(LlamaModel): - model_arch = gguf.MODEL_ARCH.MISTRAL3 - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - # for compatibility, we use LLAMA arch for older models - # TODO: remove this once everyone has migrated to newer version of llama.cpp - if self.hparams.get("model_type") != "ministral3": - self.model_arch = gguf.MODEL_ARCH.LLAMA - self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch] - self.gguf_writer.add_architecture() - self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - rope_params = self.rope_parameters - if self.hparams.get("model_type") == "ministral3": - assert rope_params, "ministral3 must have 'rope_parameters' config" - assert rope_params["rope_type"] == "yarn", "ministral3 rope_type must be 'yarn'" - self.gguf_writer.add_rope_scaling_yarn_log_mul(rope_params["mscale_all_dim"]) - self.gguf_writer.add_attn_temperature_scale(rope_params["llama_4_scaling_beta"]) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): - name = name.replace("language_model.", "") - if "multi_modal_projector" in name or "vision_tower" in name: - return - - yield from super().modify_tensors(data_torch, name, bid) - - @ModelBase.register("DeciLMForCausalLM") class DeciModel(TextModel): model_arch = gguf.MODEL_ARCH.DECI @@ -8271,6 +8250,8 @@ class DeepseekV2Model(TextModel): # TODO @ngxson : remove this when we support MTP for deepseek models skip_mtp = True + merge_expert = True + def set_vocab(self): try: self._set_vocab_gpt2() @@ -8409,7 +8390,7 @@ class DeepseekV2Model(TextModel): return # process the experts separately - if name.find("mlp.experts") != -1: + if self.merge_expert and name.find("mlp.experts") != -1: n_experts = self.hparams["n_routed_experts"] assert bid is not None @@ -8468,6 +8449,69 @@ class DeepseekV2Model(TextModel): raise ValueError(f"Unprocessed experts: {experts}") +@ModelBase.register( + "Mistral3ForConditionalGeneration", + "Ministral3ForCausalLM", +) +class Mistral3Model(TextModel): + class Ministral3Model(LlamaModel): + model_arch = gguf.MODEL_ARCH.MISTRAL3 + + def set_gguf_parameters(self): + super().set_gguf_parameters() + rope_params = self.rope_parameters + if self.hparams.get("model_type") == "ministral3": + assert rope_params, "ministral3 must have 'rope_parameters' config" + assert rope_params["rope_type"] == "yarn", "ministral3 rope_type must be 'yarn'" + self.gguf_writer.add_rope_scaling_yarn_log_mul(rope_params["mscale_all_dim"]) + self.gguf_writer.add_attn_temperature_scale(rope_params["llama_4_scaling_beta"]) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): + name = name.replace("language_model.", "") + if "multi_modal_projector" in name or "vision_tower" in name: + return + + yield from super().modify_tensors(data_torch, name, bid) + + class Mistral4Model(DeepseekV2Model): + model_arch = gguf.MODEL_ARCH.MISTRAL4 + skip_mtp = False # model contains no MTP layers, so no need to skip + merge_expert = False # experts are already stacked as 3D + + def modify_tensors(self, data_torch, name, bid): + if name.endswith(".down_proj") or name.endswith(".gate_up_proj"): + name = name + ".weight" + yield from super().modify_tensors(data_torch, name, bid) + + model_arch = gguf.MODEL_ARCH.MISTRAL3 # unused + impl: TextModel + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + if self.hparams.get("model_type") == "mistral4": + self.impl = Mistral3Model.Mistral4Model(*args, **kwargs) + else: + self.impl = Mistral3Model.Ministral3Model(*args, **kwargs) + + def set_vocab(self): + self.impl.set_vocab() + + def set_gguf_parameters(self): + self.impl.set_gguf_parameters() + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): + yield from self.impl.modify_tensors(data_torch, name, bid) + + def prepare_tensors(self): + self.impl.prepare_tensors() + + def write_vocab(self): + self.impl.write_vocab() + + def write(self): + self.impl.write() + + @ModelBase.register("MiniMaxM2ForCausalLM") class MiniMaxM2Model(TextModel): model_arch = gguf.MODEL_ARCH.MINIMAXM2 diff --git a/docs/ops.md b/docs/ops.md index 00bdcd5d2b..1357771442 100644 --- a/docs/ops.md +++ b/docs/ops.md @@ -117,5 +117,5 @@ Legend: | TOP_K | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ | | TRI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | | TRUNC | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ | -| UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ | +| UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ | | XIELU | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | diff --git a/docs/ops/SYCL.csv b/docs/ops/SYCL.csv index 13ac447cc1..afcb7e4b8e 100644 --- a/docs/ops/SYCL.csv +++ b/docs/ops/SYCL.csv @@ -5937,6 +5937,20 @@ "SYCL0","RMS_NORM_BACK","type=f32,ne=[1025,5,4,3],eps=0.100000","support","1","yes","SYCL" "SYCL0","L2_NORM","type=f32,ne=[1025,5,4,3],eps=0.100000,v=0","support","1","yes","SYCL" "SYCL0","L2_NORM","type=f32,ne=[1025,5,4,3],eps=0.100000,v=1","support","1","yes","SYCL" +"SYCL0","NORM","type=f32,ne=[64,5,4,3],v=0,eps=10.000000","support","1","yes","SYCL" +"SYCL0","RMS_NORM","type=f32,ne=[64,5,4,3],v=0,eps=10.000000,inplace=0","support","1","yes","SYCL" +"SYCL0","NORM","type=f32,ne=[64,5,4,3],v=1,eps=10.000000","support","1","yes","SYCL" +"SYCL0","RMS_NORM","type=f32,ne=[64,5,4,3],v=1,eps=10.000000,inplace=0","support","1","yes","SYCL" +"SYCL0","RMS_NORM_BACK","type=f32,ne=[64,5,4,3],eps=10.000000","support","1","yes","SYCL" +"SYCL0","L2_NORM","type=f32,ne=[64,5,4,3],eps=10.000000,v=0","support","1","yes","SYCL" +"SYCL0","L2_NORM","type=f32,ne=[64,5,4,3],eps=10.000000,v=1","support","1","yes","SYCL" +"SYCL0","NORM","type=f32,ne=[1025,5,4,3],v=0,eps=10.000000","support","1","yes","SYCL" +"SYCL0","RMS_NORM","type=f32,ne=[1025,5,4,3],v=0,eps=10.000000,inplace=0","support","1","yes","SYCL" +"SYCL0","NORM","type=f32,ne=[1025,5,4,3],v=1,eps=10.000000","support","1","yes","SYCL" +"SYCL0","RMS_NORM","type=f32,ne=[1025,5,4,3],v=1,eps=10.000000,inplace=0","support","1","yes","SYCL" +"SYCL0","RMS_NORM_BACK","type=f32,ne=[1025,5,4,3],eps=10.000000","support","1","yes","SYCL" +"SYCL0","L2_NORM","type=f32,ne=[1025,5,4,3],eps=10.000000,v=0","support","1","yes","SYCL" +"SYCL0","L2_NORM","type=f32,ne=[1025,5,4,3],eps=10.000000,v=1","support","1","yes","SYCL" "SYCL0","RMS_NORM","type=f32,ne=[64,5,4,3],v=0,eps=0.000001,inplace=1","support","1","yes","SYCL" "SYCL0","SSM_CONV","type=f32,ne_a=[3,1024,1,1],ne_b=[3,1024,1,1]","support","1","yes","SYCL" "SYCL0","SSM_CONV","type=f32,ne_a=[6,1024,1,1],ne_b=[3,1024,1,1]","support","1","yes","SYCL" @@ -10209,24 +10223,24 @@ "SYCL0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=1","support","1","yes","SYCL" "SYCL0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=nearest","support","1","yes","SYCL" "SYCL0","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=nearest","support","1","yes","SYCL" -"SYCL0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bilinear,transpose=0","support","0","no","SYCL" -"SYCL0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bilinear,transpose=1","support","0","no","SYCL" -"SYCL0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear","support","0","no","SYCL" -"SYCL0","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bilinear","support","0","no","SYCL" -"SYCL0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bicubic,transpose=0","support","0","no","SYCL" -"SYCL0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bicubic,transpose=1","support","0","no","SYCL" -"SYCL0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bicubic","support","0","no","SYCL" -"SYCL0","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bicubic","support","0","no","SYCL" -"SYCL0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bilinear|antialias,transpose=0","support","0","no","SYCL" -"SYCL0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bilinear|antialias,transpose=1","support","0","no","SYCL" -"SYCL0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear|antialias","support","0","no","SYCL" -"SYCL0","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bilinear|antialias","support","0","no","SYCL" -"SYCL0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear|align_corners","support","0","no","SYCL" -"SYCL0","UPSCALE","type=f32,ne=[1,4,3,2],ne_tgt=[2,8,3,2],mode=bilinear|align_corners","support","0","no","SYCL" -"SYCL0","UPSCALE","type=f32,ne=[4,1,3,2],ne_tgt=[1,1,3,2],mode=bilinear|align_corners","support","0","no","SYCL" -"SYCL0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bicubic|align_corners","support","0","no","SYCL" -"SYCL0","UPSCALE","type=f32,ne=[1,4,3,2],ne_tgt=[2,8,3,2],mode=bicubic|align_corners","support","0","no","SYCL" -"SYCL0","UPSCALE","type=f32,ne=[4,1,3,2],ne_tgt=[1,1,3,2],mode=bicubic|align_corners","support","0","no","SYCL" +"SYCL0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bilinear,transpose=0","support","1","yes","SYCL" +"SYCL0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bilinear,transpose=1","support","1","yes","SYCL" +"SYCL0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear","support","1","yes","SYCL" +"SYCL0","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bilinear","support","1","yes","SYCL" +"SYCL0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bicubic,transpose=0","support","1","yes","SYCL" +"SYCL0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bicubic,transpose=1","support","1","yes","SYCL" +"SYCL0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bicubic","support","1","yes","SYCL" +"SYCL0","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bicubic","support","1","yes","SYCL" +"SYCL0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bilinear|antialias,transpose=0","support","1","yes","SYCL" +"SYCL0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bilinear|antialias,transpose=1","support","1","yes","SYCL" +"SYCL0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear|antialias","support","1","yes","SYCL" +"SYCL0","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bilinear|antialias","support","1","yes","SYCL" +"SYCL0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear|align_corners","support","1","yes","SYCL" +"SYCL0","UPSCALE","type=f32,ne=[1,4,3,2],ne_tgt=[2,8,3,2],mode=bilinear|align_corners","support","1","yes","SYCL" +"SYCL0","UPSCALE","type=f32,ne=[4,1,3,2],ne_tgt=[1,1,3,2],mode=bilinear|align_corners","support","1","yes","SYCL" +"SYCL0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bicubic|align_corners","support","1","yes","SYCL" +"SYCL0","UPSCALE","type=f32,ne=[1,4,3,2],ne_tgt=[2,8,3,2],mode=bicubic|align_corners","support","1","yes","SYCL" +"SYCL0","UPSCALE","type=f32,ne=[4,1,3,2],ne_tgt=[1,1,3,2],mode=bicubic|align_corners","support","1","yes","SYCL" "SYCL0","SUM","type=f32,ne=[10,5,4,3]","support","1","yes","SYCL" "SYCL0","SUM","type=f32,ne=[11,5,6,3],permute=[0,2,1,3]","support","0","no","SYCL" "SYCL0","SUM","type=f32,ne=[11,5,6,3],permute=[0,3,2,1]","support","0","no","SYCL" @@ -13325,6 +13339,262 @@ "SYCL0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL" diff --git a/ggml/src/ggml-cpu/arch/arm/quants.c b/ggml/src/ggml-cpu/arch/arm/quants.c index c1856201b3..82b048bb3a 100644 --- a/ggml/src/ggml-cpu/arch/arm/quants.c +++ b/ggml/src/ggml-cpu/arch/arm/quants.c @@ -666,7 +666,7 @@ void ggml_vec_dot_nvfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo float sumf = 0; -#if defined __ARM_NEON +#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA) const int8x16_t values = vld1q_s8(kvalues_mxfp4); const uint8x16_t m4b = vdupq_n_u8(0x0f); float32x4_t acc = vdupq_n_f32(0.0f); diff --git a/ggml/src/ggml-cpu/arch/riscv/quants.c b/ggml/src/ggml-cpu/arch/riscv/quants.c index 826055dd9a..d7e9ba4634 100644 --- a/ggml/src/ggml-cpu/arch/riscv/quants.c +++ b/ggml/src/ggml-cpu/arch/riscv/quants.c @@ -115,10 +115,10 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) { assert(k % QK_K == 0); - block_q8_K * y_blocks = (block_q8_K *)y; size_t nb = k / QK_K; #if defined(__riscv_v_intrinsic) + block_q8_K * y_blocks = (block_q8_K *)y; const size_t vlmax_f32m8 = __riscv_vsetvlmax_e32m8(); for (size_t i = 0; i < nb; i++) { @@ -2052,6 +2052,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi #endif } +#if defined __riscv_v_intrinsic static void ggml_vec_dot_iq1_s_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); @@ -2147,6 +2148,7 @@ static void ggml_vec_dot_iq1_s_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t *s = sumf; } +#endif void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { #if defined __riscv_v_intrinsic @@ -2163,6 +2165,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo #endif } +#if defined __riscv_v_intrinsic static void ggml_vec_dot_iq1_m_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); @@ -2269,6 +2272,7 @@ static void ggml_vec_dot_iq1_m_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t *s = sumf; } +#endif void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { #if defined __riscv_v_intrinsic @@ -2285,6 +2289,7 @@ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo #endif } +#if defined __riscv_v_intrinsic static const uint8_t sign_gather_indices_arr[64] = { 0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1, 2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 5,5,5,5,5,5,5,5, 6,6,6,6,6,6,6,6, 7,7,7,7,7,7,7,7 @@ -2488,6 +2493,7 @@ static void ggml_vec_dot_iq2_s_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t } *s = 0.125f * sumf; } +#endif void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { #if defined __riscv_v_intrinsic @@ -2507,7 +2513,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo #endif } -#if defined(__riscv_v) +#if defined(__riscv_v_intrinsic) static const int8_t keven_signs_q2xs[1024] = { 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1, @@ -2542,7 +2548,6 @@ static const int8_t keven_signs_q2xs[1024] = { 1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, }; -#endif static void ggml_vec_dot_iq2_xs_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(n % QK_K == 0); @@ -2618,6 +2623,7 @@ static void ggml_vec_dot_iq2_xs_q8_K_vl256(int n, float * GGML_RESTRICT s, size_ } *s = 0.125f * sumf; } +#endif void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { #if defined __riscv_v_intrinsic @@ -2634,6 +2640,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v #endif } +#if defined __riscv_v_intrinsic static void ggml_vec_dot_iq2_xxs_q8_K_vl128(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); @@ -2818,6 +2825,7 @@ static void ggml_vec_dot_iq2_xxs_q8_K_vl256(int n, float * GGML_RESTRICT s, size } *s = 0.125f * sumf; } +#endif void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { #if defined __riscv_v_intrinsic @@ -2830,10 +2838,11 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const break; } #else - ggml_vec_dot_iq2_xxs_q8_K(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_iq2_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); #endif } +#if defined __riscv_v_intrinsic static void ggml_vec_dot_iq3_s_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(n % QK_K == 0); UNUSED(nrc); @@ -2928,6 +2937,7 @@ static void ggml_vec_dot_iq3_s_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t } *s = sumf; } +#endif void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { #if defined __riscv_v_intrinsic @@ -2944,6 +2954,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo #endif } +#if defined __riscv_v_intrinsic static void ggml_vec_dot_iq3_xxs_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); @@ -3036,6 +3047,7 @@ static void ggml_vec_dot_iq3_xxs_q8_K_vl256(int n, float * GGML_RESTRICT s, size } *s = 0.25f * sumf; } +#endif void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { #if defined __riscv_v_intrinsic @@ -3052,6 +3064,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const #endif } +#if defined __riscv_v_intrinsic static void ggml_vec_dot_iq4_nl_q8_0_vl128(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(nrc == 1); UNUSED(nrc); @@ -3161,6 +3174,7 @@ static void ggml_vec_dot_iq4_nl_q8_0_vl256(int n, float * GGML_RESTRICT s, size_ *s = sumf; } +#endif void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { #if defined __riscv_v_intrinsic @@ -3177,6 +3191,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v #endif } +#if defined __riscv_v_intrinsic static void ggml_vec_dot_iq4_xs_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(nrc == 1); UNUSED(nrc); @@ -3190,7 +3205,6 @@ static void ggml_vec_dot_iq4_xs_q8_K_vl256(int n, float * GGML_RESTRICT s, size_ const int nb = n / QK_K; -#if defined __riscv_v_intrinsic const vint8m4_t values = __riscv_vle8_v_i8m4(kvalues_iq4nl, 16); float sumf = 0; int acc[4]; @@ -3252,14 +3266,8 @@ static void ggml_vec_dot_iq4_xs_q8_K_vl256(int n, float * GGML_RESTRICT s, size_ } *s = sumf; - -#else - UNUSED(x); - UNUSED(y); - UNUSED(nb); - ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); -#endif } +#endif void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { #if defined __riscv_v_intrinsic @@ -3276,6 +3284,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v #endif } +#if defined __riscv_v_intrinsic static void ggml_vec_dot_tq1_0_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(nrc == 1); UNUSED(nrc); @@ -3381,6 +3390,7 @@ static void ggml_vec_dot_tq1_0_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t *s = sumf; } +#endif void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { #if defined __riscv_v_intrinsic @@ -3397,6 +3407,7 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo #endif } +#if defined __riscv_v_intrinsic static void ggml_vec_dot_tq2_0_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); @@ -3467,6 +3478,7 @@ static void ggml_vec_dot_tq2_0_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t *s = sumf; } +#endif void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { #if defined __riscv_v_intrinsic @@ -3483,6 +3495,7 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo #endif } +#if defined __riscv_v_intrinsic static void ggml_vec_dot_mxfp4_q8_0_vl128(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(nrc == 1); UNUSED(nrc); @@ -3592,6 +3605,7 @@ static void ggml_vec_dot_mxfp4_q8_0_vl256(int n, float * GGML_RESTRICT s, size_t *s = sumf; } +#endif void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { #if defined __riscv_v_intrinsic @@ -3604,6 +3618,6 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo break; } #else - return ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); #endif } diff --git a/ggml/src/ggml-cpu/arch/riscv/repack.cpp b/ggml/src/ggml-cpu/arch/riscv/repack.cpp index cd5807879e..c37488cae5 100644 --- a/ggml/src/ggml-cpu/arch/riscv/repack.cpp +++ b/ggml/src/ggml-cpu/arch/riscv/repack.cpp @@ -107,8 +107,7 @@ void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTR } #else UNUSED(nb); - UNUSED(y); - ggml_quantize_mat_q8_0_4x4_generic(x, vy, k); + ggml_quantize_mat_q8_0_4x8_generic(x, vy, k); #endif } @@ -203,6 +202,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo ggml_gemv_q4_0_8x8_q8_0_generic(n, s, bs, vx, vy, nr, nc); } +#if defined __riscv_zvfh void ggml_gemv_q4_0_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { const int qk = QK8_0; const int nb = n / qk; @@ -222,7 +222,6 @@ void ggml_gemv_q4_0_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v UNUSED(ncols_interleaved); UNUSED(blocklen); -#if defined __riscv_v_intrinsic const block_q8_0 * a_ptr = (const block_q8_0 *) vy; for (int x = 0; x < nc / ncols_interleaved; x++) { const block_q4_0x16 * b_ptr = (const block_q4_0x16 *) vx + (x * nb); @@ -256,9 +255,6 @@ void ggml_gemv_q4_0_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v __riscv_vse32_v_f32m2(s + x * 16, sumf, 16); } - return; -#endif - ggml_gemv_q4_0_16x1_q8_0_generic(n, s, bs, vx, vy, nr, nc); } void ggml_gemv_q4_K_16x1_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { @@ -280,7 +276,6 @@ void ggml_gemv_q4_K_16x1_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v UNUSED(ncols_interleaved); UNUSED(blocklen); -#if defined __riscv_v_intrinsic const block_q8_K * a_ptr = (const block_q8_K *) vy; for (int x = 0; x < nc / ncols_interleaved; x++) { @@ -392,9 +387,6 @@ void ggml_gemv_q4_K_16x1_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v __riscv_vse32_v_f32m2(s + x * 16, sumf, 16); } - return; -#endif - ggml_gemv_q4_K_16x1_q8_K_generic(n, s, bs, vx, vy, nr, nc); } void ggml_gemv_iq4_nl_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { @@ -416,7 +408,6 @@ void ggml_gemv_iq4_nl_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const UNUSED(ncols_interleaved); UNUSED(blocklen); -#if defined __riscv_v_intrinsic const vint8mf2_t values = __riscv_vle8_v_i8mf2(kvalues_iq4nl, 16); const block_q8_0 * a_ptr = (const block_q8_0 *) vy; for (int x = 0; x < nc / ncols_interleaved; x++) { @@ -451,9 +442,6 @@ void ggml_gemv_iq4_nl_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const __riscv_vse32_v_f32m2(s + x * 16, sumf, 16); } - return; -#endif - ggml_gemv_iq4_nl_16x1_q8_0_generic(n, s, bs, vx, vy, nr, nc); } void ggml_gemv_q8_0_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { @@ -476,7 +464,6 @@ void ggml_gemv_q8_0_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v UNUSED(blocklen); UNUSED(bs); -#if defined __riscv_v_intrinsic const block_q8_0 * a_ptr = (const block_q8_0 *) vy; for (int x = 0; x < nc / ncols_interleaved; x++) { const block_q8_0x16 * b_ptr = (const block_q8_0x16 *) vx + (x * nb); @@ -505,9 +492,6 @@ void ggml_gemv_q8_0_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v __riscv_vse32_v_f32m2(s + x * 16, sumf, 16); } - return; -#endif - ggml_gemv_q8_0_16x1_q8_0_generic(n, s, bs, vx, vy, nr, nc); } void ggml_gemv_q2_K_16x1_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { @@ -679,9 +663,9 @@ void ggml_gemv_q2_K_16x1_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v } // End K-Block __riscv_vse32_v_f32m2(s + col_tile, v_sumf, vl); - } } +#endif void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { const int qk = QK8_0; @@ -909,6 +893,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo ggml_gemm_q4_0_8x8_q8_0_generic(n, s, bs, vx, vy, nr, nc); } +#if defined __riscv_zvfh void ggml_gemm_q4_0_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { const int qk = QK8_0; const int nb = n / qk; @@ -929,7 +914,6 @@ void ggml_gemm_q4_0_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v UNUSED(ncols_interleaved); UNUSED(blocklen); -#if defined __riscv_v_intrinsic for (int y = 0; y < nr / 4; y++) { const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb); for (int x = 0; x < nc / ncols_interleaved; x++) { @@ -994,9 +978,6 @@ void ggml_gemm_q4_0_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v __riscv_vse32_v_f32m2(s + (y * 4 + 3) * bs + x * 16, sumf_3, 16); } } - return; -#endif - ggml_gemm_q4_0_16x1_q8_0_generic(n, s, bs, vx, vy, nr, nc); } void ggml_gemm_q4_K_16x1_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { @@ -1019,7 +1000,6 @@ void ggml_gemm_q4_K_16x1_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v UNUSED(ncols_interleaved); UNUSED(blocklen); -#if defined __riscv_v_intrinsic for (int y = 0; y < nr / 4; y++) { const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb); for (int x = 0; x < nc / ncols_interleaved; x++) { @@ -1267,9 +1247,6 @@ void ggml_gemm_q4_K_16x1_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v __riscv_vse32_v_f32m2(s + (y * 4 + 3) * bs + x * 16, sumf_3, 16); } } - return; -#endif - ggml_gemm_q4_K_16x1_q8_K_generic(n, s, bs, vx, vy, nr, nc); } void ggml_gemm_iq4_nl_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { @@ -1292,7 +1269,6 @@ void ggml_gemm_iq4_nl_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const UNUSED(ncols_interleaved); UNUSED(blocklen); -#if defined __riscv_v_intrinsic const vint8mf2_t values = __riscv_vle8_v_i8mf2(kvalues_iq4nl, 16); for (int y = 0; y < nr / 4; y++) { const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb); @@ -1355,9 +1331,6 @@ void ggml_gemm_iq4_nl_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const __riscv_vse32_v_f32m2(s + (y * 4 + 3) * bs + x * 16, sumf_3, 16); } } - return; -#endif - ggml_gemm_iq4_nl_16x1_q8_0_generic(n, s, bs, vx, vy, nr, nc); } void ggml_gemm_q8_0_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { @@ -1380,7 +1353,6 @@ void ggml_gemm_q8_0_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v UNUSED(ncols_interleaved); UNUSED(blocklen); -#if defined __riscv_v_intrinsic for (int y = 0; y < nr / 4; y++) { const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb); for (int x = 0; x < nc / ncols_interleaved; x++) { @@ -1429,9 +1401,6 @@ void ggml_gemm_q8_0_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v __riscv_vse32_v_f32m2(s + (y * 4 + 3) * bs + x * 16, sumf_3, 16); } } - return; -#endif - ggml_gemm_q8_0_16x1_q8_0_generic(n, s, bs, vx, vy, nr, nc); } void ggml_gemm_q2_K_16x1_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { @@ -1731,3 +1700,4 @@ void ggml_gemm_q2_K_16x1_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v } } } +#endif diff --git a/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp b/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp index 9bcc18d442..0ecf7ae02a 100644 --- a/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +++ b/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp @@ -1461,7 +1461,7 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type { return false; } if ((op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_I32) && - ggml_ne(op->src[1], 2) == 1 && ggml_ne(op->src[1], 3) == 1) { + ggml_ne(op->src[1], 3) == 1) { return true; } } @@ -1473,10 +1473,12 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type { if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type()) { return (ggml::cpu::tensor_traits *) op->src[0]->extra; } else { + if (op->src[0]->type != GGML_TYPE_F16) { + return nullptr; + } std::array kernel_chain; const int slot_total = kleidiai_collect_kernel_chain(op, kernel_chain); - const bool has_kernel = slot_total > 0; - if (has_kernel && op->src[1]->ne[1] > 1) { + if (slot_total > 0 && op->src[1]->ne[1] > 1) { if ((op->src[0]->nb[1] * op->src[0]->ne[1] != op->src[0]->nb[2]) || (op->src[1]->nb[1] * op->src[1]->ne[1] != op->src[1]->nb[2])) { return nullptr; diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 314cc1088a..3f85e531da 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -6205,7 +6205,7 @@ static void ggml_compute_forward_im2col_f16( const ggml_tensor * src1 = dst->src[1]; GGML_ASSERT(src0->type == GGML_TYPE_F16); - GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F16 || src1->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F16); GGML_TENSOR_BINARY_OP_LOCALS; @@ -6236,7 +6236,7 @@ static void ggml_compute_forward_im2col_f16( int ofs1 = is_2D ? nb12 : nb11; GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); - GGML_ASSERT(nb10 == sizeof(float)); + GGML_ASSERT(nb10 == ggml_type_size(src1->type)); // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW] { @@ -6249,7 +6249,12 @@ static void ggml_compute_forward_im2col_f16( // micro kernel ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW] - const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW] + const float * const src_data_f32 = src1->type == GGML_TYPE_F32 + ? (const float *)((const char *) src1->data + in*ofs0 + iic*ofs1) + : nullptr; // [IH, IW] + const ggml_fp16_t * const src_data_f16 = src1->type == GGML_TYPE_F16 + ? (const ggml_fp16_t *)((const char *) src1->data + in*ofs0 + iic*ofs1) + : nullptr; // [IH, IW] for (int64_t ikh = 0; ikh < KH; ikh++) { // 1 for (int64_t ikw = 0; ikw < KW; ikw++) { @@ -6259,7 +6264,11 @@ static void ggml_compute_forward_im2col_f16( if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) { dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0; } else { - dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_CPU_FP32_TO_FP16(src_data[iih*IW + iiw]); + if (src_data_f32 != nullptr) { + dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_CPU_FP32_TO_FP16(src_data_f32[iih*IW + iiw]); + } else { + dst_data[iic*(KH*KW) + ikh*KW + ikw] = src_data_f16[iih*IW + iiw]; + } } } } diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp index 6b76ab3bfb..f18758f16b 100644 --- a/ggml/src/ggml-cpu/repack.cpp +++ b/ggml/src/ggml-cpu/repack.cpp @@ -1365,6 +1365,7 @@ void ggml_gemv_q8_0_4x8_q8_0_generic(int n, } } +// Only enable these for RISC-V. #if defined __riscv_zvfh void ggml_gemv_q4_0_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { const int qk = QK8_0; @@ -1568,6 +1569,7 @@ void ggml_gemv_q2_K_16x1_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, assert(nc % 16 == 0); UNUSED(bs); + UNUSED(nr); const int nb = n / QK_K; const block_q2_Kx16 * x = (const block_q2_Kx16 *)vx; @@ -2381,6 +2383,7 @@ void ggml_gemm_q8_0_4x8_q8_0_generic(int n, } } +// Only enable these for RISC-V. #if defined __riscv_zvfh void ggml_gemm_q4_0_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { const int qk = QK8_0; diff --git a/ggml/src/ggml-sycl/backend.hpp b/ggml/src/ggml-sycl/backend.hpp index b30b7f2beb..a526d8e58b 100644 --- a/ggml/src/ggml-sycl/backend.hpp +++ b/ggml/src/ggml-sycl/backend.hpp @@ -24,6 +24,7 @@ #include "dmmv.hpp" #include "element_wise.hpp" #include "fattn.hpp" +#include "gated_delta_net.hpp" #include "gla.hpp" #include "im2col.hpp" #include "mmq.hpp" @@ -31,6 +32,7 @@ #include "norm.hpp" #include "outprod.hpp" #include "pad.hpp" +#include "pad_reflect_1d.hpp" #include "quantize.hpp" #include "quants.hpp" #include "roll.hpp" @@ -39,8 +41,8 @@ #include "ssm_conv.hpp" #include "softmax.hpp" #include "tsembd.hpp" +#include "upscale.hpp" #include "wkv.hpp" -#include "pad_reflect_1d.hpp" #endif // GGML_SYCL_BACKEND_HPP diff --git a/ggml/src/ggml-sycl/element_wise.cpp b/ggml/src/ggml-sycl/element_wise.cpp index acd51bf45b..ec0247528c 100644 --- a/ggml/src/ggml-sycl/element_wise.cpp +++ b/ggml/src/ggml-sycl/element_wise.cpp @@ -294,30 +294,6 @@ static void unary_op_trunc_kernel(const T * x, T * dst, const int k, const sycl: } } -template -static void upscale(const T *x, T *dst, const int nb00, const int nb01, - const int nb02, const int nb03, const int ne10, const int ne11, - const int ne12, const int ne13, const float sf0, const float sf1, - const float sf2, const float sf3, const sycl::nd_item<1> &item_ct1) { - int index = item_ct1.get_local_id(0) + - item_ct1.get_group(0) * item_ct1.get_local_range(0); - if (index >= ne10 * ne11 * ne12 * ne13) { - return; - } - // operation - int i10 = index % ne10; - int i11 = (index / ne10) % ne11; - int i12 = (index / (ne10 * ne11)) % ne12; - int i13 = (index / (ne10 * ne11 * ne12)) % ne13; - - int i00 = static_cast(i10 / sf0); - int i01 = static_cast(i11 / sf1); - int i02 = static_cast(i12 / sf2); - int i03 = static_cast(i13 / sf3); - - dst[index] = *(const T *)((const char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00); -} - template static void clamp(const T * x, T * dst, const float min, const float max, const int k, const sycl::nd_item<1> &item_ct1) { @@ -392,20 +368,6 @@ static void arange_kernel(T * dst, const int k, T start, T step, } } -template -static void upscale_sycl(const T *x, T *dst, const int nb00, const int nb01, - const int nb02, const int nb03, const int ne10, const int ne11, - const int ne12, const int ne13, const float sf0, const float sf1, - const float sf2, const float sf3, queue_ptr stream) { - int dst_size = ne10 * ne11 * ne12 * ne13; - int num_blocks = ceil_div(dst_size, SYCL_UPSCALE_BLOCK_SIZE); - sycl::range<1> gridDim(num_blocks * SYCL_UPSCALE_BLOCK_SIZE); - stream->parallel_for( - sycl::nd_range<1>(gridDim, sycl::range<1>(SYCL_UPSCALE_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) { - upscale(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3, item_ct1); - }); -} - template static inline void dispatch_ggml_sycl_op_unary(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); @@ -505,42 +467,6 @@ static inline void dispatch_ggml_sycl_op_fused_glu(ggml_backend_sycl_context & c } } -template -static inline void dispatch_ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) { - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); - - GGML_ASSERT(dst->src[0]->type == dst->type); - - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - - const float sf0 = (float) dst->ne[0] / dst->src[0]->ne[0]; - const float sf1 = (float) dst->ne[1] / dst->src[0]->ne[1]; - const float sf2 = (float) dst->ne[2] / dst->src[0]->ne[2]; - const float sf3 = (float) dst->ne[3] / dst->src[0]->ne[3]; - switch (dst->type) { - case GGML_TYPE_F16: - { - auto data_pts = cast_data(dst); - kernel_invoker(data_pts.src, data_pts.dst, (int)dst->src[0]->nb[0], (int)dst->src[0]->nb[1], (int)dst->src[0]->nb[2], - (int)dst->src[0]->nb[3], (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], sf0, sf1, sf2, sf3, - main_stream, std::forward(args)...); - break; - } - case GGML_TYPE_F32: - { - auto data_pts = cast_data(dst); - kernel_invoker(data_pts.src, data_pts.dst, (int)dst->src[0]->nb[0], (int)dst->src[0]->nb[1], (int)dst->src[0]->nb[2], - (int)dst->src[0]->nb[3], (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], sf0, sf1, sf2, sf3, - main_stream, std::forward(args)...); - break; - } - default: - GGML_ABORT("GGML tensor type not supported!\n"); - } -} - template static inline void ggml_sycl_op_unary( ggml_backend_sycl_context & ctx, ggml_tensor * dst, F func) { @@ -784,15 +710,6 @@ static inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, ggml_tensor }); } -static inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_detail::dispatch_ggml_sycl_op_upscale(ctx, dst, - [](const auto* src, auto* dst_ptr, int nb00, int nb01, int nb02, int nb03, - int ne10, int ne11, int ne12, int ne13, float sf0, float sf1, float sf2, float sf3, - queue_ptr stream) { - ggml_sycl_detail::upscale_sycl(src, dst_ptr, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3, stream); - }); -} - static inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { float min_val; float max_val; @@ -1131,12 +1048,6 @@ void ggml_sycl_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { ggml_sycl_op_sqr(ctx, dst); } -void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); - ggml_sycl_op_upscale(ctx, dst); -} - - void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); ggml_sycl_op_clamp(ctx, dst); diff --git a/ggml/src/ggml-sycl/element_wise.hpp b/ggml/src/ggml-sycl/element_wise.hpp index 7c71974687..997132166a 100644 --- a/ggml/src/ggml-sycl/element_wise.hpp +++ b/ggml/src/ggml-sycl/element_wise.hpp @@ -71,8 +71,6 @@ void ggml_sycl_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst); void ggml_sycl_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst); -void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst); - void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst); void ggml_sycl_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 1281970584..2ec1421841 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -44,7 +44,6 @@ #include "ggml-sycl/backend.hpp" #include "ggml-sycl/common.hpp" #include "ggml-sycl/element_wise.hpp" -#include "ggml-sycl/gated_delta_net.hpp" #include "ggml-sycl/gemm.hpp" #include "ggml-sycl/getrows.hpp" #include "ggml-sycl/norm.hpp" @@ -4863,9 +4862,8 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_ROPE: case GGML_OP_ROPE_BACK: case GGML_OP_IM2COL: - return true; case GGML_OP_UPSCALE: - return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST && !(op->op_params[0] & GGML_SCALE_FLAG_ANTIALIAS); + return true; case GGML_OP_SUM: case GGML_OP_SUM_ROWS: case GGML_OP_MEAN: diff --git a/ggml/src/ggml-sycl/upscale.cpp b/ggml/src/ggml-sycl/upscale.cpp new file mode 100644 index 0000000000..18c743de44 --- /dev/null +++ b/ggml/src/ggml-sycl/upscale.cpp @@ -0,0 +1,410 @@ +#include "upscale.hpp" + +static void upscale_f32(const float * x, float * dst, + const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int ne13, + const float sf0, const float sf1, const float sf2, const float sf3) { + auto item_ct1 = sycl::ext::oneapi::this_work_item::get_nd_item<3>(); + int index = item_ct1.get_local_id(2) + item_ct1.get_group(2) * item_ct1.get_local_range(2); + if (index >= ne10 * ne11 * ne12 * ne13) { + return; + } + + int i10 = index % ne10; + int i11 = (index / ne10) % ne11; + int i12 = (index / (ne10 * ne11)) % ne12; + int i13 = (index / (ne10 * ne11 * ne12)) % ne13; + + int i00 = i10 / sf0; + int i01 = i11 / sf1; + int i02 = i12 / sf2; + int i03 = i13 / sf3; + + dst[index] = *((const float*)((const char*)x + i03 * nb03 + i02 * nb02 + + i01 * nb01 + i00 * nb00)); +} + +static void upscale_f32_bilinear(const float * x, float * dst, + const int nb00, const int nb01, const int nb02, const int nb03, + const int ne00_src, const int ne01_src, + const int ne10_dst, const int ne11_dst, const int ne12_dst, const int ne13_dst, + const float sf0, const float sf1, const float sf2, const float sf3, + const float pixel_offset) { + auto item_ct1 = sycl::ext::oneapi::this_work_item::get_nd_item<3>(); + const int64_t index = item_ct1.get_local_id(2) + + item_ct1.get_group(2) * item_ct1.get_local_range(2); + const int64_t dst_total_elements = ne10_dst * ne11_dst * ne12_dst * ne13_dst; + + if (index >= dst_total_elements) { + return; + } + + const int i10_dst = index % ne10_dst; + const int i11_dst = (index / ne10_dst) % ne11_dst; + const int i12_dst = (index / (ne10_dst * ne11_dst)) % ne12_dst; + const int i13_dst = index / (ne10_dst * ne11_dst * ne12_dst); + + const int i02_src = (int)(i12_dst / sf2); + const int i03_src = (int)(i13_dst / sf3); + + const float y_src_f = ((float)i11_dst + pixel_offset) / sf1 - pixel_offset; + int y0_src = (int) sycl::floor((float) y_src_f); + int y1_src = y0_src + 1; + + y0_src = sycl::max(0, sycl::min(y0_src, ne01_src - 1)); + y1_src = sycl::max(0, sycl::min(y1_src, ne01_src - 1)); + + float dy = y_src_f - (float)y0_src; + dy = sycl::max(0.0f, sycl::min(dy, 1.0f)); + + float x_src_f = ((float)i10_dst + pixel_offset) / sf0 - pixel_offset; + int x0_src = (int) sycl::floor(x_src_f); + int x1_src = x0_src + 1; + + x0_src = sycl::max(0, sycl::min(x0_src, ne00_src - 1)); + x1_src = sycl::max(0, sycl::min(x1_src, ne00_src - 1)); + + float dx = x_src_f - (float)x0_src; + dx = sycl::max(0.0f, sycl::min(dx, 1.0f)); + + const float* p_a = + (const float*)((const char*)x + (int64_t)x0_src * nb00 + + (int64_t)y0_src * nb01 + (int64_t)i02_src * nb02 + + (int64_t)i03_src * nb03); + const float* p_b = + (const float*)((const char*)x + (int64_t)x1_src * nb00 + + (int64_t)y0_src * nb01 + (int64_t)i02_src * nb02 + + (int64_t)i03_src * nb03); + const float* p_c = + (const float*)((const char*)x + (int64_t)x0_src * nb00 + + (int64_t)y1_src * nb01 + (int64_t)i02_src * nb02 + + (int64_t)i03_src * nb03); + const float* p_d = + (const float*)((const char*)x + (int64_t)x1_src * nb00 + + (int64_t)y1_src * nb01 + (int64_t)i02_src * nb02 + + (int64_t)i03_src * nb03); + + const float val_a = *p_a; + const float val_b = *p_b; + const float val_c = *p_c; + const float val_d = *p_d; + + float result = val_a * (1.0f - dx) * (1.0f - dy) + + val_b * dx * (1.0f - dy) + + val_c * (1.0f - dx) * dy + + val_d * dx * dy; + + dst[index] = result; +} + +// Similar to F.interpolate(..., mode="bilinear", align_corners=False, antialias=True) +// https://github.com/pytorch/pytorch/blob/8871ff29b743948d1225389d5b7068f37b22750b/aten/src/ATen/native/cpu/UpSampleKernel.cpp +static void upscale_f32_bilinear_antialias(const float * src0, + float * dst, + const int nb00, + const int nb01, + const int nb02, + const int nb03, + const int ne00_src, + const int ne01_src, + const int ne10_dst, + const int ne11_dst, + const int ne12_dst, + const int ne13_dst, + const float sf0, + const float sf1, + const float sf2, + const float sf3, + const float pixel_offset) { + auto item_ct1 = sycl::ext::oneapi::this_work_item::get_nd_item<3>(); + const int64_t index = item_ct1.get_local_id(2) + + item_ct1.get_group(2) * item_ct1.get_local_range(2); + const int64_t dst_total_elements = ne10_dst * ne11_dst * ne12_dst * ne13_dst; + + if (index >= dst_total_elements) { + return; + } + + const int i10_dst = index % ne10_dst; + const int i11_dst = (index / ne10_dst) % ne11_dst; + const int i12_dst = (index / (ne10_dst * ne11_dst)) % ne12_dst; + const int i13_dst = index / (ne10_dst * ne11_dst * ne12_dst); + + const int i02_src = (int)(i12_dst / sf2); + const int i03_src = (int)(i13_dst / sf3); + + const float y = ((float)i11_dst + pixel_offset) / sf1; + const float x = ((float)i10_dst + pixel_offset) / sf0; + + // support and invscale, minimum 1 pixel for bilinear + const float support1 = sycl::max(1.0f / sf1, 1.0f); + const float invscale1 = 1.0f / support1; + const float support0 = sycl::max(1.0f / sf0, 1.0f); + const float invscale0 = 1.0f / support0; + + // the range of source pixels that contribute + const int64_t x_min = sycl::max(int64_t(0), int64_t(x - support0 + pixel_offset)); + const int64_t x_max = sycl::min(int64_t(ne00_src), int64_t(x + support0 + pixel_offset)); + const int64_t y_min = sycl::max(int64_t(0), int64_t(y - support1 + pixel_offset)); + const int64_t y_max = sycl::min(int64_t(ne01_src), int64_t(y + support1 + pixel_offset)); + + // bilinear filter with antialiasing + float val = 0.0f; + float total_weight = 0.0f; + + auto triangle_filter = [](float x) -> float { + return sycl::max(1.0f - sycl::fabs(x), 0.0f); + }; + + for (int64_t sy = y_min; sy < y_max; sy++) { + const float weight_y = triangle_filter((sy - y + pixel_offset) * invscale1); + + for (int64_t sx = x_min; sx < x_max; sx++) { + const float weight_x = triangle_filter((sx - x + pixel_offset) * invscale0); + const float weight = weight_x * weight_y; + + if (weight <= 0.0f) { + continue; + } + + const float pixel = + *(const float*)((const char*)src0 + sx * nb00 + sy * nb01 + + i02_src * nb02 + i03_src * nb03); + val += pixel * weight; + total_weight += weight; + } + } + + if (total_weight > 0.0f) { + val /= total_weight; + } + + dst[index] = val; +} + +namespace bicubic_interpolation { +static float weight1(float x, const float &a) { return ((a + 2) * x - (a + 3)) * x * x + 1; }; +static float weight2(float x, const float &a) { return ((a * x - 5 * a) * x + 8 * a) * x - 4 * a; }; + +static float bicubic(float p0, float p1, float p2, float p3, float x, float a) { + const float w0 = weight2(x + 1, a); + const float w1 = weight1(x + 0, a); + const float w2 = weight1(1 - x, a); + const float w3 = weight2(2 - x, a); + return p0 * w0 + p1 * w1 + p2 * w2 + p3 * w3; +}; + +} + +static void upscale_f32_bicubic(const float * x, float * dst, + const int nb00, const int nb01, const int nb02, const int nb03, + const int ne00_src, const int ne01_src, + const int ne10_dst, const int ne11_dst, const int ne12_dst, const int ne13_dst, + const float sf0, const float sf1, const float sf2, const float sf3, + const float pixel_offset) { + auto item_ct1 = sycl::ext::oneapi::this_work_item::get_nd_item<3>(); + const float a = -0.75f; + using bicubic_interpolation::bicubic; + + const int64_t index = item_ct1.get_local_id(2) + + item_ct1.get_group(2) * item_ct1.get_local_range(2); + const int64_t dst_total_elements = + ne10_dst * ne11_dst * ne12_dst * ne13_dst; + + if (index >= dst_total_elements) { + return; + } + + const int i10_dst = index % ne10_dst; + const int i11_dst = (index / ne10_dst) % ne11_dst; + const int i12_dst = (index / (ne10_dst * ne11_dst)) % ne12_dst; + const int i13_dst = index / (ne10_dst * ne11_dst * ne12_dst); + + const int i02_src = (int)(i12_dst / sf2); + const int i03_src = (int)(i13_dst / sf3); + + const float y_src_f = ((float)i11_dst + pixel_offset) / sf1 - pixel_offset; + const int y0_src = (int) sycl::floor((float) y_src_f); + const float dy = y_src_f - (float)y0_src; + + const float x_src_f = ((float)i10_dst + pixel_offset) / sf0 - pixel_offset; + const int x0_src = (int) sycl::floor((float) x_src_f); + const float dx = x_src_f - (float)x0_src; + + const char * x_base = (const char *)x + (int64_t)i02_src * nb02 + (int64_t)i03_src * nb03; + + auto load = [=](int x_off, int y_off) -> float { + int i00_src = sycl::max(0, sycl::min(x0_src + x_off, ne00_src - 1)); + int i01_src = sycl::max(0, sycl::min(y0_src + y_off, ne01_src - 1)); + return *(const float *)(x_base + (int64_t)i00_src * nb00 + (int64_t)i01_src * nb01); + }; + + const float result = bicubic( + bicubic(load(-1, -1), load(0, -1), load(1, -1), load(2, -1), dx, a), + bicubic(load(-1, 0), load(0, 0), load(1, 0), load(2, 0), dx, a), + bicubic(load(-1, 1), load(0, 1), load(1, 1), load(2, 1), dx, a), + bicubic(load(-1, 2), load(0, 2), load(1, 2), load(2, 2), dx, a), + dy, + a); + + dst[index] = result; +} + +static void upscale_f32_sycl(const float * x, + float * dst, + const int nb00, + const int nb01, + const int nb02, + const int nb03, + const int ne10, + const int ne11, + const int ne12, + const int ne13, + const float sf0, + const float sf1, + const float sf2, + const float sf3, + dpct::queue_ptr stream) { + const int64_t dst_size = ne10 * ne11 * ne12 * ne13; + const int64_t num_blocks = (dst_size + SYCL_UPSCALE_BLOCK_SIZE - 1) / SYCL_UPSCALE_BLOCK_SIZE; + + stream->parallel_for( + sycl::nd_range<3>( + sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_UPSCALE_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_UPSCALE_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + upscale_f32(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3); + }); +} + +static void upscale_f32_bilinear_sycl(const float * x, + float * dst, + const int nb00, + const int nb01, + const int nb02, + const int nb03, + const int ne00_src, + const int ne01_src, + const int ne10_dst, + const int ne11_dst, + const int ne12_dst, + const int ne13_dst, + const float sf0, + const float sf1, + const float sf2, + const float sf3, + const float pixel_offset, + bool antialias, + dpct::queue_ptr stream) { + const int64_t dst_size = ne10_dst * ne11_dst * ne12_dst * ne13_dst; + const int64_t num_blocks = (dst_size + SYCL_UPSCALE_BLOCK_SIZE - 1) / SYCL_UPSCALE_BLOCK_SIZE; + + if (antialias) { + stream->parallel_for( + sycl::nd_range<3>( + sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_UPSCALE_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_UPSCALE_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + upscale_f32_bilinear_antialias( + x, dst, nb00, nb01, nb02, nb03, ne00_src, ne01_src, ne10_dst, ne11_dst, + ne12_dst, ne13_dst, sf0, sf1, sf2, sf3, pixel_offset); + }); + } else { + stream->parallel_for( + sycl::nd_range<3>( + sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_UPSCALE_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_UPSCALE_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + upscale_f32_bilinear( + x, dst, nb00, nb01, nb02, nb03, ne00_src, ne01_src, ne10_dst, ne11_dst, ne12_dst, + ne13_dst, sf0, sf1, sf2, sf3, pixel_offset); + }); + } +} + +static void upscale_f32_bicubic_sycl(const float * x, + float * dst, + const int nb00, + const int nb01, + const int nb02, + const int nb03, + const int ne00_src, + const int ne01_src, + const int ne10_dst, + const int ne11_dst, + const int ne12_dst, + const int ne13_dst, + const float sf0, + const float sf1, + const float sf2, + const float sf3, + const float pixel_offset, + dpct::queue_ptr stream) { + const int64_t dst_size = ne10_dst * ne11_dst * ne12_dst * ne13_dst; + const int64_t num_blocks = (dst_size + SYCL_UPSCALE_BLOCK_SIZE - 1) / SYCL_UPSCALE_BLOCK_SIZE; + + { + stream->submit([&](sycl::handler & cgh) { + cgh.parallel_for( + sycl::nd_range<3>( + sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_UPSCALE_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_UPSCALE_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + upscale_f32_bicubic( + x, dst, nb00, nb01, nb02, nb03, ne00_src, ne01_src, ne10_dst, ne11_dst, + ne12_dst, ne13_dst, sf0, sf1, sf2, sf3, pixel_offset); + }); + }); + } +} + +void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const float * src0_d = (const float *)src0->data; + float * dst_d = (float *)dst->data; + dpct::queue_ptr stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + const int mode_flags = dst->op_params[0]; + const ggml_scale_mode mode = (ggml_scale_mode)(mode_flags & 0xFF); + + float sf0 = (float)dst->ne[0]/src0->ne[0]; + float sf1 = (float)dst->ne[1]/src0->ne[1]; + float sf2 = (float)dst->ne[2]/src0->ne[2]; + const float sf3 = (float)dst->ne[3]/src0->ne[3]; + + float pixel_offset = 0.5f; + if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) { + sf0 = dst->ne[0] > 1 && src0->ne[0] > 1 + ? (float)(dst->ne[0] - 1) / (src0->ne[0] - 1) + : sf0; + sf1 = dst->ne[1] > 1 && src0->ne[1] > 1 + ? (float)(dst->ne[1] - 1) / (src0->ne[1] - 1) + : sf1; + pixel_offset = 0.0f; + } + + if (mode == GGML_SCALE_MODE_NEAREST) { + upscale_f32_sycl( + src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], + dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3, stream); + } else if (mode == GGML_SCALE_MODE_BILINEAR) { + const bool antialias = (mode_flags & GGML_SCALE_FLAG_ANTIALIAS); + upscale_f32_bilinear_sycl( + src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], + src0->ne[0], src0->ne[1], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], + sf0, sf1, sf2, sf3, pixel_offset, antialias, stream); + } else if (mode == GGML_SCALE_MODE_BICUBIC) { + upscale_f32_bicubic_sycl( + src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], + src0->ne[0], src0->ne[1], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], + sf0, sf1, sf2, sf3, pixel_offset, stream); + } +} + +void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_upscale(ctx, dst); +} diff --git a/ggml/src/ggml-sycl/upscale.hpp b/ggml/src/ggml-sycl/upscale.hpp new file mode 100644 index 0000000000..c36c1bdc97 --- /dev/null +++ b/ggml/src/ggml-sycl/upscale.hpp @@ -0,0 +1,9 @@ +#pragma once + +#include +#include "dpct/helper.hpp" +#include "common.hpp" + +#define SYCL_UPSCALE_BLOCK_SIZE 256 + +void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 7092361d2e..3d8ce10676 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -191,6 +191,7 @@ struct vk_queue; struct vk_command_buffer { vk::CommandBuffer buf; + uint64_t use_counter = 0; bool in_use = false; }; @@ -938,21 +939,26 @@ struct vk_subbuffer { } }; -// vk_event is used for the event-related backend interfaces. It uses 'event' for -// event_wait and 'fence' for event_synchronize. Polling on an event for -// event_synchronize wouldn't be sufficient to wait for command buffers to complete, -// and would lead to validation errors. -struct vk_event { - vk::Event event; - vk::Fence fence; - vk_command_buffer* cmd_buffer = nullptr; -}; - struct vk_semaphore { vk::Semaphore s; uint64_t value; }; +// vk_event is used for the event-related backend interfaces. It uses vk::Events for +// event_wait and a timeline semaphore for event_synchronize. Polling on an event for +// event_synchronize wouldn't be sufficient to wait for command buffers to complete, +// and would lead to validation errors. +struct vk_event { + std::vector events_free; // Events available for reuse + std::vector events_submitted; // Events that are fully submitted and can be reused on next synchronize + vk::Event event; + bool has_event; + + vk_semaphore tl_semaphore; + vk_command_buffer* cmd_buffer = nullptr; + uint64_t cmd_buffer_use_counter = 0; +}; + struct vk_submission { vk_command_buffer* buffer = nullptr; std::vector wait_semaphores; @@ -2319,7 +2325,7 @@ static vk_command_buffer* ggml_vk_create_cmd_buffer(vk_device& device, vk_comman vk::CommandBufferLevel::ePrimary, 1); const std::vector cmd_buffers = device->device.allocateCommandBuffers(command_buffer_alloc_info); - p.cmd_buffers.push_back({ cmd_buffers.front(), true }); + p.cmd_buffers.push_back({ cmd_buffers.front(), 0, true }); return &p.cmd_buffers[p.cmd_buffers.size()-1]; } @@ -2788,6 +2794,15 @@ static void ggml_vk_sync_buffers(ggml_backend_vk_context* ctx, vk_context& subct ); } +static void ggml_vk_reset_event(vk_context& ctx, vk::Event& event) { + VK_LOG_DEBUG("ggml_vk_set_event()"); + + ctx->s->buffer->buf.resetEvent( + event, + ctx->p->q->stage_flags + ); +} + static void ggml_vk_set_event(vk_context& ctx, vk::Event& event) { VK_LOG_DEBUG("ggml_vk_set_event()"); @@ -4981,8 +4996,9 @@ static vk_device ggml_vk_get_device(size_t idx) { std::vector queue_family_props = device->physical_device.getQueueFamilyProperties(); // Try to find a non-graphics compute queue and transfer-focused queues - // On AMD, the graphics queue seems to be faster, so don't avoid it - const vk::QueueFlagBits graphics_flag = device->vendor_id == VK_VENDOR_ID_AMD ? (vk::QueueFlagBits)0 : vk::QueueFlagBits::eGraphics; + // Allow overriding avoiding the graphics queue because it can increase performance on RADV + const bool allow_graphics_queue = (getenv("GGML_VK_ALLOW_GRAPHICS_QUEUE") != nullptr); + const vk::QueueFlagBits graphics_flag = allow_graphics_queue ? (vk::QueueFlagBits)0 : vk::QueueFlagBits::eGraphics; const uint32_t compute_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eCompute, graphics_flag, -1, 1); const uint32_t transfer_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eTransfer, vk::QueueFlagBits::eCompute | graphics_flag, compute_queue_family_index, 1); @@ -5443,11 +5459,14 @@ static vk_device ggml_vk_get_device(size_t idx) { ggml_vk_load_shaders(device); + // Only use transfer queue on AMD non-GCN, when the graphics queue is not enabled + const bool prefers_transfer_queue = device->vendor_id == VK_VENDOR_ID_AMD && device->architecture != AMD_GCN && !allow_graphics_queue; + if (!device->single_queue) { const uint32_t transfer_queue_index = compute_queue_family_index == transfer_queue_family_index ? 1 : 0; ggml_vk_create_queue(device, device->transfer_queue, transfer_queue_family_index, transfer_queue_index, { vk::PipelineStageFlagBits::eTransfer }, true); - device->async_use_transfer_queue = (getenv("GGML_VK_ASYNC_USE_TRANSFER_QUEUE") != nullptr); + device->async_use_transfer_queue = prefers_transfer_queue || (getenv("GGML_VK_ASYNC_USE_TRANSFER_QUEUE") != nullptr); } else { // TODO: Use pointer or reference to avoid copy device->transfer_queue.copyFrom(device->compute_queue); @@ -6392,6 +6411,7 @@ static vk_subbuffer ggml_vk_tensor_subbuffer( static vk_command_buffer* ggml_vk_get_or_create_cmd_buffer(vk_device& device, vk_command_pool& pool) { for (auto& cmd_buffer : pool.cmd_buffers) { if (!cmd_buffer.in_use) { + cmd_buffer.use_counter++; cmd_buffer.in_use = true; return &cmd_buffer; } @@ -6496,15 +6516,16 @@ static void ggml_vk_ctx_begin(vk_device& device, vk_context& subctx) { } static vk_context ggml_vk_get_compute_ctx(ggml_backend_vk_context * ctx) { + vk_context result; if (!ctx->compute_ctx.expired()) { - return ctx->compute_ctx.lock(); + result = ctx->compute_ctx.lock(); + } else { + result = ggml_vk_create_context(ctx, ctx->compute_cmd_pool); + + ctx->compute_ctx = result; + ggml_vk_ctx_begin(ctx->device, result); } - vk_context result = ggml_vk_create_context(ctx, ctx->compute_cmd_pool); - - ctx->compute_ctx = result; - ggml_vk_ctx_begin(ctx->device, result); - if (ctx->device->async_use_transfer_queue && ctx->transfer_semaphore_last_submitted < ctx->transfer_semaphore.value) { result->s->wait_semaphores.push_back(ctx->transfer_semaphore); ctx->transfer_semaphore_last_submitted = ctx->transfer_semaphore.value; @@ -13797,6 +13818,7 @@ static void ggml_vk_synchronize(ggml_backend_vk_context * ctx) { ctx->submit_pending = false; if (cmd_buf) { cmd_buf->in_use = false; + cmd_buf->buf.reset(); } } @@ -14858,18 +14880,31 @@ static void ggml_backend_vk_event_record(ggml_backend_t backend, ggml_backend_ev vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx); auto* cmd_buf = compute_ctx->s->buffer; // retrieve pointer before it gets reset - // the backend interface doesn't have an explicit reset, so reset it here - // before we record the command to set it - ctx->device->device.resetEvent(vkev->event); - ctx->device->device.resetFences({ vkev->fence }); + if (vkev->has_event) { + // Move existing event into submitted + vkev->events_submitted.push_back(vkev->event); + } + + // Grab the next event and record it, create one if necessary + if (vkev->events_free.empty()) { + vkev->event = ctx->device->device.createEvent({}); + } else { + vkev->event = vkev->events_free.back(); + vkev->events_free.pop_back(); + } + + vkev->has_event = true; ggml_vk_set_event(compute_ctx, vkev->event); + vkev->tl_semaphore.value++; + compute_ctx->s->signal_semaphores.push_back(vkev->tl_semaphore); ggml_vk_ctx_end(compute_ctx); - ggml_vk_submit(compute_ctx, {vkev->fence}); + ggml_vk_submit(compute_ctx, {}); ctx->submit_pending = true; vkev->cmd_buffer = cmd_buf; + vkev->cmd_buffer_use_counter = cmd_buf->use_counter; ctx->compute_ctx.reset(); } @@ -14880,9 +14915,10 @@ static void ggml_backend_vk_event_wait(ggml_backend_t backend, ggml_backend_even vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx); - ggml_vk_wait_events(compute_ctx, {vkev->event}); - ggml_vk_ctx_end(compute_ctx); - ctx->compute_ctx.reset(); + if (vkev->has_event) { + // Wait for latest event + ggml_vk_wait_events(compute_ctx, { vkev->event }); + } } // TODO: enable async and synchronize @@ -15672,10 +15708,13 @@ static ggml_backend_event_t ggml_backend_vk_device_event_new(ggml_backend_dev_t return nullptr; } - // The event/fence is expected to initially be in the signaled state. - vkev->event = device->device.createEvent({}); - vkev->fence = device->device.createFence({vk::FenceCreateFlagBits::eSignaled}); - device->device.setEvent(vkev->event); + // No events initially, they get created on demand + vkev->has_event = false; + + vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eTimeline, 0 }; + vk::SemaphoreCreateInfo ci{}; + ci.setPNext(&tci); + vkev->tl_semaphore = { device->device.createSemaphore(ci), 0 }; return new ggml_backend_event { /* .device = */ dev, @@ -15689,8 +15728,16 @@ static void ggml_backend_vk_device_event_free(ggml_backend_dev_t dev, ggml_backe vk_event *vkev = (vk_event *)event->context; - device->device.destroyFence(vkev->fence); - device->device.destroyEvent(vkev->event); + device->device.destroySemaphore(vkev->tl_semaphore.s); + for (auto& event : vkev->events_free) { + device->device.destroyEvent(event); + } + for (auto& event : vkev->events_submitted) { + device->device.destroyEvent(event); + } + if (vkev->has_event) { + device->device.destroyEvent(vkev->event); + } delete vkev; delete event; } @@ -15701,10 +15748,29 @@ static void ggml_backend_vk_device_event_synchronize(ggml_backend_dev_t dev, ggm auto device = ggml_vk_get_device(ctx->device); vk_event *vkev = (vk_event *)event->context; - VK_CHECK(device->device.waitForFences({ vkev->fence }, true, UINT64_MAX), "event_synchronize"); - // Finished using current command buffer so we flag for reuse - if (vkev->cmd_buffer) { - vkev->cmd_buffer->in_use = false; + // Only do something if the event has actually been used + if (vkev->has_event) { + vk::Semaphore sem = vkev->tl_semaphore.s; + uint64_t val = vkev->tl_semaphore.value; + vk::SemaphoreWaitInfo swi{vk::SemaphoreWaitFlags{}, sem, val}; + VK_CHECK(device->device.waitSemaphores(swi, UINT64_MAX), "event_synchronize"); + + // Reset and move submitted events + for (auto& event : vkev->events_submitted) { + device->device.resetEvent(event); + } + vkev->events_free.insert(vkev->events_free.end(), vkev->events_submitted.begin(), vkev->events_submitted.end()); + vkev->events_submitted.clear(); + + // Finished using current command buffer so we flag for reuse + if (vkev->cmd_buffer) { + // Only flag for reuse if it hasn't been reused already + if (vkev->cmd_buffer_use_counter == vkev->cmd_buffer->use_counter) { + vkev->cmd_buffer->in_use = false; + vkev->cmd_buffer->buf.reset(); + } + vkev->cmd_buffer = nullptr; + } } } diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index bf617382d0..0a032e9039 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -478,6 +478,7 @@ class MODEL_ARCH(IntEnum): RND1 = auto() PANGU_EMBED = auto() MISTRAL3 = auto() + MISTRAL4 = auto() PADDLEOCR = auto() MIMO2 = auto() STEP35 = auto() @@ -924,6 +925,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.RND1: "rnd1", MODEL_ARCH.PANGU_EMBED: "pangu-embedded", MODEL_ARCH.MISTRAL3: "mistral3", + MODEL_ARCH.MISTRAL4: "mistral4", MODEL_ARCH.PADDLEOCR: "paddleocr", MODEL_ARCH.MIMO2: "mimo2", MODEL_ARCH.STEP35: "step35", @@ -3538,6 +3540,37 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, ], + MODEL_ARCH.MISTRAL4: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_Q_A, + MODEL_TENSOR.ATTN_Q_B, + MODEL_TENSOR.ATTN_KV_A_MQA, + MODEL_TENSOR.ATTN_KV_B, + MODEL_TENSOR.ATTN_K_B, + MODEL_TENSOR.ATTN_V_B, + MODEL_TENSOR.ATTN_Q_A_NORM, + MODEL_TENSOR.ATTN_KV_A_NORM, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_ROT_EMBD, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + MODEL_TENSOR.FFN_GATE_UP_EXP, + MODEL_TENSOR.FFN_GATE_SHEXP, + MODEL_TENSOR.FFN_DOWN_SHEXP, + MODEL_TENSOR.FFN_UP_SHEXP, + MODEL_TENSOR.FFN_EXP_PROBS_B, + ], MODEL_ARCH.MIMO2: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 02a096882e..96bf67d5f9 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -d6754f3d0e6d0acd21c12442353c9fd2f94188e7 +553552e1d88be2b214b85e5159eedd39a63e2c34 diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 799d16167b..84dc6d8f1b 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -123,6 +123,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_RND1, "rnd1" }, { LLM_ARCH_PANGU_EMBED, "pangu-embedded" }, { LLM_ARCH_MISTRAL3, "mistral3" }, + { LLM_ARCH_MISTRAL4, "mistral4" }, { LLM_ARCH_PADDLEOCR, "paddleocr" }, { LLM_ARCH_MIMO2, "mimo2" }, { LLM_ARCH_STEP35, "step35" }, @@ -1589,6 +1590,7 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_FFN_UP_SHEXP, }; case LLM_ARCH_DEEPSEEK2: + case LLM_ARCH_MISTRAL4: return { LLM_TENSOR_TOKEN_EMBD, LLM_TENSOR_OUTPUT_NORM, diff --git a/src/llama-arch.h b/src/llama-arch.h index b1b1dcf188..9b9eec2f5c 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -127,6 +127,7 @@ enum llm_arch { LLM_ARCH_RND1, LLM_ARCH_PANGU_EMBED, LLM_ARCH_MISTRAL3, + LLM_ARCH_MISTRAL4, LLM_ARCH_PADDLEOCR, LLM_ARCH_MIMO2, LLM_ARCH_STEP35, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index bae02e32b1..85db938a7a 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1587,6 +1587,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { } } break; case LLM_ARCH_DEEPSEEK2: + case LLM_ARCH_MISTRAL4: { // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B, Kanana-2-30B-A3B const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26 || (hparams.n_layer == 48 && n_vocab == 128256)); @@ -4883,6 +4884,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } } break; case LLM_ARCH_DEEPSEEK2: + case LLM_ARCH_MISTRAL4: { const bool is_mla = hparams.is_mla(); @@ -7850,7 +7852,7 @@ void llama_model::print_info() const { LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale); } - if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_GLM_DSA) { + if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_GLM_DSA || arch == LLM_ARCH_MISTRAL4) { LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead); LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q); LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv); @@ -8428,6 +8430,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { } break; case LLM_ARCH_DEEPSEEK2: case LLM_ARCH_GLM_DSA: + case LLM_ARCH_MISTRAL4: { llm = std::make_unique(*this, params); } break; @@ -8839,6 +8842,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_ERNIE4_5: case LLM_ARCH_ERNIE4_5_MOE: case LLM_ARCH_MISTRAL3: + case LLM_ARCH_MISTRAL4: case LLM_ARCH_LLAMA_EMBED: case LLM_ARCH_MAINCODER: case LLM_ARCH_GLM_DSA: diff --git a/src/models/qwen35.cpp b/src/models/qwen35.cpp index 3108bf331a..d07579ee87 100644 --- a/src/models/qwen35.cpp +++ b/src/models/qwen35.cpp @@ -224,7 +224,7 @@ ggml_tensor * llm_build_qwen35::build_layer_attn_linear( beta = ggml_sigmoid(ctx0, beta); ggml_tensor * alpha = build_lora_mm(model.layers[il].ssm_alpha, cur, model.layers[il].ssm_alpha_s); - alpha = ggml_cont_3d(ctx0, alpha, num_v_heads, n_seq_tokens, n_seqs); + alpha = ggml_reshape_3d(ctx0, alpha, num_v_heads, n_seq_tokens, n_seqs); cb(alpha, "alpha", il); ggml_tensor * alpha_biased = ggml_add(ctx0, alpha, model.layers[il].ssm_dt); diff --git a/src/models/qwen35moe.cpp b/src/models/qwen35moe.cpp index 165e2412e5..b38660c0bc 100644 --- a/src/models/qwen35moe.cpp +++ b/src/models/qwen35moe.cpp @@ -224,7 +224,7 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_attn_linear( beta = ggml_sigmoid(ctx0, beta); ggml_tensor * alpha = build_lora_mm(model.layers[il].ssm_alpha, cur, model.layers[il].ssm_alpha_s); - alpha = ggml_cont_3d(ctx0, alpha, num_v_heads, n_seq_tokens, n_seqs); + alpha = ggml_reshape_3d(ctx0, alpha, num_v_heads, n_seq_tokens, n_seqs); cb(alpha, "alpha", il); ggml_tensor * alpha_biased = ggml_add(ctx0, alpha, model.layers[il].ssm_dt); diff --git a/tests/test-jinja.cpp b/tests/test-jinja.cpp index 05ea8ca9e9..23af1ec827 100644 --- a/tests/test-jinja.cpp +++ b/tests/test-jinja.cpp @@ -1915,7 +1915,7 @@ env.globals["raise_exception"] = raise_exception template = env.from_string(tmpl) result = template.render(**vars_json) -print(result, end='') +sys.stdout.buffer.write(result.encode()) )"; static void test_template_py(testing & t, const std::string & name, const std::string & tmpl, const json & vars, const std::string & expect) { diff --git a/tests/test-llama-archs.cpp b/tests/test-llama-archs.cpp index 014b3f2b14..d51c09e99f 100644 --- a/tests/test-llama-archs.cpp +++ b/tests/test-llama-archs.cpp @@ -90,7 +90,10 @@ static gguf_context_ptr get_gguf_ctx(const llm_arch arch, const bool moe) { n_embd = 64; n_head = 1; n_ff = 96; - } else if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_GLM_DSA || arch == LLM_ARCH_KIMI_LINEAR) { + } else if (arch == LLM_ARCH_DEEPSEEK2 + || arch == LLM_ARCH_GLM_DSA + || arch == LLM_ARCH_KIMI_LINEAR + || arch == LLM_ARCH_MISTRAL4) { n_embd = 128; n_head = 1; n_ff = 192; @@ -145,7 +148,10 @@ static gguf_context_ptr get_gguf_ctx(const llm_arch arch, const bool moe) { } ms.add_kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, 8.0f); - if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_GLM_DSA || arch == LLM_ARCH_KIMI_LINEAR) { + if (arch == LLM_ARCH_DEEPSEEK2 + || arch == LLM_ARCH_GLM_DSA + || arch == LLM_ARCH_KIMI_LINEAR + || arch == LLM_ARCH_MISTRAL4) { ms.add_kv(LLM_KV_ATTENTION_KEY_LENGTH, uint32_t(576)); ms.add_kv(LLM_KV_ATTENTION_VALUE_LENGTH, uint32_t(512)); ms.add_kv(LLM_KV_ROPE_DIMENSION_COUNT, uint32_t(64)); @@ -319,6 +325,7 @@ static bool moe_mandatory(const llm_arch arch) { case LLM_ARCH_MIMO2: case LLM_ARCH_KIMI_LINEAR: case LLM_ARCH_STEP35: + case LLM_ARCH_MISTRAL4: return true; default: return false; diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp index 7c4342d6bf..94182b2ad8 100644 --- a/tools/cli/cli.cpp +++ b/tools/cli/cli.cpp @@ -215,6 +215,7 @@ struct cli_context { inputs.parallel_tool_calls = false; inputs.add_generation_prompt = true; inputs.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; + inputs.force_pure_content = chat_params.force_pure_content; inputs.enable_thinking = chat_params.enable_thinking ? common_chat_templates_support_enable_thinking(chat_params.tmpls.get()) : false; // Apply chat template to the list of messages diff --git a/tools/completion/completion.cpp b/tools/completion/completion.cpp index 2e0f087184..58d598fcc0 100644 --- a/tools/completion/completion.cpp +++ b/tools/completion/completion.cpp @@ -308,6 +308,7 @@ int main(int argc, char ** argv) { inputs.use_jinja = g_params->use_jinja; inputs.messages = chat_msgs; inputs.add_generation_prompt = !params.prompt.empty(); + inputs.force_pure_content = params.force_pure_content_parser; prompt = common_chat_templates_apply(chat_templates.get(), inputs).prompt; } diff --git a/tools/server/public/index.html.gz b/tools/server/public/index.html.gz index 9cff8a0126..6dcd04cbf3 100644 Binary files a/tools/server/public/index.html.gz and b/tools/server/public/index.html.gz differ diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp index bd203228cc..59ea11fc47 100644 --- a/tools/server/server-common.cpp +++ b/tools/server/server-common.cpp @@ -1065,6 +1065,7 @@ json oaicompat_chat_params_parse( inputs.add_generation_prompt = true; } + inputs.force_pure_content = opt.force_pure_content; // Apply chat template to the list of messages auto chat_params = common_chat_templates_apply(opt.tmpls.get(), inputs); @@ -1273,17 +1274,27 @@ json convert_responses_to_chatcmpl(const json & response_body) { for (const auto & output_text : item.at("content")) { const std::string type = json_value(output_text, "type", std::string()); - if (type != "output_text") { - throw std::invalid_argument("'type' must be 'output_text'"); + if (type == "output_text") { + if (!exists_and_is_string(output_text, "text")) { + throw std::invalid_argument("'Output text' requires 'text'"); + // Ignore annotations and logprobs for now + chatcmpl_content.push_back({ + {"text", output_text.at("text")}, + {"type", "text"}, + }); + } + } else if (type == "refusal") { + if (!exists_and_is_string(output_text, "refusal")) { + throw std::invalid_argument("'Refusal' requires 'refusal'"); + // Ignore annotations and logprobs for now + chatcmpl_content.push_back({ + {"refusal", output_text.at("refusal")}, + {"type", "refusal"}, + }); + } + } else { + throw std::invalid_argument("'type' must be one of 'output_text' or 'refusal'"); } - if (!exists_and_is_string(output_text, "text")) { - throw std::invalid_argument("'Output text' requires 'text'"); - } - // Ignore annotations and logprobs for now - chatcmpl_content.push_back({ - {"text", output_text.at("text")}, - {"type", "text"}, - }); } if (merge_prev) { diff --git a/tools/server/server-common.h b/tools/server/server-common.h index 3e56b3d856..213ae52bb0 100644 --- a/tools/server/server-common.h +++ b/tools/server/server-common.h @@ -290,6 +290,7 @@ struct server_chat_params { int reasoning_budget = -1; std::string reasoning_budget_message; std::string media_path; + bool force_pure_content = false; }; // used by /completions endpoint diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index c47ad876cb..1e5ff101c8 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -911,6 +911,7 @@ private: /* reasoning_budget */ params_base.reasoning_budget, /* reasoning_budget_msg */ params_base.reasoning_budget_message, /* media_path */ params_base.media_path, + /* force_pure_content */ params_base.force_pure_content_parser }; } @@ -2402,11 +2403,11 @@ private: } { - // erase any checkpoints with pos_min > pos_min_thold + // erase any checkpoints with pos_max > pos_next for (auto it = slot.prompt.checkpoints.begin(); it != slot.prompt.checkpoints.end();) { const auto & cur = *it; - if (cur.pos_min > pos_min_thold) { - SLT_WRN(slot, "erased invalidated context checkpoint (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", n_swa = %d, size = %.3f MiB)\n", cur.pos_min, cur.pos_max, cur.n_tokens, n_swa, (float) cur.data.size() / 1024 / 1024); + if (cur.pos_max > pos_next) { + SLT_WRN(slot, "erased invalidated context checkpoint (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", n_swa = %d, pos_next = %d, size = %.3f MiB)\n", cur.pos_min, cur.pos_max, cur.n_tokens, n_swa, pos_next, (float) cur.data.size() / 1024 / 1024); it = slot.prompt.checkpoints.erase(it); } else { ++it; diff --git a/tools/server/webui/src/lib/components/app/actions/ActionIcon.svelte b/tools/server/webui/src/lib/components/app/actions/ActionIcon.svelte index c676e224a7..1d2dd3c1d9 100644 --- a/tools/server/webui/src/lib/components/app/actions/ActionIcon.svelte +++ b/tools/server/webui/src/lib/components/app/actions/ActionIcon.svelte @@ -11,7 +11,7 @@ iconSize?: string; class?: string; disabled?: boolean; - onclick: () => void; + onclick: (e?: MouseEvent) => void; 'aria-label'?: string; } diff --git a/tools/server/webui/src/lib/components/app/dialogs/DialogModelInformation.svelte b/tools/server/webui/src/lib/components/app/dialogs/DialogModelInformation.svelte index eac83f234d..3a1db5c77d 100644 --- a/tools/server/webui/src/lib/components/app/dialogs/DialogModelInformation.svelte +++ b/tools/server/webui/src/lib/components/app/dialogs/DialogModelInformation.svelte @@ -5,21 +5,38 @@ import { serverStore } from '$lib/stores/server.svelte'; import { modelsStore, modelOptions, modelsLoading } from '$lib/stores/models.svelte'; import { formatFileSize, formatParameters, formatNumber } from '$lib/utils'; + import type { ApiLlamaCppServerProps } from '$lib/types'; interface Props { open?: boolean; onOpenChange?: (open: boolean) => void; + // when set, fetch props from the child process (router mode) + modelId?: string | null; } - let { open = $bindable(), onOpenChange }: Props = $props(); + let { open = $bindable(), onOpenChange, modelId = null }: Props = $props(); - let serverProps = $derived(serverStore.props); - let modelName = $derived(modelsStore.singleModelName); + let isRouter = $derived(serverStore.isRouterMode); + + // per-model props fetched from the child process + let routerModelProps = $state(null); + let isLoadingRouterProps = $state(false); + + // in router mode use per-model props, otherwise use global props + let serverProps = $derived(isRouter && modelId ? routerModelProps : serverStore.props); + + let modelName = $derived(isRouter && modelId ? modelId : modelsStore.singleModelName); let models = $derived(modelOptions()); let isLoadingModels = $derived(modelsLoading()); - // Get the first model for single-model mode display - let firstModel = $derived(models[0] ?? null); + // in router mode, find the model option matching modelId + // in single mode, use the first model as before + let firstModel = $derived.by(() => { + if (isRouter && modelId) { + return models.find((m) => m.model === modelId) ?? null; + } + return models[0] ?? null; + }); // Get modalities from modelStore using the model ID from the first model let modalities = $derived.by(() => { @@ -33,10 +50,31 @@ modelsStore.fetch(); } }); + + // fetch per-model props from child process when dialog opens in router mode + $effect(() => { + if (open && isRouter && modelId) { + isLoadingRouterProps = true; + modelsStore + .fetchModelProps(modelId) + .then((props) => { + routerModelProps = props; + }) + .catch(() => { + routerModelProps = null; + }) + .finally(() => { + isLoadingRouterProps = false; + }); + } + if (!open) { + routerModelProps = null; + } + }); - +