Merge branch 'master' into hksdpc255-patch-2
This commit is contained in:
commit
c9315a3c73
|
|
@ -4,7 +4,7 @@
|
||||||
|
|
||||||
# Define the CANN base image for easier version updates later
|
# Define the CANN base image for easier version updates later
|
||||||
ARG CHIP_TYPE=910b
|
ARG CHIP_TYPE=910b
|
||||||
ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.3.rc1.alpha001-${CHIP_TYPE}-openeuler22.03-py3.11
|
ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.3.rc2-${CHIP_TYPE}-openeuler24.03-py3.11
|
||||||
|
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
# BUILD STAGE
|
# BUILD STAGE
|
||||||
|
|
@ -111,7 +111,7 @@ ENTRYPOINT ["/app/tools.sh"]
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
FROM base AS light
|
FROM base AS light
|
||||||
|
|
||||||
COPY --from=build /app/full/llama-cli /app
|
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-cli" ]
|
ENTRYPOINT [ "/app/llama-cli" ]
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -68,7 +68,7 @@ ENTRYPOINT ["/app/tools.sh"]
|
||||||
### Light, CLI only
|
### Light, CLI only
|
||||||
FROM base AS light
|
FROM base AS light
|
||||||
|
|
||||||
COPY --from=build /app/full/llama-cli /app
|
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -74,7 +74,7 @@ ENTRYPOINT ["/app/tools.sh"]
|
||||||
### Light, CLI only
|
### Light, CLI only
|
||||||
FROM base AS light
|
FROM base AS light
|
||||||
|
|
||||||
COPY --from=build /app/full/llama-cli /app
|
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -73,7 +73,7 @@ ENTRYPOINT ["/app/tools.sh"]
|
||||||
FROM base AS light
|
FROM base AS light
|
||||||
|
|
||||||
COPY --from=build /app/lib/ /app
|
COPY --from=build /app/lib/ /app
|
||||||
COPY --from=build /app/full/llama-cli /app
|
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -81,7 +81,7 @@ ENTRYPOINT ["/app/tools.sh"]
|
||||||
### Light, CLI only
|
### Light, CLI only
|
||||||
FROM base AS light
|
FROM base AS light
|
||||||
|
|
||||||
COPY --from=build /app/full/llama-cli /app
|
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -94,7 +94,7 @@ ENTRYPOINT ["/app/tools.sh"]
|
||||||
### Light, CLI only
|
### Light, CLI only
|
||||||
FROM base AS light
|
FROM base AS light
|
||||||
|
|
||||||
COPY --from=build /app/full/llama-cli /app
|
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -105,7 +105,7 @@ WORKDIR /llama.cpp/bin
|
||||||
|
|
||||||
# Copy llama.cpp binaries and libraries
|
# Copy llama.cpp binaries and libraries
|
||||||
COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
|
COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
|
||||||
COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin
|
COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin/llama-completion /llama.cpp/bin
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ]
|
ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ]
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -13,6 +13,8 @@ elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
|
||||||
exec ./llama-quantize "$@"
|
exec ./llama-quantize "$@"
|
||||||
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
|
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
|
||||||
exec ./llama-cli "$@"
|
exec ./llama-cli "$@"
|
||||||
|
elif [[ "$arg1" == '--run-legacy' || "$arg1" == '-l' ]]; then
|
||||||
|
exec ./llama-completion "$@"
|
||||||
elif [[ "$arg1" == '--bench' || "$arg1" == '-b' ]]; then
|
elif [[ "$arg1" == '--bench' || "$arg1" == '-b' ]]; then
|
||||||
exec ./llama-bench "$@"
|
exec ./llama-bench "$@"
|
||||||
elif [[ "$arg1" == '--perplexity' || "$arg1" == '-p' ]]; then
|
elif [[ "$arg1" == '--perplexity' || "$arg1" == '-p' ]]; then
|
||||||
|
|
@ -32,8 +34,10 @@ elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
|
||||||
else
|
else
|
||||||
echo "Unknown command: $arg1"
|
echo "Unknown command: $arg1"
|
||||||
echo "Available commands: "
|
echo "Available commands: "
|
||||||
echo " --run (-r): Run a model previously converted into ggml"
|
echo " --run (-r): Run a model (chat) previously converted into ggml"
|
||||||
echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
|
echo " ex: -m /models/7B/ggml-model-q4_0.bin"
|
||||||
|
echo " --run-legacy (-l): Run a model (legacy completion) previously converted into ggml"
|
||||||
|
echo " ex: -m /models/7B/ggml-model-q4_0.bin -no-cnv -p \"Building a website can be done in 10 simple steps:\" -n 512"
|
||||||
echo " --bench (-b): Benchmark the performance of the inference for various parameters."
|
echo " --bench (-b): Benchmark the performance of the inference for various parameters."
|
||||||
echo " ex: -m model.gguf"
|
echo " ex: -m model.gguf"
|
||||||
echo " --perplexity (-p): Measure the perplexity of a model over a given text."
|
echo " --perplexity (-p): Measure the perplexity of a model over a given text."
|
||||||
|
|
|
||||||
|
|
@ -68,7 +68,7 @@ ENTRYPOINT ["/app/tools.sh"]
|
||||||
### Light, CLI only
|
### Light, CLI only
|
||||||
FROM base AS light
|
FROM base AS light
|
||||||
|
|
||||||
COPY --from=build /app/full/llama-cli /app
|
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -11,7 +11,7 @@ body:
|
||||||
(i.e. the generated text) are incorrect or llama.cpp crashes during model evaluation.
|
(i.e. the generated text) are incorrect or llama.cpp crashes during model evaluation.
|
||||||
If you encountered the issue while using an external UI (e.g. ollama),
|
If you encountered the issue while using an external UI (e.g. ollama),
|
||||||
please reproduce your issue using one of the examples/binaries in this repository.
|
please reproduce your issue using one of the examples/binaries in this repository.
|
||||||
The `llama-cli` binary can be used for simple and reproducible model inference.
|
The `llama-completion` binary can be used for simple and reproducible model inference.
|
||||||
- type: textarea
|
- type: textarea
|
||||||
id: version
|
id: version
|
||||||
attributes:
|
attributes:
|
||||||
|
|
@ -74,9 +74,12 @@ body:
|
||||||
Please give us a summary of the problem and tell us how to reproduce it.
|
Please give us a summary of the problem and tell us how to reproduce it.
|
||||||
If you can narrow down the bug to specific hardware, compile flags, or command line arguments,
|
If you can narrow down the bug to specific hardware, compile flags, or command line arguments,
|
||||||
that information would be very much appreciated by us.
|
that information would be very much appreciated by us.
|
||||||
|
|
||||||
|
If possible, please try to reproduce the issue using `llama-completion` with `-fit off`.
|
||||||
|
If you can only reproduce the issue with `-fit on`, please provide logs both with and without `--verbose`.
|
||||||
placeholder: >
|
placeholder: >
|
||||||
e.g. when I run llama-cli with -ngl 99 I get garbled outputs.
|
e.g. when I run llama-completion with `-fa on` I get garbled outputs for very long prompts.
|
||||||
When I use -ngl 0 it works correctly.
|
With short prompts or `-fa off` it works correctly.
|
||||||
Here are the exact commands that I used: ...
|
Here are the exact commands that I used: ...
|
||||||
validations:
|
validations:
|
||||||
required: true
|
required: true
|
||||||
|
|
|
||||||
|
|
@ -20,7 +20,8 @@ on:
|
||||||
'**/*.swift',
|
'**/*.swift',
|
||||||
'**/*.m',
|
'**/*.m',
|
||||||
'**/*.metal',
|
'**/*.metal',
|
||||||
'**/*.comp'
|
'**/*.comp',
|
||||||
|
'**/*.glsl'
|
||||||
]
|
]
|
||||||
|
|
||||||
pull_request:
|
pull_request:
|
||||||
|
|
@ -40,7 +41,8 @@ on:
|
||||||
'**/*.swift',
|
'**/*.swift',
|
||||||
'**/*.m',
|
'**/*.m',
|
||||||
'**/*.metal',
|
'**/*.metal',
|
||||||
'**/*.comp'
|
'**/*.comp',
|
||||||
|
'**/*.glsl'
|
||||||
]
|
]
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
|
|
@ -243,7 +245,7 @@ jobs:
|
||||||
echo "Fetch llama2c model"
|
echo "Fetch llama2c model"
|
||||||
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
|
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
|
||||||
./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
|
./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
|
||||||
./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
|
./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
|
||||||
|
|
||||||
- name: Test llama2c (s390x)
|
- name: Test llama2c (s390x)
|
||||||
id: llama2c_test_s390x
|
id: llama2c_test_s390x
|
||||||
|
|
@ -252,7 +254,7 @@ jobs:
|
||||||
cd build
|
cd build
|
||||||
echo "Fetch llama2c big-endian model"
|
echo "Fetch llama2c big-endian model"
|
||||||
wget https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K-be.gguf
|
wget https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K-be.gguf
|
||||||
./bin/llama-cli -m stories260K-be.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
|
./bin/llama-completion -m stories260K-be.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
|
||||||
|
|
||||||
ubuntu-latest-cmake-sanitizer:
|
ubuntu-latest-cmake-sanitizer:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
|
@ -1400,26 +1402,55 @@ jobs:
|
||||||
chip_type: ['910b', '310p']
|
chip_type: ['910b', '310p']
|
||||||
build: ['Release']
|
build: ['Release']
|
||||||
runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
|
runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
|
||||||
container: ascendai/cann:${{ matrix.chip_type == '910b' && '8.3.rc1.alpha001-910b-openeuler22.03-py3.11' || '8.2.rc1-310p-openeuler22.03-py3.11' }}
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
- name: Dependencies
|
- name: Free up disk space
|
||||||
|
uses: ggml-org/free-disk-space@v1.3.1
|
||||||
|
with:
|
||||||
|
tool-cache: true
|
||||||
|
|
||||||
|
- name: Set container image
|
||||||
|
id: cann-image
|
||||||
run: |
|
run: |
|
||||||
yum update -y
|
image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.3.rc2-910b-openeuler24.03-py3.11' || '8.3.rc2-310p-openeuler24.03-py3.11' }}"
|
||||||
yum install -y git gcc gcc-c++ make cmake libcurl-devel
|
echo "image=${image}" >> "${GITHUB_OUTPUT}"
|
||||||
|
|
||||||
|
- name: Pull container image
|
||||||
|
run: docker pull "${{ steps.cann-image.outputs.image }}"
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
|
env:
|
||||||
|
BUILD_TYPE: ${{ matrix.build }}
|
||||||
|
SOC_TYPE: ascend${{ matrix.chip_type }}
|
||||||
run: |
|
run: |
|
||||||
export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
|
HOST_UID=$(id -u)
|
||||||
|
HOST_GID=$(id -g)
|
||||||
|
|
||||||
|
docker run --rm \
|
||||||
|
-v "${PWD}:/workspace" \
|
||||||
|
-w /workspace \
|
||||||
|
-e SOC_TYPE=${SOC_TYPE} \
|
||||||
|
-e BUILD_TYPE=${BUILD_TYPE} \
|
||||||
|
"${{ steps.cann-image.outputs.image }}" \
|
||||||
|
bash -lc '
|
||||||
|
set -e
|
||||||
|
yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake libcurl-devel
|
||||||
|
yum clean all && rm -rf /var/cache/yum
|
||||||
|
git config --global --add safe.directory "/workspace"
|
||||||
|
export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
|
||||||
cmake -S . -B build \
|
cmake -S . -B build \
|
||||||
-DCMAKE_BUILD_TYPE=${{ matrix.build }} \
|
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
|
||||||
-DGGML_CANN=on \
|
-DGGML_CANN=on \
|
||||||
-DSOC_TYPE=ascend${{ matrix.chip_type }}
|
-DSOC_TYPE=${SOC_TYPE}
|
||||||
cmake --build build -j $(nproc)
|
cmake --build build -j $(nproc)
|
||||||
|
|
||||||
|
chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
|
||||||
|
'
|
||||||
|
|
||||||
# TODO: simplify the following workflows using a matrix
|
# TODO: simplify the following workflows using a matrix
|
||||||
# TODO: run lighter CI on PRs and the full CI only on master (if needed)
|
# TODO: run lighter CI on PRs and the full CI only on master (if needed)
|
||||||
ggml-ci-x64-cpu-low-perf:
|
ggml-ci-x64-cpu-low-perf:
|
||||||
|
|
@ -1770,7 +1801,7 @@ jobs:
|
||||||
echo "Fetch llama2c model"
|
echo "Fetch llama2c model"
|
||||||
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
|
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
|
||||||
./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
|
./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
|
||||||
./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
|
./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
|
||||||
|
|
||||||
ubuntu-cmake-sanitizer-riscv64-native:
|
ubuntu-cmake-sanitizer-riscv64-native:
|
||||||
runs-on: RISCV64
|
runs-on: RISCV64
|
||||||
|
|
|
||||||
|
|
@ -731,6 +731,78 @@ jobs:
|
||||||
path: llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz
|
path: llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz
|
||||||
name: llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz
|
name: llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz
|
||||||
|
|
||||||
|
|
||||||
|
openEuler-cann:
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
arch: [x86, aarch64]
|
||||||
|
chip_type: ['910b', '310p']
|
||||||
|
build: ['Release']
|
||||||
|
runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
|
- name: Free up disk space
|
||||||
|
uses: ggml-org/free-disk-space@v1.3.1
|
||||||
|
with:
|
||||||
|
tool-cache: true
|
||||||
|
|
||||||
|
- name: Set container image
|
||||||
|
id: cann-image
|
||||||
|
run: |
|
||||||
|
image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.3.rc2-910b-openeuler24.03-py3.11' || '8.3.rc2-310p-openeuler24.03-py3.11' }}"
|
||||||
|
echo "image=${image}" >> "${GITHUB_OUTPUT}"
|
||||||
|
|
||||||
|
- name: Pull container image
|
||||||
|
run: docker pull "${{ steps.cann-image.outputs.image }}"
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
env:
|
||||||
|
BUILD_TYPE: ${{ matrix.build }}
|
||||||
|
SOC_TYPE: ascend${{ matrix.chip_type }}
|
||||||
|
run: |
|
||||||
|
HOST_UID=$(id -u)
|
||||||
|
HOST_GID=$(id -g)
|
||||||
|
|
||||||
|
docker run --rm \
|
||||||
|
-v "${PWD}:/workspace" \
|
||||||
|
-w /workspace \
|
||||||
|
-e SOC_TYPE=${SOC_TYPE} \
|
||||||
|
-e BUILD_TYPE=${BUILD_TYPE} \
|
||||||
|
"${{ steps.cann-image.outputs.image }}" \
|
||||||
|
bash -lc '
|
||||||
|
set -e
|
||||||
|
yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake libcurl-devel
|
||||||
|
yum clean all && rm -rf /var/cache/yum
|
||||||
|
git config --global --add safe.directory "/workspace"
|
||||||
|
export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
|
||||||
|
cmake -S . -B build \
|
||||||
|
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
|
||||||
|
-DGGML_CANN=on \
|
||||||
|
-DSOC_TYPE=${SOC_TYPE}
|
||||||
|
cmake --build build -j $(nproc)
|
||||||
|
|
||||||
|
chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
|
||||||
|
'
|
||||||
|
|
||||||
|
- name: Determine tag name
|
||||||
|
id: tag
|
||||||
|
uses: ./.github/actions/get-tag-name
|
||||||
|
|
||||||
|
- name: Pack artifacts
|
||||||
|
run: |
|
||||||
|
cp LICENSE ./build/bin/
|
||||||
|
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
|
||||||
|
|
||||||
|
- name: Upload artifacts (tar)
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.tar.gz
|
||||||
|
name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.tar.gz
|
||||||
|
|
||||||
release:
|
release:
|
||||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||||
|
|
||||||
|
|
@ -752,6 +824,7 @@ jobs:
|
||||||
- macOS-arm64
|
- macOS-arm64
|
||||||
- macOS-x64
|
- macOS-x64
|
||||||
- ios-xcode-build
|
- ios-xcode-build
|
||||||
|
- openEuler-cann
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
|
|
@ -844,6 +917,12 @@ jobs:
|
||||||
- [Windows x64 (SYCL)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip)
|
- [Windows x64 (SYCL)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip)
|
||||||
- [Windows x64 (HIP)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-hip-radeon-x64.zip)
|
- [Windows x64 (HIP)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-hip-radeon-x64.zip)
|
||||||
|
|
||||||
|
**openEuler:**
|
||||||
|
- [openEuler x86 (310p)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-310p-openEuler-x86.tar.gz)
|
||||||
|
- [openEuler x86 (910b)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-x86.tar.gz)
|
||||||
|
- [openEuler aarch64 (310p)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-310p-openEuler-aarch64.tar.gz)
|
||||||
|
- [openEuler aarch64 (910b)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-aarch64.tar.gz)
|
||||||
|
|
||||||
- name: Upload release
|
- name: Upload release
|
||||||
id: upload_release
|
id: upload_release
|
||||||
uses: actions/github-script@v3
|
uses: actions/github-script@v3
|
||||||
|
|
|
||||||
|
|
@ -54,6 +54,7 @@
|
||||||
/out/
|
/out/
|
||||||
/tmp/
|
/tmp/
|
||||||
/autogen-*.md
|
/autogen-*.md
|
||||||
|
/common/build-info.cpp
|
||||||
|
|
||||||
# Deprecated
|
# Deprecated
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -87,7 +87,8 @@
|
||||||
/tests/ @ggerganov
|
/tests/ @ggerganov
|
||||||
/tests/test-chat-.* @pwilkin
|
/tests/test-chat-.* @pwilkin
|
||||||
/tools/batched-bench/ @ggerganov
|
/tools/batched-bench/ @ggerganov
|
||||||
/tools/main/ @ggerganov
|
/tools/cli/ @ngxson
|
||||||
|
/tools/completion/ @ggerganov
|
||||||
/tools/mtmd/ @ngxson
|
/tools/mtmd/ @ngxson
|
||||||
/tools/perplexity/ @ggerganov
|
/tools/perplexity/ @ggerganov
|
||||||
/tools/quantize/ @ggerganov
|
/tools/quantize/ @ggerganov
|
||||||
|
|
|
||||||
|
|
@ -15,6 +15,7 @@ The project differentiates between 3 levels of contributors:
|
||||||
- If you modified the `ggml` source, run the `test-backend-ops` tool to check whether different backend implementations of the `ggml` operators produce consistent results (this requires access to at least two different `ggml` backends)
|
- If you modified the `ggml` source, run the `test-backend-ops` tool to check whether different backend implementations of the `ggml` operators produce consistent results (this requires access to at least two different `ggml` backends)
|
||||||
- If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops`
|
- If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops`
|
||||||
- Create separate PRs for each feature or fix. Avoid combining unrelated changes in a single PR
|
- Create separate PRs for each feature or fix. Avoid combining unrelated changes in a single PR
|
||||||
|
- When adding support for a new model or feature, focus on **CPU support only** in the initial PR unless you have a good reason not to. Add support for other backends like CUDA in follow-up PRs
|
||||||
- Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
|
- Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
|
||||||
- If your PR becomes stale, rebase it on top of latest `master` to get maintainers attention
|
- If your PR becomes stale, rebase it on top of latest `master` to get maintainers attention
|
||||||
- Maintainers will rely on your insights and approval when making a final decision to approve and merge a PR
|
- Maintainers will rely on your insights and approval when making a final decision to approve and merge a PR
|
||||||
|
|
|
||||||
18
README.md
18
README.md
|
|
@ -313,7 +313,7 @@ The Hugging Face platform provides a variety of online tools for converting, qua
|
||||||
|
|
||||||
To learn more about model quantization, [read this documentation](tools/quantize/README.md)
|
To learn more about model quantization, [read this documentation](tools/quantize/README.md)
|
||||||
|
|
||||||
## [`llama-cli`](tools/main)
|
## [`llama-cli`](tools/cli)
|
||||||
|
|
||||||
#### A CLI tool for accessing and experimenting with most of `llama.cpp`'s functionality.
|
#### A CLI tool for accessing and experimenting with most of `llama.cpp`'s functionality.
|
||||||
|
|
||||||
|
|
@ -347,19 +347,6 @@ To learn more about model quantization, [read this documentation](tools/quantize
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
- <details>
|
|
||||||
<summary>Run simple text completion</summary>
|
|
||||||
|
|
||||||
To disable conversation mode explicitly, use `-no-cnv`
|
|
||||||
|
|
||||||
```bash
|
|
||||||
llama-cli -m model.gguf -p "I believe the meaning of life is" -n 128 -no-cnv
|
|
||||||
|
|
||||||
# I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga – it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
|
|
||||||
```
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
- <details>
|
- <details>
|
||||||
<summary>Constrain the output with a custom grammar</summary>
|
<summary>Constrain the output with a custom grammar</summary>
|
||||||
|
|
||||||
|
|
@ -538,7 +525,8 @@ To learn more about model quantization, [read this documentation](tools/quantize
|
||||||
|
|
||||||
## Other documentation
|
## Other documentation
|
||||||
|
|
||||||
- [main (cli)](tools/main/README.md)
|
- [cli](tools/cli/README.md)
|
||||||
|
- [completion](tools/completion/README.md)
|
||||||
- [server](tools/server/README.md)
|
- [server](tools/server/README.md)
|
||||||
- [GBNF grammars](grammars/README.md)
|
- [GBNF grammars](grammars/README.md)
|
||||||
|
|
||||||
|
|
|
||||||
30
ci/run.sh
30
ci/run.sh
|
|
@ -398,18 +398,20 @@ function gg_run_qwen3_0_6b {
|
||||||
./bin/llama-quantize ${model_bf16} ${model_q5_k} q5_k $(nproc)
|
./bin/llama-quantize ${model_bf16} ${model_q5_k} q5_k $(nproc)
|
||||||
./bin/llama-quantize ${model_bf16} ${model_q6_k} q6_k $(nproc)
|
./bin/llama-quantize ${model_bf16} ${model_q6_k} q6_k $(nproc)
|
||||||
|
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_f16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_bf16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
|
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q8_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/llama-completion -no-cnv --model ${model_f16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q4_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
(time ./bin/llama-completion -no-cnv --model ${model_bf16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q4_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
(time ./bin/llama-completion -no-cnv --model ${model_q8_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q5_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
(time ./bin/llama-completion -no-cnv --model ${model_q4_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q5_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
(time ./bin/llama-completion -no-cnv --model ${model_q4_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q2_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
(time ./bin/llama-completion -no-cnv --model ${model_q5_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q3_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
(time ./bin/llama-completion -no-cnv --model ${model_q5_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q4_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
(time ./bin/llama-completion -no-cnv --model ${model_q2_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q5_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/llama-completion -no-cnv --model ${model_q3_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q6_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/llama-completion -no-cnv --model ${model_q4_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
|
(time ./bin/llama-completion -no-cnv --model ${model_q5_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
|
(time ./bin/llama-completion -no-cnv --model ${model_q6_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
if [ -z ${GG_BUILD_NO_BF16} ]; then
|
if [ -z ${GG_BUILD_NO_BF16} ]; then
|
||||||
|
|
@ -523,6 +525,8 @@ function gg_run_embd_bge_small {
|
||||||
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
||||||
|
|
||||||
|
(time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
|
||||||
|
|
||||||
(time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
|
|
||||||
|
|
@ -563,6 +567,8 @@ function gg_run_rerank_tiny {
|
||||||
|
|
||||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||||
|
|
||||||
|
(time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
|
||||||
|
|
||||||
# for this model, the SEP token is "</s>"
|
# for this model, the SEP token is "</s>"
|
||||||
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --no-op-offload --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
|
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --no-op-offload --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -73,6 +73,8 @@ add_library(${TARGET} STATIC
|
||||||
ngram-cache.h
|
ngram-cache.h
|
||||||
peg-parser.cpp
|
peg-parser.cpp
|
||||||
peg-parser.h
|
peg-parser.h
|
||||||
|
preset.cpp
|
||||||
|
preset.h
|
||||||
regex-partial.cpp
|
regex-partial.cpp
|
||||||
regex-partial.h
|
regex-partial.h
|
||||||
sampling.cpp
|
sampling.cpp
|
||||||
|
|
|
||||||
511
common/arg.cpp
511
common/arg.cpp
File diff suppressed because it is too large
Load Diff
45
common/arg.h
45
common/arg.h
|
|
@ -3,8 +3,10 @@
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#include <set>
|
#include <set>
|
||||||
|
#include <map>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include <cstring>
|
||||||
|
|
||||||
//
|
//
|
||||||
// CLI argument parsing
|
// CLI argument parsing
|
||||||
|
|
@ -14,6 +16,7 @@ struct common_arg {
|
||||||
std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
|
std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
|
||||||
std::set<enum llama_example> excludes = {};
|
std::set<enum llama_example> excludes = {};
|
||||||
std::vector<const char *> args;
|
std::vector<const char *> args;
|
||||||
|
std::vector<const char *> args_neg; // for negated args like --no-xxx
|
||||||
const char * value_hint = nullptr; // help text or example for arg value
|
const char * value_hint = nullptr; // help text or example for arg value
|
||||||
const char * value_hint_2 = nullptr; // for second arg value
|
const char * value_hint_2 = nullptr; // for second arg value
|
||||||
const char * env = nullptr;
|
const char * env = nullptr;
|
||||||
|
|
@ -23,6 +26,9 @@ struct common_arg {
|
||||||
void (*handler_string) (common_params & params, const std::string &) = nullptr;
|
void (*handler_string) (common_params & params, const std::string &) = nullptr;
|
||||||
void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr;
|
void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr;
|
||||||
void (*handler_int) (common_params & params, int) = nullptr;
|
void (*handler_int) (common_params & params, int) = nullptr;
|
||||||
|
void (*handler_bool) (common_params & params, bool) = nullptr;
|
||||||
|
|
||||||
|
common_arg() = default;
|
||||||
|
|
||||||
common_arg(
|
common_arg(
|
||||||
const std::initializer_list<const char *> & args,
|
const std::initializer_list<const char *> & args,
|
||||||
|
|
@ -44,6 +50,13 @@ struct common_arg {
|
||||||
void (*handler)(common_params & params)
|
void (*handler)(common_params & params)
|
||||||
) : args(args), help(help), handler_void(handler) {}
|
) : args(args), help(help), handler_void(handler) {}
|
||||||
|
|
||||||
|
common_arg(
|
||||||
|
const std::initializer_list<const char *> & args,
|
||||||
|
const std::initializer_list<const char *> & args_neg,
|
||||||
|
const std::string & help,
|
||||||
|
void (*handler)(common_params & params, bool)
|
||||||
|
) : args(args), args_neg(args_neg), help(help), handler_bool(handler) {}
|
||||||
|
|
||||||
// support 2 values for arg
|
// support 2 values for arg
|
||||||
common_arg(
|
common_arg(
|
||||||
const std::initializer_list<const char *> & args,
|
const std::initializer_list<const char *> & args,
|
||||||
|
|
@ -61,9 +74,33 @@ struct common_arg {
|
||||||
bool is_exclude(enum llama_example ex);
|
bool is_exclude(enum llama_example ex);
|
||||||
bool get_value_from_env(std::string & output) const;
|
bool get_value_from_env(std::string & output) const;
|
||||||
bool has_value_from_env() const;
|
bool has_value_from_env() const;
|
||||||
std::string to_string();
|
std::string to_string() const;
|
||||||
|
|
||||||
|
// for using as key in std::map
|
||||||
|
bool operator<(const common_arg& other) const {
|
||||||
|
if (args.empty() || other.args.empty()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return strcmp(args[0], other.args[0]) < 0;
|
||||||
|
}
|
||||||
|
bool operator==(const common_arg& other) const {
|
||||||
|
if (args.empty() || other.args.empty()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return strcmp(args[0], other.args[0]) == 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// get all args and env vars (including negated args/env)
|
||||||
|
std::vector<std::string> get_args() const;
|
||||||
|
std::vector<std::string> get_env() const;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
namespace common_arg_utils {
|
||||||
|
bool is_truthy(const std::string & value);
|
||||||
|
bool is_falsey(const std::string & value);
|
||||||
|
bool is_autoy(const std::string & value);
|
||||||
|
}
|
||||||
|
|
||||||
struct common_params_context {
|
struct common_params_context {
|
||||||
enum llama_example ex = LLAMA_EXAMPLE_COMMON;
|
enum llama_example ex = LLAMA_EXAMPLE_COMMON;
|
||||||
common_params & params;
|
common_params & params;
|
||||||
|
|
@ -76,7 +113,11 @@ struct common_params_context {
|
||||||
// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
|
// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
|
||||||
bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
||||||
|
|
||||||
// function to be used by test-arg-parser
|
// parse input arguments from CLI into a map
|
||||||
|
// TODO: support repeated args in the future
|
||||||
|
bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map);
|
||||||
|
|
||||||
|
// initialize argument parser context - used by test-arg-parser and preset
|
||||||
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
||||||
|
|
||||||
struct common_remote_params {
|
struct common_remote_params {
|
||||||
|
|
|
||||||
132
common/chat.cpp
132
common/chat.cpp
|
|
@ -1,5 +1,6 @@
|
||||||
#include "chat.h"
|
#include "chat.h"
|
||||||
#include "chat-parser.h"
|
#include "chat-parser.h"
|
||||||
|
#include "chat-peg-parser.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "json-partial.h"
|
#include "json-partial.h"
|
||||||
#include "json-schema-to-grammar.h"
|
#include "json-schema-to-grammar.h"
|
||||||
|
|
@ -150,6 +151,7 @@ struct templates_params {
|
||||||
common_chat_tool_choice tool_choice;
|
common_chat_tool_choice tool_choice;
|
||||||
json json_schema;
|
json json_schema;
|
||||||
bool parallel_tool_calls;
|
bool parallel_tool_calls;
|
||||||
|
common_reasoning_format reasoning_format;
|
||||||
bool stream;
|
bool stream;
|
||||||
std::string grammar;
|
std::string grammar;
|
||||||
bool add_generation_prompt = true;
|
bool add_generation_prompt = true;
|
||||||
|
|
@ -589,6 +591,16 @@ common_chat_templates_ptr common_chat_templates_init(
|
||||||
"{%- if false %}");
|
"{%- if false %}");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO @aldehir : this is a temporary fix, pending Minja changes
|
||||||
|
// Ref: https://github.com/ggml-org/llama.cpp/pull/17713#issuecomment-3631342664
|
||||||
|
if (default_template_src.find("[TOOL_CALLS]") != std::string::npos
|
||||||
|
// search for the error message and patch it
|
||||||
|
&& default_template_src.find("if (message['content'] is none or") != std::string::npos) {
|
||||||
|
string_replace_all(default_template_src,
|
||||||
|
"{%- if (message['content'] is none or message['content'] == '' or message['content']|length == 0) and (message['tool_calls'] is not defined or message['tool_calls'] is none or message['tool_calls']|length == 0) %}",
|
||||||
|
"{%- if false %}");
|
||||||
|
}
|
||||||
|
|
||||||
std::string token_bos = bos_token_override;
|
std::string token_bos = bos_token_override;
|
||||||
std::string token_eos = eos_token_override;
|
std::string token_eos = eos_token_override;
|
||||||
bool add_bos = false;
|
bool add_bos = false;
|
||||||
|
|
@ -988,6 +1000,118 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
|
||||||
return data;
|
return data;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static common_chat_params common_chat_params_init_ministral_3(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
||||||
|
common_chat_params data;
|
||||||
|
|
||||||
|
// Build up messages to follow the format: https://huggingface.co/mistralai/Ministral-3-14B-Reasoning-2512/blob/main/chat_template.jinja
|
||||||
|
auto adjusted_messages = json::array();
|
||||||
|
for (const auto & msg : inputs.messages) {
|
||||||
|
auto role = msg.value("role", "");
|
||||||
|
if (role != "system" && role != "assistant") {
|
||||||
|
// Only adjust system and assistant messages. Interestingly, the system message may contain thinking.
|
||||||
|
adjusted_messages.push_back(msg);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto content = json::array();
|
||||||
|
|
||||||
|
// If message contains `reasoning_content`, add it as a block of type `thinking`
|
||||||
|
if (msg.contains("reasoning_content") && msg.at("reasoning_content").is_string()) {
|
||||||
|
content.push_back({
|
||||||
|
{"type", "thinking"},
|
||||||
|
{"thinking", msg.at("reasoning_content").get<std::string>()},
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// If message contains `content`, add it as a block of type `text`
|
||||||
|
if (msg.contains("content")) {
|
||||||
|
if (msg.at("content").is_string()) {
|
||||||
|
content.push_back({
|
||||||
|
{"type", "text"},
|
||||||
|
{"text", msg.at("content").get<std::string>()},
|
||||||
|
});
|
||||||
|
} else if (msg.at("content").is_array()) {
|
||||||
|
auto blocks = msg.at("content");
|
||||||
|
content.insert(content.end(), blocks.begin(), blocks.end());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
auto adjusted = msg;
|
||||||
|
adjusted["content"] = content;
|
||||||
|
adjusted.erase("reasoning_content");
|
||||||
|
adjusted_messages.push_back(adjusted);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
|
||||||
|
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
|
||||||
|
auto include_grammar = true;
|
||||||
|
|
||||||
|
data.prompt = apply(tmpl, inputs, /* messages_override = */ adjusted_messages);
|
||||||
|
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
|
||||||
|
data.preserved_tokens = {
|
||||||
|
"[THINK]",
|
||||||
|
"[/THINK]",
|
||||||
|
"[TOOL_CALLS]",
|
||||||
|
"[ARGS]",
|
||||||
|
};
|
||||||
|
|
||||||
|
auto parser = build_chat_peg_native_parser([&](common_chat_peg_native_builder & p) {
|
||||||
|
auto reasoning = extract_reasoning ? p.optional("[THINK]" + p.reasoning(p.until("[/THINK]")) + "[/THINK]") : p.eps();
|
||||||
|
|
||||||
|
// Response format parser
|
||||||
|
if (inputs.json_schema.is_object() && !inputs.json_schema.empty()) {
|
||||||
|
// Ministral wants to emit json surrounded by code fences
|
||||||
|
return reasoning << "```json" << p.content(p.schema(p.json(), "response-format", inputs.json_schema)) << "```";
|
||||||
|
}
|
||||||
|
|
||||||
|
// Tool call parser
|
||||||
|
if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
|
||||||
|
auto tool_choice = p.choice();
|
||||||
|
foreach_function(inputs.tools, [&](const json & tool) {
|
||||||
|
const auto & function = tool.at("function");
|
||||||
|
std::string name = function.at("name");
|
||||||
|
const auto & schema = function.at("parameters");
|
||||||
|
|
||||||
|
tool_choice |= p.rule("tool-" + name,
|
||||||
|
p.tool_open(p.tool_name(p.literal(name)) + "[ARGS]")
|
||||||
|
+ p.tool_args(p.schema(p.json(), "tool-" + name + "-schema", schema))
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
auto min_calls = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED ? 1 : 0;
|
||||||
|
auto max_calls = inputs.parallel_tool_calls ? -1 : 1;
|
||||||
|
auto tool_calls = p.trigger_rule("tool-call", p.repeat("[TOOL_CALLS]" + tool_choice, min_calls, max_calls));
|
||||||
|
|
||||||
|
return reasoning << p.content(p.until("[TOOL_CALLS]")) << tool_calls;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Content only parser
|
||||||
|
include_grammar = false;
|
||||||
|
return reasoning << p.content(p.rest());
|
||||||
|
});
|
||||||
|
|
||||||
|
data.parser = parser.save();
|
||||||
|
|
||||||
|
if (include_grammar) {
|
||||||
|
data.grammar_lazy = has_tools && inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
|
||||||
|
|
||||||
|
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
||||||
|
foreach_function(inputs.tools, [&](const json & tool) {
|
||||||
|
const auto & function = tool.at("function");
|
||||||
|
auto schema = function.at("parameters");
|
||||||
|
builder.resolve_refs(schema);
|
||||||
|
});
|
||||||
|
parser.build_grammar(builder, data.grammar_lazy);
|
||||||
|
});
|
||||||
|
|
||||||
|
data.grammar_triggers = {
|
||||||
|
{COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "[TOOL_CALLS]"}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
return data;
|
||||||
|
}
|
||||||
|
|
||||||
static common_chat_params common_chat_params_init_magistral(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
static common_chat_params common_chat_params_init_magistral(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
||||||
common_chat_params data;
|
common_chat_params data;
|
||||||
data.prompt = apply(tmpl, inputs);
|
data.prompt = apply(tmpl, inputs);
|
||||||
|
|
@ -2390,6 +2514,7 @@ static common_chat_params common_chat_templates_apply_jinja(
|
||||||
params.messages = common_chat_msgs_to_json_oaicompat<json>(inputs.messages, /* concat_text= */ !tmpl.original_caps().requires_typed_content);
|
params.messages = common_chat_msgs_to_json_oaicompat<json>(inputs.messages, /* concat_text= */ !tmpl.original_caps().requires_typed_content);
|
||||||
params.add_generation_prompt = inputs.add_generation_prompt;
|
params.add_generation_prompt = inputs.add_generation_prompt;
|
||||||
params.tool_choice = inputs.tool_choice;
|
params.tool_choice = inputs.tool_choice;
|
||||||
|
params.reasoning_format = inputs.reasoning_format;
|
||||||
params.enable_thinking = inputs.enable_thinking;
|
params.enable_thinking = inputs.enable_thinking;
|
||||||
params.grammar = inputs.grammar;
|
params.grammar = inputs.grammar;
|
||||||
params.now = inputs.now;
|
params.now = inputs.now;
|
||||||
|
|
@ -2568,6 +2693,13 @@ static common_chat_params common_chat_templates_apply_jinja(
|
||||||
return common_chat_params_init_llama_3_x(tmpl, params, allow_python_tag_builtin_tools);
|
return common_chat_params_init_llama_3_x(tmpl, params, allow_python_tag_builtin_tools);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Ministral/Mistral Large 3
|
||||||
|
if (src.find("[SYSTEM_PROMPT]") != std::string::npos &&
|
||||||
|
src.find("[TOOL_CALLS]") != std::string::npos &&
|
||||||
|
src.find("[ARGS]") != std::string::npos) {
|
||||||
|
return common_chat_params_init_ministral_3(tmpl, params);
|
||||||
|
}
|
||||||
|
|
||||||
if (src.find("[THINK]") != std::string::npos && src.find("[/THINK]") != std::string::npos) {
|
if (src.find("[THINK]") != std::string::npos && src.find("[/THINK]") != std::string::npos) {
|
||||||
return common_chat_params_init_magistral(tmpl, params);
|
return common_chat_params_init_magistral(tmpl, params);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1013,31 +1013,40 @@ bool tty_can_use_colors() {
|
||||||
// Model utils
|
// Model utils
|
||||||
//
|
//
|
||||||
|
|
||||||
static inline void common_init_sampler_from_model(
|
// TODO: move to common/sampling
|
||||||
|
static void common_init_sampler_from_model(
|
||||||
const llama_model * model,
|
const llama_model * model,
|
||||||
common_params_sampling & sparams) {
|
common_params_sampling & sparams) {
|
||||||
|
|
||||||
const uint64_t config = sparams.user_sampling_config;
|
const uint64_t config = sparams.user_sampling_config;
|
||||||
|
|
||||||
auto get_int32 = [&](const char * key, int32_t & dst, uint64_t user_config) {
|
auto get_int32 = [&](const char * key, int32_t & dst, uint64_t user_config) {
|
||||||
if (config & user_config) return;
|
if (config & user_config) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
char buf[64] = {0};
|
char buf[64] = {0};
|
||||||
if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
|
if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
|
||||||
char * end = nullptr;
|
char * end = nullptr;
|
||||||
int32_t v = strtol(buf, &end, 10);
|
int32_t v = strtol(buf, &end, 10);
|
||||||
if (end && end != buf) dst = v;
|
if (end && end != buf) {
|
||||||
|
dst = v;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
auto get_float = [&](const char * key, float & dst, uint64_t user_config) {
|
auto get_float = [&](const char * key, float & dst, uint64_t user_config) {
|
||||||
if (config & user_config) return;
|
if (config & user_config) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
char buf[128] = {0};
|
char buf[128] = {0};
|
||||||
if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
|
if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
|
||||||
char * end = nullptr;
|
char * end = nullptr;
|
||||||
float v = strtof(buf, &end);
|
float v = strtof(buf, &end);
|
||||||
if (end && end != buf) dst = v;
|
if (end && end != buf) {
|
||||||
|
dst = v;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
@ -1065,31 +1074,125 @@ static inline void common_init_sampler_from_model(
|
||||||
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA), sparams.mirostat_eta, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA);
|
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA), sparams.mirostat_eta, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct common_init_result common_init_from_params(common_params & params) {
|
struct common_init_result::impl {
|
||||||
common_init_result iparams;
|
impl() = default;
|
||||||
|
~impl() = default;
|
||||||
|
|
||||||
|
llama_model_ptr model;
|
||||||
|
llama_context_ptr context;
|
||||||
|
|
||||||
|
std::vector<llama_adapter_lora_ptr> lora;
|
||||||
|
|
||||||
|
std::vector<common_sampler_ptr> samplers;
|
||||||
|
};
|
||||||
|
|
||||||
|
common_init_result::common_init_result(common_params & params) :
|
||||||
|
pimpl(new impl{}) {
|
||||||
auto mparams = common_model_params_to_llama(params);
|
auto mparams = common_model_params_to_llama(params);
|
||||||
|
auto cparams = common_context_params_to_llama(params);
|
||||||
|
|
||||||
|
if (params.fit_params) {
|
||||||
|
LOG_INF("%s: fitting params to device memory, to report bugs during this step use -fit off (or --verbose if you can't)\n", __func__);
|
||||||
|
llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
|
||||||
|
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
|
||||||
|
params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
|
||||||
|
}
|
||||||
|
|
||||||
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
|
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
LOG_ERR("%s: failed to load model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
|
return;
|
||||||
__func__, params.model.path.c_str());
|
|
||||||
return iparams;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
common_init_sampler_from_model(model, params.sampling);
|
pimpl->model.reset(model);
|
||||||
|
|
||||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||||
|
|
||||||
auto cparams = common_context_params_to_llama(params);
|
// updates params.sampling
|
||||||
|
// TODO: fix naming
|
||||||
|
common_init_sampler_from_model(model, params.sampling);
|
||||||
|
|
||||||
|
if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
|
||||||
|
LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
|
||||||
|
params.sampling.ignore_eos = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// initialize once
|
||||||
|
for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
|
||||||
|
if (llama_vocab_is_eog(vocab, i)) {
|
||||||
|
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(vocab, i).c_str(), -INFINITY);
|
||||||
|
params.sampling.logit_bias_eog.push_back({i, -INFINITY});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (params.sampling.ignore_eos) {
|
||||||
|
// add EOG biases to the active set of logit biases
|
||||||
|
params.sampling.logit_bias.insert(
|
||||||
|
params.sampling.logit_bias.end(),
|
||||||
|
params.sampling.logit_bias_eog.begin(), params.sampling.logit_bias_eog.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
//if (params.sampling.penalty_last_n == -1) {
|
||||||
|
// LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
|
||||||
|
// params.sampling.penalty_last_n = llama_n_ctx(lctx);
|
||||||
|
//}
|
||||||
|
|
||||||
|
//if (params.sampling.dry_penalty_last_n == -1) {
|
||||||
|
// LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
|
||||||
|
// params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
|
||||||
|
//}
|
||||||
|
|
||||||
|
pimpl->samplers.resize(cparams.n_seq_max);
|
||||||
|
|
||||||
|
for (int i = 0; i < (int) cparams.n_seq_max; ++i) {
|
||||||
|
pimpl->samplers[i].reset(common_sampler_init(model, params.sampling));
|
||||||
|
}
|
||||||
|
|
||||||
llama_context * lctx = llama_init_from_model(model, cparams);
|
llama_context * lctx = llama_init_from_model(model, cparams);
|
||||||
if (lctx == NULL) {
|
if (lctx == NULL) {
|
||||||
LOG_ERR("%s: failed to create context with model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
|
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
|
||||||
__func__, params.model.path.c_str());
|
return;
|
||||||
llama_model_free(model);
|
|
||||||
return iparams;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pimpl->context.reset(lctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_model * common_init_result::model() {
|
||||||
|
return pimpl->model.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_context * common_init_result::context() {
|
||||||
|
return pimpl->context.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
common_sampler * common_init_result::sampler(llama_seq_id seq_id) {
|
||||||
|
return pimpl->samplers[seq_id].get();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<llama_adapter_lora_ptr> & common_init_result::lora() {
|
||||||
|
return pimpl->lora;
|
||||||
|
}
|
||||||
|
|
||||||
|
void common_init_result::free_context() {
|
||||||
|
pimpl->context.reset();
|
||||||
|
}
|
||||||
|
|
||||||
|
common_init_result_ptr common_init_from_params(common_params & params) {
|
||||||
|
common_init_result_ptr res(new common_init_result(params));
|
||||||
|
|
||||||
|
llama_model * model = res->model();
|
||||||
|
if (model == NULL) {
|
||||||
|
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_context * lctx = res->context();
|
||||||
|
if (lctx == NULL) {
|
||||||
|
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||||
|
|
||||||
if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) {
|
if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) {
|
||||||
LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
|
LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
|
||||||
params.ctx_shift = false;
|
params.ctx_shift = false;
|
||||||
|
|
@ -1101,10 +1204,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||||
|
|
||||||
const auto cvec = common_control_vector_load(params.control_vectors);
|
const auto cvec = common_control_vector_load(params.control_vectors);
|
||||||
if (cvec.n_embd == -1) {
|
if (cvec.n_embd == -1) {
|
||||||
llama_free(lctx);
|
return res;
|
||||||
llama_model_free(model);
|
|
||||||
|
|
||||||
return iparams;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int err = llama_apply_adapter_cvec(
|
int err = llama_apply_adapter_cvec(
|
||||||
|
|
@ -1115,10 +1215,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||||
params.control_vector_layer_start,
|
params.control_vector_layer_start,
|
||||||
params.control_vector_layer_end);
|
params.control_vector_layer_end);
|
||||||
if (err) {
|
if (err) {
|
||||||
llama_free(lctx);
|
return res;
|
||||||
llama_model_free(model);
|
|
||||||
|
|
||||||
return iparams;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1142,10 +1239,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!ok) {
|
if (!ok) {
|
||||||
llama_free(lctx);
|
return res;
|
||||||
llama_model_free(model);
|
|
||||||
|
|
||||||
return iparams;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1155,9 +1249,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||||
lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
|
lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
|
||||||
if (lora == nullptr) {
|
if (lora == nullptr) {
|
||||||
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
|
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
|
||||||
llama_free(lctx);
|
return res;
|
||||||
llama_model_free(model);
|
|
||||||
return iparams;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
char buf[1024];
|
char buf[1024];
|
||||||
|
|
@ -1166,43 +1258,13 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||||
la.task_name = buf;
|
la.task_name = buf;
|
||||||
llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
|
llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
|
||||||
la.prompt_prefix = buf;
|
la.prompt_prefix = buf;
|
||||||
iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
|
res->lora().emplace_back(std::move(lora)); // copy to list of loaded adapters
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!params.lora_init_without_apply) {
|
if (!params.lora_init_without_apply) {
|
||||||
common_set_adapter_lora(lctx, params.lora_adapters);
|
common_set_adapter_lora(lctx, params.lora_adapters);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
|
|
||||||
LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
|
|
||||||
params.sampling.ignore_eos = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// initialize once
|
|
||||||
for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
|
|
||||||
if (llama_vocab_is_eog(vocab, i)) {
|
|
||||||
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
|
|
||||||
params.sampling.logit_bias_eog.push_back({i, -INFINITY});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (params.sampling.ignore_eos) {
|
|
||||||
// add EOG biases to the active set of logit biases
|
|
||||||
params.sampling.logit_bias.insert(
|
|
||||||
params.sampling.logit_bias.end(),
|
|
||||||
params.sampling.logit_bias_eog.begin(), params.sampling.logit_bias_eog.end());
|
|
||||||
}
|
|
||||||
|
|
||||||
if (params.sampling.penalty_last_n == -1) {
|
|
||||||
LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
|
|
||||||
params.sampling.penalty_last_n = llama_n_ctx(lctx);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (params.sampling.dry_penalty_last_n == -1) {
|
|
||||||
LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
|
|
||||||
params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (params.warmup) {
|
if (params.warmup) {
|
||||||
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
|
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
|
||||||
|
|
||||||
|
|
@ -1241,12 +1303,11 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||||
llama_set_warmup(lctx, false);
|
llama_set_warmup(lctx, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
iparams.model.reset(model);
|
return res;
|
||||||
iparams.context.reset(lctx);
|
|
||||||
|
|
||||||
return iparams;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
common_init_result::~common_init_result() = default;
|
||||||
|
|
||||||
std::string get_model_endpoint() {
|
std::string get_model_endpoint() {
|
||||||
const char * model_endpoint_env = getenv("MODEL_ENDPOINT");
|
const char * model_endpoint_env = getenv("MODEL_ENDPOINT");
|
||||||
// We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility.
|
// We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility.
|
||||||
|
|
@ -1255,7 +1316,9 @@ std::string get_model_endpoint() {
|
||||||
std::string model_endpoint = "https://huggingface.co/";
|
std::string model_endpoint = "https://huggingface.co/";
|
||||||
if (endpoint_env) {
|
if (endpoint_env) {
|
||||||
model_endpoint = endpoint_env;
|
model_endpoint = endpoint_env;
|
||||||
if (model_endpoint.back() != '/') model_endpoint += '/';
|
if (model_endpoint.back() != '/') {
|
||||||
|
model_endpoint += '/';
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return model_endpoint;
|
return model_endpoint;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -82,7 +82,8 @@ int32_t cpu_get_num_math();
|
||||||
enum llama_example {
|
enum llama_example {
|
||||||
LLAMA_EXAMPLE_COMMON,
|
LLAMA_EXAMPLE_COMMON,
|
||||||
LLAMA_EXAMPLE_SPECULATIVE,
|
LLAMA_EXAMPLE_SPECULATIVE,
|
||||||
LLAMA_EXAMPLE_MAIN,
|
LLAMA_EXAMPLE_COMPLETION,
|
||||||
|
LLAMA_EXAMPLE_CLI,
|
||||||
LLAMA_EXAMPLE_EMBEDDING,
|
LLAMA_EXAMPLE_EMBEDDING,
|
||||||
LLAMA_EXAMPLE_PERPLEXITY,
|
LLAMA_EXAMPLE_PERPLEXITY,
|
||||||
LLAMA_EXAMPLE_RETRIEVAL,
|
LLAMA_EXAMPLE_RETRIEVAL,
|
||||||
|
|
@ -98,6 +99,7 @@ enum llama_example {
|
||||||
LLAMA_EXAMPLE_TTS,
|
LLAMA_EXAMPLE_TTS,
|
||||||
LLAMA_EXAMPLE_DIFFUSION,
|
LLAMA_EXAMPLE_DIFFUSION,
|
||||||
LLAMA_EXAMPLE_FINETUNE,
|
LLAMA_EXAMPLE_FINETUNE,
|
||||||
|
LLAMA_EXAMPLE_FIT_PARAMS,
|
||||||
|
|
||||||
LLAMA_EXAMPLE_COUNT,
|
LLAMA_EXAMPLE_COUNT,
|
||||||
};
|
};
|
||||||
|
|
@ -194,7 +196,6 @@ struct common_params_sampling {
|
||||||
|
|
||||||
std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
|
std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
|
||||||
|
|
||||||
|
|
||||||
std::vector<enum common_sampler_type> samplers = {
|
std::vector<enum common_sampler_type> samplers = {
|
||||||
COMMON_SAMPLER_TYPE_PENALTIES,
|
COMMON_SAMPLER_TYPE_PENALTIES,
|
||||||
COMMON_SAMPLER_TYPE_DRY,
|
COMMON_SAMPLER_TYPE_DRY,
|
||||||
|
|
@ -215,6 +216,10 @@ struct common_params_sampling {
|
||||||
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
|
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
|
||||||
std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
|
std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
|
||||||
|
|
||||||
|
bool has_logit_bias() const {
|
||||||
|
return !logit_bias.empty();
|
||||||
|
}
|
||||||
|
|
||||||
// print the parameters into a string
|
// print the parameters into a string
|
||||||
std::string print() const;
|
std::string print() const;
|
||||||
};
|
};
|
||||||
|
|
@ -302,8 +307,8 @@ struct lr_opt {
|
||||||
struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
|
struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
|
||||||
|
|
||||||
struct common_params {
|
struct common_params {
|
||||||
int32_t n_predict = -1; // new tokens to predict
|
int32_t n_predict = -1; // max. number of new tokens to predict, -1 == no limit
|
||||||
int32_t n_ctx = 4096; // context size
|
int32_t n_ctx = 0; // context size, 0 == context the model was trained with
|
||||||
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
||||||
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
|
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
|
||||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||||
|
|
@ -327,6 +332,9 @@ struct common_params {
|
||||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
||||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||||
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
||||||
|
bool fit_params = true; // whether to fit unset model/context parameters to free device memory
|
||||||
|
size_t fit_params_target = 1024 * 1024*1024; // margin per device in bytes for fitting parameters to free memory
|
||||||
|
int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
|
||||||
|
|
||||||
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
||||||
|
|
||||||
|
|
@ -406,6 +414,7 @@ struct common_params {
|
||||||
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
||||||
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
||||||
bool no_perf = false; // disable performance metrics
|
bool no_perf = false; // disable performance metrics
|
||||||
|
bool show_timings = true; // show timing information on CLI
|
||||||
bool ctx_shift = false; // context shift on infinite text generation
|
bool ctx_shift = false; // context shift on infinite text generation
|
||||||
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
||||||
bool kv_unified = false; // enable unified KV cache
|
bool kv_unified = false; // enable unified KV cache
|
||||||
|
|
@ -462,7 +471,7 @@ struct common_params {
|
||||||
std::string public_path = ""; // NOLINT
|
std::string public_path = ""; // NOLINT
|
||||||
std::string api_prefix = ""; // NOLINT
|
std::string api_prefix = ""; // NOLINT
|
||||||
std::string chat_template = ""; // NOLINT
|
std::string chat_template = ""; // NOLINT
|
||||||
bool use_jinja = false; // NOLINT
|
bool use_jinja = true; // NOLINT
|
||||||
bool enable_chat_template = true;
|
bool enable_chat_template = true;
|
||||||
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
|
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
|
||||||
int reasoning_budget = -1;
|
int reasoning_budget = -1;
|
||||||
|
|
@ -483,6 +492,7 @@ struct common_params {
|
||||||
|
|
||||||
// router server configs
|
// router server configs
|
||||||
std::string models_dir = ""; // directory containing models for the router server
|
std::string models_dir = ""; // directory containing models for the router server
|
||||||
|
std::string models_preset = ""; // directory containing model presets for the router server
|
||||||
int models_max = 4; // maximum number of models to load simultaneously
|
int models_max = 4; // maximum number of models to load simultaneously
|
||||||
bool models_autoload = true; // automatically load models when requested via the router server
|
bool models_autoload = true; // automatically load models when requested via the router server
|
||||||
|
|
||||||
|
|
@ -666,15 +676,29 @@ bool tty_can_use_colors();
|
||||||
// Model utils
|
// Model utils
|
||||||
//
|
//
|
||||||
|
|
||||||
// note: defines object's lifetime
|
struct common_sampler;
|
||||||
struct common_init_result {
|
|
||||||
llama_model_ptr model;
|
|
||||||
llama_context_ptr context;
|
|
||||||
|
|
||||||
std::vector<llama_adapter_lora_ptr> lora;
|
// note: defines the model, context, samplers, ets. lifetimes
|
||||||
|
struct common_init_result {
|
||||||
|
common_init_result(common_params & params);
|
||||||
|
~common_init_result();
|
||||||
|
|
||||||
|
llama_model * model();
|
||||||
|
llama_context * context();
|
||||||
|
common_sampler * sampler(llama_seq_id seq_id);
|
||||||
|
|
||||||
|
std::vector<llama_adapter_lora_ptr> & lora();
|
||||||
|
|
||||||
|
void free_context();
|
||||||
|
|
||||||
|
private:
|
||||||
|
struct impl;
|
||||||
|
std::unique_ptr<impl> pimpl;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct common_init_result common_init_from_params(common_params & params);
|
using common_init_result_ptr = std::unique_ptr<common_init_result>;
|
||||||
|
|
||||||
|
common_init_result_ptr common_init_from_params(common_params & params);
|
||||||
|
|
||||||
struct llama_model_params common_model_params_to_llama ( common_params & params);
|
struct llama_model_params common_model_params_to_llama ( common_params & params);
|
||||||
struct llama_context_params common_context_params_to_llama(const common_params & params);
|
struct llama_context_params common_context_params_to_llama(const common_params & params);
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,16 @@
|
||||||
#include "console.h"
|
#include "console.h"
|
||||||
|
#include "log.h"
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
#include <cassert>
|
||||||
|
#include <cstddef>
|
||||||
|
#include <cctype>
|
||||||
|
#include <cwctype>
|
||||||
|
#include <cstdint>
|
||||||
|
#include <condition_variable>
|
||||||
|
#include <mutex>
|
||||||
|
#include <thread>
|
||||||
|
#include <stdarg.h>
|
||||||
|
|
||||||
#if defined(_WIN32)
|
#if defined(_WIN32)
|
||||||
#define WIN32_LEAN_AND_MEAN
|
#define WIN32_LEAN_AND_MEAN
|
||||||
|
|
@ -30,18 +40,36 @@
|
||||||
#define ANSI_COLOR_BLUE "\x1b[34m"
|
#define ANSI_COLOR_BLUE "\x1b[34m"
|
||||||
#define ANSI_COLOR_MAGENTA "\x1b[35m"
|
#define ANSI_COLOR_MAGENTA "\x1b[35m"
|
||||||
#define ANSI_COLOR_CYAN "\x1b[36m"
|
#define ANSI_COLOR_CYAN "\x1b[36m"
|
||||||
|
#define ANSI_COLOR_GRAY "\x1b[90m"
|
||||||
#define ANSI_COLOR_RESET "\x1b[0m"
|
#define ANSI_COLOR_RESET "\x1b[0m"
|
||||||
#define ANSI_BOLD "\x1b[1m"
|
#define ANSI_BOLD "\x1b[1m"
|
||||||
|
|
||||||
namespace console {
|
namespace console {
|
||||||
|
|
||||||
|
#if defined (_WIN32)
|
||||||
|
namespace {
|
||||||
|
// Use private-use unicode values to represent special keys that are not reported
|
||||||
|
// as characters (e.g. arrows on Windows). These values should never clash with
|
||||||
|
// real input and let the rest of the code handle navigation uniformly.
|
||||||
|
static constexpr char32_t KEY_ARROW_LEFT = 0xE000;
|
||||||
|
static constexpr char32_t KEY_ARROW_RIGHT = 0xE001;
|
||||||
|
static constexpr char32_t KEY_ARROW_UP = 0xE002;
|
||||||
|
static constexpr char32_t KEY_ARROW_DOWN = 0xE003;
|
||||||
|
static constexpr char32_t KEY_HOME = 0xE004;
|
||||||
|
static constexpr char32_t KEY_END = 0xE005;
|
||||||
|
static constexpr char32_t KEY_CTRL_ARROW_LEFT = 0xE006;
|
||||||
|
static constexpr char32_t KEY_CTRL_ARROW_RIGHT = 0xE007;
|
||||||
|
static constexpr char32_t KEY_DELETE = 0xE008;
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// Console state
|
// Console state
|
||||||
//
|
//
|
||||||
|
#endif
|
||||||
|
|
||||||
static bool advanced_display = false;
|
static bool advanced_display = false;
|
||||||
static bool simple_io = true;
|
static bool simple_io = true;
|
||||||
static display_t current_display = reset;
|
static display_type current_display = DISPLAY_TYPE_RESET;
|
||||||
|
|
||||||
static FILE* out = stdout;
|
static FILE* out = stdout;
|
||||||
|
|
||||||
|
|
@ -120,7 +148,7 @@ namespace console {
|
||||||
|
|
||||||
void cleanup() {
|
void cleanup() {
|
||||||
// Reset console display
|
// Reset console display
|
||||||
set_display(reset);
|
set_display(DISPLAY_TYPE_RESET);
|
||||||
|
|
||||||
#if !defined(_WIN32)
|
#if !defined(_WIN32)
|
||||||
// Restore settings on POSIX systems
|
// Restore settings on POSIX systems
|
||||||
|
|
@ -140,20 +168,26 @@ namespace console {
|
||||||
//
|
//
|
||||||
|
|
||||||
// Keep track of current display and only emit ANSI code if it changes
|
// Keep track of current display and only emit ANSI code if it changes
|
||||||
void set_display(display_t display) {
|
void set_display(display_type display) {
|
||||||
if (advanced_display && current_display != display) {
|
if (advanced_display && current_display != display) {
|
||||||
fflush(stdout);
|
common_log_flush(common_log_main());
|
||||||
switch(display) {
|
switch(display) {
|
||||||
case reset:
|
case DISPLAY_TYPE_RESET:
|
||||||
fprintf(out, ANSI_COLOR_RESET);
|
fprintf(out, ANSI_COLOR_RESET);
|
||||||
break;
|
break;
|
||||||
case prompt:
|
case DISPLAY_TYPE_INFO:
|
||||||
|
fprintf(out, ANSI_COLOR_MAGENTA);
|
||||||
|
break;
|
||||||
|
case DISPLAY_TYPE_PROMPT:
|
||||||
fprintf(out, ANSI_COLOR_YELLOW);
|
fprintf(out, ANSI_COLOR_YELLOW);
|
||||||
break;
|
break;
|
||||||
case user_input:
|
case DISPLAY_TYPE_REASONING:
|
||||||
|
fprintf(out, ANSI_COLOR_GRAY);
|
||||||
|
break;
|
||||||
|
case DISPLAY_TYPE_USER_INPUT:
|
||||||
fprintf(out, ANSI_BOLD ANSI_COLOR_GREEN);
|
fprintf(out, ANSI_BOLD ANSI_COLOR_GREEN);
|
||||||
break;
|
break;
|
||||||
case error:
|
case DISPLAY_TYPE_ERROR:
|
||||||
fprintf(out, ANSI_BOLD ANSI_COLOR_RED);
|
fprintf(out, ANSI_BOLD ANSI_COLOR_RED);
|
||||||
}
|
}
|
||||||
current_display = display;
|
current_display = display;
|
||||||
|
|
@ -176,7 +210,18 @@ namespace console {
|
||||||
if (record.EventType == KEY_EVENT && record.Event.KeyEvent.bKeyDown) {
|
if (record.EventType == KEY_EVENT && record.Event.KeyEvent.bKeyDown) {
|
||||||
wchar_t wc = record.Event.KeyEvent.uChar.UnicodeChar;
|
wchar_t wc = record.Event.KeyEvent.uChar.UnicodeChar;
|
||||||
if (wc == 0) {
|
if (wc == 0) {
|
||||||
continue;
|
const DWORD ctrl_mask = LEFT_CTRL_PRESSED | RIGHT_CTRL_PRESSED;
|
||||||
|
const bool ctrl_pressed = (record.Event.KeyEvent.dwControlKeyState & ctrl_mask) != 0;
|
||||||
|
switch (record.Event.KeyEvent.wVirtualKeyCode) {
|
||||||
|
case VK_LEFT: return ctrl_pressed ? KEY_CTRL_ARROW_LEFT : KEY_ARROW_LEFT;
|
||||||
|
case VK_RIGHT: return ctrl_pressed ? KEY_CTRL_ARROW_RIGHT : KEY_ARROW_RIGHT;
|
||||||
|
case VK_UP: return KEY_ARROW_UP;
|
||||||
|
case VK_DOWN: return KEY_ARROW_DOWN;
|
||||||
|
case VK_HOME: return KEY_HOME;
|
||||||
|
case VK_END: return KEY_END;
|
||||||
|
case VK_DELETE: return KEY_DELETE;
|
||||||
|
default: continue;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate
|
if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate
|
||||||
|
|
@ -315,6 +360,52 @@ namespace console {
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static char32_t decode_utf8(const std::string & input, size_t pos, size_t & advance) {
|
||||||
|
unsigned char c = static_cast<unsigned char>(input[pos]);
|
||||||
|
if ((c & 0x80u) == 0u) {
|
||||||
|
advance = 1;
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
if ((c & 0xE0u) == 0xC0u && pos + 1 < input.size()) {
|
||||||
|
unsigned char c1 = static_cast<unsigned char>(input[pos + 1]);
|
||||||
|
if ((c1 & 0xC0u) != 0x80u) {
|
||||||
|
advance = 1;
|
||||||
|
return 0xFFFD;
|
||||||
|
}
|
||||||
|
advance = 2;
|
||||||
|
return ((c & 0x1Fu) << 6) | (static_cast<unsigned char>(input[pos + 1]) & 0x3Fu);
|
||||||
|
}
|
||||||
|
if ((c & 0xF0u) == 0xE0u && pos + 2 < input.size()) {
|
||||||
|
unsigned char c1 = static_cast<unsigned char>(input[pos + 1]);
|
||||||
|
unsigned char c2 = static_cast<unsigned char>(input[pos + 2]);
|
||||||
|
if ((c1 & 0xC0u) != 0x80u || (c2 & 0xC0u) != 0x80u) {
|
||||||
|
advance = 1;
|
||||||
|
return 0xFFFD;
|
||||||
|
}
|
||||||
|
advance = 3;
|
||||||
|
return ((c & 0x0Fu) << 12) |
|
||||||
|
((static_cast<unsigned char>(input[pos + 1]) & 0x3Fu) << 6) |
|
||||||
|
(static_cast<unsigned char>(input[pos + 2]) & 0x3Fu);
|
||||||
|
}
|
||||||
|
if ((c & 0xF8u) == 0xF0u && pos + 3 < input.size()) {
|
||||||
|
unsigned char c1 = static_cast<unsigned char>(input[pos + 1]);
|
||||||
|
unsigned char c2 = static_cast<unsigned char>(input[pos + 2]);
|
||||||
|
unsigned char c3 = static_cast<unsigned char>(input[pos + 3]);
|
||||||
|
if ((c1 & 0xC0u) != 0x80u || (c2 & 0xC0u) != 0x80u || (c3 & 0xC0u) != 0x80u) {
|
||||||
|
advance = 1;
|
||||||
|
return 0xFFFD;
|
||||||
|
}
|
||||||
|
advance = 4;
|
||||||
|
return ((c & 0x07u) << 18) |
|
||||||
|
((static_cast<unsigned char>(input[pos + 1]) & 0x3Fu) << 12) |
|
||||||
|
((static_cast<unsigned char>(input[pos + 2]) & 0x3Fu) << 6) |
|
||||||
|
(static_cast<unsigned char>(input[pos + 3]) & 0x3Fu);
|
||||||
|
}
|
||||||
|
|
||||||
|
advance = 1;
|
||||||
|
return 0xFFFD; // replacement character for invalid input
|
||||||
|
}
|
||||||
|
|
||||||
static void append_utf8(char32_t ch, std::string & out) {
|
static void append_utf8(char32_t ch, std::string & out) {
|
||||||
if (ch <= 0x7F) {
|
if (ch <= 0x7F) {
|
||||||
out.push_back(static_cast<unsigned char>(ch));
|
out.push_back(static_cast<unsigned char>(ch));
|
||||||
|
|
@ -336,21 +427,318 @@ namespace console {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Helper function to remove the last UTF-8 character from a string
|
// Helper function to remove the last UTF-8 character from a string
|
||||||
static void pop_back_utf8_char(std::string & line) {
|
static size_t prev_utf8_char_pos(const std::string & line, size_t pos) {
|
||||||
if (line.empty()) {
|
if (pos == 0) return 0;
|
||||||
|
pos--;
|
||||||
|
while (pos > 0 && (line[pos] & 0xC0) == 0x80) {
|
||||||
|
pos--;
|
||||||
|
}
|
||||||
|
return pos;
|
||||||
|
}
|
||||||
|
|
||||||
|
static size_t next_utf8_char_pos(const std::string & line, size_t pos) {
|
||||||
|
if (pos >= line.length()) return line.length();
|
||||||
|
pos++;
|
||||||
|
while (pos < line.length() && (line[pos] & 0xC0) == 0x80) {
|
||||||
|
pos++;
|
||||||
|
}
|
||||||
|
return pos;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void move_cursor(int delta);
|
||||||
|
static void move_word_left(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths, const std::string & line);
|
||||||
|
static void move_word_right(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths, const std::string & line);
|
||||||
|
static void move_to_line_start(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths);
|
||||||
|
static void move_to_line_end(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths, const std::string & line);
|
||||||
|
|
||||||
|
static void delete_at_cursor(std::string & line, std::vector<int> & widths, size_t & char_pos, size_t & byte_pos) {
|
||||||
|
if (char_pos >= widths.size()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t pos = line.length() - 1;
|
size_t next_pos = next_utf8_char_pos(line, byte_pos);
|
||||||
|
int w = widths[char_pos];
|
||||||
|
size_t char_len = next_pos - byte_pos;
|
||||||
|
|
||||||
// Find the start of the last UTF-8 character (checking up to 4 bytes back)
|
line.erase(byte_pos, char_len);
|
||||||
for (size_t i = 0; i < 3 && pos > 0; ++i, --pos) {
|
widths.erase(widths.begin() + char_pos);
|
||||||
if ((line[pos] & 0xC0) != 0x80) {
|
|
||||||
break; // Found the start of the character
|
size_t p = byte_pos;
|
||||||
|
int tail_width = 0;
|
||||||
|
for (size_t i = char_pos; i < widths.size(); ++i) {
|
||||||
|
size_t following = next_utf8_char_pos(line, p);
|
||||||
|
put_codepoint(line.c_str() + p, following - p, widths[i]);
|
||||||
|
tail_width += widths[i];
|
||||||
|
p = following;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < w; ++i) {
|
||||||
|
fputc(' ', out);
|
||||||
|
}
|
||||||
|
|
||||||
|
move_cursor(-(tail_width + w));
|
||||||
|
}
|
||||||
|
|
||||||
|
static void clear_current_line(const std::vector<int> & widths) {
|
||||||
|
int total_width = 0;
|
||||||
|
for (int w : widths) {
|
||||||
|
total_width += (w > 0 ? w : 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (total_width > 0) {
|
||||||
|
std::string spaces(total_width, ' ');
|
||||||
|
fwrite(spaces.c_str(), 1, total_width, out);
|
||||||
|
move_cursor(-total_width);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
line.erase(pos);
|
|
||||||
|
static void set_line_contents(std::string new_line, std::string & line, std::vector<int> & widths, size_t & char_pos,
|
||||||
|
size_t & byte_pos) {
|
||||||
|
move_to_line_start(char_pos, byte_pos, widths);
|
||||||
|
clear_current_line(widths);
|
||||||
|
|
||||||
|
line = std::move(new_line);
|
||||||
|
widths.clear();
|
||||||
|
byte_pos = 0;
|
||||||
|
char_pos = 0;
|
||||||
|
|
||||||
|
size_t idx = 0;
|
||||||
|
while (idx < line.size()) {
|
||||||
|
size_t advance = 0;
|
||||||
|
char32_t cp = decode_utf8(line, idx, advance);
|
||||||
|
int expected_width = estimateWidth(cp);
|
||||||
|
int real_width = put_codepoint(line.c_str() + idx, advance, expected_width);
|
||||||
|
if (real_width < 0) real_width = 0;
|
||||||
|
widths.push_back(real_width);
|
||||||
|
idx += advance;
|
||||||
|
++char_pos;
|
||||||
|
byte_pos = idx;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void move_to_line_start(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths) {
|
||||||
|
int back_width = 0;
|
||||||
|
for (size_t i = 0; i < char_pos; ++i) {
|
||||||
|
back_width += widths[i];
|
||||||
|
}
|
||||||
|
move_cursor(-back_width);
|
||||||
|
char_pos = 0;
|
||||||
|
byte_pos = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void move_to_line_end(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths, const std::string & line) {
|
||||||
|
int forward_width = 0;
|
||||||
|
for (size_t i = char_pos; i < widths.size(); ++i) {
|
||||||
|
forward_width += widths[i];
|
||||||
|
}
|
||||||
|
move_cursor(forward_width);
|
||||||
|
char_pos = widths.size();
|
||||||
|
byte_pos = line.length();
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool has_ctrl_modifier(const std::string & params) {
|
||||||
|
size_t start = 0;
|
||||||
|
while (start < params.size()) {
|
||||||
|
size_t end = params.find(';', start);
|
||||||
|
size_t len = (end == std::string::npos) ? params.size() - start : end - start;
|
||||||
|
if (len > 0) {
|
||||||
|
int value = 0;
|
||||||
|
for (size_t i = 0; i < len; ++i) {
|
||||||
|
char ch = params[start + i];
|
||||||
|
if (!std::isdigit(static_cast<unsigned char>(ch))) {
|
||||||
|
value = -1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
value = value * 10 + (ch - '0');
|
||||||
|
}
|
||||||
|
if (value == 5) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (end == std::string::npos) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
start = end + 1;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool is_space_codepoint(char32_t cp) {
|
||||||
|
return std::iswspace(static_cast<wint_t>(cp)) != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void move_word_left(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths, const std::string & line) {
|
||||||
|
if (char_pos == 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t new_char_pos = char_pos;
|
||||||
|
size_t new_byte_pos = byte_pos;
|
||||||
|
int move_width = 0;
|
||||||
|
|
||||||
|
while (new_char_pos > 0) {
|
||||||
|
size_t prev_byte = prev_utf8_char_pos(line, new_byte_pos);
|
||||||
|
size_t advance = 0;
|
||||||
|
char32_t cp = decode_utf8(line, prev_byte, advance);
|
||||||
|
if (!is_space_codepoint(cp)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
move_width += widths[new_char_pos - 1];
|
||||||
|
new_char_pos--;
|
||||||
|
new_byte_pos = prev_byte;
|
||||||
|
}
|
||||||
|
|
||||||
|
while (new_char_pos > 0) {
|
||||||
|
size_t prev_byte = prev_utf8_char_pos(line, new_byte_pos);
|
||||||
|
size_t advance = 0;
|
||||||
|
char32_t cp = decode_utf8(line, prev_byte, advance);
|
||||||
|
if (is_space_codepoint(cp)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
move_width += widths[new_char_pos - 1];
|
||||||
|
new_char_pos--;
|
||||||
|
new_byte_pos = prev_byte;
|
||||||
|
}
|
||||||
|
|
||||||
|
move_cursor(-move_width);
|
||||||
|
char_pos = new_char_pos;
|
||||||
|
byte_pos = new_byte_pos;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void move_word_right(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths, const std::string & line) {
|
||||||
|
if (char_pos >= widths.size()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t new_char_pos = char_pos;
|
||||||
|
size_t new_byte_pos = byte_pos;
|
||||||
|
int move_width = 0;
|
||||||
|
|
||||||
|
while (new_char_pos < widths.size()) {
|
||||||
|
size_t advance = 0;
|
||||||
|
char32_t cp = decode_utf8(line, new_byte_pos, advance);
|
||||||
|
if (!is_space_codepoint(cp)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
move_width += widths[new_char_pos];
|
||||||
|
new_char_pos++;
|
||||||
|
new_byte_pos += advance;
|
||||||
|
}
|
||||||
|
|
||||||
|
while (new_char_pos < widths.size()) {
|
||||||
|
size_t advance = 0;
|
||||||
|
char32_t cp = decode_utf8(line, new_byte_pos, advance);
|
||||||
|
if (is_space_codepoint(cp)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
move_width += widths[new_char_pos];
|
||||||
|
new_char_pos++;
|
||||||
|
new_byte_pos += advance;
|
||||||
|
}
|
||||||
|
|
||||||
|
while (new_char_pos < widths.size()) {
|
||||||
|
size_t advance = 0;
|
||||||
|
char32_t cp = decode_utf8(line, new_byte_pos, advance);
|
||||||
|
if (!is_space_codepoint(cp)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
move_width += widths[new_char_pos];
|
||||||
|
new_char_pos++;
|
||||||
|
new_byte_pos += advance;
|
||||||
|
}
|
||||||
|
|
||||||
|
move_cursor(move_width);
|
||||||
|
char_pos = new_char_pos;
|
||||||
|
byte_pos = new_byte_pos;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void move_cursor(int delta) {
|
||||||
|
if (delta == 0) return;
|
||||||
|
#if defined(_WIN32)
|
||||||
|
if (hConsole != NULL) {
|
||||||
|
CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
|
||||||
|
GetConsoleScreenBufferInfo(hConsole, &bufferInfo);
|
||||||
|
COORD newCursorPosition = bufferInfo.dwCursorPosition;
|
||||||
|
int width = bufferInfo.dwSize.X;
|
||||||
|
int newX = newCursorPosition.X + delta;
|
||||||
|
int newY = newCursorPosition.Y;
|
||||||
|
|
||||||
|
while (newX >= width) {
|
||||||
|
newX -= width;
|
||||||
|
newY++;
|
||||||
|
}
|
||||||
|
while (newX < 0) {
|
||||||
|
newX += width;
|
||||||
|
newY--;
|
||||||
|
}
|
||||||
|
|
||||||
|
newCursorPosition.X = newX;
|
||||||
|
newCursorPosition.Y = newY;
|
||||||
|
SetConsoleCursorPosition(hConsole, newCursorPosition);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
if (delta < 0) {
|
||||||
|
for (int i = 0; i < -delta; i++) fprintf(out, "\b");
|
||||||
|
} else {
|
||||||
|
for (int i = 0; i < delta; i++) fprintf(out, "\033[C");
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
struct history_t {
|
||||||
|
std::vector<std::string> entries;
|
||||||
|
size_t viewing_idx = SIZE_MAX;
|
||||||
|
std::string backup_line; // current line before viewing history
|
||||||
|
void add(const std::string & line) {
|
||||||
|
if (line.empty()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// avoid duplicates with the last entry
|
||||||
|
if (entries.empty() || entries.back() != line) {
|
||||||
|
entries.push_back(line);
|
||||||
|
}
|
||||||
|
// also clear viewing state
|
||||||
|
end_viewing();
|
||||||
|
}
|
||||||
|
bool prev(std::string & cur_line) {
|
||||||
|
if (entries.empty()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (viewing_idx == SIZE_MAX) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (viewing_idx > 0) {
|
||||||
|
viewing_idx--;
|
||||||
|
}
|
||||||
|
cur_line = entries[viewing_idx];
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
bool next(std::string & cur_line) {
|
||||||
|
if (entries.empty() || viewing_idx == SIZE_MAX) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
viewing_idx++;
|
||||||
|
if (viewing_idx >= entries.size()) {
|
||||||
|
cur_line = backup_line;
|
||||||
|
end_viewing();
|
||||||
|
} else {
|
||||||
|
cur_line = entries[viewing_idx];
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
void begin_viewing(const std::string & line) {
|
||||||
|
backup_line = line;
|
||||||
|
viewing_idx = entries.size();
|
||||||
|
}
|
||||||
|
void end_viewing() {
|
||||||
|
viewing_idx = SIZE_MAX;
|
||||||
|
backup_line.clear();
|
||||||
|
}
|
||||||
|
bool is_viewing() const {
|
||||||
|
return viewing_idx != SIZE_MAX;
|
||||||
|
}
|
||||||
|
} history;
|
||||||
|
|
||||||
static bool readline_advanced(std::string & line, bool multiline_input) {
|
static bool readline_advanced(std::string & line, bool multiline_input) {
|
||||||
if (out != stdout) {
|
if (out != stdout) {
|
||||||
|
|
@ -362,8 +750,33 @@ namespace console {
|
||||||
bool is_special_char = false;
|
bool is_special_char = false;
|
||||||
bool end_of_stream = false;
|
bool end_of_stream = false;
|
||||||
|
|
||||||
|
size_t byte_pos = 0; // current byte index
|
||||||
|
size_t char_pos = 0; // current character index (one char can be multiple bytes)
|
||||||
|
|
||||||
char32_t input_char;
|
char32_t input_char;
|
||||||
while (true) {
|
while (true) {
|
||||||
|
assert(char_pos <= byte_pos);
|
||||||
|
assert(char_pos <= widths.size());
|
||||||
|
auto history_prev = [&]() {
|
||||||
|
if (!history.is_viewing()) {
|
||||||
|
history.begin_viewing(line);
|
||||||
|
}
|
||||||
|
std::string new_line;
|
||||||
|
if (!history.prev(new_line)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
set_line_contents(new_line, line, widths, char_pos, byte_pos);
|
||||||
|
};
|
||||||
|
auto history_next = [&]() {
|
||||||
|
if (history.is_viewing()) {
|
||||||
|
std::string new_line;
|
||||||
|
if (!history.next(new_line)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
set_line_contents(new_line, line, widths, char_pos, byte_pos);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
fflush(out); // Ensure all output is displayed before waiting for input
|
fflush(out); // Ensure all output is displayed before waiting for input
|
||||||
input_char = getchar32();
|
input_char = getchar32();
|
||||||
|
|
||||||
|
|
@ -371,20 +784,83 @@ namespace console {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (input_char == (char32_t) WEOF || input_char == 0x04 /* Ctrl+D*/) {
|
if (input_char == (char32_t) WEOF || input_char == 0x04 /* Ctrl+D */) {
|
||||||
end_of_stream = true;
|
end_of_stream = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (is_special_char) {
|
if (is_special_char) {
|
||||||
set_display(user_input);
|
|
||||||
replace_last(line.back());
|
replace_last(line.back());
|
||||||
is_special_char = false;
|
is_special_char = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (input_char == '\033') { // Escape sequence
|
if (input_char == '\033') { // Escape sequence
|
||||||
char32_t code = getchar32();
|
char32_t code = getchar32();
|
||||||
if (code == '[' || code == 0x1B) {
|
if (code == '[') {
|
||||||
|
std::string params;
|
||||||
|
while (true) {
|
||||||
|
code = getchar32();
|
||||||
|
if ((code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z') || code == '~' || code == (char32_t) WEOF) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.push_back(static_cast<char>(code));
|
||||||
|
}
|
||||||
|
|
||||||
|
const bool ctrl_modifier = has_ctrl_modifier(params);
|
||||||
|
|
||||||
|
if (code == 'D') { // left
|
||||||
|
if (ctrl_modifier) {
|
||||||
|
move_word_left(char_pos, byte_pos, widths, line);
|
||||||
|
} else if (char_pos > 0) {
|
||||||
|
int w = widths[char_pos - 1];
|
||||||
|
move_cursor(-w);
|
||||||
|
char_pos--;
|
||||||
|
byte_pos = prev_utf8_char_pos(line, byte_pos);
|
||||||
|
}
|
||||||
|
} else if (code == 'C') { // right
|
||||||
|
if (ctrl_modifier) {
|
||||||
|
move_word_right(char_pos, byte_pos, widths, line);
|
||||||
|
} else if (char_pos < widths.size()) {
|
||||||
|
int w = widths[char_pos];
|
||||||
|
move_cursor(w);
|
||||||
|
char_pos++;
|
||||||
|
byte_pos = next_utf8_char_pos(line, byte_pos);
|
||||||
|
}
|
||||||
|
} else if (code == 'H') { // home
|
||||||
|
move_to_line_start(char_pos, byte_pos, widths);
|
||||||
|
} else if (code == 'F') { // end
|
||||||
|
move_to_line_end(char_pos, byte_pos, widths, line);
|
||||||
|
} else if (code == 'A' || code == 'B') {
|
||||||
|
// up/down
|
||||||
|
if (code == 'A') {
|
||||||
|
history_prev();
|
||||||
|
is_special_char = false;
|
||||||
|
} else if (code == 'B') {
|
||||||
|
history_next();
|
||||||
|
is_special_char = false;
|
||||||
|
}
|
||||||
|
} else if ((code == '~' || (code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z')) && !params.empty()) {
|
||||||
|
std::string digits;
|
||||||
|
for (char ch : params) {
|
||||||
|
if (ch == ';') {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (std::isdigit(static_cast<unsigned char>(ch))) {
|
||||||
|
digits.push_back(ch);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (code == '~') {
|
||||||
|
if (digits == "1" || digits == "7") { // home
|
||||||
|
move_to_line_start(char_pos, byte_pos, widths);
|
||||||
|
} else if (digits == "4" || digits == "8") { // end
|
||||||
|
move_to_line_end(char_pos, byte_pos, widths, line);
|
||||||
|
} else if (digits == "3") { // delete
|
||||||
|
delete_at_cursor(line, widths, char_pos, byte_pos);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if (code == 0x1B) {
|
||||||
// Discard the rest of the escape sequence
|
// Discard the rest of the escape sequence
|
||||||
while ((code = getchar32()) != (char32_t) WEOF) {
|
while ((code = getchar32()) != (char32_t) WEOF) {
|
||||||
if ((code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z') || code == '~') {
|
if ((code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z') || code == '~') {
|
||||||
|
|
@ -392,32 +868,110 @@ namespace console {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (input_char == 0x08 || input_char == 0x7F) { // Backspace
|
#if defined(_WIN32)
|
||||||
if (!widths.empty()) {
|
} else if (input_char == KEY_ARROW_LEFT) {
|
||||||
int count;
|
if (char_pos > 0) {
|
||||||
do {
|
int w = widths[char_pos - 1];
|
||||||
count = widths.back();
|
move_cursor(-w);
|
||||||
widths.pop_back();
|
char_pos--;
|
||||||
// Move cursor back, print space, and move cursor back again
|
byte_pos = prev_utf8_char_pos(line, byte_pos);
|
||||||
for (int i = 0; i < count; i++) {
|
|
||||||
replace_last(' ');
|
|
||||||
pop_cursor();
|
|
||||||
}
|
}
|
||||||
pop_back_utf8_char(line);
|
} else if (input_char == KEY_ARROW_RIGHT) {
|
||||||
} while (count == 0 && !widths.empty());
|
if (char_pos < widths.size()) {
|
||||||
|
int w = widths[char_pos];
|
||||||
|
move_cursor(w);
|
||||||
|
char_pos++;
|
||||||
|
byte_pos = next_utf8_char_pos(line, byte_pos);
|
||||||
|
}
|
||||||
|
} else if (input_char == KEY_CTRL_ARROW_LEFT) {
|
||||||
|
move_word_left(char_pos, byte_pos, widths, line);
|
||||||
|
} else if (input_char == KEY_CTRL_ARROW_RIGHT) {
|
||||||
|
move_word_right(char_pos, byte_pos, widths, line);
|
||||||
|
} else if (input_char == KEY_HOME) {
|
||||||
|
move_to_line_start(char_pos, byte_pos, widths);
|
||||||
|
} else if (input_char == KEY_END) {
|
||||||
|
move_to_line_end(char_pos, byte_pos, widths, line);
|
||||||
|
} else if (input_char == KEY_DELETE) {
|
||||||
|
delete_at_cursor(line, widths, char_pos, byte_pos);
|
||||||
|
} else if (input_char == KEY_ARROW_UP || input_char == KEY_ARROW_DOWN) {
|
||||||
|
if (input_char == KEY_ARROW_UP) {
|
||||||
|
history_prev();
|
||||||
|
is_special_char = false;
|
||||||
|
} else if (input_char == KEY_ARROW_DOWN) {
|
||||||
|
history_next();
|
||||||
|
is_special_char = false;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
} else if (input_char == 0x08 || input_char == 0x7F) { // Backspace
|
||||||
|
if (char_pos > 0) {
|
||||||
|
int w = widths[char_pos - 1];
|
||||||
|
move_cursor(-w);
|
||||||
|
char_pos--;
|
||||||
|
size_t prev_pos = prev_utf8_char_pos(line, byte_pos);
|
||||||
|
size_t char_len = byte_pos - prev_pos;
|
||||||
|
byte_pos = prev_pos;
|
||||||
|
|
||||||
|
// remove the character
|
||||||
|
line.erase(byte_pos, char_len);
|
||||||
|
widths.erase(widths.begin() + char_pos);
|
||||||
|
|
||||||
|
// redraw tail
|
||||||
|
size_t p = byte_pos;
|
||||||
|
int tail_width = 0;
|
||||||
|
for (size_t i = char_pos; i < widths.size(); ++i) {
|
||||||
|
size_t next_p = next_utf8_char_pos(line, p);
|
||||||
|
put_codepoint(line.c_str() + p, next_p - p, widths[i]);
|
||||||
|
tail_width += widths[i];
|
||||||
|
p = next_p;
|
||||||
|
}
|
||||||
|
|
||||||
|
// clear display
|
||||||
|
for (int i = 0; i < w; ++i) {
|
||||||
|
fputc(' ', out);
|
||||||
|
}
|
||||||
|
move_cursor(-(tail_width + w));
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
int offset = line.length();
|
// insert character
|
||||||
append_utf8(input_char, line);
|
std::string new_char_str;
|
||||||
int width = put_codepoint(line.c_str() + offset, line.length() - offset, estimateWidth(input_char));
|
append_utf8(input_char, new_char_str);
|
||||||
if (width < 0) {
|
int w = estimateWidth(input_char);
|
||||||
width = 0;
|
|
||||||
|
if (char_pos == widths.size()) {
|
||||||
|
// insert at the end
|
||||||
|
line += new_char_str;
|
||||||
|
int real_w = put_codepoint(new_char_str.c_str(), new_char_str.length(), w);
|
||||||
|
if (real_w < 0) real_w = 0;
|
||||||
|
widths.push_back(real_w);
|
||||||
|
byte_pos += new_char_str.length();
|
||||||
|
char_pos++;
|
||||||
|
} else {
|
||||||
|
// insert in middle
|
||||||
|
line.insert(byte_pos, new_char_str);
|
||||||
|
|
||||||
|
int real_w = put_codepoint(new_char_str.c_str(), new_char_str.length(), w);
|
||||||
|
if (real_w < 0) real_w = 0;
|
||||||
|
|
||||||
|
widths.insert(widths.begin() + char_pos, real_w);
|
||||||
|
|
||||||
|
// print the tail
|
||||||
|
size_t p = byte_pos + new_char_str.length();
|
||||||
|
int tail_width = 0;
|
||||||
|
for (size_t i = char_pos + 1; i < widths.size(); ++i) {
|
||||||
|
size_t next_p = next_utf8_char_pos(line, p);
|
||||||
|
put_codepoint(line.c_str() + p, next_p - p, widths[i]);
|
||||||
|
tail_width += widths[i];
|
||||||
|
p = next_p;
|
||||||
|
}
|
||||||
|
|
||||||
|
move_cursor(-tail_width);
|
||||||
|
|
||||||
|
byte_pos += new_char_str.length();
|
||||||
|
char_pos++;
|
||||||
}
|
}
|
||||||
widths.push_back(width);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!line.empty() && (line.back() == '\\' || line.back() == '/')) {
|
if (!line.empty() && (line.back() == '\\' || line.back() == '/')) {
|
||||||
set_display(prompt);
|
|
||||||
replace_last(line.back());
|
replace_last(line.back());
|
||||||
is_special_char = true;
|
is_special_char = true;
|
||||||
}
|
}
|
||||||
|
|
@ -451,6 +1005,15 @@ namespace console {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!end_of_stream && !line.empty()) {
|
||||||
|
// remove the trailing newline for history storage
|
||||||
|
if (!line.empty() && line.back() == '\n') {
|
||||||
|
line.pop_back();
|
||||||
|
}
|
||||||
|
// TODO: maybe support multiline history entries?
|
||||||
|
history.add(line);
|
||||||
|
}
|
||||||
|
|
||||||
fflush(out);
|
fflush(out);
|
||||||
return has_more;
|
return has_more;
|
||||||
}
|
}
|
||||||
|
|
@ -493,12 +1056,82 @@ namespace console {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool readline(std::string & line, bool multiline_input) {
|
bool readline(std::string & line, bool multiline_input) {
|
||||||
set_display(user_input);
|
|
||||||
|
|
||||||
if (simple_io) {
|
if (simple_io) {
|
||||||
return readline_simple(line, multiline_input);
|
return readline_simple(line, multiline_input);
|
||||||
}
|
}
|
||||||
return readline_advanced(line, multiline_input);
|
return readline_advanced(line, multiline_input);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
namespace spinner {
|
||||||
|
static const char LOADING_CHARS[] = {'|', '/', '-', '\\'};
|
||||||
|
static std::condition_variable cv_stop;
|
||||||
|
static std::thread th;
|
||||||
|
static size_t frame = 0; // only modified by one thread
|
||||||
|
static bool running = false;
|
||||||
|
static std::mutex mtx;
|
||||||
|
static auto wait_time = std::chrono::milliseconds(100);
|
||||||
|
static void draw_next_frame() {
|
||||||
|
// don't need lock because only one thread modifies running
|
||||||
|
frame = (frame + 1) % sizeof(LOADING_CHARS);
|
||||||
|
replace_last(LOADING_CHARS[frame]);
|
||||||
|
fflush(out);
|
||||||
|
}
|
||||||
|
void start() {
|
||||||
|
std::unique_lock<std::mutex> lock(mtx);
|
||||||
|
if (simple_io || running) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
common_log_flush(common_log_main());
|
||||||
|
fprintf(out, "%c", LOADING_CHARS[0]);
|
||||||
|
fflush(out);
|
||||||
|
frame = 1;
|
||||||
|
running = true;
|
||||||
|
th = std::thread([]() {
|
||||||
|
std::unique_lock<std::mutex> lock(mtx);
|
||||||
|
while (true) {
|
||||||
|
if (cv_stop.wait_for(lock, wait_time, []{ return !running; })) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
draw_next_frame();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
void stop() {
|
||||||
|
{
|
||||||
|
std::unique_lock<std::mutex> lock(mtx);
|
||||||
|
if (simple_io || !running) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
running = false;
|
||||||
|
cv_stop.notify_all();
|
||||||
|
}
|
||||||
|
if (th.joinable()) {
|
||||||
|
th.join();
|
||||||
|
}
|
||||||
|
replace_last(' ');
|
||||||
|
pop_cursor();
|
||||||
|
fflush(out);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void log(const char * fmt, ...) {
|
||||||
|
va_list args;
|
||||||
|
va_start(args, fmt);
|
||||||
|
vfprintf(out, fmt, args);
|
||||||
|
va_end(args);
|
||||||
|
}
|
||||||
|
|
||||||
|
void error(const char * fmt, ...) {
|
||||||
|
va_list args;
|
||||||
|
va_start(args, fmt);
|
||||||
|
display_type cur = current_display;
|
||||||
|
set_display(DISPLAY_TYPE_ERROR);
|
||||||
|
vfprintf(out, fmt, args);
|
||||||
|
set_display(cur); // restore previous color
|
||||||
|
va_end(args);
|
||||||
|
}
|
||||||
|
|
||||||
|
void flush() {
|
||||||
|
fflush(out);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -2,18 +2,40 @@
|
||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
namespace console {
|
enum display_type {
|
||||||
enum display_t {
|
DISPLAY_TYPE_RESET = 0,
|
||||||
reset = 0,
|
DISPLAY_TYPE_INFO,
|
||||||
prompt,
|
DISPLAY_TYPE_PROMPT,
|
||||||
user_input,
|
DISPLAY_TYPE_REASONING,
|
||||||
error
|
DISPLAY_TYPE_USER_INPUT,
|
||||||
};
|
DISPLAY_TYPE_ERROR
|
||||||
|
};
|
||||||
|
|
||||||
|
namespace console {
|
||||||
void init(bool use_simple_io, bool use_advanced_display);
|
void init(bool use_simple_io, bool use_advanced_display);
|
||||||
void cleanup();
|
void cleanup();
|
||||||
void set_display(display_t display);
|
void set_display(display_type display);
|
||||||
bool readline(std::string & line, bool multiline_input);
|
bool readline(std::string & line, bool multiline_input);
|
||||||
|
|
||||||
|
namespace spinner {
|
||||||
|
void start();
|
||||||
|
void stop();
|
||||||
|
}
|
||||||
|
|
||||||
|
// note: the logging API below output directly to stdout
|
||||||
|
// it can negatively impact performance if used on inference thread
|
||||||
|
// only use in in a dedicated CLI thread
|
||||||
|
// for logging in inference thread, use log.h instead
|
||||||
|
|
||||||
|
LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
|
||||||
|
void log(const char * fmt, ...);
|
||||||
|
|
||||||
|
LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
|
||||||
|
void error(const char * fmt, ...);
|
||||||
|
|
||||||
|
void flush();
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -12,6 +12,8 @@
|
||||||
#include <filesystem>
|
#include <filesystem>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <future>
|
#include <future>
|
||||||
|
#include <map>
|
||||||
|
#include <mutex>
|
||||||
#include <regex>
|
#include <regex>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <thread>
|
#include <thread>
|
||||||
|
|
@ -472,15 +474,35 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
|
||||||
|
|
||||||
#elif defined(LLAMA_USE_HTTPLIB)
|
#elif defined(LLAMA_USE_HTTPLIB)
|
||||||
|
|
||||||
static bool is_output_a_tty() {
|
class ProgressBar {
|
||||||
|
static inline std::mutex mutex;
|
||||||
|
static inline std::map<const ProgressBar *, int> lines;
|
||||||
|
static inline int max_line = 0;
|
||||||
|
|
||||||
|
static void cleanup(const ProgressBar * line) {
|
||||||
|
lines.erase(line);
|
||||||
|
if (lines.empty()) {
|
||||||
|
max_line = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool is_output_a_tty() {
|
||||||
#if defined(_WIN32)
|
#if defined(_WIN32)
|
||||||
return _isatty(_fileno(stdout));
|
return _isatty(_fileno(stdout));
|
||||||
#else
|
#else
|
||||||
return isatty(1);
|
return isatty(1);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static void print_progress(size_t current, size_t total) {
|
public:
|
||||||
|
ProgressBar() = default;
|
||||||
|
|
||||||
|
~ProgressBar() {
|
||||||
|
std::lock_guard<std::mutex> lock(mutex);
|
||||||
|
cleanup(this);
|
||||||
|
}
|
||||||
|
|
||||||
|
void update(size_t current, size_t total) {
|
||||||
if (!is_output_a_tty()) {
|
if (!is_output_a_tty()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
@ -489,19 +511,42 @@ static void print_progress(size_t current, size_t total) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::lock_guard<std::mutex> lock(mutex);
|
||||||
|
|
||||||
|
if (lines.find(this) == lines.end()) {
|
||||||
|
lines[this] = max_line++;
|
||||||
|
std::cout << "\n";
|
||||||
|
}
|
||||||
|
int lines_up = max_line - lines[this];
|
||||||
|
|
||||||
size_t width = 50;
|
size_t width = 50;
|
||||||
size_t pct = (100 * current) / total;
|
size_t pct = (100 * current) / total;
|
||||||
size_t pos = (width * current) / total;
|
size_t pos = (width * current) / total;
|
||||||
|
|
||||||
std::cout << "["
|
std::cout << "\033[s";
|
||||||
|
|
||||||
|
if (lines_up > 0) {
|
||||||
|
std::cout << "\033[" << lines_up << "A";
|
||||||
|
}
|
||||||
|
std::cout << "\033[2K\r["
|
||||||
<< std::string(pos, '=')
|
<< std::string(pos, '=')
|
||||||
<< (pos < width ? ">" : "")
|
<< (pos < width ? ">" : "")
|
||||||
<< std::string(width - pos, ' ')
|
<< std::string(width - pos, ' ')
|
||||||
<< "] " << std::setw(3) << pct << "% ("
|
<< "] " << std::setw(3) << pct << "% ("
|
||||||
<< current / (1024 * 1024) << " MB / "
|
<< current / (1024 * 1024) << " MB / "
|
||||||
<< total / (1024 * 1024) << " MB)\r";
|
<< total / (1024 * 1024) << " MB) "
|
||||||
|
<< "\033[u";
|
||||||
|
|
||||||
std::cout.flush();
|
std::cout.flush();
|
||||||
}
|
|
||||||
|
if (current == total) {
|
||||||
|
cleanup(this);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ProgressBar(const ProgressBar &) = delete;
|
||||||
|
ProgressBar & operator=(const ProgressBar &) = delete;
|
||||||
|
};
|
||||||
|
|
||||||
static bool common_pull_file(httplib::Client & cli,
|
static bool common_pull_file(httplib::Client & cli,
|
||||||
const std::string & resolve_path,
|
const std::string & resolve_path,
|
||||||
|
|
@ -523,6 +568,7 @@ static bool common_pull_file(httplib::Client & cli,
|
||||||
const char * func = __func__; // avoid __func__ inside a lambda
|
const char * func = __func__; // avoid __func__ inside a lambda
|
||||||
size_t downloaded = existing_size;
|
size_t downloaded = existing_size;
|
||||||
size_t progress_step = 0;
|
size_t progress_step = 0;
|
||||||
|
ProgressBar bar;
|
||||||
|
|
||||||
auto res = cli.Get(resolve_path, headers,
|
auto res = cli.Get(resolve_path, headers,
|
||||||
[&](const httplib::Response &response) {
|
[&](const httplib::Response &response) {
|
||||||
|
|
@ -554,7 +600,7 @@ static bool common_pull_file(httplib::Client & cli,
|
||||||
progress_step += len;
|
progress_step += len;
|
||||||
|
|
||||||
if (progress_step >= total_size / 1000 || downloaded == total_size) {
|
if (progress_step >= total_size / 1000 || downloaded == total_size) {
|
||||||
print_progress(downloaded, total_size);
|
bar.update(downloaded, total_size);
|
||||||
progress_step = 0;
|
progress_step = 0;
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
|
|
@ -562,8 +608,6 @@ static bool common_pull_file(httplib::Client & cli,
|
||||||
nullptr
|
nullptr
|
||||||
);
|
);
|
||||||
|
|
||||||
std::cout << "\n";
|
|
||||||
|
|
||||||
if (!res) {
|
if (!res) {
|
||||||
LOG_ERR("%s: error during download. Status: %d\n", __func__, res ? res->status : -1);
|
LOG_ERR("%s: error during download. Status: %d\n", __func__, res ? res->status : -1);
|
||||||
return false;
|
return false;
|
||||||
|
|
|
||||||
|
|
@ -420,6 +420,11 @@ void common_log_set_timestamps(struct common_log * log, bool timestamps) {
|
||||||
log->set_timestamps(timestamps);
|
log->set_timestamps(timestamps);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void common_log_flush(struct common_log * log) {
|
||||||
|
log->pause();
|
||||||
|
log->resume();
|
||||||
|
}
|
||||||
|
|
||||||
static int common_get_verbosity(enum ggml_log_level level) {
|
static int common_get_verbosity(enum ggml_log_level level) {
|
||||||
switch (level) {
|
switch (level) {
|
||||||
case GGML_LOG_LEVEL_DEBUG: return LOG_LEVEL_DEBUG;
|
case GGML_LOG_LEVEL_DEBUG: return LOG_LEVEL_DEBUG;
|
||||||
|
|
|
||||||
|
|
@ -84,6 +84,7 @@ void common_log_set_file (struct common_log * log, const char * file); // n
|
||||||
void common_log_set_colors (struct common_log * log, log_colors colors); // not thread-safe
|
void common_log_set_colors (struct common_log * log, log_colors colors); // not thread-safe
|
||||||
void common_log_set_prefix (struct common_log * log, bool prefix); // whether to output prefix to each log
|
void common_log_set_prefix (struct common_log * log, bool prefix); // whether to output prefix to each log
|
||||||
void common_log_set_timestamps(struct common_log * log, bool timestamps); // whether to output timestamps in the prefix
|
void common_log_set_timestamps(struct common_log * log, bool timestamps); // whether to output timestamps in the prefix
|
||||||
|
void common_log_flush (struct common_log * log); // flush all pending log messages
|
||||||
|
|
||||||
// helper macros for logging
|
// helper macros for logging
|
||||||
// use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
|
// use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,206 @@
|
||||||
|
#include "arg.h"
|
||||||
|
#include "preset.h"
|
||||||
|
#include "peg-parser.h"
|
||||||
|
#include "log.h"
|
||||||
|
|
||||||
|
#include <fstream>
|
||||||
|
#include <sstream>
|
||||||
|
#include <filesystem>
|
||||||
|
|
||||||
|
static std::string rm_leading_dashes(const std::string & str) {
|
||||||
|
size_t pos = 0;
|
||||||
|
while (pos < str.size() && str[pos] == '-') {
|
||||||
|
++pos;
|
||||||
|
}
|
||||||
|
return str.substr(pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::string> common_preset::to_args() const {
|
||||||
|
std::vector<std::string> args;
|
||||||
|
|
||||||
|
for (const auto & [opt, value] : options) {
|
||||||
|
args.push_back(opt.args.back()); // use the last arg as the main arg
|
||||||
|
if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) {
|
||||||
|
// flag option, no value
|
||||||
|
if (common_arg_utils::is_falsey(value)) {
|
||||||
|
// use negative arg if available
|
||||||
|
if (!opt.args_neg.empty()) {
|
||||||
|
args.back() = opt.args_neg.back();
|
||||||
|
} else {
|
||||||
|
// otherwise, skip the flag
|
||||||
|
// TODO: maybe throw an error instead?
|
||||||
|
args.pop_back();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (opt.value_hint != nullptr) {
|
||||||
|
// single value
|
||||||
|
args.push_back(value);
|
||||||
|
}
|
||||||
|
if (opt.value_hint != nullptr && opt.value_hint_2 != nullptr) {
|
||||||
|
throw std::runtime_error(string_format(
|
||||||
|
"common_preset::to_args(): option '%s' has two values, which is not supported yet",
|
||||||
|
opt.args.back()
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return args;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string common_preset::to_ini() const {
|
||||||
|
std::ostringstream ss;
|
||||||
|
|
||||||
|
ss << "[" << name << "]\n";
|
||||||
|
for (const auto & [opt, value] : options) {
|
||||||
|
auto espaced_value = value;
|
||||||
|
string_replace_all(espaced_value, "\n", "\\\n");
|
||||||
|
ss << rm_leading_dashes(opt.args.back()) << " = ";
|
||||||
|
ss << espaced_value << "\n";
|
||||||
|
}
|
||||||
|
ss << "\n";
|
||||||
|
|
||||||
|
return ss.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::map<std::string, std::map<std::string, std::string>> parse_ini_from_file(const std::string & path) {
|
||||||
|
std::map<std::string, std::map<std::string, std::string>> parsed;
|
||||||
|
|
||||||
|
if (!std::filesystem::exists(path)) {
|
||||||
|
throw std::runtime_error("preset file does not exist: " + path);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::ifstream file(path);
|
||||||
|
if (!file.good()) {
|
||||||
|
throw std::runtime_error("failed to open server preset file: " + path);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string contents((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
|
||||||
|
|
||||||
|
static const auto parser = build_peg_parser([](auto & p) {
|
||||||
|
// newline ::= "\r\n" / "\n" / "\r"
|
||||||
|
auto newline = p.rule("newline", p.literal("\r\n") | p.literal("\n") | p.literal("\r"));
|
||||||
|
|
||||||
|
// ws ::= [ \t]*
|
||||||
|
auto ws = p.rule("ws", p.chars("[ \t]", 0, -1));
|
||||||
|
|
||||||
|
// comment ::= [;#] (!newline .)*
|
||||||
|
auto comment = p.rule("comment", p.chars("[;#]", 1, 1) + p.zero_or_more(p.negate(newline) + p.any()));
|
||||||
|
|
||||||
|
// eol ::= ws comment? (newline / EOF)
|
||||||
|
auto eol = p.rule("eol", ws + p.optional(comment) + (newline | p.end()));
|
||||||
|
|
||||||
|
// ident ::= [a-zA-Z_] [a-zA-Z0-9_.-]*
|
||||||
|
auto ident = p.rule("ident", p.chars("[a-zA-Z_]", 1, 1) + p.chars("[a-zA-Z0-9_.-]", 0, -1));
|
||||||
|
|
||||||
|
// value ::= (!eol-start .)*
|
||||||
|
auto eol_start = p.rule("eol-start", ws + (p.chars("[;#]", 1, 1) | newline | p.end()));
|
||||||
|
auto value = p.rule("value", p.zero_or_more(p.negate(eol_start) + p.any()));
|
||||||
|
|
||||||
|
// header-line ::= "[" ws ident ws "]" eol
|
||||||
|
auto header_line = p.rule("header-line", "[" + ws + p.tag("section-name", p.chars("[^]]")) + ws + "]" + eol);
|
||||||
|
|
||||||
|
// kv-line ::= ident ws "=" ws value eol
|
||||||
|
auto kv_line = p.rule("kv-line", p.tag("key", ident) + ws + "=" + ws + p.tag("value", value) + eol);
|
||||||
|
|
||||||
|
// comment-line ::= ws comment (newline / EOF)
|
||||||
|
auto comment_line = p.rule("comment-line", ws + comment + (newline | p.end()));
|
||||||
|
|
||||||
|
// blank-line ::= ws (newline / EOF)
|
||||||
|
auto blank_line = p.rule("blank-line", ws + (newline | p.end()));
|
||||||
|
|
||||||
|
// line ::= header-line / kv-line / comment-line / blank-line
|
||||||
|
auto line = p.rule("line", header_line | kv_line | comment_line | blank_line);
|
||||||
|
|
||||||
|
// ini ::= line* EOF
|
||||||
|
auto ini = p.rule("ini", p.zero_or_more(line) + p.end());
|
||||||
|
|
||||||
|
return ini;
|
||||||
|
});
|
||||||
|
|
||||||
|
common_peg_parse_context ctx(contents);
|
||||||
|
const auto result = parser.parse(ctx);
|
||||||
|
if (!result.success()) {
|
||||||
|
throw std::runtime_error("failed to parse server config file: " + path);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string current_section = COMMON_PRESET_DEFAULT_NAME;
|
||||||
|
std::string current_key;
|
||||||
|
|
||||||
|
ctx.ast.visit(result, [&](const auto & node) {
|
||||||
|
if (node.tag == "section-name") {
|
||||||
|
const std::string section = std::string(node.text);
|
||||||
|
current_section = section;
|
||||||
|
parsed[current_section] = {};
|
||||||
|
} else if (node.tag == "key") {
|
||||||
|
const std::string key = std::string(node.text);
|
||||||
|
current_key = key;
|
||||||
|
} else if (node.tag == "value" && !current_key.empty() && !current_section.empty()) {
|
||||||
|
parsed[current_section][current_key] = std::string(node.text);
|
||||||
|
current_key.clear();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
return parsed;
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::map<std::string, common_arg> get_map_key_opt(common_params_context & ctx_params) {
|
||||||
|
std::map<std::string, common_arg> mapping;
|
||||||
|
for (const auto & opt : ctx_params.options) {
|
||||||
|
for (const auto & env : opt.get_env()) {
|
||||||
|
mapping[env] = opt;
|
||||||
|
}
|
||||||
|
for (const auto & arg : opt.get_args()) {
|
||||||
|
mapping[rm_leading_dashes(arg)] = opt;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return mapping;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool is_bool_arg(const common_arg & arg) {
|
||||||
|
return !arg.args_neg.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::string parse_bool_arg(const common_arg & arg, const std::string & key, const std::string & value) {
|
||||||
|
// if this is a negated arg, we need to reverse the value
|
||||||
|
for (const auto & neg_arg : arg.args_neg) {
|
||||||
|
if (rm_leading_dashes(neg_arg) == key) {
|
||||||
|
return common_arg_utils::is_truthy(value) ? "false" : "true";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// otherwise, not negated
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
common_presets common_presets_load(const std::string & path, common_params_context & ctx_params) {
|
||||||
|
common_presets out;
|
||||||
|
auto key_to_opt = get_map_key_opt(ctx_params);
|
||||||
|
auto ini_data = parse_ini_from_file(path);
|
||||||
|
|
||||||
|
for (auto section : ini_data) {
|
||||||
|
common_preset preset;
|
||||||
|
if (section.first.empty()) {
|
||||||
|
preset.name = COMMON_PRESET_DEFAULT_NAME;
|
||||||
|
} else {
|
||||||
|
preset.name = section.first;
|
||||||
|
}
|
||||||
|
LOG_DBG("loading preset: %s\n", preset.name.c_str());
|
||||||
|
for (const auto & [key, value] : section.second) {
|
||||||
|
LOG_DBG("option: %s = %s\n", key.c_str(), value.c_str());
|
||||||
|
if (key_to_opt.find(key) != key_to_opt.end()) {
|
||||||
|
auto & opt = key_to_opt[key];
|
||||||
|
if (is_bool_arg(opt)) {
|
||||||
|
preset.options[opt] = parse_bool_arg(opt, key, value);
|
||||||
|
} else {
|
||||||
|
preset.options[opt] = value;
|
||||||
|
}
|
||||||
|
LOG_DBG("accepted option: %s = %s\n", key.c_str(), preset.options[opt].c_str());
|
||||||
|
} else {
|
||||||
|
// TODO: maybe warn about unknown key?
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out[preset.name] = preset;
|
||||||
|
}
|
||||||
|
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,32 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
#include "arg.h"
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
#include <map>
|
||||||
|
|
||||||
|
//
|
||||||
|
// INI preset parser and writer
|
||||||
|
//
|
||||||
|
|
||||||
|
constexpr const char * COMMON_PRESET_DEFAULT_NAME = "default";
|
||||||
|
|
||||||
|
struct common_preset {
|
||||||
|
std::string name;
|
||||||
|
// TODO: support repeated args in the future
|
||||||
|
std::map<common_arg, std::string> options;
|
||||||
|
|
||||||
|
// convert preset to CLI argument list
|
||||||
|
std::vector<std::string> to_args() const;
|
||||||
|
|
||||||
|
// convert preset to INI format string
|
||||||
|
std::string to_ini() const;
|
||||||
|
|
||||||
|
// TODO: maybe implement to_env() if needed
|
||||||
|
};
|
||||||
|
|
||||||
|
// interface for multiple presets in one file
|
||||||
|
using common_presets = std::map<std::string, common_preset>;
|
||||||
|
common_presets common_presets_load(const std::string & path, common_params_context & ctx_params);
|
||||||
|
|
@ -104,9 +104,10 @@ struct ring_buffer {
|
||||||
struct common_sampler {
|
struct common_sampler {
|
||||||
common_params_sampling params;
|
common_params_sampling params;
|
||||||
|
|
||||||
struct llama_sampler * grmr;
|
|
||||||
struct llama_sampler * chain;
|
struct llama_sampler * chain;
|
||||||
|
|
||||||
|
bool grammar;
|
||||||
|
|
||||||
ring_buffer<llama_token> prev;
|
ring_buffer<llama_token> prev;
|
||||||
|
|
||||||
std::vector<llama_token_data> cur;
|
std::vector<llama_token_data> cur;
|
||||||
|
|
@ -116,7 +117,6 @@ struct common_sampler {
|
||||||
void reset() {
|
void reset() {
|
||||||
prev.clear();
|
prev.clear();
|
||||||
|
|
||||||
llama_sampler_reset(grmr);
|
|
||||||
llama_sampler_reset(chain);
|
llama_sampler_reset(chain);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -167,10 +167,15 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
||||||
|
|
||||||
lparams.no_perf = params.no_perf;
|
lparams.no_perf = params.no_perf;
|
||||||
|
|
||||||
struct llama_sampler * grmr;
|
llama_sampler * chain = llama_sampler_chain_init(lparams);
|
||||||
|
|
||||||
|
bool grammar = false;
|
||||||
|
std::vector<llama_sampler *> samplers;
|
||||||
|
|
||||||
if (params.grammar.compare(0, 11, "%llguidance") == 0) {
|
if (params.grammar.compare(0, 11, "%llguidance") == 0) {
|
||||||
#ifdef LLAMA_USE_LLGUIDANCE
|
#ifdef LLAMA_USE_LLGUIDANCE
|
||||||
grmr = llama_sampler_init_llg(vocab, "lark", params.grammar.c_str());
|
samplers.push_back(llama_sampler_init_llg(vocab, "lark", params.grammar.c_str()));
|
||||||
|
grammar = true;
|
||||||
#else
|
#else
|
||||||
GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
|
GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
|
||||||
#endif // LLAMA_USE_LLGUIDANCE
|
#endif // LLAMA_USE_LLGUIDANCE
|
||||||
|
|
@ -217,30 +222,23 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
||||||
trigger_patterns_c.push_back(regex.c_str());
|
trigger_patterns_c.push_back(regex.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
grmr = params.grammar_lazy
|
if (!params.grammar.empty()) {
|
||||||
? llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
|
if (params.grammar_lazy) {
|
||||||
|
samplers.push_back(
|
||||||
|
llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
|
||||||
trigger_patterns_c.data(), trigger_patterns_c.size(),
|
trigger_patterns_c.data(), trigger_patterns_c.size(),
|
||||||
trigger_tokens.data(), trigger_tokens.size())
|
trigger_tokens.data(), trigger_tokens.size()));
|
||||||
: llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
|
} else {
|
||||||
if (!grmr) {
|
samplers.push_back(llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root"));
|
||||||
return nullptr;
|
}
|
||||||
|
|
||||||
|
grammar = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
auto * result = new common_sampler {
|
if (params.has_logit_bias()) {
|
||||||
/* .params = */ params,
|
samplers.push_back(llama_sampler_init_logit_bias(llama_vocab_n_tokens(vocab), params.logit_bias.size(), params.logit_bias.data()));
|
||||||
/* .grmr = */ grmr,
|
}
|
||||||
/* .chain = */ llama_sampler_chain_init(lparams),
|
|
||||||
/* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
|
|
||||||
/* .cur = */ {},
|
|
||||||
/* .cur_p = */ {},
|
|
||||||
};
|
|
||||||
|
|
||||||
llama_sampler_chain_add(result->chain,
|
|
||||||
llama_sampler_init_logit_bias(
|
|
||||||
llama_vocab_n_tokens(vocab),
|
|
||||||
params.logit_bias.size(),
|
|
||||||
params.logit_bias.data()));
|
|
||||||
|
|
||||||
if (params.mirostat == 0) {
|
if (params.mirostat == 0) {
|
||||||
for (const auto & cnstr : params.samplers) {
|
for (const auto & cnstr : params.samplers) {
|
||||||
|
|
@ -253,58 +251,70 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
||||||
c_breakers.push_back(str.c_str());
|
c_breakers.push_back(str.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
|
samplers.push_back(llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_TOP_K:
|
case COMMON_SAMPLER_TYPE_TOP_K:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
|
samplers.push_back(llama_sampler_init_top_k (params.top_k));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_TOP_P:
|
case COMMON_SAMPLER_TYPE_TOP_P:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
|
samplers.push_back(llama_sampler_init_top_p (params.top_p, params.min_keep));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA:
|
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
|
samplers.push_back(llama_sampler_init_top_n_sigma(params.top_n_sigma));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_MIN_P:
|
case COMMON_SAMPLER_TYPE_MIN_P:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
|
samplers.push_back(llama_sampler_init_min_p (params.min_p, params.min_keep));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_XTC:
|
case COMMON_SAMPLER_TYPE_XTC:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
|
samplers.push_back(llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_TYPICAL_P:
|
case COMMON_SAMPLER_TYPE_TYPICAL_P:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
|
samplers.push_back(llama_sampler_init_typical (params.typ_p, params.min_keep));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_TEMPERATURE:
|
case COMMON_SAMPLER_TYPE_TEMPERATURE:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
samplers.push_back(llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_INFILL:
|
case COMMON_SAMPLER_TYPE_INFILL:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
|
samplers.push_back(llama_sampler_init_infill (vocab));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_PENALTIES:
|
case COMMON_SAMPLER_TYPE_PENALTIES:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_penalties (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
|
samplers.push_back(llama_sampler_init_penalties (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
GGML_ASSERT(false && "unknown sampler type");
|
GGML_ASSERT(false && "unknown sampler type");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
|
|
||||||
|
samplers.push_back(llama_sampler_init_dist(params.seed));
|
||||||
} else if (params.mirostat == 1) {
|
} else if (params.mirostat == 1) {
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
|
samplers.push_back(llama_sampler_init_temp(params.temp));
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
|
samplers.push_back(llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
|
||||||
} else if (params.mirostat == 2) {
|
} else if (params.mirostat == 2) {
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
|
samplers.push_back(llama_sampler_init_temp(params.temp));
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
|
samplers.push_back(llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
|
||||||
} else {
|
} else {
|
||||||
GGML_ASSERT(false && "unknown mirostat version");
|
GGML_ASSERT(false && "unknown mirostat version");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (auto * smpl : samplers) {
|
||||||
|
llama_sampler_chain_add(chain, smpl);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto * result = new common_sampler {
|
||||||
|
/* .params = */ params,
|
||||||
|
/* .chain = */ chain,
|
||||||
|
/* .grammar = */ grammar,
|
||||||
|
/* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
|
||||||
|
/* .cur = */ {},
|
||||||
|
/* .cur_p = */ {},
|
||||||
|
};
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
void common_sampler_free(struct common_sampler * gsmpl) {
|
void common_sampler_free(struct common_sampler * gsmpl) {
|
||||||
if (gsmpl) {
|
if (gsmpl) {
|
||||||
llama_sampler_free(gsmpl->grmr);
|
|
||||||
|
|
||||||
llama_sampler_free(gsmpl->chain);
|
llama_sampler_free(gsmpl->chain);
|
||||||
|
|
||||||
delete gsmpl;
|
delete gsmpl;
|
||||||
|
|
@ -314,11 +324,24 @@ void common_sampler_free(struct common_sampler * gsmpl) {
|
||||||
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
|
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
|
||||||
const auto tm = gsmpl->tm();
|
const auto tm = gsmpl->tm();
|
||||||
|
|
||||||
if (accept_grammar) {
|
if (gsmpl->grammar) {
|
||||||
llama_sampler_accept(gsmpl->grmr, token);
|
const int n_smpl = llama_sampler_chain_n(gsmpl->chain);
|
||||||
}
|
|
||||||
|
|
||||||
|
for (int i = 0; i < n_smpl; i++) {
|
||||||
|
auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
|
||||||
|
|
||||||
|
// the grammar sampler is always the first one
|
||||||
|
if (i == 0) {
|
||||||
|
if (accept_grammar) {
|
||||||
|
llama_sampler_accept(smpl, token);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
llama_sampler_accept(smpl, token);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
llama_sampler_accept(gsmpl->chain, token);
|
llama_sampler_accept(gsmpl->chain, token);
|
||||||
|
}
|
||||||
|
|
||||||
gsmpl->prev.push_back(token);
|
gsmpl->prev.push_back(token);
|
||||||
}
|
}
|
||||||
|
|
@ -330,8 +353,8 @@ void common_sampler_reset(struct common_sampler * gsmpl) {
|
||||||
struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
|
struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
|
||||||
return new common_sampler {
|
return new common_sampler {
|
||||||
/* .params = */ gsmpl->params,
|
/* .params = */ gsmpl->params,
|
||||||
/* .grmr = */ llama_sampler_clone(gsmpl->grmr),
|
|
||||||
/* .chain = */ llama_sampler_clone(gsmpl->chain),
|
/* .chain = */ llama_sampler_clone(gsmpl->chain),
|
||||||
|
/* .grammar = */ gsmpl->grammar,
|
||||||
/* .prev = */ gsmpl->prev,
|
/* .prev = */ gsmpl->prev,
|
||||||
/* .cur = */ gsmpl->cur,
|
/* .cur = */ gsmpl->cur,
|
||||||
/* .cur_p = */ gsmpl->cur_p,
|
/* .cur_p = */ gsmpl->cur_p,
|
||||||
|
|
@ -383,58 +406,33 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
|
struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl) {
|
||||||
|
return gsmpl->chain;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx) {
|
||||||
llama_synchronize(ctx);
|
llama_synchronize(ctx);
|
||||||
|
|
||||||
// start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations
|
// start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations
|
||||||
const auto tm = gsmpl->tm();
|
const auto tm = gsmpl->tm();
|
||||||
|
|
||||||
gsmpl->set_logits(ctx, idx);
|
llama_token id = LLAMA_TOKEN_NULL;
|
||||||
|
|
||||||
auto & grmr = gsmpl->grmr;
|
|
||||||
auto & chain = gsmpl->chain;
|
auto & chain = gsmpl->chain;
|
||||||
auto & cur_p = gsmpl->cur_p; // initialized by set_logits
|
auto & cur_p = gsmpl->cur_p; // initialized by set_logits
|
||||||
|
|
||||||
if (grammar_first) {
|
gsmpl->set_logits(ctx, idx);
|
||||||
llama_sampler_apply(grmr, &cur_p);
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_sampler_apply(chain, &cur_p);
|
llama_sampler_apply(chain, &cur_p);
|
||||||
|
|
||||||
GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
|
GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
|
||||||
|
|
||||||
const llama_token id = cur_p.data[cur_p.selected].id;
|
id = cur_p.data[cur_p.selected].id;
|
||||||
|
|
||||||
if (grammar_first) {
|
|
||||||
return id;
|
return id;
|
||||||
}
|
|
||||||
|
|
||||||
// check if it the sampled token fits the grammar
|
|
||||||
{
|
|
||||||
llama_token_data single_token_data = { id, 1.0f, 0.0f };
|
|
||||||
llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
|
|
||||||
|
|
||||||
llama_sampler_apply(grmr, &single_token_data_array);
|
|
||||||
|
|
||||||
const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
|
|
||||||
if (is_valid) {
|
|
||||||
return id;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// resampling:
|
|
||||||
// if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
|
|
||||||
gsmpl->set_logits(ctx, idx);
|
|
||||||
|
|
||||||
llama_sampler_apply(grmr, &cur_p);
|
|
||||||
llama_sampler_apply(chain, &cur_p);
|
|
||||||
|
|
||||||
GGML_ASSERT(cur_p.selected != -1 && "no selected token during re-sampling - check your sampling configuration");
|
|
||||||
|
|
||||||
return cur_p.data[cur_p.selected].id;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first) {
|
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft) {
|
||||||
GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1");
|
GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1");
|
||||||
|
|
||||||
std::vector<llama_token> result;
|
std::vector<llama_token> result;
|
||||||
|
|
@ -442,7 +440,7 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
|
||||||
|
|
||||||
size_t i = 0;
|
size_t i = 0;
|
||||||
for (; i < draft.size(); i++) {
|
for (; i < draft.size(); i++) {
|
||||||
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
|
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);
|
||||||
|
|
||||||
common_sampler_accept(gsmpl, id, true);
|
common_sampler_accept(gsmpl, id, true);
|
||||||
|
|
||||||
|
|
@ -454,7 +452,7 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
|
||||||
}
|
}
|
||||||
|
|
||||||
if (i == draft.size()) {
|
if (i == draft.size()) {
|
||||||
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
|
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);
|
||||||
|
|
||||||
common_sampler_accept(gsmpl, id, true);
|
common_sampler_accept(gsmpl, id, true);
|
||||||
|
|
||||||
|
|
@ -464,13 +462,13 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) {
|
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft) {
|
||||||
std::vector<int> idxs(draft.size() + 1);
|
std::vector<int> idxs(draft.size() + 1);
|
||||||
for (size_t i = 0; i < idxs.size(); ++i) {
|
for (size_t i = 0; i < idxs.size(); ++i) {
|
||||||
idxs[i] = i;
|
idxs[i] = i;
|
||||||
}
|
}
|
||||||
|
|
||||||
return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first);
|
return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft);
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
|
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
|
||||||
|
|
@ -515,7 +513,8 @@ std::string common_sampler_print(const struct common_sampler * gsmpl) {
|
||||||
|
|
||||||
for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
|
for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
|
||||||
const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
|
const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
|
||||||
result += std::string("-> ") + llama_sampler_name(smpl) + " ";
|
result += std::string("-> ");
|
||||||
|
result += std::string(llama_sampler_name(smpl)) + " ";
|
||||||
}
|
}
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
|
|
|
||||||
|
|
@ -48,6 +48,8 @@ struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
|
||||||
// arguments can be nullptr to skip printing
|
// arguments can be nullptr to skip printing
|
||||||
void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);
|
void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);
|
||||||
|
|
||||||
|
struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl);
|
||||||
|
|
||||||
// extended sampling implementation:
|
// extended sampling implementation:
|
||||||
//
|
//
|
||||||
// - set logits
|
// - set logits
|
||||||
|
|
@ -55,10 +57,7 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
|
||||||
// - check if the token fits the grammar (if any)
|
// - check if the token fits the grammar (if any)
|
||||||
// - if not: resample by first applying the grammar constraints and then sampling again (slower path)
|
// - if not: resample by first applying the grammar constraints and then sampling again (slower path)
|
||||||
//
|
//
|
||||||
// if grammar_first is true, the grammar is applied before the samplers (slower)
|
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx);
|
||||||
// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
|
|
||||||
//
|
|
||||||
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
|
|
||||||
|
|
||||||
// generalized version of common_sampler_sample
|
// generalized version of common_sampler_sample
|
||||||
//
|
//
|
||||||
|
|
@ -76,10 +75,10 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
|
||||||
//
|
//
|
||||||
// returns at least 1 token, up to idxs.size()
|
// returns at least 1 token, up to idxs.size()
|
||||||
//
|
//
|
||||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first = false);
|
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft);
|
||||||
|
|
||||||
// assume idxs == [ 0, 1, 2, ..., draft.size() ]
|
// assume idxs == [ 0, 1, 2, ..., draft.size() ]
|
||||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false);
|
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft);
|
||||||
|
|
||||||
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
|
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
|
||||||
|
|
||||||
|
|
@ -107,3 +106,9 @@ std::vector<enum common_sampler_type> common_sampler_types_from_chars(const std:
|
||||||
|
|
||||||
llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab,
|
llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab,
|
||||||
const char * grammar_kind, const char * grammar_data);
|
const char * grammar_kind, const char * grammar_data);
|
||||||
|
|
||||||
|
struct common_sampler_deleter {
|
||||||
|
void operator()(common_sampler * s) { common_sampler_free(s); }
|
||||||
|
};
|
||||||
|
|
||||||
|
typedef std::unique_ptr<common_sampler, common_sampler_deleter> common_sampler_ptr;
|
||||||
|
|
|
||||||
|
|
@ -315,7 +315,7 @@ llama_tokens common_speculative_gen_draft(
|
||||||
for (int i = 0; i < params.n_draft; ++i) {
|
for (int i = 0; i < params.n_draft; ++i) {
|
||||||
common_batch_clear(batch);
|
common_batch_clear(batch);
|
||||||
|
|
||||||
common_sampler_sample(smpl, ctx_dft, 0, true);
|
common_sampler_sample(smpl, ctx_dft, 0);
|
||||||
|
|
||||||
const auto * cur_p = common_sampler_get_candidates(smpl, true);
|
const auto * cur_p = common_sampler_get_candidates(smpl, true);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -383,6 +383,17 @@ class ModelBase:
|
||||||
s = self.model_tensors[name]
|
s = self.model_tensors[name]
|
||||||
self.model_tensors[weight_name] = lambda w=w, s=s, bs=block_size: dequant_simple(w(), s(), bs)
|
self.model_tensors[weight_name] = lambda w=w, s=s, bs=block_size: dequant_simple(w(), s(), bs)
|
||||||
tensors_to_remove.append(name)
|
tensors_to_remove.append(name)
|
||||||
|
if name.endswith(".activation_scale"): # unused
|
||||||
|
tensors_to_remove.append(name)
|
||||||
|
# mistral format
|
||||||
|
if name.endswith(".qscale_weight"):
|
||||||
|
weight_name = name.removesuffix("qscale_weight") + "weight"
|
||||||
|
w = self.model_tensors[weight_name]
|
||||||
|
s = self.model_tensors[name]
|
||||||
|
self.model_tensors[weight_name] = lambda w=w, s=s, bs=block_size: dequant_simple(w(), s(), bs)
|
||||||
|
tensors_to_remove.append(name)
|
||||||
|
if name.endswith(".qscale_act"):
|
||||||
|
tensors_to_remove.append(name)
|
||||||
elif quant_method == "gptq":
|
elif quant_method == "gptq":
|
||||||
for name in self.model_tensors.keys():
|
for name in self.model_tensors.keys():
|
||||||
if name.endswith(".qweight"):
|
if name.endswith(".qweight"):
|
||||||
|
|
@ -694,6 +705,9 @@ class ModelBase:
|
||||||
if "llm_config" in config:
|
if "llm_config" in config:
|
||||||
# rename for InternVL
|
# rename for InternVL
|
||||||
config["text_config"] = config["llm_config"]
|
config["text_config"] = config["llm_config"]
|
||||||
|
if "lm_config" in config:
|
||||||
|
# rename for GlmASR
|
||||||
|
config["text_config"] = config["lm_config"]
|
||||||
if "thinker_config" in config:
|
if "thinker_config" in config:
|
||||||
# rename for Qwen2.5-Omni
|
# rename for Qwen2.5-Omni
|
||||||
config["text_config"] = config["thinker_config"]["text_config"]
|
config["text_config"] = config["thinker_config"]["text_config"]
|
||||||
|
|
@ -743,6 +757,15 @@ class TextModel(ModelBase):
|
||||||
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
|
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
|
||||||
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
||||||
|
|
||||||
|
self.rope_parameters = self.hparams.get("rope_parameters", self.hparams.get("rope_scaling")) or {}
|
||||||
|
|
||||||
|
# Ensure "rope_theta" and "rope_type" is mirrored in rope_parameters
|
||||||
|
if "full_attention" not in self.rope_parameters and "sliding_attention" not in self.rope_parameters:
|
||||||
|
if "rope_theta" not in self.rope_parameters and (rope_theta := self.find_hparam(["rope_theta", "global_rope_theta", "rotary_emb_base"], optional=True)) is not None:
|
||||||
|
self.rope_parameters["rope_theta"] = rope_theta
|
||||||
|
if "rope_type" not in self.rope_parameters and (rope_type := self.rope_parameters.get("type")) is not None:
|
||||||
|
self.rope_parameters["rope_type"] = rope_type
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def __init_subclass__(cls):
|
def __init_subclass__(cls):
|
||||||
# can't use an abstract property, because overriding it without type errors
|
# can't use an abstract property, because overriding it without type errors
|
||||||
|
|
@ -784,7 +807,7 @@ class TextModel(ModelBase):
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
self.gguf_writer.add_block_count(self.block_count)
|
self.gguf_writer.add_block_count(self.block_count)
|
||||||
|
|
||||||
if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions", "max_length"], optional=True)) is not None:
|
if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions", "max_length", "max_sequence_length", "model_max_length"], optional=True)) is not None:
|
||||||
self.gguf_writer.add_context_length(n_ctx)
|
self.gguf_writer.add_context_length(n_ctx)
|
||||||
logger.info(f"gguf: context length = {n_ctx}")
|
logger.info(f"gguf: context length = {n_ctx}")
|
||||||
|
|
||||||
|
|
@ -804,7 +827,42 @@ class TextModel(ModelBase):
|
||||||
self.gguf_writer.add_head_count_kv(n_head_kv)
|
self.gguf_writer.add_head_count_kv(n_head_kv)
|
||||||
logger.info(f"gguf: key-value head count = {n_head_kv}")
|
logger.info(f"gguf: key-value head count = {n_head_kv}")
|
||||||
|
|
||||||
if (rope_theta := self.hparams.get("rope_theta")) is not None:
|
rope_params = self.rope_parameters.get("full_attention", self.rope_parameters)
|
||||||
|
if (rope_type := rope_params.get("rope_type")) is not None:
|
||||||
|
rope_factor = rope_params.get("factor")
|
||||||
|
rope_gguf_type = gguf.RopeScalingType.NONE
|
||||||
|
if rope_type == "linear" and rope_factor is not None:
|
||||||
|
rope_gguf_type = gguf.RopeScalingType.LINEAR
|
||||||
|
self.gguf_writer.add_rope_scaling_type(rope_gguf_type)
|
||||||
|
self.gguf_writer.add_rope_scaling_factor(rope_factor)
|
||||||
|
elif rope_type == "yarn" and rope_factor is not None:
|
||||||
|
rope_gguf_type = gguf.RopeScalingType.YARN
|
||||||
|
self.gguf_writer.add_rope_scaling_type(rope_gguf_type)
|
||||||
|
self.gguf_writer.add_rope_scaling_factor(rope_factor)
|
||||||
|
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_params["original_max_position_embeddings"])
|
||||||
|
if (yarn_ext_factor := rope_params.get("extrapolation_factor")) is not None:
|
||||||
|
self.gguf_writer.add_rope_scaling_yarn_ext_factor(yarn_ext_factor)
|
||||||
|
if (yarn_attn_factor := rope_params.get("attention_factor", rope_params.get("attn_factor"))) is not None:
|
||||||
|
self.gguf_writer.add_rope_scaling_yarn_attn_factor(yarn_attn_factor)
|
||||||
|
if (yarn_beta_fast := rope_params.get("beta_fast")) is not None:
|
||||||
|
self.gguf_writer.add_rope_scaling_yarn_beta_fast(yarn_beta_fast)
|
||||||
|
if (yarn_beta_slow := rope_params.get("beta_slow")) is not None:
|
||||||
|
self.gguf_writer.add_rope_scaling_yarn_beta_slow(yarn_beta_slow)
|
||||||
|
# self.gguf_writer.add_rope_scaling_yarn_log_mul(rope_params["mscale_all_dim"])
|
||||||
|
elif rope_type == "su" or rope_type == "longrope":
|
||||||
|
rope_gguf_type = gguf.RopeScalingType.LONGROPE
|
||||||
|
self.gguf_writer.add_rope_scaling_type(rope_gguf_type)
|
||||||
|
elif rope_type == "dynamic":
|
||||||
|
# HunYuan, handled in model class
|
||||||
|
pass
|
||||||
|
elif rope_type.lower() == "llama3":
|
||||||
|
# Handled in generate_extra_tensors
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
logger.warning(f"Unknown RoPE type: {rope_type}")
|
||||||
|
logger.info(f"gguf: rope scaling type = {rope_gguf_type.name}")
|
||||||
|
|
||||||
|
if (rope_theta := rope_params.get("rope_theta")) is not None:
|
||||||
self.gguf_writer.add_rope_freq_base(rope_theta)
|
self.gguf_writer.add_rope_freq_base(rope_theta)
|
||||||
logger.info(f"gguf: rope theta = {rope_theta}")
|
logger.info(f"gguf: rope theta = {rope_theta}")
|
||||||
if (f_rms_eps := self.find_hparam(["rms_norm_eps", "norm_eps"], optional=True)) is not None:
|
if (f_rms_eps := self.find_hparam(["rms_norm_eps", "norm_eps"], optional=True)) is not None:
|
||||||
|
|
@ -1146,6 +1204,9 @@ class TextModel(ModelBase):
|
||||||
if chkhsh == "f4f37b6c8eb9ea29b3eac6bb8c8487c5ab7885f8d8022e67edc1c68ce8403e95":
|
if chkhsh == "f4f37b6c8eb9ea29b3eac6bb8c8487c5ab7885f8d8022e67edc1c68ce8403e95":
|
||||||
# ref: https://huggingface.co/MiniMaxAI/MiniMax-M2
|
# ref: https://huggingface.co/MiniMaxAI/MiniMax-M2
|
||||||
res = "minimax-m2"
|
res = "minimax-m2"
|
||||||
|
if chkhsh == "4a2e2abae11ca2b86d570fc5b44be4d5eb5e72cc8f22dd136a94b37da83ab665":
|
||||||
|
# ref: https://huggingface.co/KORMo-Team/KORMo-tokenizer
|
||||||
|
res = "kormo"
|
||||||
|
|
||||||
if res is None:
|
if res is None:
|
||||||
logger.warning("\n")
|
logger.warning("\n")
|
||||||
|
|
@ -1475,6 +1536,21 @@ class TextModel(ModelBase):
|
||||||
raise NotImplementedError("Only MEAN, CLS, and LAST pooling types supported")
|
raise NotImplementedError("Only MEAN, CLS, and LAST pooling types supported")
|
||||||
self.gguf_writer.add_pooling_type(pooling_type)
|
self.gguf_writer.add_pooling_type(pooling_type)
|
||||||
|
|
||||||
|
def _set_vocab_glmedge(self):
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
|
||||||
|
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
|
||||||
|
tokens, toktypes, tokpre = self.get_vocab_base()
|
||||||
|
self.gguf_writer.add_tokenizer_model("gpt2")
|
||||||
|
self.gguf_writer.add_tokenizer_pre(tokpre)
|
||||||
|
self.gguf_writer.add_token_list(tokens)
|
||||||
|
self.gguf_writer.add_token_types(toktypes)
|
||||||
|
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
|
||||||
|
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
|
||||||
|
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"])
|
||||||
|
special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"])
|
||||||
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
def _set_vocab_interns1(self):
|
def _set_vocab_interns1(self):
|
||||||
tokens: list[str] = []
|
tokens: list[str] = []
|
||||||
toktypes: list[int] = []
|
toktypes: list[int] = []
|
||||||
|
|
@ -1604,7 +1680,7 @@ class MmprojModel(ModelBase):
|
||||||
preprocessor_config: dict[str, Any]
|
preprocessor_config: dict[str, Any]
|
||||||
global_config: dict[str, Any]
|
global_config: dict[str, Any]
|
||||||
|
|
||||||
n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth"]
|
n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth", "encoder_layers"]
|
||||||
|
|
||||||
has_vision_encoder: bool = True # by default
|
has_vision_encoder: bool = True # by default
|
||||||
has_audio_encoder: bool = False
|
has_audio_encoder: bool = False
|
||||||
|
|
@ -1680,7 +1756,8 @@ class MmprojModel(ModelBase):
|
||||||
return self.global_config.get(config_name)
|
return self.global_config.get(config_name)
|
||||||
|
|
||||||
def get_audio_config(self) -> dict[str, Any] | None:
|
def get_audio_config(self) -> dict[str, Any] | None:
|
||||||
return self.global_config.get("audio_config")
|
mm_config_key = "whisper_config" if "whisper_config" in self.hparams else "audio_config"
|
||||||
|
return self.global_config.get(mm_config_key)
|
||||||
|
|
||||||
def set_type(self):
|
def set_type(self):
|
||||||
self.gguf_writer.add_type(gguf.GGUFType.MMPROJ)
|
self.gguf_writer.add_type(gguf.GGUFType.MMPROJ)
|
||||||
|
|
@ -1955,34 +2032,10 @@ class BaichuanModel(TextModel):
|
||||||
self._set_vocab_sentencepiece()
|
self._set_vocab_sentencepiece()
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
head_count = self.hparams["num_attention_heads"]
|
super().set_gguf_parameters()
|
||||||
head_count_kv = self.hparams.get("num_key_value_heads", head_count)
|
|
||||||
|
|
||||||
ctx_length = 0
|
|
||||||
if "max_sequence_length" in self.hparams:
|
|
||||||
ctx_length = self.hparams["max_sequence_length"]
|
|
||||||
elif "max_position_embeddings" in self.hparams:
|
|
||||||
ctx_length = self.hparams["max_position_embeddings"]
|
|
||||||
elif "model_max_length" in self.hparams:
|
|
||||||
ctx_length = self.hparams["model_max_length"]
|
|
||||||
else:
|
|
||||||
raise ValueError("gguf: can not find ctx length parameter.")
|
|
||||||
|
|
||||||
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
||||||
self.gguf_writer.add_context_length(ctx_length)
|
|
||||||
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
|
||||||
self.gguf_writer.add_block_count(self.block_count)
|
|
||||||
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
|
||||||
self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
||||||
self.gguf_writer.add_head_count(head_count)
|
|
||||||
self.gguf_writer.add_head_count_kv(head_count_kv)
|
|
||||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
|
||||||
self.gguf_writer.add_file_type(self.ftype)
|
|
||||||
|
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
head_count = self.hparams["num_attention_heads"]
|
head_count = self.hparams["num_attention_heads"]
|
||||||
|
|
@ -2078,34 +2131,10 @@ class XverseModel(TextModel):
|
||||||
special_vocab.add_to_gguf(self.gguf_writer)
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
head_count = self.hparams["num_attention_heads"]
|
super().set_gguf_parameters()
|
||||||
head_count_kv = self.hparams.get("num_key_value_heads", head_count)
|
|
||||||
|
|
||||||
ctx_length = 0
|
|
||||||
if "max_sequence_length" in self.hparams:
|
|
||||||
ctx_length = self.hparams["max_sequence_length"]
|
|
||||||
elif "max_position_embeddings" in self.hparams:
|
|
||||||
ctx_length = self.hparams["max_position_embeddings"]
|
|
||||||
elif "model_max_length" in self.hparams:
|
|
||||||
ctx_length = self.hparams["model_max_length"]
|
|
||||||
else:
|
|
||||||
raise ValueError("gguf: can not find ctx length parameter.")
|
|
||||||
|
|
||||||
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
||||||
self.gguf_writer.add_context_length(ctx_length)
|
|
||||||
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
|
||||||
self.gguf_writer.add_block_count(self.block_count)
|
|
||||||
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
|
||||||
self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
||||||
self.gguf_writer.add_head_count(head_count)
|
|
||||||
self.gguf_writer.add_head_count_kv(head_count_kv)
|
|
||||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
|
||||||
self.gguf_writer.add_file_type(self.ftype)
|
|
||||||
|
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
del bid # unused
|
del bid # unused
|
||||||
|
|
@ -2366,8 +2395,13 @@ class LlamaModel(TextModel):
|
||||||
# fix for SmolVLM2, missing `num_attention_heads` in config.json
|
# fix for SmolVLM2, missing `num_attention_heads` in config.json
|
||||||
if self.hf_arch == "VLlama3ForCausalLM":
|
if self.hf_arch == "VLlama3ForCausalLM":
|
||||||
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
|
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
|
||||||
|
hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
|
||||||
|
self.origin_hf_arch = hparams.get('architectures', [None])[0]
|
||||||
|
|
||||||
def set_vocab(self):
|
def set_vocab(self):
|
||||||
|
if self.origin_hf_arch == "GlmasrModel":
|
||||||
|
return self._set_vocab_glmedge()
|
||||||
|
|
||||||
if self.is_mistral_format:
|
if self.is_mistral_format:
|
||||||
return self._set_vocab_mistral()
|
return self._set_vocab_mistral()
|
||||||
|
|
||||||
|
|
@ -2419,11 +2453,6 @@ class LlamaModel(TextModel):
|
||||||
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||||
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||||
|
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
||||||
if n_head_kv is not None and n_head != n_head_kv:
|
if n_head_kv is not None and n_head != n_head_kv:
|
||||||
|
|
@ -2443,6 +2472,7 @@ class LlamaModel(TextModel):
|
||||||
"vision_language_adapter.",
|
"vision_language_adapter.",
|
||||||
"patch_merger.",
|
"patch_merger.",
|
||||||
"pre_mm_projector_norm",
|
"pre_mm_projector_norm",
|
||||||
|
"audio_encoder.",
|
||||||
]
|
]
|
||||||
|
|
||||||
is_multimodal_tensor = "vision_tower" in name \
|
is_multimodal_tensor = "vision_tower" in name \
|
||||||
|
|
@ -2507,16 +2537,16 @@ class LlamaModel(TextModel):
|
||||||
return [(self.map_tensor_name(name), data_torch)]
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||||
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters):
|
||||||
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
if rope_params.get("rope_type", '').lower() == "llama3":
|
||||||
base = self.hparams.get("rope_theta", 10000.0)
|
base = rope_params.get("rope_theta", 10000.0)
|
||||||
if (dim := self.hparams.get("head_dim")) is None:
|
if (dim := self.hparams.get("head_dim")) is None:
|
||||||
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||||
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
||||||
|
|
||||||
factor = rope_scaling.get("factor", 8.0)
|
factor = rope_params.get("factor", 8.0)
|
||||||
low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
|
low_freq_factor = rope_params.get("low_freq_factor", 1.0)
|
||||||
high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
|
high_freq_factor = rope_params.get("high_freq_factor", 4.0)
|
||||||
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
|
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
|
||||||
|
|
||||||
low_freq_wavelen = old_context_len / low_freq_factor
|
low_freq_wavelen = old_context_len / low_freq_factor
|
||||||
|
|
@ -2553,11 +2583,6 @@ class ArceeModel(LlamaModel):
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
self._try_set_pooling_type()
|
self._try_set_pooling_type()
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
|
||||||
|
|
||||||
|
|
||||||
@ModelBase.register("AfmoeForCausalLM")
|
@ModelBase.register("AfmoeForCausalLM")
|
||||||
|
|
@ -2840,27 +2865,18 @@ class Mistral3Model(LlamaModel):
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
rope_params = self.hparams.get("rope_parameters")
|
rope_params = self.rope_parameters
|
||||||
if self.hparams.get("model_type") == "ministral3":
|
if self.hparams.get("model_type") == "ministral3":
|
||||||
assert rope_params is not None, "ministral3 must have 'rope_parameters' config"
|
assert rope_params, "ministral3 must have 'rope_parameters' config"
|
||||||
assert rope_params["rope_type"] == "yarn", "ministral3 rope_type must be 'yarn'"
|
assert rope_params["rope_type"] == "yarn", "ministral3 rope_type must be 'yarn'"
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_params["factor"])
|
|
||||||
self.gguf_writer.add_rope_scaling_yarn_beta_fast(rope_params["beta_fast"])
|
|
||||||
self.gguf_writer.add_rope_scaling_yarn_beta_slow(rope_params["beta_slow"])
|
|
||||||
self.gguf_writer.add_rope_scaling_yarn_log_mul(rope_params["mscale_all_dim"])
|
self.gguf_writer.add_rope_scaling_yarn_log_mul(rope_params["mscale_all_dim"])
|
||||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_params["original_max_position_embeddings"])
|
|
||||||
self.gguf_writer.add_rope_freq_base(rope_params["rope_theta"])
|
|
||||||
self.gguf_writer.add_attn_temperature_scale(rope_params["llama_4_scaling_beta"])
|
self.gguf_writer.add_attn_temperature_scale(rope_params["llama_4_scaling_beta"])
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
|
||||||
# TODO: probably not worth supporting quantized weight, as official BF16 is also available
|
|
||||||
if name.endswith("weight_scale_inv"):
|
|
||||||
raise ValueError("This is a quantized weight, please use BF16 weight instead")
|
|
||||||
|
|
||||||
name = name.replace("language_model.", "")
|
name = name.replace("language_model.", "")
|
||||||
if "multi_modal_projector" in name or "vision_tower" in name:
|
if "multi_modal_projector" in name or "vision_tower" in name:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
return super().modify_tensors(data_torch, name, bid)
|
return super().modify_tensors(data_torch, name, bid)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -2950,7 +2966,7 @@ class DeciModel(TextModel):
|
||||||
assert self.block_count == len(self._num_kv_heads)
|
assert self.block_count == len(self._num_kv_heads)
|
||||||
assert self.block_count == len(self._num_heads)
|
assert self.block_count == len(self._num_heads)
|
||||||
assert self.block_count == len(self._ffn_dims)
|
assert self.block_count == len(self._ffn_dims)
|
||||||
if (rope_theta := self.hparams.get("rope_theta")) is not None:
|
if (rope_theta := self.rope_parameters.get("rope_theta")) is not None:
|
||||||
self.gguf_writer.add_rope_freq_base(rope_theta)
|
self.gguf_writer.add_rope_freq_base(rope_theta)
|
||||||
self.gguf_writer.add_head_count_kv(self._num_kv_heads)
|
self.gguf_writer.add_head_count_kv(self._num_kv_heads)
|
||||||
self.gguf_writer.add_head_count(self._num_heads)
|
self.gguf_writer.add_head_count(self._num_heads)
|
||||||
|
|
@ -2975,11 +2991,6 @@ class DeciModel(TextModel):
|
||||||
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||||
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||||
|
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
||||||
if n_head_kv is not None and n_head != n_head_kv:
|
if n_head_kv is not None and n_head != n_head_kv:
|
||||||
|
|
@ -3008,16 +3019,16 @@ class DeciModel(TextModel):
|
||||||
return [(self.map_tensor_name(name), data_torch)]
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||||
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters):
|
||||||
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
if rope_params.get("rope_type", '').lower() == "llama3":
|
||||||
base = self.hparams.get("rope_theta", 10000.0)
|
base = rope_params.get("rope_theta", 10000.0)
|
||||||
if (dim := self.hparams.get("head_dim")) is None:
|
if (dim := self.hparams.get("head_dim")) is None:
|
||||||
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||||
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
||||||
|
|
||||||
factor = rope_scaling.get("factor", 8.0)
|
factor = rope_params.get("factor", 8.0)
|
||||||
low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
|
low_freq_factor = rope_params.get("low_freq_factor", 1.0)
|
||||||
high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
|
high_freq_factor = rope_params.get("high_freq_factor", 4.0)
|
||||||
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
|
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
|
||||||
|
|
||||||
low_freq_wavelen = old_context_len / low_freq_factor
|
low_freq_wavelen = old_context_len / low_freq_factor
|
||||||
|
|
@ -3271,10 +3282,6 @@ class MiniCPMModel(TextModel):
|
||||||
logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"]
|
logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"]
|
||||||
self.gguf_writer.add_logit_scale(logit_scale)
|
self.gguf_writer.add_logit_scale(logit_scale)
|
||||||
logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}")
|
logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}")
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "longrope":
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE)
|
|
||||||
logger.info(f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}")
|
|
||||||
|
|
||||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||||
rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||||
|
|
@ -3394,19 +3401,8 @@ class QwenModel(TextModel):
|
||||||
def set_vocab(self):
|
def set_vocab(self):
|
||||||
self._set_vocab_qwen()
|
self._set_vocab_qwen()
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
|
||||||
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
|
||||||
self.gguf_writer.add_block_count(self.block_count)
|
|
||||||
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
|
||||||
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
|
||||||
self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
|
|
||||||
self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
|
||||||
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
|
|
||||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
|
|
||||||
self.gguf_writer.add_file_type(self.ftype)
|
|
||||||
|
|
||||||
|
@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM", "Qwen2AudioForConditionalGeneration", "KORMoForCausalLM")
|
||||||
@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM", "Qwen2AudioForConditionalGeneration")
|
|
||||||
class Qwen2Model(TextModel):
|
class Qwen2Model(TextModel):
|
||||||
model_arch = gguf.MODEL_ARCH.QWEN2
|
model_arch = gguf.MODEL_ARCH.QWEN2
|
||||||
|
|
||||||
|
|
@ -3419,11 +3415,6 @@ class Qwen2Model(TextModel):
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
self._try_set_pooling_type()
|
self._try_set_pooling_type()
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
if self.hf_arch == "Qwen2Model":
|
if self.hf_arch == "Qwen2Model":
|
||||||
|
|
@ -3491,12 +3482,6 @@ class DreamModel(TextModel):
|
||||||
|
|
||||||
# Dream models use non-causal attention for diffusion
|
# Dream models use non-causal attention for diffusion
|
||||||
self.gguf_writer.add_causal_attention(False)
|
self.gguf_writer.add_causal_attention(False)
|
||||||
# Handle RoPE scaling similar to Qwen2
|
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
|
||||||
|
|
||||||
# Add Dream-specific parameters
|
# Add Dream-specific parameters
|
||||||
mask_token_id = self.hparams.get("mask_token_id")
|
mask_token_id = self.hparams.get("mask_token_id")
|
||||||
|
|
@ -4040,13 +4025,6 @@ class Qwen2MoeModel(TextModel):
|
||||||
if (shared_expert_intermediate_size := self.hparams.get('shared_expert_intermediate_size')) is not None:
|
if (shared_expert_intermediate_size := self.hparams.get('shared_expert_intermediate_size')) is not None:
|
||||||
self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size)
|
self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size)
|
||||||
logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}")
|
logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}")
|
||||||
# YaRN is not enabled by default
|
|
||||||
# To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts
|
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
|
||||||
|
|
||||||
_experts: list[dict[str, Tensor]] | None = None
|
_experts: list[dict[str, Tensor]] | None = None
|
||||||
|
|
||||||
|
|
@ -4648,7 +4626,7 @@ class Phi3MiniModel(TextModel):
|
||||||
self.gguf_writer.add_head_count_kv(n_head_kv)
|
self.gguf_writer.add_head_count_kv(n_head_kv)
|
||||||
self.gguf_writer.add_layer_norm_rms_eps(rms_eps)
|
self.gguf_writer.add_layer_norm_rms_eps(rms_eps)
|
||||||
self.gguf_writer.add_rope_dimension_count(rope_dims)
|
self.gguf_writer.add_rope_dimension_count(rope_dims)
|
||||||
self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
|
self.gguf_writer.add_rope_freq_base(self.rope_parameters.get("full_attention", self.rope_parameters)["rope_theta"])
|
||||||
self.gguf_writer.add_file_type(self.ftype)
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
sliding_window = self.hparams.get("sliding_window")
|
sliding_window = self.hparams.get("sliding_window")
|
||||||
# use zero value of sliding_window to distinguish Phi-4 from other PHI3 models
|
# use zero value of sliding_window to distinguish Phi-4 from other PHI3 models
|
||||||
|
|
@ -4924,7 +4902,7 @@ class Plamo2Model(TextModel):
|
||||||
self.gguf_writer.add_value_length(hparams.get("hidden_size_per_head", 128))
|
self.gguf_writer.add_value_length(hparams.get("hidden_size_per_head", 128))
|
||||||
self.gguf_writer.add_block_count(self.block_count)
|
self.gguf_writer.add_block_count(self.block_count)
|
||||||
self.gguf_writer.add_layer_norm_rms_eps(hparams.get("rms_norm_eps", 1e-06))
|
self.gguf_writer.add_layer_norm_rms_eps(hparams.get("rms_norm_eps", 1e-06))
|
||||||
self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 10000))
|
self.gguf_writer.add_rope_freq_base(self.rope_parameters.get("rope_theta", 10000))
|
||||||
|
|
||||||
# Mamba parameters
|
# Mamba parameters
|
||||||
self.gguf_writer.add_ssm_state_size(hparams.get("mamba_d_state", 64))
|
self.gguf_writer.add_ssm_state_size(hparams.get("mamba_d_state", 64))
|
||||||
|
|
@ -5122,21 +5100,6 @@ class InternLM2Model(TextModel):
|
||||||
|
|
||||||
special_vocab.add_to_gguf(self.gguf_writer)
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
|
||||||
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
|
||||||
self.gguf_writer.add_block_count(self.block_count)
|
|
||||||
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
|
||||||
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
|
||||||
self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
|
|
||||||
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
|
|
||||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
|
||||||
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
|
|
||||||
self.gguf_writer.add_file_type(self.ftype)
|
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
num_heads = self.hparams["num_attention_heads"]
|
num_heads = self.hparams["num_attention_heads"]
|
||||||
num_kv_heads = self.hparams["num_key_value_heads"]
|
num_kv_heads = self.hparams["num_key_value_heads"]
|
||||||
|
|
@ -5213,11 +5176,6 @@ class InternLM3Model(TextModel):
|
||||||
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||||
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||||
|
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
n_head = self.hparams["num_attention_heads"]
|
n_head = self.hparams["num_attention_heads"]
|
||||||
n_kv_head = self.hparams.get("num_key_value_heads")
|
n_kv_head = self.hparams.get("num_key_value_heads")
|
||||||
|
|
@ -5580,7 +5538,6 @@ class NomicBertModel(BertModel):
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
|
|
||||||
if self.is_moe:
|
if self.is_moe:
|
||||||
self.gguf_writer.add_moe_every_n_layers(self.hparams["moe_every_n_layers"])
|
self.gguf_writer.add_moe_every_n_layers(self.hparams["moe_every_n_layers"])
|
||||||
self.gguf_writer.add_expert_count(self.hparams["num_experts"])
|
self.gguf_writer.add_expert_count(self.hparams["num_experts"])
|
||||||
|
|
@ -5703,8 +5660,6 @@ class XLMRobertaModel(BertModel):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
|
|
||||||
# jina-embeddings-v3
|
# jina-embeddings-v3
|
||||||
if rotary_emb_base := self.hparams.get("rotary_emb_base"):
|
|
||||||
self.gguf_writer.add_rope_freq_base(rotary_emb_base)
|
|
||||||
lora_alpha = self.hparams.get("lora_alpha")
|
lora_alpha = self.hparams.get("lora_alpha")
|
||||||
if lora_prompt_prefixes := self.hparams.get("task_instructions"):
|
if lora_prompt_prefixes := self.hparams.get("task_instructions"):
|
||||||
assert self._lora_files and all(lora_name in lora_prompt_prefixes for lora_name in self._lora_files.keys())
|
assert self._lora_files and all(lora_name in lora_prompt_prefixes for lora_name in self._lora_files.keys())
|
||||||
|
|
@ -5825,33 +5780,30 @@ class Gemma3Model(TextModel):
|
||||||
norm_shift = 1.0 # Gemma3RMSNorm adds 1.0 to the norm value
|
norm_shift = 1.0 # Gemma3RMSNorm adds 1.0 to the norm value
|
||||||
|
|
||||||
def set_vocab(self):
|
def set_vocab(self):
|
||||||
|
if (self.dir_model / "tokenizer.model").is_file():
|
||||||
self._set_vocab_sentencepiece()
|
self._set_vocab_sentencepiece()
|
||||||
|
|
||||||
self.gguf_writer.add_add_space_prefix(False)
|
self.gguf_writer.add_add_space_prefix(False)
|
||||||
|
else:
|
||||||
|
self._set_vocab_gpt2()
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
|
super().set_gguf_parameters()
|
||||||
hparams = self.hparams
|
hparams = self.hparams
|
||||||
|
|
||||||
# some default values are not specified in the hparams
|
# some default values are not specified in the hparams
|
||||||
self.gguf_writer.add_context_length(hparams.get("max_position_embeddings", 131072))
|
self.gguf_writer.add_context_length(hparams.get("max_position_embeddings", 131072))
|
||||||
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
|
||||||
self.gguf_writer.add_block_count(self.block_count)
|
|
||||||
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
|
||||||
self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 8))
|
self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 8))
|
||||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-6))
|
self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-6))
|
||||||
self.gguf_writer.add_key_length(hparams.get("head_dim", 256))
|
self.gguf_writer.add_key_length(hparams.get("head_dim", 256))
|
||||||
self.gguf_writer.add_value_length(hparams.get("head_dim", 256))
|
self.gguf_writer.add_value_length(hparams.get("head_dim", 256))
|
||||||
self.gguf_writer.add_file_type(self.ftype)
|
self.gguf_writer.add_rope_freq_base(self.rope_parameters.get("full_attention", self.rope_parameters).get("rope_theta", 1_000_000.0)) # for global layers
|
||||||
self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1_000_000.0)) # for global layers
|
|
||||||
# attn_logit_softcapping is removed in Gemma3
|
# attn_logit_softcapping is removed in Gemma3
|
||||||
assert hparams.get("attn_logit_softcapping") is None
|
assert hparams.get("attn_logit_softcapping") is None
|
||||||
|
if (final_logit_softcap := hparams.get("final_logit_softcapping")):
|
||||||
|
self.gguf_writer.add_final_logit_softcapping(final_logit_softcap)
|
||||||
|
if hparams.get("sliding_window_pattern") != 1:
|
||||||
self.gguf_writer.add_sliding_window(hparams["sliding_window"])
|
self.gguf_writer.add_sliding_window(hparams["sliding_window"])
|
||||||
self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4))
|
self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4))
|
||||||
if hparams.get("rope_scaling") is not None:
|
|
||||||
assert hparams["rope_scaling"]["rope_type"] == "linear"
|
|
||||||
# important: this rope_scaling is only applied for global layers, and not used by 1B model
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
|
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
del bid # unused
|
del bid # unused
|
||||||
|
|
@ -5865,8 +5817,10 @@ class Gemma3Model(TextModel):
|
||||||
|
|
||||||
# remove OOV (out-of-vocabulary) rows in token_embd
|
# remove OOV (out-of-vocabulary) rows in token_embd
|
||||||
if "embed_tokens.weight" in name:
|
if "embed_tokens.weight" in name:
|
||||||
vocab = self._create_vocab_sentencepiece()
|
if (self.dir_model / "tokenizer.model").is_file():
|
||||||
tokens = vocab[0]
|
tokens = self._create_vocab_sentencepiece()[0]
|
||||||
|
else:
|
||||||
|
tokens = self.get_vocab_base()[0]
|
||||||
data_torch = data_torch[:len(tokens)]
|
data_torch = data_torch[:len(tokens)]
|
||||||
|
|
||||||
# ref code in Gemma3RMSNorm
|
# ref code in Gemma3RMSNorm
|
||||||
|
|
@ -6753,13 +6707,6 @@ class Olmo2Model(TextModel):
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
|
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
self.gguf_writer.add_rope_scaling_attn_factors(rope_scaling["attention_factor"])
|
|
||||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
|
||||||
|
|
||||||
if "sliding_window" in self.hparams:
|
if "sliding_window" in self.hparams:
|
||||||
self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
|
self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
|
||||||
|
|
||||||
|
|
@ -7258,12 +7205,11 @@ class DeepseekV2Model(TextModel):
|
||||||
|
|
||||||
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
|
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
|
||||||
|
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
if (rope_mscale_all := self.rope_parameters.get("mscale_all_dim")) is not None:
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
# [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
# note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
# ref https://github.com/ggml-org/llama.cpp/pull/17945
|
||||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_mscale_all)
|
||||||
self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_scaling["mscale_all_dim"])
|
|
||||||
|
|
||||||
_experts: list[dict[str, Tensor]] | None = None
|
_experts: list[dict[str, Tensor]] | None = None
|
||||||
|
|
||||||
|
|
@ -7871,11 +7817,6 @@ class Glm4Model(TextModel):
|
||||||
if (rope_dim := self.hparams.get("head_dim")) is None:
|
if (rope_dim := self.hparams.get("head_dim")) is None:
|
||||||
rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||||
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
|
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
if name.startswith("model.visual."): # ignore visual part of Glm4v
|
if name.startswith("model.visual."): # ignore visual part of Glm4v
|
||||||
|
|
@ -8213,50 +8154,26 @@ class ExaoneModel(TextModel):
|
||||||
model_arch = gguf.MODEL_ARCH.EXAONE
|
model_arch = gguf.MODEL_ARCH.EXAONE
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
|
super().set_gguf_parameters()
|
||||||
hparams = self.hparams
|
hparams = self.hparams
|
||||||
|
|
||||||
assert (hparams["activation_function"] == "silu")
|
assert (hparams["activation_function"] == "silu")
|
||||||
|
|
||||||
max_position_embeddings = hparams["max_position_embeddings"]
|
|
||||||
embed_dim = hparams["hidden_size"]
|
|
||||||
num_heads = hparams["num_attention_heads"]
|
|
||||||
num_kv_heads = hparams.get("num_key_value_heads", num_heads)
|
|
||||||
layer_norm_eps = hparams["layer_norm_epsilon"]
|
|
||||||
intermediate_size = hparams["intermediate_size"] if "intermediate_size" in hparams else 4 * embed_dim
|
|
||||||
# ignore for now as EXAONE-3.0-7.8B-Instruct attentino_dropout is 0.0
|
|
||||||
# attention_dropout_rate = hparams["attention_dropout"]
|
|
||||||
# ignore for now as EXAONE-3.0-7.8B-Instruct embed_dropout is 0.0
|
|
||||||
# embed_dropout_rate = hparams["embed_dropout"]
|
|
||||||
self.gguf_writer.add_embedding_length(embed_dim)
|
|
||||||
self.gguf_writer.add_head_count(num_heads)
|
|
||||||
self.gguf_writer.add_head_count_kv(num_kv_heads)
|
|
||||||
self.gguf_writer.add_context_length(max_position_embeddings)
|
|
||||||
self.gguf_writer.add_layer_norm_rms_eps(layer_norm_eps)
|
|
||||||
self.gguf_writer.add_feed_forward_length(intermediate_size)
|
|
||||||
self.gguf_writer.add_block_count(self.block_count)
|
|
||||||
self.gguf_writer.add_file_type(self.ftype)
|
|
||||||
|
|
||||||
if (rope_theta := self.hparams.get("rope_theta")) is not None:
|
|
||||||
self.gguf_writer.add_rope_freq_base(rope_theta)
|
|
||||||
rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True)
|
rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True)
|
||||||
rotary_factor = rotary_factor if rotary_factor is not None else 1.0
|
rotary_factor = rotary_factor if rotary_factor is not None else 1.0
|
||||||
self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
|
self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
|
|
||||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||||
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters):
|
||||||
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
if rope_params.get("rope_type", '').lower() == "llama3":
|
||||||
base = self.hparams.get("rope_theta", 10000.0)
|
base = self.rope_parameters.get("rope_theta", 10000.0)
|
||||||
if (dim := self.hparams.get("head_dim")) is None:
|
if (dim := self.hparams.get("head_dim")) is None:
|
||||||
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||||
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
||||||
|
|
||||||
factor = rope_scaling.get("factor", 8.0)
|
factor = rope_params.get("factor", 8.0)
|
||||||
low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
|
low_freq_factor = rope_params.get("low_freq_factor", 1.0)
|
||||||
high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
|
high_freq_factor = rope_params.get("high_freq_factor", 4.0)
|
||||||
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
|
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
|
||||||
|
|
||||||
low_freq_wavelen = old_context_len / low_freq_factor
|
low_freq_wavelen = old_context_len / low_freq_factor
|
||||||
|
|
@ -8311,22 +8228,17 @@ class Exaone4Model(TextModel):
|
||||||
if len(sliding_window_pattern) == hparams["num_hidden_layers"]:
|
if len(sliding_window_pattern) == hparams["num_hidden_layers"]:
|
||||||
self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
|
self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
|
||||||
|
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
|
|
||||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||||
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters):
|
||||||
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
if rope_params.get("rope_type", '').lower() == "llama3":
|
||||||
base = self.hparams.get("rope_theta", 10_000.0)
|
base = rope_params.get("rope_theta", 10_000.0)
|
||||||
if (dim := self.hparams.get("head_dim")) is None:
|
if (dim := self.hparams.get("head_dim")) is None:
|
||||||
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||||
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
||||||
|
|
||||||
factor = rope_scaling.get("factor", 16.0)
|
factor = rope_params.get("factor", 16.0)
|
||||||
low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
|
low_freq_factor = rope_params.get("low_freq_factor", 1.0)
|
||||||
high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
|
high_freq_factor = rope_params.get("high_freq_factor", 4.0)
|
||||||
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
|
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
|
||||||
|
|
||||||
low_freq_wavelen = old_context_len / low_freq_factor
|
low_freq_wavelen = old_context_len / low_freq_factor
|
||||||
|
|
@ -8637,13 +8549,6 @@ class BailingMoeModel(TextModel):
|
||||||
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||||
|
|
||||||
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
|
||||||
else:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
|
||||||
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
|
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
|
||||||
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
||||||
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
|
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
|
||||||
|
|
@ -8750,13 +8655,6 @@ class BailingMoeV2Model(TextModel):
|
||||||
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||||
|
|
||||||
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
|
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
|
||||||
else:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
|
||||||
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
|
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
|
||||||
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
||||||
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
|
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
|
||||||
|
|
@ -8835,13 +8733,6 @@ class GroveMoeModel(TextModel):
|
||||||
self.gguf_writer.add_experts_per_group(2)
|
self.gguf_writer.add_experts_per_group(2)
|
||||||
# FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L376
|
# FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L376
|
||||||
self.gguf_writer.add_expert_group_scale(0.05)
|
self.gguf_writer.add_expert_group_scale(0.05)
|
||||||
# YaRN is not enabled by default
|
|
||||||
# To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts
|
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
|
||||||
|
|
||||||
_experts: list[dict[str, Tensor]] | None = None
|
_experts: list[dict[str, Tensor]] | None = None
|
||||||
_chunk_experts: list[dict[str, Tensor]] | None = None
|
_chunk_experts: list[dict[str, Tensor]] | None = None
|
||||||
|
|
@ -8984,6 +8875,63 @@ class UltravoxModel(TextModel):
|
||||||
raise NotImplementedError("Ultravox does not have text decoder. Instead, it uses Llama or other models for text. If you want to get the audio encoder, please use --mmproj argument")
|
raise NotImplementedError("Ultravox does not have text decoder. Instead, it uses Llama or other models for text. If you want to get the audio encoder, please use --mmproj argument")
|
||||||
|
|
||||||
|
|
||||||
|
@ModelBase.register("GlmasrModel")
|
||||||
|
class GlmASRWhisperEncoderModel(MmprojModel):
|
||||||
|
has_vision_encoder = False
|
||||||
|
has_audio_encoder = True
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
if "hidden_size" not in self.hparams and "intermediate_size" not in self.hparams:
|
||||||
|
self.hparams["hidden_size"] = self.hparams["d_model"]
|
||||||
|
self.hparams["intermediate_size"] = self.hparams["encoder_ffn_dim"]
|
||||||
|
self.hparams["num_attention_heads"] = self.hparams["encoder_attention_heads"]
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
super().set_gguf_parameters()
|
||||||
|
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GLMA)
|
||||||
|
self.gguf_writer.add_audio_num_mel_bins(self.hparams["num_mel_bins"])
|
||||||
|
self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5))
|
||||||
|
self.gguf_writer.add_audio_stack_factor(self.global_config["merge_factor"])
|
||||||
|
|
||||||
|
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||||
|
if ".conv" in name and ".weight" in name:
|
||||||
|
return gguf.GGMLQuantizationType.F16
|
||||||
|
return super().tensor_force_quant(name, new_name, bid, n_dims)
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
del bid # unused
|
||||||
|
|
||||||
|
if name.startswith("model.") or name.startswith("lm_head."):
|
||||||
|
# skip language model tensors
|
||||||
|
return []
|
||||||
|
|
||||||
|
if name.startswith("audio_encoder.whisper."):
|
||||||
|
name = name.replace("audio_encoder.whisper.","audio_tower.")
|
||||||
|
if "audio_encoder.layer_norm." in name or "audio_encoder.proj." in name:
|
||||||
|
name = name.replace("audio_encoder.", "audio_encoder.adapting.")
|
||||||
|
|
||||||
|
if name.startswith("audio_encoder.audio_bos_eos_token."):
|
||||||
|
return [(self.map_tensor_name("model.vision.boi"), data_torch[0]), (self.map_tensor_name("model.vision.eoi"), data_torch[1])]
|
||||||
|
|
||||||
|
if name.startswith("audio_encoder.adapting."):
|
||||||
|
name = name.replace("audio_encoder.adapting.","audio.multi_modal_projector.")
|
||||||
|
if ".layer_norm." in name:
|
||||||
|
name = name.replace(".layer_norm.", ".ln_pre.")
|
||||||
|
if ".0." in name:
|
||||||
|
name = name.replace(".0.", ".linear_1.")
|
||||||
|
if ".2." in name:
|
||||||
|
name = name.replace(".2.", ".linear_2.")
|
||||||
|
if ".proj." in name:
|
||||||
|
return []
|
||||||
|
|
||||||
|
if "conv1.bias" in name or "conv2.bias" in name:
|
||||||
|
# transpose conv1 and conv2 bias
|
||||||
|
data_torch = data_torch.unsqueeze(-1)
|
||||||
|
|
||||||
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
|
|
||||||
@ModelBase.register("Qwen2AudioForConditionalGeneration")
|
@ModelBase.register("Qwen2AudioForConditionalGeneration")
|
||||||
class WhisperEncoderModel(MmprojModel):
|
class WhisperEncoderModel(MmprojModel):
|
||||||
has_vision_encoder = False # no vision encoder
|
has_vision_encoder = False # no vision encoder
|
||||||
|
|
@ -9151,7 +9099,7 @@ class FalconH1Model(Mamba2Model):
|
||||||
assert self.d_inner % self.d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {self.d_head}"
|
assert self.d_inner % self.d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {self.d_head}"
|
||||||
|
|
||||||
# Add any other Falcon Mamba2 specific configuration
|
# Add any other Falcon Mamba2 specific configuration
|
||||||
self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
|
self.gguf_writer.add_rope_freq_base(self.rope_parameters["rope_theta"])
|
||||||
|
|
||||||
|
|
||||||
@ModelBase.register("HunYuanMoEV1ForCausalLM")
|
@ModelBase.register("HunYuanMoEV1ForCausalLM")
|
||||||
|
|
@ -9229,12 +9177,11 @@ class HunYuanMoEModel(TextModel):
|
||||||
self.gguf_writer.add_expert_shared_count(moe_shared_expert[0])
|
self.gguf_writer.add_expert_shared_count(moe_shared_expert[0])
|
||||||
|
|
||||||
# Rope
|
# Rope
|
||||||
rope_scaling = hparams.get("rope_scaling", {})
|
if self.rope_parameters.get("rope_type") == "dynamic":
|
||||||
if rope_scaling.get("type") == "dynamic":
|
|
||||||
# HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
|
# HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
|
||||||
# 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf)
|
# 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf)
|
||||||
alpha = rope_scaling.get("alpha", 1000)
|
alpha = self.rope_parameters.get("alpha", 1000)
|
||||||
base = hparams.get("rope_theta", 10000.0)
|
base = self.rope_parameters.get("rope_theta", 10000.0)
|
||||||
dim = (hparams["hidden_size"] // hparams["num_attention_heads"]) # 128
|
dim = (hparams["hidden_size"] // hparams["num_attention_heads"]) # 128
|
||||||
scaled_base = base * (alpha ** (dim / (dim - 2))) # 10000 * (1000 ** (128 / 126)) = 11158839.9251
|
scaled_base = base * (alpha ** (dim / (dim - 2))) # 10000 * (1000 ** (128 / 126)) = 11158839.9251
|
||||||
self.gguf_writer.add_rope_freq_base(scaled_base)
|
self.gguf_writer.add_rope_freq_base(scaled_base)
|
||||||
|
|
@ -9429,12 +9376,11 @@ class HunYuanModel(TextModel):
|
||||||
hparams = self.hparams
|
hparams = self.hparams
|
||||||
|
|
||||||
# Rope
|
# Rope
|
||||||
rope_scaling = hparams.get("rope_scaling", {})
|
if self.rope_parameters.get("rope_type") == "dynamic":
|
||||||
if rope_scaling.get("type") == "dynamic":
|
|
||||||
# HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
|
# HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
|
||||||
# 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf)
|
# 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf)
|
||||||
alpha = rope_scaling.get("alpha", 50)
|
alpha = self.rope_parameters.get("alpha", 50)
|
||||||
base = hparams.get("rope_theta", 10000.0)
|
base = self.rope_parameters.get("rope_theta", 10000.0)
|
||||||
dim = hparams["head_dim"]
|
dim = hparams["head_dim"]
|
||||||
scaled_base = base * (alpha ** (dim / (dim - 2)))
|
scaled_base = base * (alpha ** (dim / (dim - 2)))
|
||||||
self.gguf_writer.add_rope_freq_base(scaled_base)
|
self.gguf_writer.add_rope_freq_base(scaled_base)
|
||||||
|
|
@ -9585,13 +9531,6 @@ class GptOssModel(TextModel):
|
||||||
self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
|
self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
|
||||||
self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size"])
|
self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size"])
|
||||||
|
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
rope_type = rope_scaling.get("rope_type", rope_scaling.get("type"))
|
|
||||||
assert rope_type == "yarn", f"GPT-OSS only supports yarn rope scaling, got {rope_type}"
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling.get("original_max_position_embeddings", 4096))
|
|
||||||
|
|
||||||
|
|
||||||
@ModelBase.register("Lfm2ForCausalLM", "LFM2ForCausalLM")
|
@ModelBase.register("Lfm2ForCausalLM", "LFM2ForCausalLM")
|
||||||
class LFM2Model(TextModel):
|
class LFM2Model(TextModel):
|
||||||
|
|
@ -9764,13 +9703,6 @@ class SmallThinkerModel(TextModel):
|
||||||
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
|
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
|
||||||
else:
|
else:
|
||||||
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
|
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
|
||||||
# YaRN is not enabled by default
|
|
||||||
# To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts
|
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
|
||||||
|
|
||||||
sliding_window_layout = self.hparams.get("sliding_window_layout")
|
sliding_window_layout = self.hparams.get("sliding_window_layout")
|
||||||
if sliding_window_layout:
|
if sliding_window_layout:
|
||||||
|
|
@ -9883,6 +9815,18 @@ class MistralModel(LlamaModel):
|
||||||
self.gguf_writer.add_architecture()
|
self.gguf_writer.add_architecture()
|
||||||
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
||||||
|
|
||||||
|
def dequant_model(self):
|
||||||
|
# transform quantization config into HF format
|
||||||
|
quant_config = self.hparams.get("quantization")
|
||||||
|
if quant_config is not None:
|
||||||
|
assert quant_config["qformat_weight"] == "fp8_e4m3"
|
||||||
|
self.hparams["quantization_config"] = {
|
||||||
|
"activation_scheme": "static",
|
||||||
|
"quant_method": "fp8",
|
||||||
|
"weight_block_size": None,
|
||||||
|
}
|
||||||
|
return super().dequant_model()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_community_chat_template(vocab: MistralVocab, templates_dir: Path, is_mistral_format: bool):
|
def get_community_chat_template(vocab: MistralVocab, templates_dir: Path, is_mistral_format: bool):
|
||||||
assert TokenizerVersion is not None and Tekkenizer is not None and SentencePieceTokenizer is not None, _mistral_import_error_msg
|
assert TokenizerVersion is not None and Tekkenizer is not None and SentencePieceTokenizer is not None, _mistral_import_error_msg
|
||||||
|
|
@ -10006,6 +9950,10 @@ class MistralMoeModel(DeepseekV2Model):
|
||||||
MistralModel.set_mistral_config(self.gguf_writer, self.hparams)
|
MistralModel.set_mistral_config(self.gguf_writer, self.hparams)
|
||||||
yarn_params = self.hparams["yarn"]
|
yarn_params = self.hparams["yarn"]
|
||||||
self.gguf_writer.add_attn_temperature_length(yarn_params["original_max_position_embeddings"])
|
self.gguf_writer.add_attn_temperature_length(yarn_params["original_max_position_embeddings"])
|
||||||
|
|
||||||
|
# [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
|
||||||
|
# note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul
|
||||||
|
# ref https://github.com/ggml-org/llama.cpp/pull/17945
|
||||||
self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1) # mscale_all_dim * 0.1
|
self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1) # mscale_all_dim * 0.1
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
|
||||||
|
|
|
||||||
|
|
@ -143,6 +143,7 @@ models = [
|
||||||
{"name": "bailingmoe2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-mini-base-2.0", },
|
{"name": "bailingmoe2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-mini-base-2.0", },
|
||||||
{"name": "granite-docling", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-docling-258M", },
|
{"name": "granite-docling", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-docling-258M", },
|
||||||
{"name": "minimax-m2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/MiniMaxAI/MiniMax-M2", },
|
{"name": "minimax-m2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/MiniMaxAI/MiniMax-M2", },
|
||||||
|
{"name": "kormo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/KORMo-Team/KORMo-tokenizer", },
|
||||||
]
|
]
|
||||||
|
|
||||||
# some models are known to be broken upstream, so we will skip them as exceptions
|
# some models are known to be broken upstream, so we will skip them as exceptions
|
||||||
|
|
|
||||||
|
|
@ -9,7 +9,8 @@ Adding a model requires few steps:
|
||||||
After following these steps, you can open PR.
|
After following these steps, you can open PR.
|
||||||
|
|
||||||
Also, it is important to check that the examples and main ggml backends (CUDA, METAL, CPU) are working with the new architecture, especially:
|
Also, it is important to check that the examples and main ggml backends (CUDA, METAL, CPU) are working with the new architecture, especially:
|
||||||
- [main](/tools/main/)
|
- [cli](/tools/cli/)
|
||||||
|
- [completion](/tools/completion/)
|
||||||
- [imatrix](/tools/imatrix/)
|
- [imatrix](/tools/imatrix/)
|
||||||
- [quantize](/tools/quantize/)
|
- [quantize](/tools/quantize/)
|
||||||
- [server](/tools/server/)
|
- [server](/tools/server/)
|
||||||
|
|
|
||||||
|
|
@ -56,7 +56,7 @@ docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:light -m /model
|
||||||
or with a server image:
|
or with a server image:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggml-org/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512
|
docker run -v /path/to/models:/models -p 8080:8080 ghcr.io/ggml-org/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512
|
||||||
```
|
```
|
||||||
|
|
||||||
## Docker With CUDA
|
## Docker With CUDA
|
||||||
|
|
@ -91,7 +91,7 @@ After building locally, Usage is similar to the non-CUDA examples, but you'll ne
|
||||||
```bash
|
```bash
|
||||||
docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
||||||
docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
||||||
docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
|
docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512 --n-gpu-layers 1
|
||||||
```
|
```
|
||||||
|
|
||||||
## Docker With MUSA
|
## Docker With MUSA
|
||||||
|
|
@ -125,5 +125,5 @@ After building locally, Usage is similar to the non-MUSA examples, but you'll ne
|
||||||
```bash
|
```bash
|
||||||
docker run -v /path/to/models:/models local/llama.cpp:full-musa --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
docker run -v /path/to/models:/models local/llama.cpp:full-musa --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
||||||
docker run -v /path/to/models:/models local/llama.cpp:light-musa -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
docker run -v /path/to/models:/models local/llama.cpp:light-musa -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
||||||
docker run -v /path/to/models:/models local/llama.cpp:server-musa -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
|
docker run -v /path/to/models:/models local/llama.cpp:server-musa -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512 --n-gpu-layers 1
|
||||||
```
|
```
|
||||||
|
|
|
||||||
51
docs/ops.md
51
docs/ops.md
|
|
@ -16,14 +16,14 @@ Legend:
|
||||||
|-----------|------|------|------|------|------|------|------|------|------|------|------|
|
|-----------|------|------|------|------|------|------|------|------|------|------|------|
|
||||||
| ABS | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
| ABS | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
|
| ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||||
| ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| ADD_ID | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
| ADD_ID | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
|
||||||
| CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
| CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
| CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
|
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
|
||||||
| CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
| CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||||
|
|
@ -31,20 +31,21 @@ Legend:
|
||||||
| CONV_3D | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
| CONV_3D | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||||
| CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
| CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
| COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
|
| CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
|
||||||
| CROSS_ENTROPY_LOSS | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
| CROSS_ENTROPY_LOSS | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||||
| CROSS_ENTROPY_LOSS_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
| CROSS_ENTROPY_LOSS_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||||
| CUMSUM | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
| CUMSUM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||||
|
| DIAG | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||||
| DIAG_MASK_INF | ❌ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| DIAG_MASK_INF | ❌ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| DIV | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
|
| DIV | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||||
| DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| ELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ |
|
| ELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ |
|
||||||
| EXP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
| EXP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| EXPM1 | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
| EXPM1 | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||||
| FILL | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
| FILL | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
|
| FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| FLOOR | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
| FLOOR | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| GATED_LINEAR_ATTN | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
| GATED_LINEAR_ATTN | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||||
| GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
| GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
|
|
@ -63,9 +64,9 @@ Legend:
|
||||||
| IM2COL_3D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
| IM2COL_3D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| L2_NORM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| L2_NORM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
|
| LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| LOG | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
|
| LOG | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
|
| MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||||
| MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
| MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||||
| MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
|
| MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
|
||||||
| NEG | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
| NEG | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
|
|
@ -74,7 +75,7 @@ Legend:
|
||||||
| OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
| OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| OPT_STEP_SGD | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
| OPT_STEP_SGD | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| OUT_PROD | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ | ❌ |
|
| OUT_PROD | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ | ❌ |
|
||||||
| PAD | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ | ❌ |
|
| PAD | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
|
||||||
| PAD_REFLECT_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
| PAD_REFLECT_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||||
| POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| REGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
| REGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
|
|
@ -83,7 +84,7 @@ Legend:
|
||||||
| REPEAT_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| REPEAT_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| RMS_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
| RMS_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||||
| RMS_NORM_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| RMS_NORM_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| RMS_NORM_MUL_ADD | ❌ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
| RMS_NORM_MUL_ADD | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||||
| ROLL | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| ROLL | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| ROPE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
| ROPE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||||
| ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
| ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||||
|
|
@ -97,26 +98,26 @@ Legend:
|
||||||
| SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
| SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
| SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
| SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
| SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| SOFTCAP | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
| SOFTCAP | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||||
| SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
|
| SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
| SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||||
| SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
|
| SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
|
||||||
| SOLVE_TRI | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
|
| SOLVE_TRI | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
| SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
| SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| SSM_CONV | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| SSM_CONV | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
|
| SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
| STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| SUB | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
|
| SUB | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||||
| SUM | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
| SUM | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| SUM_ROWS | ❌ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ | ❌ |
|
| SUM_ROWS | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
|
||||||
| SWIGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
| SWIGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| SWIGLU_OAI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ |
|
| SWIGLU_OAI | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
| TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| TOP_K | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
|
| TOP_K | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| TRI | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
| TRI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| TRUNC | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
| TRUNC | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
| UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| XIELU | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
|
| XIELU | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||||
|
|
|
||||||
496
docs/ops/CPU.csv
496
docs/ops/CPU.csv
|
|
@ -4964,6 +4964,7 @@
|
||||||
"CPU","CONV_TRANSPOSE_1D","ne_input=[2,1,1,1],ne_kernel=[3,1,1,1],s0=1,p0=0,d0=1","support","1","yes","CPU"
|
"CPU","CONV_TRANSPOSE_1D","ne_input=[2,1,1,1],ne_kernel=[3,1,1,1],s0=1,p0=0,d0=1","support","1","yes","CPU"
|
||||||
"CPU","CONV_TRANSPOSE_2D","ne_input=[3,2,3,1],ne_kernel=[2,2,1,3],stride=1","support","1","yes","CPU"
|
"CPU","CONV_TRANSPOSE_2D","ne_input=[3,2,3,1],ne_kernel=[2,2,1,3],stride=1","support","1","yes","CPU"
|
||||||
"CPU","CONV_TRANSPOSE_2D","ne_input=[10,10,9,1],ne_kernel=[3,3,1,9],stride=2","support","1","yes","CPU"
|
"CPU","CONV_TRANSPOSE_2D","ne_input=[10,10,9,1],ne_kernel=[3,3,1,9],stride=2","support","1","yes","CPU"
|
||||||
|
"CPU","CONV_TRANSPOSE_2D","ne_input=[129,63,35,1],ne_kernel=[3,3,48,35],stride=1","support","1","yes","CPU"
|
||||||
"CPU","COUNT_EQUAL","type=f32,ne=[4,500,1,1]","support","1","yes","CPU"
|
"CPU","COUNT_EQUAL","type=f32,ne=[4,500,1,1]","support","1","yes","CPU"
|
||||||
"CPU","COUNT_EQUAL","type=f32,ne=[4,5000,1,1]","support","1","yes","CPU"
|
"CPU","COUNT_EQUAL","type=f32,ne=[4,5000,1,1]","support","1","yes","CPU"
|
||||||
"CPU","ARGMAX","type=f32,ne=[32,1,1,1]","support","1","yes","CPU"
|
"CPU","ARGMAX","type=f32,ne=[32,1,1,1]","support","1","yes","CPU"
|
||||||
|
|
@ -5419,17 +5420,45 @@
|
||||||
"CPU","CPY","type_src=f16,type_dst=f16,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","CPU"
|
"CPU","CPY","type_src=f16,type_dst=f16,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","CPU"
|
||||||
"CPU","CPY","type_src=f32,type_dst=f32,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","CPU"
|
"CPU","CPY","type_src=f32,type_dst=f32,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","CPU"
|
||||||
"CPU","CPY","type_src=bf16,type_dst=bf16,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","CPU"
|
"CPU","CPY","type_src=bf16,type_dst=bf16,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","CPU"
|
||||||
|
"CPU","CPY","type_src=i32,type_dst=i32,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","CPU"
|
||||||
|
"CPU","CPY","type_src=i32,type_dst=i32,ne=[256,1,4,1],permute_src=[1,2,0,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","CPU"
|
||||||
"CPU","CPY","type_src=f32,type_dst=f32,ne=[256,1,4,1],permute_src=[1,2,0,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","CPU"
|
"CPU","CPY","type_src=f32,type_dst=f32,ne=[256,1,4,1],permute_src=[1,2,0,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","CPU"
|
||||||
"CPU","CONT","type=f32,ne=[10,10,10,1]","support","1","yes","CPU"
|
"CPU","CONT","type=f32,ne=[2,1,1,1],use_view_slice=1","support","1","yes","CPU"
|
||||||
"CPU","CONT","type=f32,ne=[2,1,1,1]","support","1","yes","CPU"
|
"CPU","CONT","type=f32,ne=[2,1,3,5],use_view_slice=1","support","1","yes","CPU"
|
||||||
"CPU","CONT","type=f32,ne=[2,1,3,5]","support","1","yes","CPU"
|
"CPU","CONT","type=f32,ne=[2,3,5,7],use_view_slice=1","support","1","yes","CPU"
|
||||||
"CPU","CONT","type=f32,ne=[2,3,5,7]","support","1","yes","CPU"
|
"CPU","CONT","type=f32,ne=[1,4,4,1],use_view_slice=1","support","1","yes","CPU"
|
||||||
"CPU","CONT","type=f16,ne=[2,1,1,1]","support","1","yes","CPU"
|
"CPU","CONT","type=f32,ne=[1,8,17,1],use_view_slice=1","support","1","yes","CPU"
|
||||||
"CPU","CONT","type=f16,ne=[2,1,3,5]","support","1","yes","CPU"
|
"CPU","CONT","type=f32,ne=[10,10,10,1],use_view_slice=1","support","1","yes","CPU"
|
||||||
"CPU","CONT","type=f16,ne=[2,3,5,7]","support","1","yes","CPU"
|
"CPU","CONT","type=f32,ne=[2,1,1,1],use_view_slice=0","support","1","yes","CPU"
|
||||||
"CPU","CONT","type=bf16,ne=[2,1,1,1]","support","1","yes","CPU"
|
"CPU","CONT","type=f32,ne=[2,1,3,5],use_view_slice=0","support","1","yes","CPU"
|
||||||
"CPU","CONT","type=bf16,ne=[2,1,3,5]","support","1","yes","CPU"
|
"CPU","CONT","type=f32,ne=[2,3,5,7],use_view_slice=0","support","1","yes","CPU"
|
||||||
"CPU","CONT","type=bf16,ne=[2,3,5,7]","support","1","yes","CPU"
|
"CPU","CONT","type=f32,ne=[1,4,4,1],use_view_slice=0","support","1","yes","CPU"
|
||||||
|
"CPU","CONT","type=f32,ne=[1,8,17,1],use_view_slice=0","support","1","yes","CPU"
|
||||||
|
"CPU","CONT","type=f32,ne=[10,10,10,1],use_view_slice=0","support","1","yes","CPU"
|
||||||
|
"CPU","CONT","type=i32,ne=[2,1,1,1],use_view_slice=1","support","1","yes","CPU"
|
||||||
|
"CPU","CONT","type=i32,ne=[2,1,3,5],use_view_slice=1","support","1","yes","CPU"
|
||||||
|
"CPU","CONT","type=i32,ne=[2,3,5,7],use_view_slice=1","support","1","yes","CPU"
|
||||||
|
"CPU","CONT","type=i32,ne=[1,4,4,1],use_view_slice=1","support","1","yes","CPU"
|
||||||
|
"CPU","CONT","type=i32,ne=[1,8,17,1],use_view_slice=1","support","1","yes","CPU"
|
||||||
|
"CPU","CONT","type=i32,ne=[10,10,10,1],use_view_slice=1","support","1","yes","CPU"
|
||||||
|
"CPU","CONT","type=i32,ne=[2,1,1,1],use_view_slice=0","support","1","yes","CPU"
|
||||||
|
"CPU","CONT","type=i32,ne=[2,1,3,5],use_view_slice=0","support","1","yes","CPU"
|
||||||
|
"CPU","CONT","type=i32,ne=[2,3,5,7],use_view_slice=0","support","1","yes","CPU"
|
||||||
|
"CPU","CONT","type=i32,ne=[1,4,4,1],use_view_slice=0","support","1","yes","CPU"
|
||||||
|
"CPU","CONT","type=i32,ne=[1,8,17,1],use_view_slice=0","support","1","yes","CPU"
|
||||||
|
"CPU","CONT","type=i32,ne=[10,10,10,1],use_view_slice=0","support","1","yes","CPU"
|
||||||
|
"CPU","CONT","type=f16,ne=[2,1,1,1],use_view_slice=0","support","1","yes","CPU"
|
||||||
|
"CPU","CONT","type=f16,ne=[2,1,3,5],use_view_slice=0","support","1","yes","CPU"
|
||||||
|
"CPU","CONT","type=f16,ne=[2,3,5,7],use_view_slice=0","support","1","yes","CPU"
|
||||||
|
"CPU","CONT","type=f16,ne=[1,4,4,1],use_view_slice=0","support","1","yes","CPU"
|
||||||
|
"CPU","CONT","type=f16,ne=[1,8,17,1],use_view_slice=0","support","1","yes","CPU"
|
||||||
|
"CPU","CONT","type=f16,ne=[10,10,10,1],use_view_slice=0","support","1","yes","CPU"
|
||||||
|
"CPU","CONT","type=bf16,ne=[2,1,1,1],use_view_slice=0","support","1","yes","CPU"
|
||||||
|
"CPU","CONT","type=bf16,ne=[2,1,3,5],use_view_slice=0","support","1","yes","CPU"
|
||||||
|
"CPU","CONT","type=bf16,ne=[2,3,5,7],use_view_slice=0","support","1","yes","CPU"
|
||||||
|
"CPU","CONT","type=bf16,ne=[1,4,4,1],use_view_slice=0","support","1","yes","CPU"
|
||||||
|
"CPU","CONT","type=bf16,ne=[1,8,17,1],use_view_slice=0","support","1","yes","CPU"
|
||||||
|
"CPU","CONT","type=bf16,ne=[10,10,10,1],use_view_slice=0","support","1","yes","CPU"
|
||||||
"CPU","ADD","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1","support","1","yes","CPU"
|
"CPU","ADD","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1","support","1","yes","CPU"
|
||||||
"CPU","SUB","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1","support","1","yes","CPU"
|
"CPU","SUB","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1","support","1","yes","CPU"
|
||||||
"CPU","MUL","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1","support","1","yes","CPU"
|
"CPU","MUL","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1","support","1","yes","CPU"
|
||||||
|
|
@ -5655,6 +5684,7 @@
|
||||||
"CPU","MUL","type=f32,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1","support","1","yes","CPU"
|
"CPU","MUL","type=f32,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1","support","1","yes","CPU"
|
||||||
"CPU","DIV","type=f32,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1","support","1","yes","CPU"
|
"CPU","DIV","type=f32,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1","support","1","yes","CPU"
|
||||||
"CPU","ADD1","type=f32,ne=[10,5,4,3]","support","1","yes","CPU"
|
"CPU","ADD1","type=f32,ne=[10,5,4,3]","support","1","yes","CPU"
|
||||||
|
"CPU","ADD1","type=f32,ne=[1024,1024,1,1]","support","1","yes","CPU"
|
||||||
"CPU","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=0.000000,inplace=0","support","1","yes","CPU"
|
"CPU","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=0.000000,inplace=0","support","1","yes","CPU"
|
||||||
"CPU","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=1.000000,inplace=0","support","1","yes","CPU"
|
"CPU","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=1.000000,inplace=0","support","1","yes","CPU"
|
||||||
"CPU","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=1.000000,inplace=1","support","1","yes","CPU"
|
"CPU","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=1.000000,inplace=1","support","1","yes","CPU"
|
||||||
|
|
@ -8644,9 +8674,13 @@
|
||||||
"CPU","CLAMP","type=f16,ne=[7,1,5,3],min=-0.500000,max=0.500000","support","1","yes","CPU"
|
"CPU","CLAMP","type=f16,ne=[7,1,5,3],min=-0.500000,max=0.500000","support","1","yes","CPU"
|
||||||
"CPU","LEAKY_RELU","type=f16,ne_a=[7,1,5,3],negative_slope=0.100000","support","1","yes","CPU"
|
"CPU","LEAKY_RELU","type=f16,ne_a=[7,1,5,3],negative_slope=0.100000","support","1","yes","CPU"
|
||||||
"CPU","FLOOR","type=f16,ne=[7,1,5,3]","support","1","yes","CPU"
|
"CPU","FLOOR","type=f16,ne=[7,1,5,3]","support","1","yes","CPU"
|
||||||
|
"CPU","FLOOR","type=f16,ne=[1024,1024,1,1]","support","1","yes","CPU"
|
||||||
"CPU","CEIL","type=f16,ne=[7,1,5,3]","support","1","yes","CPU"
|
"CPU","CEIL","type=f16,ne=[7,1,5,3]","support","1","yes","CPU"
|
||||||
|
"CPU","CEIL","type=f16,ne=[1024,1024,1,1]","support","1","yes","CPU"
|
||||||
"CPU","ROUND","type=f16,ne=[7,1,5,3]","support","1","yes","CPU"
|
"CPU","ROUND","type=f16,ne=[7,1,5,3]","support","1","yes","CPU"
|
||||||
|
"CPU","ROUND","type=f16,ne=[1024,1024,1,1]","support","1","yes","CPU"
|
||||||
"CPU","TRUNC","type=f16,ne=[7,1,5,3]","support","1","yes","CPU"
|
"CPU","TRUNC","type=f16,ne=[7,1,5,3]","support","1","yes","CPU"
|
||||||
|
"CPU","TRUNC","type=f16,ne=[1024,1024,1,1]","support","1","yes","CPU"
|
||||||
"CPU","SQR","type=f32,ne=[10,5,4,3]","support","1","yes","CPU"
|
"CPU","SQR","type=f32,ne=[10,5,4,3]","support","1","yes","CPU"
|
||||||
"CPU","SQRT","type=f32,ne=[10,3,3,2]","support","1","yes","CPU"
|
"CPU","SQRT","type=f32,ne=[10,3,3,2]","support","1","yes","CPU"
|
||||||
"CPU","LOG","type=f32,ne=[10,5,4,3]","support","1","yes","CPU"
|
"CPU","LOG","type=f32,ne=[10,5,4,3]","support","1","yes","CPU"
|
||||||
|
|
@ -8666,9 +8700,13 @@
|
||||||
"CPU","CLAMP","type=f32,ne=[7,1,5,3],min=-0.500000,max=0.500000","support","1","yes","CPU"
|
"CPU","CLAMP","type=f32,ne=[7,1,5,3],min=-0.500000,max=0.500000","support","1","yes","CPU"
|
||||||
"CPU","LEAKY_RELU","type=f32,ne_a=[7,1,5,3],negative_slope=0.100000","support","1","yes","CPU"
|
"CPU","LEAKY_RELU","type=f32,ne_a=[7,1,5,3],negative_slope=0.100000","support","1","yes","CPU"
|
||||||
"CPU","FLOOR","type=f32,ne=[7,1,5,3]","support","1","yes","CPU"
|
"CPU","FLOOR","type=f32,ne=[7,1,5,3]","support","1","yes","CPU"
|
||||||
|
"CPU","FLOOR","type=f32,ne=[1024,1024,1,1]","support","1","yes","CPU"
|
||||||
"CPU","CEIL","type=f32,ne=[7,1,5,3]","support","1","yes","CPU"
|
"CPU","CEIL","type=f32,ne=[7,1,5,3]","support","1","yes","CPU"
|
||||||
|
"CPU","CEIL","type=f32,ne=[1024,1024,1,1]","support","1","yes","CPU"
|
||||||
"CPU","ROUND","type=f32,ne=[7,1,5,3]","support","1","yes","CPU"
|
"CPU","ROUND","type=f32,ne=[7,1,5,3]","support","1","yes","CPU"
|
||||||
|
"CPU","ROUND","type=f32,ne=[1024,1024,1,1]","support","1","yes","CPU"
|
||||||
"CPU","TRUNC","type=f32,ne=[7,1,5,3]","support","1","yes","CPU"
|
"CPU","TRUNC","type=f32,ne=[7,1,5,3]","support","1","yes","CPU"
|
||||||
|
"CPU","TRUNC","type=f32,ne=[1024,1024,1,1]","support","1","yes","CPU"
|
||||||
"CPU","DIAG_MASK_INF","type=f32,ne=[10,10,1,1],n_past=5","support","1","yes","CPU"
|
"CPU","DIAG_MASK_INF","type=f32,ne=[10,10,1,1],n_past=5","support","1","yes","CPU"
|
||||||
"CPU","DIAG_MASK_INF","type=f32,ne=[10,10,3,1],n_past=5","support","1","yes","CPU"
|
"CPU","DIAG_MASK_INF","type=f32,ne=[10,10,3,1],n_past=5","support","1","yes","CPU"
|
||||||
"CPU","DIAG_MASK_INF","type=f32,ne=[10,10,3,2],n_past=5","support","1","yes","CPU"
|
"CPU","DIAG_MASK_INF","type=f32,ne=[10,10,3,2],n_past=5","support","1","yes","CPU"
|
||||||
|
|
@ -9411,18 +9449,405 @@
|
||||||
"CPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","1","yes","CPU"
|
"CPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","1","yes","CPU"
|
||||||
"CPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","CPU"
|
"CPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","CPU"
|
||||||
"CPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","CPU"
|
"CPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[3,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[4,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[7,1,1,1],order=0","support","1","yes","CPU"
|
||||||
"CPU","ARGSORT","type=f32,ne=[8,1,1,1],order=0","support","1","yes","CPU"
|
"CPU","ARGSORT","type=f32,ne=[8,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[15,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[16,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[31,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[32,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[63,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[64,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[127,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[128,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[255,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[256,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[511,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[512,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[1023,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[1024,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[2047,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[2048,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[4095,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[4096,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[8191,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[8192,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[16383,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[16384,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[32767,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[32768,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[65535,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[65536,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[131071,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[131072,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[262143,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[262144,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[524287,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[524288,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[1048575,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[1048576,1,1,1],order=0","support","1","yes","CPU"
|
||||||
"CPU","ARGSORT","type=f32,ne=[16,10,10,10],order=0","support","1","yes","CPU"
|
"CPU","ARGSORT","type=f32,ne=[16,10,10,10],order=0","support","1","yes","CPU"
|
||||||
"CPU","ARGSORT","type=f32,ne=[60,10,10,10],order=0","support","1","yes","CPU"
|
"CPU","ARGSORT","type=f32,ne=[60,10,10,10],order=0","support","1","yes","CPU"
|
||||||
"CPU","ARGSORT","type=f32,ne=[1024,1,1,1],order=0","support","1","yes","CPU"
|
"CPU","ARGSORT","type=f32,ne=[1023,2,1,3],order=0","support","1","yes","CPU"
|
||||||
"CPU","ARGSORT","type=f32,ne=[16384,1,1,1],order=0","support","1","yes","CPU"
|
"CPU","ARGSORT","type=f32,ne=[1024,2,1,3],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[1025,2,1,3],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[2047,2,1,3],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[2048,2,1,3],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[2049,2,1,3],order=0","support","1","yes","CPU"
|
||||||
"CPU","ARGSORT","type=f32,ne=[2,8,8192,1],order=0","support","1","yes","CPU"
|
"CPU","ARGSORT","type=f32,ne=[2,8,8192,1],order=0","support","1","yes","CPU"
|
||||||
"CPU","ARGSORT","type=f32,ne=[8,1,1,1],order=1","support","1","yes","CPU"
|
"CPU","ARGSORT","type=f32,ne=[3,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[4,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[7,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[8,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[15,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[16,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[31,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[32,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[63,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[64,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[127,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[128,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[255,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[256,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[511,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[512,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[1023,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[1024,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[2047,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[2048,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[4095,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[4096,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[8191,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[8192,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[16383,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[16384,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[32767,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[32768,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[65535,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[65536,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[131071,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[131072,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[262143,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[262144,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[524287,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[524288,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[1048575,1,1,1],order=0","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[1048576,1,1,1],order=0","support","1","yes","CPU"
|
||||||
"CPU","ARGSORT","type=f32,ne=[16,10,10,10],order=1","support","1","yes","CPU"
|
"CPU","ARGSORT","type=f32,ne=[16,10,10,10],order=1","support","1","yes","CPU"
|
||||||
"CPU","ARGSORT","type=f32,ne=[60,10,10,10],order=1","support","1","yes","CPU"
|
"CPU","ARGSORT","type=f32,ne=[60,10,10,10],order=1","support","1","yes","CPU"
|
||||||
"CPU","ARGSORT","type=f32,ne=[1024,1,1,1],order=1","support","1","yes","CPU"
|
"CPU","ARGSORT","type=f32,ne=[1023,2,1,3],order=1","support","1","yes","CPU"
|
||||||
"CPU","ARGSORT","type=f32,ne=[16384,1,1,1],order=1","support","1","yes","CPU"
|
"CPU","ARGSORT","type=f32,ne=[1024,2,1,3],order=1","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[1025,2,1,3],order=1","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[2047,2,1,3],order=1","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[2048,2,1,3],order=1","support","1","yes","CPU"
|
||||||
|
"CPU","ARGSORT","type=f32,ne=[2049,2,1,3],order=1","support","1","yes","CPU"
|
||||||
"CPU","ARGSORT","type=f32,ne=[2,8,8192,1],order=1","support","1","yes","CPU"
|
"CPU","ARGSORT","type=f32,ne=[2,8,8192,1],order=1","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[1,1,1,1],k=1,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[12,1,2,1],k=1,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[2,1,1,1],k=1,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[13,1,2,1],k=1,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[2,1,1,1],k=2,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[13,1,2,1],k=2,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[4,1,1,1],k=1,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[15,1,2,1],k=1,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[4,1,1,1],k=2,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[15,1,2,1],k=2,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[4,1,1,1],k=3,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[15,1,2,1],k=3,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[8,1,1,1],k=1,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[19,1,2,1],k=1,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[8,1,1,1],k=2,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[19,1,2,1],k=2,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[8,1,1,1],k=3,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[19,1,2,1],k=3,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[8,1,1,1],k=7,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[19,1,2,1],k=7,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[16,1,1,1],k=1,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[27,1,2,1],k=1,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[16,1,1,1],k=2,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[27,1,2,1],k=2,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[16,1,1,1],k=3,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[27,1,2,1],k=3,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[16,1,1,1],k=7,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[27,1,2,1],k=7,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[16,1,1,1],k=15,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[27,1,2,1],k=15,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[32,1,1,1],k=1,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[43,1,2,1],k=1,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[32,1,1,1],k=2,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[43,1,2,1],k=2,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[32,1,1,1],k=3,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[43,1,2,1],k=3,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[32,1,1,1],k=7,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[43,1,2,1],k=7,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[32,1,1,1],k=15,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[43,1,2,1],k=15,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[64,1,1,1],k=1,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[75,1,2,1],k=1,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[64,1,1,1],k=2,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[75,1,2,1],k=2,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[64,1,1,1],k=3,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[75,1,2,1],k=3,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[64,1,1,1],k=7,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[75,1,2,1],k=7,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[64,1,1,1],k=15,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[75,1,2,1],k=15,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[128,1,1,1],k=1,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[139,1,2,1],k=1,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[128,1,1,1],k=2,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[139,1,2,1],k=2,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[128,1,1,1],k=3,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[139,1,2,1],k=3,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[128,1,1,1],k=7,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[139,1,2,1],k=7,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[128,1,1,1],k=15,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[139,1,2,1],k=15,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[128,1,1,1],k=100,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[139,1,2,1],k=100,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[256,1,1,1],k=1,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[267,1,2,1],k=1,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[256,1,1,1],k=2,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[267,1,2,1],k=2,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[256,1,1,1],k=3,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[267,1,2,1],k=3,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[256,1,1,1],k=7,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[267,1,2,1],k=7,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[256,1,1,1],k=15,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[267,1,2,1],k=15,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[256,1,1,1],k=100,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[267,1,2,1],k=100,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[512,1,1,1],k=1,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[523,1,2,1],k=1,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[512,1,1,1],k=2,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[523,1,2,1],k=2,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[512,1,1,1],k=3,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[523,1,2,1],k=3,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[512,1,1,1],k=7,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[523,1,2,1],k=7,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[512,1,1,1],k=15,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[523,1,2,1],k=15,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[512,1,1,1],k=100,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[523,1,2,1],k=100,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[512,1,1,1],k=500,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[523,1,2,1],k=500,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[1024,1,1,1],k=1,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[1035,1,2,1],k=1,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[1024,1,1,1],k=2,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[1035,1,2,1],k=2,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[1024,1,1,1],k=3,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[1035,1,2,1],k=3,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[1024,1,1,1],k=7,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[1035,1,2,1],k=7,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[1024,1,1,1],k=15,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[1035,1,2,1],k=15,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[1024,1,1,1],k=100,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[1035,1,2,1],k=100,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[1024,1,1,1],k=500,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[1035,1,2,1],k=500,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[1024,1,1,1],k=1023,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[1035,1,2,1],k=1023,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[2048,1,1,1],k=1,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[2059,1,2,1],k=1,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[2048,1,1,1],k=2,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[2059,1,2,1],k=2,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[2048,1,1,1],k=3,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[2059,1,2,1],k=3,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[2048,1,1,1],k=7,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[2059,1,2,1],k=7,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[2048,1,1,1],k=15,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[2059,1,2,1],k=15,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[2048,1,1,1],k=100,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[2059,1,2,1],k=100,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[2048,1,1,1],k=500,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[2059,1,2,1],k=500,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[2048,1,1,1],k=1023,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[2059,1,2,1],k=1023,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[4096,1,1,1],k=1,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[4107,1,2,1],k=1,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[4096,1,1,1],k=2,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[4107,1,2,1],k=2,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[4096,1,1,1],k=3,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[4107,1,2,1],k=3,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[4096,1,1,1],k=7,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[4107,1,2,1],k=7,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[4096,1,1,1],k=15,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[4107,1,2,1],k=15,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[4096,1,1,1],k=100,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[4107,1,2,1],k=100,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[4096,1,1,1],k=500,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[4107,1,2,1],k=500,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[4096,1,1,1],k=1023,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[4107,1,2,1],k=1023,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[8192,1,1,1],k=1,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[8203,1,2,1],k=1,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[8192,1,1,1],k=2,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[8203,1,2,1],k=2,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[8192,1,1,1],k=3,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[8203,1,2,1],k=3,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[8192,1,1,1],k=7,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[8203,1,2,1],k=7,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[8192,1,1,1],k=15,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[8203,1,2,1],k=15,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[8192,1,1,1],k=100,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[8203,1,2,1],k=100,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[8192,1,1,1],k=500,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[8203,1,2,1],k=500,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[8192,1,1,1],k=1023,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[8203,1,2,1],k=1023,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=1,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[16395,1,2,1],k=1,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=2,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[16395,1,2,1],k=2,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=3,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[16395,1,2,1],k=3,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=7,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[16395,1,2,1],k=7,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=15,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[16395,1,2,1],k=15,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=100,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[16395,1,2,1],k=100,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=500,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[16395,1,2,1],k=500,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=1023,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[16395,1,2,1],k=1023,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=9999,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[16395,1,2,1],k=9999,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[32768,1,1,1],k=1,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[32779,1,2,1],k=1,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[32768,1,1,1],k=2,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[32779,1,2,1],k=2,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[32768,1,1,1],k=3,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[32779,1,2,1],k=3,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[32768,1,1,1],k=7,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[32779,1,2,1],k=7,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[32768,1,1,1],k=15,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[32779,1,2,1],k=15,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[32768,1,1,1],k=100,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[32779,1,2,1],k=100,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[32768,1,1,1],k=500,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[32779,1,2,1],k=500,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[32768,1,1,1],k=1023,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[32779,1,2,1],k=1023,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[32768,1,1,1],k=9999,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[32779,1,2,1],k=9999,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[65536,1,1,1],k=1,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[65547,1,2,1],k=1,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[65536,1,1,1],k=2,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[65547,1,2,1],k=2,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[65536,1,1,1],k=3,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[65547,1,2,1],k=3,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[65536,1,1,1],k=7,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[65547,1,2,1],k=7,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[65536,1,1,1],k=15,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[65547,1,2,1],k=15,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[65536,1,1,1],k=100,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[65547,1,2,1],k=100,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[65536,1,1,1],k=500,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[65547,1,2,1],k=500,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[65536,1,1,1],k=1023,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[65547,1,2,1],k=1023,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[65536,1,1,1],k=9999,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[65547,1,2,1],k=9999,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[131072,1,1,1],k=1,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[131083,1,2,1],k=1,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[131072,1,1,1],k=2,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[131083,1,2,1],k=2,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[131072,1,1,1],k=3,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[131083,1,2,1],k=3,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[131072,1,1,1],k=7,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[131083,1,2,1],k=7,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[131072,1,1,1],k=15,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[131083,1,2,1],k=15,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[131072,1,1,1],k=100,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[131083,1,2,1],k=100,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[131072,1,1,1],k=500,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[131083,1,2,1],k=500,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[131072,1,1,1],k=1023,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[131083,1,2,1],k=1023,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[131072,1,1,1],k=9999,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[131083,1,2,1],k=9999,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[262144,1,1,1],k=1,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[262155,1,2,1],k=1,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[262144,1,1,1],k=2,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[262155,1,2,1],k=2,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[262144,1,1,1],k=3,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[262155,1,2,1],k=3,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[262144,1,1,1],k=7,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[262155,1,2,1],k=7,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[262144,1,1,1],k=15,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[262155,1,2,1],k=15,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[262144,1,1,1],k=100,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[262155,1,2,1],k=100,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[262144,1,1,1],k=500,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[262155,1,2,1],k=500,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[262144,1,1,1],k=1023,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[262155,1,2,1],k=1023,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[262144,1,1,1],k=9999,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[262155,1,2,1],k=9999,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[524288,1,1,1],k=1,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[524299,1,2,1],k=1,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[524288,1,1,1],k=2,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[524299,1,2,1],k=2,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[524288,1,1,1],k=3,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[524299,1,2,1],k=3,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[524288,1,1,1],k=7,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[524299,1,2,1],k=7,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[524288,1,1,1],k=15,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[524299,1,2,1],k=15,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[524288,1,1,1],k=100,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[524299,1,2,1],k=100,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[524288,1,1,1],k=500,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[524299,1,2,1],k=500,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[524288,1,1,1],k=1023,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[524299,1,2,1],k=1023,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[524288,1,1,1],k=9999,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[524299,1,2,1],k=9999,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[16,10,10,10],k=1,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[60,10,10,10],k=1,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[1023,2,1,3],k=1,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[1024,2,1,3],k=1,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[1025,2,1,3],k=1,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=1,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[2047,2,1,3],k=1,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[2048,2,1,3],k=1,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[2049,2,1,3],k=1,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[16,10,10,10],k=2,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[60,10,10,10],k=2,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[1023,2,1,3],k=2,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[1024,2,1,3],k=2,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[1025,2,1,3],k=2,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=2,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[2047,2,1,3],k=2,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[2048,2,1,3],k=2,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[2049,2,1,3],k=2,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[16,10,10,10],k=3,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[60,10,10,10],k=3,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[1023,2,1,3],k=3,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[1024,2,1,3],k=3,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[1025,2,1,3],k=3,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=3,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[2047,2,1,3],k=3,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[2048,2,1,3],k=3,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[2049,2,1,3],k=3,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[16,10,10,10],k=7,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[60,10,10,10],k=7,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[1023,2,1,3],k=7,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[1024,2,1,3],k=7,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[1025,2,1,3],k=7,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=7,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[2047,2,1,3],k=7,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[2048,2,1,3],k=7,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[2049,2,1,3],k=7,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[16,10,10,10],k=15,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[60,10,10,10],k=15,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[1023,2,1,3],k=15,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[1024,2,1,3],k=15,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[1025,2,1,3],k=15,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=15,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[2047,2,1,3],k=15,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[2048,2,1,3],k=15,ties=0","support","1","yes","CPU"
|
||||||
|
"CPU","TOP_K","type=f32,ne=[2049,2,1,3],k=15,ties=0","support","1","yes","CPU"
|
||||||
"CPU","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=0","support","1","yes","CPU"
|
"CPU","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=0","support","1","yes","CPU"
|
||||||
"CPU","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=1","support","1","yes","CPU"
|
"CPU","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=1","support","1","yes","CPU"
|
||||||
"CPU","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=nearest,flags=none","support","1","yes","CPU"
|
"CPU","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=nearest,flags=none","support","1","yes","CPU"
|
||||||
|
|
@ -9435,6 +9860,10 @@
|
||||||
"CPU","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bicubic,transpose=1","support","1","yes","CPU"
|
"CPU","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bicubic,transpose=1","support","1","yes","CPU"
|
||||||
"CPU","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bicubic,flags=none","support","1","yes","CPU"
|
"CPU","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bicubic,flags=none","support","1","yes","CPU"
|
||||||
"CPU","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bicubic,flags=none","support","1","yes","CPU"
|
"CPU","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bicubic,flags=none","support","1","yes","CPU"
|
||||||
|
"CPU","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=513,transpose=0","support","1","yes","CPU"
|
||||||
|
"CPU","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=513,transpose=1","support","1","yes","CPU"
|
||||||
|
"CPU","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear,flags=none","support","1","yes","CPU"
|
||||||
|
"CPU","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bilinear,flags=none","support","1","yes","CPU"
|
||||||
"CPU","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear,flags=align_corners","support","1","yes","CPU"
|
"CPU","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear,flags=align_corners","support","1","yes","CPU"
|
||||||
"CPU","UPSCALE","type=f32,ne=[1,4,3,2],ne_tgt=[2,8,3,2],mode=bilinear,flags=align_corners","support","1","yes","CPU"
|
"CPU","UPSCALE","type=f32,ne=[1,4,3,2],ne_tgt=[2,8,3,2],mode=bilinear,flags=align_corners","support","1","yes","CPU"
|
||||||
"CPU","UPSCALE","type=f32,ne=[4,1,3,2],ne_tgt=[1,1,3,2],mode=bilinear,flags=align_corners","support","1","yes","CPU"
|
"CPU","UPSCALE","type=f32,ne=[4,1,3,2],ne_tgt=[1,1,3,2],mode=bilinear,flags=align_corners","support","1","yes","CPU"
|
||||||
|
|
@ -9463,15 +9892,30 @@
|
||||||
"CPU","GROUP_NORM","type=f32,ne=[64,64,320,1],num_groups=32,eps=0.000001","support","1","yes","CPU"
|
"CPU","GROUP_NORM","type=f32,ne=[64,64,320,1],num_groups=32,eps=0.000001","support","1","yes","CPU"
|
||||||
"CPU","GROUP_NORM","type=f32,ne=[9,9,1280,1],num_groups=32,eps=0.000001","support","1","yes","CPU"
|
"CPU","GROUP_NORM","type=f32,ne=[9,9,1280,1],num_groups=32,eps=0.000001","support","1","yes","CPU"
|
||||||
"CPU","ACC","type=f32,ne_a=[256,17,1,1],ne_b=[256,16,1,1]","support","1","yes","CPU"
|
"CPU","ACC","type=f32,ne_a=[256,17,1,1],ne_b=[256,16,1,1]","support","1","yes","CPU"
|
||||||
"CPU","PAD","type=f32,ne_a=[512,512,1,1],pad_0=1,pad_1=1","support","1","yes","CPU"
|
"CPU","PAD","type=f32,ne_a=[512,512,1,1],pad_0=1,pad_1=1,circular=0","support","1","yes","CPU"
|
||||||
"CPU","PAD","type=f32,ne_a=[512,512,3,1],lp0=1,rp0=1,lp1=1,rp1=1,lp2=1,rp2=1,lp3=1,rp3=1,v=0","support","1","yes","CPU"
|
"CPU","PAD","type=f32,ne_a=[33,17,2,1],pad_0=4,pad_1=3,circular=1","support","1","yes","CPU"
|
||||||
|
"CPU","PAD","type=f32,ne_a=[512,512,3,1],lp0=1,rp0=1,lp1=1,rp1=1,lp2=1,rp2=1,lp3=1,rp3=1,v=0,circular=0","support","1","yes","CPU"
|
||||||
"CPU","PAD_REFLECT_1D","type=f32,ne_a=[512,34,2,1],pad_0=10,pad_1=9","support","1","yes","CPU"
|
"CPU","PAD_REFLECT_1D","type=f32,ne_a=[512,34,2,1],pad_0=10,pad_1=9","support","1","yes","CPU"
|
||||||
"CPU","PAD_REFLECT_1D","type=f32,ne_a=[3000,384,4,1],pad_0=10,pad_1=9","support","1","yes","CPU"
|
"CPU","PAD_REFLECT_1D","type=f32,ne_a=[3000,384,4,1],pad_0=10,pad_1=9","support","1","yes","CPU"
|
||||||
"CPU","ROLL","shift0=3,shift1=-2,shift3=1,shift4=-1","support","1","yes","CPU"
|
"CPU","ROLL","shift0=3,shift1=-2,shift3=1,shift4=-1","support","1","yes","CPU"
|
||||||
"CPU","ARANGE","type=f32,start=0.000000,stop=10.000000,step=1.000000","support","1","yes","CPU"
|
"CPU","ARANGE","type=f32,start=0.000000,stop=10.000000,step=1.000000","support","1","yes","CPU"
|
||||||
|
"CPU","ARANGE","type=f32,start=0.000000,stop=1048576.000000,step=1.000000","support","1","yes","CPU"
|
||||||
"CPU","TIMESTEP_EMBEDDING","type=f32,ne_a=[2,1,1,1],dim=320,max_period=10000","support","1","yes","CPU"
|
"CPU","TIMESTEP_EMBEDDING","type=f32,ne_a=[2,1,1,1],dim=320,max_period=10000","support","1","yes","CPU"
|
||||||
"CPU","LEAKY_RELU","type=f32,ne_a=[10,5,4,3],negative_slope=0.100000","support","1","yes","CPU"
|
"CPU","LEAKY_RELU","type=f32,ne_a=[10,5,4,3],negative_slope=0.100000","support","1","yes","CPU"
|
||||||
"CPU","CUMSUM","type=f32,ne=[10,5,4,3]","support","1","yes","CPU"
|
"CPU","CUMSUM","type=f32,ne=[10,5,4,3]","support","1","yes","CPU"
|
||||||
|
"CPU","CUMSUM","type=f32,ne=[127,5,4,3]","support","1","yes","CPU"
|
||||||
|
"CPU","CUMSUM","type=f32,ne=[128,5,4,3]","support","1","yes","CPU"
|
||||||
|
"CPU","CUMSUM","type=f32,ne=[128,128,4,4]","support","1","yes","CPU"
|
||||||
|
"CPU","CUMSUM","type=f32,ne=[255,5,4,3]","support","1","yes","CPU"
|
||||||
|
"CPU","CUMSUM","type=f32,ne=[256,5,4,3]","support","1","yes","CPU"
|
||||||
|
"CPU","CUMSUM","type=f32,ne=[511,5,4,3]","support","1","yes","CPU"
|
||||||
|
"CPU","CUMSUM","type=f32,ne=[512,5,4,3]","support","1","yes","CPU"
|
||||||
|
"CPU","CUMSUM","type=f32,ne=[1023,5,4,3]","support","1","yes","CPU"
|
||||||
|
"CPU","CUMSUM","type=f32,ne=[1024,5,4,3]","support","1","yes","CPU"
|
||||||
|
"CPU","CUMSUM","type=f32,ne=[2047,5,4,3]","support","1","yes","CPU"
|
||||||
|
"CPU","CUMSUM","type=f32,ne=[2048,5,4,3]","support","1","yes","CPU"
|
||||||
|
"CPU","CUMSUM","type=f32,ne=[242004,1,1,1]","support","1","yes","CPU"
|
||||||
|
"CPU","CUMSUM","type=f32,ne=[375960,1,1,1]","support","1","yes","CPU"
|
||||||
"CPU","XIELU","type=f32,ne=[10,5,4,3]","support","1","yes","CPU"
|
"CPU","XIELU","type=f32,ne=[10,5,4,3]","support","1","yes","CPU"
|
||||||
"CPU","TRI","type=f32,ne=[10,10,4,3],tri_type=3","support","1","yes","CPU"
|
"CPU","TRI","type=f32,ne=[10,10,4,3],tri_type=3","support","1","yes","CPU"
|
||||||
"CPU","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","1","yes","CPU"
|
"CPU","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","1","yes","CPU"
|
||||||
|
|
@ -9480,6 +9924,10 @@
|
||||||
"CPU","FILL","type=f32,ne=[10,10,4,3],c=0.000000","support","1","yes","CPU"
|
"CPU","FILL","type=f32,ne=[10,10,4,3],c=0.000000","support","1","yes","CPU"
|
||||||
"CPU","FILL","type=f32,ne=[303,207,11,3],c=2.000000","support","1","yes","CPU"
|
"CPU","FILL","type=f32,ne=[303,207,11,3],c=2.000000","support","1","yes","CPU"
|
||||||
"CPU","FILL","type=f32,ne=[800,600,4,4],c=-152.000000","support","1","yes","CPU"
|
"CPU","FILL","type=f32,ne=[800,600,4,4],c=-152.000000","support","1","yes","CPU"
|
||||||
|
"CPU","FILL","type=f32,ne=[2048,512,2,2],c=3.500000","support","1","yes","CPU"
|
||||||
|
"CPU","DIAG","type=f32,ne=[10,1,4,3]","support","1","yes","CPU"
|
||||||
|
"CPU","DIAG","type=f32,ne=[79,1,19,13]","support","1","yes","CPU"
|
||||||
|
"CPU","DIAG","type=f32,ne=[256,1,8,16]","support","1","yes","CPU"
|
||||||
"CPU","SOLVE_TRI","type=f32,ne_lhs=[10,10,4,3],ne_rhs=[3,10,4,3]","support","1","yes","CPU"
|
"CPU","SOLVE_TRI","type=f32,ne_lhs=[10,10,4,3],ne_rhs=[3,10,4,3]","support","1","yes","CPU"
|
||||||
"CPU","SOLVE_TRI","type=f32,ne_lhs=[11,11,1,1],ne_rhs=[5,11,1,1]","support","1","yes","CPU"
|
"CPU","SOLVE_TRI","type=f32,ne_lhs=[11,11,1,1],ne_rhs=[5,11,1,1]","support","1","yes","CPU"
|
||||||
"CPU","SOLVE_TRI","type=f32,ne_lhs=[17,17,2,4],ne_rhs=[9,17,2,4]","support","1","yes","CPU"
|
"CPU","SOLVE_TRI","type=f32,ne_lhs=[17,17,2,4],ne_rhs=[9,17,2,4]","support","1","yes","CPU"
|
||||||
|
|
@ -9487,10 +9935,16 @@
|
||||||
"CPU","SOLVE_TRI","type=f32,ne_lhs=[42,42,5,2],ne_rhs=[10,42,5,2]","support","1","yes","CPU"
|
"CPU","SOLVE_TRI","type=f32,ne_lhs=[42,42,5,2],ne_rhs=[10,42,5,2]","support","1","yes","CPU"
|
||||||
"CPU","SOLVE_TRI","type=f32,ne_lhs=[64,64,2,2],ne_rhs=[10,64,2,2]","support","1","yes","CPU"
|
"CPU","SOLVE_TRI","type=f32,ne_lhs=[64,64,2,2],ne_rhs=[10,64,2,2]","support","1","yes","CPU"
|
||||||
"CPU","SOLVE_TRI","type=f32,ne_lhs=[100,100,4,4],ne_rhs=[41,100,4,4]","support","1","yes","CPU"
|
"CPU","SOLVE_TRI","type=f32,ne_lhs=[100,100,4,4],ne_rhs=[41,100,4,4]","support","1","yes","CPU"
|
||||||
"CPU","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0","support","1","yes","CPU"
|
"CPU","SOLVE_TRI","type=f32,ne_lhs=[128,128,4,4],ne_rhs=[31,128,4,4]","support","1","yes","CPU"
|
||||||
"CPU","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0","support","1","yes","CPU"
|
"CPU","SOLVE_TRI","type=f32,ne_lhs=[64,64,4,4],ne_rhs=[300,64,4,4]","support","1","yes","CPU"
|
||||||
"CPU","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1","support","1","yes","CPU"
|
"CPU","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0,circular=0","support","1","yes","CPU"
|
||||||
"CPU","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1","support","1","yes","CPU"
|
"CPU","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0,circular=0","support","1","yes","CPU"
|
||||||
|
"CPU","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0,circular=1","support","1","yes","CPU"
|
||||||
|
"CPU","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0,circular=1","support","1","yes","CPU"
|
||||||
|
"CPU","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1,circular=0","support","1","yes","CPU"
|
||||||
|
"CPU","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1,circular=0","support","1","yes","CPU"
|
||||||
|
"CPU","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1,circular=1","support","1","yes","CPU"
|
||||||
|
"CPU","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1,circular=1","support","1","yes","CPU"
|
||||||
"CPU","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","CPU"
|
"CPU","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","CPU"
|
||||||
"CPU","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","CPU"
|
"CPU","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","CPU"
|
||||||
"CPU","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","1","yes","CPU"
|
"CPU","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","1","yes","CPU"
|
||||||
|
|
|
||||||
|
Can't render this file because it is too large.
|
|
|
@ -4964,6 +4964,7 @@
|
||||||
"CUDA0","CONV_TRANSPOSE_1D","ne_input=[2,1,1,1],ne_kernel=[3,1,1,1],s0=1,p0=0,d0=1","support","1","yes","CUDA"
|
"CUDA0","CONV_TRANSPOSE_1D","ne_input=[2,1,1,1],ne_kernel=[3,1,1,1],s0=1,p0=0,d0=1","support","1","yes","CUDA"
|
||||||
"CUDA0","CONV_TRANSPOSE_2D","ne_input=[3,2,3,1],ne_kernel=[2,2,1,3],stride=1","support","1","yes","CUDA"
|
"CUDA0","CONV_TRANSPOSE_2D","ne_input=[3,2,3,1],ne_kernel=[2,2,1,3],stride=1","support","1","yes","CUDA"
|
||||||
"CUDA0","CONV_TRANSPOSE_2D","ne_input=[10,10,9,1],ne_kernel=[3,3,1,9],stride=2","support","1","yes","CUDA"
|
"CUDA0","CONV_TRANSPOSE_2D","ne_input=[10,10,9,1],ne_kernel=[3,3,1,9],stride=2","support","1","yes","CUDA"
|
||||||
|
"CUDA0","CONV_TRANSPOSE_2D","ne_input=[129,63,35,1],ne_kernel=[3,3,48,35],stride=1","support","1","yes","CUDA"
|
||||||
"CUDA0","COUNT_EQUAL","type=f32,ne=[4,500,1,1]","support","1","yes","CUDA"
|
"CUDA0","COUNT_EQUAL","type=f32,ne=[4,500,1,1]","support","1","yes","CUDA"
|
||||||
"CUDA0","COUNT_EQUAL","type=f32,ne=[4,5000,1,1]","support","1","yes","CUDA"
|
"CUDA0","COUNT_EQUAL","type=f32,ne=[4,5000,1,1]","support","1","yes","CUDA"
|
||||||
"CUDA0","ARGMAX","type=f32,ne=[32,1,1,1]","support","1","yes","CUDA"
|
"CUDA0","ARGMAX","type=f32,ne=[32,1,1,1]","support","1","yes","CUDA"
|
||||||
|
|
@ -5419,17 +5420,45 @@
|
||||||
"CUDA0","CPY","type_src=f16,type_dst=f16,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","CUDA"
|
"CUDA0","CPY","type_src=f16,type_dst=f16,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","CUDA"
|
||||||
"CUDA0","CPY","type_src=f32,type_dst=f32,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","CUDA"
|
"CUDA0","CPY","type_src=f32,type_dst=f32,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","CUDA"
|
||||||
"CUDA0","CPY","type_src=bf16,type_dst=bf16,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","CUDA"
|
"CUDA0","CPY","type_src=bf16,type_dst=bf16,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","CUDA"
|
||||||
|
"CUDA0","CPY","type_src=i32,type_dst=i32,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","CUDA"
|
||||||
|
"CUDA0","CPY","type_src=i32,type_dst=i32,ne=[256,1,4,1],permute_src=[1,2,0,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","CUDA"
|
||||||
"CUDA0","CPY","type_src=f32,type_dst=f32,ne=[256,1,4,1],permute_src=[1,2,0,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","CUDA"
|
"CUDA0","CPY","type_src=f32,type_dst=f32,ne=[256,1,4,1],permute_src=[1,2,0,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","CUDA"
|
||||||
"CUDA0","CONT","type=f32,ne=[10,10,10,1]","support","1","yes","CUDA"
|
"CUDA0","CONT","type=f32,ne=[2,1,1,1],use_view_slice=1","support","1","yes","CUDA"
|
||||||
"CUDA0","CONT","type=f32,ne=[2,1,1,1]","support","1","yes","CUDA"
|
"CUDA0","CONT","type=f32,ne=[2,1,3,5],use_view_slice=1","support","1","yes","CUDA"
|
||||||
"CUDA0","CONT","type=f32,ne=[2,1,3,5]","support","1","yes","CUDA"
|
"CUDA0","CONT","type=f32,ne=[2,3,5,7],use_view_slice=1","support","1","yes","CUDA"
|
||||||
"CUDA0","CONT","type=f32,ne=[2,3,5,7]","support","1","yes","CUDA"
|
"CUDA0","CONT","type=f32,ne=[1,4,4,1],use_view_slice=1","support","1","yes","CUDA"
|
||||||
"CUDA0","CONT","type=f16,ne=[2,1,1,1]","support","1","yes","CUDA"
|
"CUDA0","CONT","type=f32,ne=[1,8,17,1],use_view_slice=1","support","1","yes","CUDA"
|
||||||
"CUDA0","CONT","type=f16,ne=[2,1,3,5]","support","1","yes","CUDA"
|
"CUDA0","CONT","type=f32,ne=[10,10,10,1],use_view_slice=1","support","1","yes","CUDA"
|
||||||
"CUDA0","CONT","type=f16,ne=[2,3,5,7]","support","1","yes","CUDA"
|
"CUDA0","CONT","type=f32,ne=[2,1,1,1],use_view_slice=0","support","1","yes","CUDA"
|
||||||
"CUDA0","CONT","type=bf16,ne=[2,1,1,1]","support","1","yes","CUDA"
|
"CUDA0","CONT","type=f32,ne=[2,1,3,5],use_view_slice=0","support","1","yes","CUDA"
|
||||||
"CUDA0","CONT","type=bf16,ne=[2,1,3,5]","support","1","yes","CUDA"
|
"CUDA0","CONT","type=f32,ne=[2,3,5,7],use_view_slice=0","support","1","yes","CUDA"
|
||||||
"CUDA0","CONT","type=bf16,ne=[2,3,5,7]","support","1","yes","CUDA"
|
"CUDA0","CONT","type=f32,ne=[1,4,4,1],use_view_slice=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","CONT","type=f32,ne=[1,8,17,1],use_view_slice=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","CONT","type=f32,ne=[10,10,10,1],use_view_slice=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","CONT","type=i32,ne=[2,1,1,1],use_view_slice=1","support","1","yes","CUDA"
|
||||||
|
"CUDA0","CONT","type=i32,ne=[2,1,3,5],use_view_slice=1","support","1","yes","CUDA"
|
||||||
|
"CUDA0","CONT","type=i32,ne=[2,3,5,7],use_view_slice=1","support","1","yes","CUDA"
|
||||||
|
"CUDA0","CONT","type=i32,ne=[1,4,4,1],use_view_slice=1","support","1","yes","CUDA"
|
||||||
|
"CUDA0","CONT","type=i32,ne=[1,8,17,1],use_view_slice=1","support","1","yes","CUDA"
|
||||||
|
"CUDA0","CONT","type=i32,ne=[10,10,10,1],use_view_slice=1","support","1","yes","CUDA"
|
||||||
|
"CUDA0","CONT","type=i32,ne=[2,1,1,1],use_view_slice=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","CONT","type=i32,ne=[2,1,3,5],use_view_slice=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","CONT","type=i32,ne=[2,3,5,7],use_view_slice=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","CONT","type=i32,ne=[1,4,4,1],use_view_slice=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","CONT","type=i32,ne=[1,8,17,1],use_view_slice=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","CONT","type=i32,ne=[10,10,10,1],use_view_slice=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","CONT","type=f16,ne=[2,1,1,1],use_view_slice=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","CONT","type=f16,ne=[2,1,3,5],use_view_slice=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","CONT","type=f16,ne=[2,3,5,7],use_view_slice=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","CONT","type=f16,ne=[1,4,4,1],use_view_slice=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","CONT","type=f16,ne=[1,8,17,1],use_view_slice=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","CONT","type=f16,ne=[10,10,10,1],use_view_slice=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","CONT","type=bf16,ne=[2,1,1,1],use_view_slice=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","CONT","type=bf16,ne=[2,1,3,5],use_view_slice=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","CONT","type=bf16,ne=[2,3,5,7],use_view_slice=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","CONT","type=bf16,ne=[1,4,4,1],use_view_slice=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","CONT","type=bf16,ne=[1,8,17,1],use_view_slice=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","CONT","type=bf16,ne=[10,10,10,1],use_view_slice=0","support","1","yes","CUDA"
|
||||||
"CUDA0","ADD","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1","support","1","yes","CUDA"
|
"CUDA0","ADD","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1","support","1","yes","CUDA"
|
||||||
"CUDA0","SUB","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1","support","1","yes","CUDA"
|
"CUDA0","SUB","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1","support","1","yes","CUDA"
|
||||||
"CUDA0","MUL","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1","support","1","yes","CUDA"
|
"CUDA0","MUL","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1","support","1","yes","CUDA"
|
||||||
|
|
@ -5655,6 +5684,7 @@
|
||||||
"CUDA0","MUL","type=f32,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1","support","1","yes","CUDA"
|
"CUDA0","MUL","type=f32,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1","support","1","yes","CUDA"
|
||||||
"CUDA0","DIV","type=f32,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1","support","1","yes","CUDA"
|
"CUDA0","DIV","type=f32,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1","support","1","yes","CUDA"
|
||||||
"CUDA0","ADD1","type=f32,ne=[10,5,4,3]","support","1","yes","CUDA"
|
"CUDA0","ADD1","type=f32,ne=[10,5,4,3]","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ADD1","type=f32,ne=[1024,1024,1,1]","support","1","yes","CUDA"
|
||||||
"CUDA0","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=0.000000,inplace=0","support","1","yes","CUDA"
|
"CUDA0","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=0.000000,inplace=0","support","1","yes","CUDA"
|
||||||
"CUDA0","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=1.000000,inplace=0","support","1","yes","CUDA"
|
"CUDA0","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=1.000000,inplace=0","support","1","yes","CUDA"
|
||||||
"CUDA0","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=1.000000,inplace=1","support","1","yes","CUDA"
|
"CUDA0","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=1.000000,inplace=1","support","1","yes","CUDA"
|
||||||
|
|
@ -8644,9 +8674,13 @@
|
||||||
"CUDA0","CLAMP","type=f16,ne=[7,1,5,3],min=-0.500000,max=0.500000","support","1","yes","CUDA"
|
"CUDA0","CLAMP","type=f16,ne=[7,1,5,3],min=-0.500000,max=0.500000","support","1","yes","CUDA"
|
||||||
"CUDA0","LEAKY_RELU","type=f16,ne_a=[7,1,5,3],negative_slope=0.100000","support","1","yes","CUDA"
|
"CUDA0","LEAKY_RELU","type=f16,ne_a=[7,1,5,3],negative_slope=0.100000","support","1","yes","CUDA"
|
||||||
"CUDA0","FLOOR","type=f16,ne=[7,1,5,3]","support","1","yes","CUDA"
|
"CUDA0","FLOOR","type=f16,ne=[7,1,5,3]","support","1","yes","CUDA"
|
||||||
|
"CUDA0","FLOOR","type=f16,ne=[1024,1024,1,1]","support","1","yes","CUDA"
|
||||||
"CUDA0","CEIL","type=f16,ne=[7,1,5,3]","support","1","yes","CUDA"
|
"CUDA0","CEIL","type=f16,ne=[7,1,5,3]","support","1","yes","CUDA"
|
||||||
|
"CUDA0","CEIL","type=f16,ne=[1024,1024,1,1]","support","1","yes","CUDA"
|
||||||
"CUDA0","ROUND","type=f16,ne=[7,1,5,3]","support","1","yes","CUDA"
|
"CUDA0","ROUND","type=f16,ne=[7,1,5,3]","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ROUND","type=f16,ne=[1024,1024,1,1]","support","1","yes","CUDA"
|
||||||
"CUDA0","TRUNC","type=f16,ne=[7,1,5,3]","support","1","yes","CUDA"
|
"CUDA0","TRUNC","type=f16,ne=[7,1,5,3]","support","1","yes","CUDA"
|
||||||
|
"CUDA0","TRUNC","type=f16,ne=[1024,1024,1,1]","support","1","yes","CUDA"
|
||||||
"CUDA0","SQR","type=f32,ne=[10,5,4,3]","support","1","yes","CUDA"
|
"CUDA0","SQR","type=f32,ne=[10,5,4,3]","support","1","yes","CUDA"
|
||||||
"CUDA0","SQRT","type=f32,ne=[10,3,3,2]","support","1","yes","CUDA"
|
"CUDA0","SQRT","type=f32,ne=[10,3,3,2]","support","1","yes","CUDA"
|
||||||
"CUDA0","LOG","type=f32,ne=[10,5,4,3]","support","1","yes","CUDA"
|
"CUDA0","LOG","type=f32,ne=[10,5,4,3]","support","1","yes","CUDA"
|
||||||
|
|
@ -8666,9 +8700,13 @@
|
||||||
"CUDA0","CLAMP","type=f32,ne=[7,1,5,3],min=-0.500000,max=0.500000","support","1","yes","CUDA"
|
"CUDA0","CLAMP","type=f32,ne=[7,1,5,3],min=-0.500000,max=0.500000","support","1","yes","CUDA"
|
||||||
"CUDA0","LEAKY_RELU","type=f32,ne_a=[7,1,5,3],negative_slope=0.100000","support","1","yes","CUDA"
|
"CUDA0","LEAKY_RELU","type=f32,ne_a=[7,1,5,3],negative_slope=0.100000","support","1","yes","CUDA"
|
||||||
"CUDA0","FLOOR","type=f32,ne=[7,1,5,3]","support","1","yes","CUDA"
|
"CUDA0","FLOOR","type=f32,ne=[7,1,5,3]","support","1","yes","CUDA"
|
||||||
|
"CUDA0","FLOOR","type=f32,ne=[1024,1024,1,1]","support","1","yes","CUDA"
|
||||||
"CUDA0","CEIL","type=f32,ne=[7,1,5,3]","support","1","yes","CUDA"
|
"CUDA0","CEIL","type=f32,ne=[7,1,5,3]","support","1","yes","CUDA"
|
||||||
|
"CUDA0","CEIL","type=f32,ne=[1024,1024,1,1]","support","1","yes","CUDA"
|
||||||
"CUDA0","ROUND","type=f32,ne=[7,1,5,3]","support","1","yes","CUDA"
|
"CUDA0","ROUND","type=f32,ne=[7,1,5,3]","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ROUND","type=f32,ne=[1024,1024,1,1]","support","1","yes","CUDA"
|
||||||
"CUDA0","TRUNC","type=f32,ne=[7,1,5,3]","support","1","yes","CUDA"
|
"CUDA0","TRUNC","type=f32,ne=[7,1,5,3]","support","1","yes","CUDA"
|
||||||
|
"CUDA0","TRUNC","type=f32,ne=[1024,1024,1,1]","support","1","yes","CUDA"
|
||||||
"CUDA0","DIAG_MASK_INF","type=f32,ne=[10,10,1,1],n_past=5","support","1","yes","CUDA"
|
"CUDA0","DIAG_MASK_INF","type=f32,ne=[10,10,1,1],n_past=5","support","1","yes","CUDA"
|
||||||
"CUDA0","DIAG_MASK_INF","type=f32,ne=[10,10,3,1],n_past=5","support","1","yes","CUDA"
|
"CUDA0","DIAG_MASK_INF","type=f32,ne=[10,10,3,1],n_past=5","support","1","yes","CUDA"
|
||||||
"CUDA0","DIAG_MASK_INF","type=f32,ne=[10,10,3,2],n_past=5","support","1","yes","CUDA"
|
"CUDA0","DIAG_MASK_INF","type=f32,ne=[10,10,3,2],n_past=5","support","1","yes","CUDA"
|
||||||
|
|
@ -9411,18 +9449,405 @@
|
||||||
"CUDA0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","0","no","CUDA"
|
"CUDA0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","0","no","CUDA"
|
||||||
"CUDA0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","CUDA"
|
"CUDA0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","CUDA"
|
||||||
"CUDA0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","0","no","CUDA"
|
"CUDA0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","0","no","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[3,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[4,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[7,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
"CUDA0","ARGSORT","type=f32,ne=[8,1,1,1],order=0","support","1","yes","CUDA"
|
"CUDA0","ARGSORT","type=f32,ne=[8,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[15,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[16,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[31,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[32,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[63,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[64,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[127,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[128,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[255,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[256,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[511,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[512,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[1023,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[1024,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[2047,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[2048,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[4095,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[4096,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[8191,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[8192,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[16383,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[16384,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[32767,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[32768,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[65535,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[65536,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[131071,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[131072,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[262143,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[262144,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[524287,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[524288,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[1048575,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[1048576,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
"CUDA0","ARGSORT","type=f32,ne=[16,10,10,10],order=0","support","1","yes","CUDA"
|
"CUDA0","ARGSORT","type=f32,ne=[16,10,10,10],order=0","support","1","yes","CUDA"
|
||||||
"CUDA0","ARGSORT","type=f32,ne=[60,10,10,10],order=0","support","1","yes","CUDA"
|
"CUDA0","ARGSORT","type=f32,ne=[60,10,10,10],order=0","support","1","yes","CUDA"
|
||||||
"CUDA0","ARGSORT","type=f32,ne=[1024,1,1,1],order=0","support","1","yes","CUDA"
|
"CUDA0","ARGSORT","type=f32,ne=[1023,2,1,3],order=0","support","1","yes","CUDA"
|
||||||
"CUDA0","ARGSORT","type=f32,ne=[16384,1,1,1],order=0","support","1","yes","CUDA"
|
"CUDA0","ARGSORT","type=f32,ne=[1024,2,1,3],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[1025,2,1,3],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[2047,2,1,3],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[2048,2,1,3],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[2049,2,1,3],order=0","support","1","yes","CUDA"
|
||||||
"CUDA0","ARGSORT","type=f32,ne=[2,8,8192,1],order=0","support","1","yes","CUDA"
|
"CUDA0","ARGSORT","type=f32,ne=[2,8,8192,1],order=0","support","1","yes","CUDA"
|
||||||
"CUDA0","ARGSORT","type=f32,ne=[8,1,1,1],order=1","support","1","yes","CUDA"
|
"CUDA0","ARGSORT","type=f32,ne=[3,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[4,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[7,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[8,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[15,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[16,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[31,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[32,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[63,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[64,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[127,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[128,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[255,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[256,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[511,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[512,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[1023,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[1024,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[2047,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[2048,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[4095,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[4096,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[8191,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[8192,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[16383,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[16384,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[32767,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[32768,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[65535,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[65536,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[131071,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[131072,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[262143,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[262144,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[524287,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[524288,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[1048575,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[1048576,1,1,1],order=0","support","1","yes","CUDA"
|
||||||
"CUDA0","ARGSORT","type=f32,ne=[16,10,10,10],order=1","support","1","yes","CUDA"
|
"CUDA0","ARGSORT","type=f32,ne=[16,10,10,10],order=1","support","1","yes","CUDA"
|
||||||
"CUDA0","ARGSORT","type=f32,ne=[60,10,10,10],order=1","support","1","yes","CUDA"
|
"CUDA0","ARGSORT","type=f32,ne=[60,10,10,10],order=1","support","1","yes","CUDA"
|
||||||
"CUDA0","ARGSORT","type=f32,ne=[1024,1,1,1],order=1","support","1","yes","CUDA"
|
"CUDA0","ARGSORT","type=f32,ne=[1023,2,1,3],order=1","support","1","yes","CUDA"
|
||||||
"CUDA0","ARGSORT","type=f32,ne=[16384,1,1,1],order=1","support","1","yes","CUDA"
|
"CUDA0","ARGSORT","type=f32,ne=[1024,2,1,3],order=1","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[1025,2,1,3],order=1","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[2047,2,1,3],order=1","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[2048,2,1,3],order=1","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARGSORT","type=f32,ne=[2049,2,1,3],order=1","support","1","yes","CUDA"
|
||||||
"CUDA0","ARGSORT","type=f32,ne=[2,8,8192,1],order=1","support","1","yes","CUDA"
|
"CUDA0","ARGSORT","type=f32,ne=[2,8,8192,1],order=1","support","1","yes","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[1,1,1,1],k=1,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[12,1,2,1],k=1,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[2,1,1,1],k=1,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[13,1,2,1],k=1,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[2,1,1,1],k=2,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[13,1,2,1],k=2,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[4,1,1,1],k=1,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[15,1,2,1],k=1,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[4,1,1,1],k=2,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[15,1,2,1],k=2,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[4,1,1,1],k=3,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[15,1,2,1],k=3,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[8,1,1,1],k=1,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[19,1,2,1],k=1,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[8,1,1,1],k=2,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[19,1,2,1],k=2,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[8,1,1,1],k=3,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[19,1,2,1],k=3,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[8,1,1,1],k=7,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[19,1,2,1],k=7,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[16,1,1,1],k=1,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[27,1,2,1],k=1,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[16,1,1,1],k=2,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[27,1,2,1],k=2,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[16,1,1,1],k=3,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[27,1,2,1],k=3,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[16,1,1,1],k=7,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[27,1,2,1],k=7,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[16,1,1,1],k=15,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[27,1,2,1],k=15,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[32,1,1,1],k=1,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[43,1,2,1],k=1,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[32,1,1,1],k=2,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[43,1,2,1],k=2,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[32,1,1,1],k=3,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[43,1,2,1],k=3,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[32,1,1,1],k=7,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[43,1,2,1],k=7,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[32,1,1,1],k=15,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[43,1,2,1],k=15,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[64,1,1,1],k=1,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[75,1,2,1],k=1,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[64,1,1,1],k=2,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[75,1,2,1],k=2,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[64,1,1,1],k=3,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[75,1,2,1],k=3,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[64,1,1,1],k=7,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[75,1,2,1],k=7,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[64,1,1,1],k=15,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[75,1,2,1],k=15,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[128,1,1,1],k=1,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[139,1,2,1],k=1,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[128,1,1,1],k=2,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[139,1,2,1],k=2,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[128,1,1,1],k=3,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[139,1,2,1],k=3,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[128,1,1,1],k=7,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[139,1,2,1],k=7,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[128,1,1,1],k=15,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[139,1,2,1],k=15,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[128,1,1,1],k=100,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[139,1,2,1],k=100,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[256,1,1,1],k=1,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[267,1,2,1],k=1,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[256,1,1,1],k=2,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[267,1,2,1],k=2,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[256,1,1,1],k=3,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[267,1,2,1],k=3,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[256,1,1,1],k=7,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[267,1,2,1],k=7,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[256,1,1,1],k=15,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[267,1,2,1],k=15,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[256,1,1,1],k=100,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[267,1,2,1],k=100,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[512,1,1,1],k=1,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[523,1,2,1],k=1,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[512,1,1,1],k=2,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[523,1,2,1],k=2,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[512,1,1,1],k=3,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[523,1,2,1],k=3,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[512,1,1,1],k=7,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[523,1,2,1],k=7,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[512,1,1,1],k=15,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[523,1,2,1],k=15,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[512,1,1,1],k=100,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[523,1,2,1],k=100,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[512,1,1,1],k=500,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[523,1,2,1],k=500,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[1024,1,1,1],k=1,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[1035,1,2,1],k=1,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[1024,1,1,1],k=2,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[1035,1,2,1],k=2,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[1024,1,1,1],k=3,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[1035,1,2,1],k=3,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[1024,1,1,1],k=7,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[1035,1,2,1],k=7,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[1024,1,1,1],k=15,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[1035,1,2,1],k=15,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[1024,1,1,1],k=100,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[1035,1,2,1],k=100,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[1024,1,1,1],k=500,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[1035,1,2,1],k=500,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[1024,1,1,1],k=1023,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[1035,1,2,1],k=1023,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[2048,1,1,1],k=1,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[2059,1,2,1],k=1,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[2048,1,1,1],k=2,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[2059,1,2,1],k=2,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[2048,1,1,1],k=3,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[2059,1,2,1],k=3,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[2048,1,1,1],k=7,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[2059,1,2,1],k=7,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[2048,1,1,1],k=15,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[2059,1,2,1],k=15,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[2048,1,1,1],k=100,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[2059,1,2,1],k=100,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[2048,1,1,1],k=500,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[2059,1,2,1],k=500,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[2048,1,1,1],k=1023,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[2059,1,2,1],k=1023,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[4096,1,1,1],k=1,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[4107,1,2,1],k=1,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[4096,1,1,1],k=2,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[4107,1,2,1],k=2,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[4096,1,1,1],k=3,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[4107,1,2,1],k=3,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[4096,1,1,1],k=7,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[4107,1,2,1],k=7,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[4096,1,1,1],k=15,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[4107,1,2,1],k=15,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[4096,1,1,1],k=100,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[4107,1,2,1],k=100,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[4096,1,1,1],k=500,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[4107,1,2,1],k=500,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[4096,1,1,1],k=1023,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[4107,1,2,1],k=1023,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[8192,1,1,1],k=1,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[8203,1,2,1],k=1,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[8192,1,1,1],k=2,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[8203,1,2,1],k=2,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[8192,1,1,1],k=3,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[8203,1,2,1],k=3,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[8192,1,1,1],k=7,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[8203,1,2,1],k=7,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[8192,1,1,1],k=15,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[8203,1,2,1],k=15,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[8192,1,1,1],k=100,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[8203,1,2,1],k=100,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[8192,1,1,1],k=500,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[8203,1,2,1],k=500,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[8192,1,1,1],k=1023,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[8203,1,2,1],k=1023,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=1,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[16395,1,2,1],k=1,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=2,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[16395,1,2,1],k=2,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=3,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[16395,1,2,1],k=3,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=7,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[16395,1,2,1],k=7,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=15,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[16395,1,2,1],k=15,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=100,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[16395,1,2,1],k=100,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=500,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[16395,1,2,1],k=500,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=1023,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[16395,1,2,1],k=1023,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=9999,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[16395,1,2,1],k=9999,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[32768,1,1,1],k=1,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[32779,1,2,1],k=1,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[32768,1,1,1],k=2,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[32779,1,2,1],k=2,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[32768,1,1,1],k=3,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[32779,1,2,1],k=3,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[32768,1,1,1],k=7,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[32779,1,2,1],k=7,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[32768,1,1,1],k=15,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[32779,1,2,1],k=15,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[32768,1,1,1],k=100,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[32779,1,2,1],k=100,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[32768,1,1,1],k=500,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[32779,1,2,1],k=500,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[32768,1,1,1],k=1023,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[32779,1,2,1],k=1023,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[32768,1,1,1],k=9999,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[32779,1,2,1],k=9999,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[65536,1,1,1],k=1,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[65547,1,2,1],k=1,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[65536,1,1,1],k=2,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[65547,1,2,1],k=2,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[65536,1,1,1],k=3,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[65547,1,2,1],k=3,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[65536,1,1,1],k=7,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[65547,1,2,1],k=7,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[65536,1,1,1],k=15,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[65547,1,2,1],k=15,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[65536,1,1,1],k=100,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[65547,1,2,1],k=100,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[65536,1,1,1],k=500,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[65547,1,2,1],k=500,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[65536,1,1,1],k=1023,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[65547,1,2,1],k=1023,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[65536,1,1,1],k=9999,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[65547,1,2,1],k=9999,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[131072,1,1,1],k=1,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[131083,1,2,1],k=1,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[131072,1,1,1],k=2,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[131083,1,2,1],k=2,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[131072,1,1,1],k=3,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[131083,1,2,1],k=3,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[131072,1,1,1],k=7,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[131083,1,2,1],k=7,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[131072,1,1,1],k=15,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[131083,1,2,1],k=15,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[131072,1,1,1],k=100,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[131083,1,2,1],k=100,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[131072,1,1,1],k=500,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[131083,1,2,1],k=500,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[131072,1,1,1],k=1023,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[131083,1,2,1],k=1023,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[131072,1,1,1],k=9999,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[131083,1,2,1],k=9999,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[262144,1,1,1],k=1,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[262155,1,2,1],k=1,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[262144,1,1,1],k=2,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[262155,1,2,1],k=2,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[262144,1,1,1],k=3,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[262155,1,2,1],k=3,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[262144,1,1,1],k=7,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[262155,1,2,1],k=7,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[262144,1,1,1],k=15,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[262155,1,2,1],k=15,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[262144,1,1,1],k=100,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[262155,1,2,1],k=100,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[262144,1,1,1],k=500,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[262155,1,2,1],k=500,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[262144,1,1,1],k=1023,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[262155,1,2,1],k=1023,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[262144,1,1,1],k=9999,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[262155,1,2,1],k=9999,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[524288,1,1,1],k=1,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[524299,1,2,1],k=1,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[524288,1,1,1],k=2,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[524299,1,2,1],k=2,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[524288,1,1,1],k=3,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[524299,1,2,1],k=3,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[524288,1,1,1],k=7,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[524299,1,2,1],k=7,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[524288,1,1,1],k=15,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[524299,1,2,1],k=15,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[524288,1,1,1],k=100,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[524299,1,2,1],k=100,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[524288,1,1,1],k=500,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[524299,1,2,1],k=500,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[524288,1,1,1],k=1023,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[524299,1,2,1],k=1023,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[524288,1,1,1],k=9999,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[524299,1,2,1],k=9999,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[16,10,10,10],k=1,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[60,10,10,10],k=1,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[1023,2,1,3],k=1,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[1024,2,1,3],k=1,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[1025,2,1,3],k=1,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=1,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[2047,2,1,3],k=1,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[2048,2,1,3],k=1,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[2049,2,1,3],k=1,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[16,10,10,10],k=2,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[60,10,10,10],k=2,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[1023,2,1,3],k=2,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[1024,2,1,3],k=2,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[1025,2,1,3],k=2,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=2,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[2047,2,1,3],k=2,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[2048,2,1,3],k=2,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[2049,2,1,3],k=2,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[16,10,10,10],k=3,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[60,10,10,10],k=3,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[1023,2,1,3],k=3,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[1024,2,1,3],k=3,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[1025,2,1,3],k=3,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=3,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[2047,2,1,3],k=3,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[2048,2,1,3],k=3,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[2049,2,1,3],k=3,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[16,10,10,10],k=7,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[60,10,10,10],k=7,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[1023,2,1,3],k=7,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[1024,2,1,3],k=7,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[1025,2,1,3],k=7,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=7,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[2047,2,1,3],k=7,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[2048,2,1,3],k=7,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[2049,2,1,3],k=7,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[16,10,10,10],k=15,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[60,10,10,10],k=15,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[1023,2,1,3],k=15,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[1024,2,1,3],k=15,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[1025,2,1,3],k=15,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=15,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[2047,2,1,3],k=15,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[2048,2,1,3],k=15,ties=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","TOP_K","type=f32,ne=[2049,2,1,3],k=15,ties=0","support","0","no","CUDA"
|
||||||
"CUDA0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=0","support","1","yes","CUDA"
|
"CUDA0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=0","support","1","yes","CUDA"
|
||||||
"CUDA0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=1","support","1","yes","CUDA"
|
"CUDA0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=1","support","1","yes","CUDA"
|
||||||
"CUDA0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=nearest,flags=none","support","1","yes","CUDA"
|
"CUDA0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=nearest,flags=none","support","1","yes","CUDA"
|
||||||
|
|
@ -9435,6 +9860,10 @@
|
||||||
"CUDA0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bicubic,transpose=1","support","1","yes","CUDA"
|
"CUDA0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bicubic,transpose=1","support","1","yes","CUDA"
|
||||||
"CUDA0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bicubic,flags=none","support","1","yes","CUDA"
|
"CUDA0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bicubic,flags=none","support","1","yes","CUDA"
|
||||||
"CUDA0","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bicubic,flags=none","support","1","yes","CUDA"
|
"CUDA0","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bicubic,flags=none","support","1","yes","CUDA"
|
||||||
|
"CUDA0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=513,transpose=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=513,transpose=1","support","1","yes","CUDA"
|
||||||
|
"CUDA0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear,flags=none","support","1","yes","CUDA"
|
||||||
|
"CUDA0","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bilinear,flags=none","support","1","yes","CUDA"
|
||||||
"CUDA0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear,flags=align_corners","support","1","yes","CUDA"
|
"CUDA0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear,flags=align_corners","support","1","yes","CUDA"
|
||||||
"CUDA0","UPSCALE","type=f32,ne=[1,4,3,2],ne_tgt=[2,8,3,2],mode=bilinear,flags=align_corners","support","1","yes","CUDA"
|
"CUDA0","UPSCALE","type=f32,ne=[1,4,3,2],ne_tgt=[2,8,3,2],mode=bilinear,flags=align_corners","support","1","yes","CUDA"
|
||||||
"CUDA0","UPSCALE","type=f32,ne=[4,1,3,2],ne_tgt=[1,1,3,2],mode=bilinear,flags=align_corners","support","1","yes","CUDA"
|
"CUDA0","UPSCALE","type=f32,ne=[4,1,3,2],ne_tgt=[1,1,3,2],mode=bilinear,flags=align_corners","support","1","yes","CUDA"
|
||||||
|
|
@ -9463,34 +9892,59 @@
|
||||||
"CUDA0","GROUP_NORM","type=f32,ne=[64,64,320,1],num_groups=32,eps=0.000001","support","1","yes","CUDA"
|
"CUDA0","GROUP_NORM","type=f32,ne=[64,64,320,1],num_groups=32,eps=0.000001","support","1","yes","CUDA"
|
||||||
"CUDA0","GROUP_NORM","type=f32,ne=[9,9,1280,1],num_groups=32,eps=0.000001","support","1","yes","CUDA"
|
"CUDA0","GROUP_NORM","type=f32,ne=[9,9,1280,1],num_groups=32,eps=0.000001","support","1","yes","CUDA"
|
||||||
"CUDA0","ACC","type=f32,ne_a=[256,17,1,1],ne_b=[256,16,1,1]","support","1","yes","CUDA"
|
"CUDA0","ACC","type=f32,ne_a=[256,17,1,1],ne_b=[256,16,1,1]","support","1","yes","CUDA"
|
||||||
"CUDA0","PAD","type=f32,ne_a=[512,512,1,1],pad_0=1,pad_1=1","support","1","yes","CUDA"
|
"CUDA0","PAD","type=f32,ne_a=[512,512,1,1],pad_0=1,pad_1=1,circular=0","support","1","yes","CUDA"
|
||||||
"CUDA0","PAD","type=f32,ne_a=[512,512,3,1],lp0=1,rp0=1,lp1=1,rp1=1,lp2=1,rp2=1,lp3=1,rp3=1,v=0","support","1","yes","CUDA"
|
"CUDA0","PAD","type=f32,ne_a=[33,17,2,1],pad_0=4,pad_1=3,circular=1","support","1","yes","CUDA"
|
||||||
|
"CUDA0","PAD","type=f32,ne_a=[512,512,3,1],lp0=1,rp0=1,lp1=1,rp1=1,lp2=1,rp2=1,lp3=1,rp3=1,v=0,circular=0","support","1","yes","CUDA"
|
||||||
"CUDA0","PAD_REFLECT_1D","type=f32,ne_a=[512,34,2,1],pad_0=10,pad_1=9","support","1","yes","CUDA"
|
"CUDA0","PAD_REFLECT_1D","type=f32,ne_a=[512,34,2,1],pad_0=10,pad_1=9","support","1","yes","CUDA"
|
||||||
"CUDA0","PAD_REFLECT_1D","type=f32,ne_a=[3000,384,4,1],pad_0=10,pad_1=9","support","1","yes","CUDA"
|
"CUDA0","PAD_REFLECT_1D","type=f32,ne_a=[3000,384,4,1],pad_0=10,pad_1=9","support","1","yes","CUDA"
|
||||||
"CUDA0","ROLL","shift0=3,shift1=-2,shift3=1,shift4=-1","support","1","yes","CUDA"
|
"CUDA0","ROLL","shift0=3,shift1=-2,shift3=1,shift4=-1","support","1","yes","CUDA"
|
||||||
"CUDA0","ARANGE","type=f32,start=0.000000,stop=10.000000,step=1.000000","support","1","yes","CUDA"
|
"CUDA0","ARANGE","type=f32,start=0.000000,stop=10.000000,step=1.000000","support","1","yes","CUDA"
|
||||||
|
"CUDA0","ARANGE","type=f32,start=0.000000,stop=1048576.000000,step=1.000000","support","1","yes","CUDA"
|
||||||
"CUDA0","TIMESTEP_EMBEDDING","type=f32,ne_a=[2,1,1,1],dim=320,max_period=10000","support","1","yes","CUDA"
|
"CUDA0","TIMESTEP_EMBEDDING","type=f32,ne_a=[2,1,1,1],dim=320,max_period=10000","support","1","yes","CUDA"
|
||||||
"CUDA0","LEAKY_RELU","type=f32,ne_a=[10,5,4,3],negative_slope=0.100000","support","1","yes","CUDA"
|
"CUDA0","LEAKY_RELU","type=f32,ne_a=[10,5,4,3],negative_slope=0.100000","support","1","yes","CUDA"
|
||||||
"CUDA0","CUMSUM","type=f32,ne=[10,5,4,3]","support","0","no","CUDA"
|
"CUDA0","CUMSUM","type=f32,ne=[10,5,4,3]","support","1","yes","CUDA"
|
||||||
|
"CUDA0","CUMSUM","type=f32,ne=[127,5,4,3]","support","1","yes","CUDA"
|
||||||
|
"CUDA0","CUMSUM","type=f32,ne=[128,5,4,3]","support","1","yes","CUDA"
|
||||||
|
"CUDA0","CUMSUM","type=f32,ne=[128,128,4,4]","support","1","yes","CUDA"
|
||||||
|
"CUDA0","CUMSUM","type=f32,ne=[255,5,4,3]","support","1","yes","CUDA"
|
||||||
|
"CUDA0","CUMSUM","type=f32,ne=[256,5,4,3]","support","1","yes","CUDA"
|
||||||
|
"CUDA0","CUMSUM","type=f32,ne=[511,5,4,3]","support","1","yes","CUDA"
|
||||||
|
"CUDA0","CUMSUM","type=f32,ne=[512,5,4,3]","support","1","yes","CUDA"
|
||||||
|
"CUDA0","CUMSUM","type=f32,ne=[1023,5,4,3]","support","1","yes","CUDA"
|
||||||
|
"CUDA0","CUMSUM","type=f32,ne=[1024,5,4,3]","support","1","yes","CUDA"
|
||||||
|
"CUDA0","CUMSUM","type=f32,ne=[2047,5,4,3]","support","1","yes","CUDA"
|
||||||
|
"CUDA0","CUMSUM","type=f32,ne=[2048,5,4,3]","support","1","yes","CUDA"
|
||||||
|
"CUDA0","CUMSUM","type=f32,ne=[242004,1,1,1]","support","1","yes","CUDA"
|
||||||
|
"CUDA0","CUMSUM","type=f32,ne=[375960,1,1,1]","support","1","yes","CUDA"
|
||||||
"CUDA0","XIELU","type=f32,ne=[10,5,4,3]","support","0","no","CUDA"
|
"CUDA0","XIELU","type=f32,ne=[10,5,4,3]","support","0","no","CUDA"
|
||||||
"CUDA0","TRI","type=f32,ne=[10,10,4,3],tri_type=3","support","0","no","CUDA"
|
"CUDA0","TRI","type=f32,ne=[10,10,4,3],tri_type=3","support","1","yes","CUDA"
|
||||||
"CUDA0","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","0","no","CUDA"
|
"CUDA0","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","1","yes","CUDA"
|
||||||
"CUDA0","TRI","type=f32,ne=[10,10,4,3],tri_type=1","support","0","no","CUDA"
|
"CUDA0","TRI","type=f32,ne=[10,10,4,3],tri_type=1","support","1","yes","CUDA"
|
||||||
"CUDA0","TRI","type=f32,ne=[10,10,4,3],tri_type=0","support","0","no","CUDA"
|
"CUDA0","TRI","type=f32,ne=[10,10,4,3],tri_type=0","support","1","yes","CUDA"
|
||||||
"CUDA0","FILL","type=f32,ne=[10,10,4,3],c=0.000000","support","0","no","CUDA"
|
"CUDA0","FILL","type=f32,ne=[10,10,4,3],c=0.000000","support","1","yes","CUDA"
|
||||||
"CUDA0","FILL","type=f32,ne=[303,207,11,3],c=2.000000","support","0","no","CUDA"
|
"CUDA0","FILL","type=f32,ne=[303,207,11,3],c=2.000000","support","1","yes","CUDA"
|
||||||
"CUDA0","FILL","type=f32,ne=[800,600,4,4],c=-152.000000","support","0","no","CUDA"
|
"CUDA0","FILL","type=f32,ne=[800,600,4,4],c=-152.000000","support","1","yes","CUDA"
|
||||||
"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[10,10,4,3],ne_rhs=[3,10,4,3]","support","0","no","CUDA"
|
"CUDA0","FILL","type=f32,ne=[2048,512,2,2],c=3.500000","support","1","yes","CUDA"
|
||||||
"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[11,11,1,1],ne_rhs=[5,11,1,1]","support","0","no","CUDA"
|
"CUDA0","DIAG","type=f32,ne=[10,1,4,3]","support","1","yes","CUDA"
|
||||||
"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[17,17,2,4],ne_rhs=[9,17,2,4]","support","0","no","CUDA"
|
"CUDA0","DIAG","type=f32,ne=[79,1,19,13]","support","1","yes","CUDA"
|
||||||
"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[30,30,7,1],ne_rhs=[8,30,7,1]","support","0","no","CUDA"
|
"CUDA0","DIAG","type=f32,ne=[256,1,8,16]","support","1","yes","CUDA"
|
||||||
"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[42,42,5,2],ne_rhs=[10,42,5,2]","support","0","no","CUDA"
|
"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[10,10,4,3],ne_rhs=[3,10,4,3]","support","1","yes","CUDA"
|
||||||
"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[64,64,2,2],ne_rhs=[10,64,2,2]","support","0","no","CUDA"
|
"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[11,11,1,1],ne_rhs=[5,11,1,1]","support","1","yes","CUDA"
|
||||||
|
"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[17,17,2,4],ne_rhs=[9,17,2,4]","support","1","yes","CUDA"
|
||||||
|
"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[30,30,7,1],ne_rhs=[8,30,7,1]","support","1","yes","CUDA"
|
||||||
|
"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[42,42,5,2],ne_rhs=[10,42,5,2]","support","1","yes","CUDA"
|
||||||
|
"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[64,64,2,2],ne_rhs=[10,64,2,2]","support","1","yes","CUDA"
|
||||||
"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[100,100,4,4],ne_rhs=[41,100,4,4]","support","0","no","CUDA"
|
"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[100,100,4,4],ne_rhs=[41,100,4,4]","support","0","no","CUDA"
|
||||||
"CUDA0","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0","support","1","yes","CUDA"
|
"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[128,128,4,4],ne_rhs=[31,128,4,4]","support","0","no","CUDA"
|
||||||
"CUDA0","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0","support","1","yes","CUDA"
|
"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[64,64,4,4],ne_rhs=[300,64,4,4]","support","0","no","CUDA"
|
||||||
"CUDA0","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1","support","0","no","CUDA"
|
"CUDA0","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0,circular=0","support","1","yes","CUDA"
|
||||||
"CUDA0","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1","support","0","no","CUDA"
|
"CUDA0","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0,circular=0","support","1","yes","CUDA"
|
||||||
|
"CUDA0","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0,circular=1","support","1","yes","CUDA"
|
||||||
|
"CUDA0","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0,circular=1","support","1","yes","CUDA"
|
||||||
|
"CUDA0","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1,circular=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1,circular=0","support","0","no","CUDA"
|
||||||
|
"CUDA0","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1,circular=1","support","0","no","CUDA"
|
||||||
|
"CUDA0","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1,circular=1","support","0","no","CUDA"
|
||||||
"CUDA0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","CUDA"
|
"CUDA0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","CUDA"
|
||||||
"CUDA0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","CUDA"
|
"CUDA0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","CUDA"
|
||||||
"CUDA0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","CUDA"
|
"CUDA0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","CUDA"
|
||||||
|
|
|
||||||
|
Can't render this file because it is too large.
|
19640
docs/ops/OpenCL.csv
19640
docs/ops/OpenCL.csv
File diff suppressed because it is too large
Load Diff
1158
docs/ops/SYCL.csv
1158
docs/ops/SYCL.csv
File diff suppressed because it is too large
Load Diff
|
|
@ -2,6 +2,7 @@
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "log.h"
|
#include "log.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
#include "sampling.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
|
|
@ -64,11 +65,12 @@ int main(int argc, char ** argv) {
|
||||||
ctx_params.n_ctx = n_kv_req;
|
ctx_params.n_ctx = n_kv_req;
|
||||||
ctx_params.n_batch = std::max(n_predict, n_parallel);
|
ctx_params.n_batch = std::max(n_predict, n_parallel);
|
||||||
|
|
||||||
llama_context * ctx = llama_init_from_model(model, ctx_params);
|
|
||||||
|
|
||||||
auto sparams = llama_sampler_chain_default_params();
|
auto sparams = llama_sampler_chain_default_params();
|
||||||
sparams.no_perf = false;
|
sparams.no_perf = false;
|
||||||
|
|
||||||
|
std::vector<llama_sampler *> samplers;
|
||||||
|
|
||||||
|
for (int32_t i = 0; i < n_parallel; ++i) {
|
||||||
llama_sampler * smpl = llama_sampler_chain_init(sparams);
|
llama_sampler * smpl = llama_sampler_chain_init(sparams);
|
||||||
|
|
||||||
llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sampling.top_k));
|
llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sampling.top_k));
|
||||||
|
|
@ -76,6 +78,11 @@ int main(int argc, char ** argv) {
|
||||||
llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sampling.temp));
|
llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sampling.temp));
|
||||||
llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sampling.seed));
|
llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sampling.seed));
|
||||||
|
|
||||||
|
samplers.push_back(smpl);
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_context * ctx = llama_init_from_model(model, ctx_params);
|
||||||
|
|
||||||
if (ctx == NULL) {
|
if (ctx == NULL) {
|
||||||
LOG_ERR("%s: error: failed to create the llama_context\n" , __func__);
|
LOG_ERR("%s: error: failed to create the llama_context\n" , __func__);
|
||||||
return 1;
|
return 1;
|
||||||
|
|
@ -173,7 +180,7 @@ int main(int argc, char ** argv) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
const llama_token new_token_id = llama_sampler_sample(smpl, ctx, i_batch[i]);
|
const llama_token new_token_id = llama_sampler_sample(samplers[i], ctx, i_batch[i]);
|
||||||
|
|
||||||
// is it an end of generation? -> mark the stream as finished
|
// is it an end of generation? -> mark the stream as finished
|
||||||
if (llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_predict) {
|
if (llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_predict) {
|
||||||
|
|
@ -229,14 +236,17 @@ int main(int argc, char ** argv) {
|
||||||
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
||||||
|
|
||||||
LOG("\n");
|
LOG("\n");
|
||||||
llama_perf_sampler_print(smpl);
|
llama_perf_sampler_print(samplers[0]);
|
||||||
llama_perf_context_print(ctx);
|
llama_perf_context_print(ctx);
|
||||||
|
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
|
|
||||||
llama_batch_free(batch);
|
llama_batch_free(batch);
|
||||||
|
|
||||||
llama_sampler_free(smpl);
|
for (auto & sampler_config : samplers) {
|
||||||
|
llama_sampler_free(sampler_config);
|
||||||
|
}
|
||||||
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_model_free(model);
|
llama_model_free(model);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -131,10 +131,10 @@ int main(int argc, char ** argv) {
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
// load the model
|
// load the model
|
||||||
common_init_result llama_init = common_init_from_params(params);
|
auto llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
llama_model * model = llama_init.model.get();
|
auto * model = llama_init->model();
|
||||||
llama_context * ctx = llama_init.context.get();
|
auto * ctx = llama_init->context();
|
||||||
|
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
LOG_ERR("%s: unable to load model\n", __func__);
|
LOG_ERR("%s: unable to load model\n", __func__);
|
||||||
|
|
|
||||||
|
|
@ -202,10 +202,10 @@ int main(int argc, char ** argv) {
|
||||||
params.warmup = false;
|
params.warmup = false;
|
||||||
|
|
||||||
// init
|
// init
|
||||||
common_init_result llama_init = common_init_from_params(params);
|
auto llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
llama_model * model = llama_init.model.get();
|
auto * model = llama_init->model();
|
||||||
llama_context * ctx = llama_init.context.get();
|
auto * ctx = llama_init->context();
|
||||||
|
|
||||||
if (model == nullptr || ctx == nullptr) {
|
if (model == nullptr || ctx == nullptr) {
|
||||||
LOG_ERR("%s : failed to init\n", __func__);
|
LOG_ERR("%s : failed to init\n", __func__);
|
||||||
|
|
|
||||||
|
|
@ -14,12 +14,13 @@ static void write_table_header(std::ofstream & file) {
|
||||||
static void write_table_entry(std::ofstream & file, const common_arg & opt) {
|
static void write_table_entry(std::ofstream & file, const common_arg & opt) {
|
||||||
file << "| `";
|
file << "| `";
|
||||||
// args
|
// args
|
||||||
for (const auto & arg : opt.args) {
|
auto all_args = opt.get_args();
|
||||||
if (arg == opt.args.front()) {
|
for (const auto & arg : all_args) {
|
||||||
|
if (arg == all_args.front()) {
|
||||||
file << arg;
|
file << arg;
|
||||||
if (opt.args.size() > 1) file << ", ";
|
if (all_args.size() > 1) file << ", ";
|
||||||
} else {
|
} else {
|
||||||
file << arg << (arg != opt.args.back() ? ", " : "");
|
file << arg << (arg != all_args.back() ? ", " : "");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// value hint
|
// value hint
|
||||||
|
|
@ -76,7 +77,7 @@ static void export_md(std::string fname, llama_example ex) {
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int, char **) {
|
int main(int, char **) {
|
||||||
export_md("autogen-main.md", LLAMA_EXAMPLE_MAIN);
|
export_md("autogen-main.md", LLAMA_EXAMPLE_COMPLETION);
|
||||||
export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER);
|
export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
||||||
|
|
@ -55,10 +55,10 @@ int main(int argc, char ** argv) {
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
// load the target model
|
// load the target model
|
||||||
common_init_result llama_init = common_init_from_params(params);
|
auto llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
llama_model * model = llama_init.model.get();
|
auto * model = llama_init->model();
|
||||||
llama_context * ctx = llama_init.context.get();
|
auto * ctx = llama_init->context();
|
||||||
|
|
||||||
auto * mem = llama_get_memory(ctx);
|
auto * mem = llama_get_memory(ctx);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -18,16 +18,16 @@ int main(int argc, char ** argv){
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
// load the model
|
// load the model
|
||||||
common_init_result llama_init = common_init_from_params(params);
|
auto llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
llama_model_ptr & model = llama_init.model;
|
auto * model = llama_init->model();
|
||||||
llama_context_ptr & ctx = llama_init.context;
|
auto * ctx = llama_init->context();
|
||||||
|
|
||||||
GGML_ASSERT(model != nullptr);
|
GGML_ASSERT(model != nullptr);
|
||||||
|
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
std::vector<llama_token> inp;
|
std::vector<llama_token> inp;
|
||||||
inp = common_tokenize(ctx.get(), params.prompt, true, true);
|
inp = common_tokenize(ctx, params.prompt, true, true);
|
||||||
fprintf(stderr, "%s: tokenization done\n", __func__);
|
fprintf(stderr, "%s: tokenization done\n", __func__);
|
||||||
|
|
||||||
common_ngram_cache ngram_cache;
|
common_ngram_cache ngram_cache;
|
||||||
|
|
|
||||||
|
|
@ -28,13 +28,13 @@ int main(int argc, char ** argv){
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
// load the model
|
// load the model
|
||||||
common_init_result llama_init = common_init_from_params(params);
|
auto llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
llama_context_ptr & ctx = llama_init.context;
|
llama_context * ctx = llama_init->context();
|
||||||
|
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
std::vector<llama_token> inp;
|
std::vector<llama_token> inp;
|
||||||
inp = common_tokenize(ctx.get(), params.prompt, true, true);
|
inp = common_tokenize(ctx, params.prompt, true, true);
|
||||||
|
|
||||||
common_ngram_cache ngram_cache_context;
|
common_ngram_cache ngram_cache_context;
|
||||||
common_ngram_cache ngram_cache_dynamic;
|
common_ngram_cache ngram_cache_dynamic;
|
||||||
|
|
@ -65,7 +65,7 @@ int main(int argc, char ** argv){
|
||||||
}
|
}
|
||||||
|
|
||||||
const int n_input = inp.size();
|
const int n_input = inp.size();
|
||||||
const int n_ctx = llama_n_ctx(ctx.get());
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
|
|
||||||
int n_drafted = 0;
|
int n_drafted = 0;
|
||||||
int n_accept = 0;
|
int n_accept = 0;
|
||||||
|
|
|
||||||
|
|
@ -29,10 +29,10 @@ int main(int argc, char ** argv){
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
// load the model
|
// load the model
|
||||||
common_init_result llama_init = common_init_from_params(params);
|
auto llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
llama_model * model = llama_init.model.get();
|
auto * model = llama_init->model();
|
||||||
llama_context * ctx = llama_init.context.get();
|
auto * ctx = llama_init->context();
|
||||||
|
|
||||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,10 +1,13 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import sys
|
import sys
|
||||||
import os
|
import numpy as np
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Add utils directory to path for direct script execution
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent / "utils"))
|
||||||
|
from common import get_model_name_from_env_path # type: ignore[import-not-found]
|
||||||
|
|
||||||
def quick_logits_check(pytorch_file, llamacpp_file):
|
def quick_logits_check(pytorch_file, llamacpp_file):
|
||||||
"""Lightweight sanity check before NMSE"""
|
"""Lightweight sanity check before NMSE"""
|
||||||
|
|
||||||
|
|
@ -32,27 +35,16 @@ def quick_logits_check(pytorch_file, llamacpp_file):
|
||||||
print(f"Top 10 llama.cpp logits: {llamacpp_logits[llamacpp_top10]}")
|
print(f"Top 10 llama.cpp logits: {llamacpp_logits[llamacpp_top10]}")
|
||||||
print(f"Max absolute difference: {max_diff:.4f}")
|
print(f"Max absolute difference: {max_diff:.4f}")
|
||||||
|
|
||||||
if max_diff > 1.0:
|
|
||||||
print(f"❌ NOK: Large differences detected - max diff: {max_diff:.4f}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
model_path = os.getenv('MODEL_PATH')
|
model_name = get_model_name_from_env_path('MODEL_PATH')
|
||||||
if not model_path:
|
|
||||||
print("Error: MODEL_PATH environment variable not set")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
if not os.path.exists(model_path):
|
|
||||||
print(f"Error: Model file not found: {model_path}")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
model_name = os.path.basename(model_path)
|
|
||||||
data_dir = Path("data")
|
data_dir = Path("data")
|
||||||
|
|
||||||
pytorch_file = data_dir / f"pytorch-{model_name}.bin"
|
pytorch_file = data_dir / f"pytorch-{model_name}.bin"
|
||||||
llamacpp_file = data_dir / f"llamacpp-{model_name}.bin"
|
|
||||||
|
llamacpp_model_name = get_model_name_from_env_path('CONVERTED_MODEL')
|
||||||
|
print(f"Using converted model: {llamacpp_model_name}")
|
||||||
|
llamacpp_file = data_dir / f"llamacpp-{llamacpp_model_name}.bin"
|
||||||
|
|
||||||
if not pytorch_file.exists():
|
if not pytorch_file.exists():
|
||||||
print(f"Error: PyTorch logits file not found: {pytorch_file}")
|
print(f"Error: PyTorch logits file not found: {pytorch_file}")
|
||||||
|
|
|
||||||
|
|
@ -200,7 +200,7 @@ with torch.no_grad():
|
||||||
logits = outputs.logits
|
logits = outputs.logits
|
||||||
|
|
||||||
# Extract logits for the last token (next token prediction)
|
# Extract logits for the last token (next token prediction)
|
||||||
last_logits = logits[0, -1, :].cpu().numpy()
|
last_logits = logits[0, -1, :].float().cpu().numpy()
|
||||||
|
|
||||||
print(f"Logits shape: {logits.shape}")
|
print(f"Logits shape: {logits.shape}")
|
||||||
print(f"Last token logits shape: {last_logits.shape}")
|
print(f"Last token logits shape: {last_logits.shape}")
|
||||||
|
|
|
||||||
|
|
@ -5,6 +5,7 @@ import sys
|
||||||
import os
|
import os
|
||||||
import argparse
|
import argparse
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from common import get_model_name_from_env_path # type: ignore[import-not-found]
|
||||||
|
|
||||||
def calculate_nmse(reference, test):
|
def calculate_nmse(reference, test):
|
||||||
mse = np.mean((test - reference) ** 2)
|
mse = np.mean((test - reference) ** 2)
|
||||||
|
|
@ -67,11 +68,13 @@ def main():
|
||||||
parser.add_argument('-m', '--model-path', required=True, help='Path to the model directory')
|
parser.add_argument('-m', '--model-path', required=True, help='Path to the model directory')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
model_name = os.path.basename(args.model_path)
|
model_name = get_model_name_from_env_path('MODEL_PATH')
|
||||||
data_dir = Path("data")
|
data_dir = Path("data")
|
||||||
|
|
||||||
pytorch_file = data_dir / f"pytorch-{model_name}.bin"
|
pytorch_file = data_dir / f"pytorch-{model_name}.bin"
|
||||||
llamacpp_file = data_dir / f"llamacpp-{model_name}.bin"
|
|
||||||
|
llamacpp_model_name = get_model_name_from_env_path('CONVERTED_MODEL')
|
||||||
|
llamacpp_file = data_dir / f"llamacpp-{llamacpp_model_name}.bin"
|
||||||
|
|
||||||
print(f"Model name: {model_name}")
|
print(f"Model name: {model_name}")
|
||||||
print(f"PyTorch logits file: {pytorch_file}")
|
print(f"PyTorch logits file: {pytorch_file}")
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,20 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
def get_model_name_from_env_path(env_path_name):
|
||||||
|
model_path = os.getenv(env_path_name)
|
||||||
|
if not model_path:
|
||||||
|
print(f"Error: {env_path_name} environment variable not set")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if not os.path.exists(model_path):
|
||||||
|
print(f"Error: Model file not found: {model_path}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
name = os.path.basename(os.path.normpath(model_path))
|
||||||
|
if name.endswith(".gguf"):
|
||||||
|
name = name[:-5]
|
||||||
|
|
||||||
|
return name
|
||||||
|
|
@ -192,10 +192,10 @@ int main(int argc, char ** argv) {
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
// load the target model
|
// load the target model
|
||||||
common_init_result llama_init = common_init_from_params(params);
|
auto llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
llama_model * model = llama_init.model.get();
|
auto * model = llama_init->model();
|
||||||
llama_context * ctx = llama_init.context.get();
|
auto * ctx = llama_init->context();
|
||||||
|
|
||||||
auto * mem = llama_get_memory(ctx);
|
auto * mem = llama_get_memory(ctx);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -149,10 +149,10 @@ int main(int argc, char ** argv) {
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
// load the model
|
// load the model
|
||||||
common_init_result llama_init = common_init_from_params(params);
|
auto llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
llama_model * model = llama_init.model.get();
|
auto * model = llama_init->model();
|
||||||
llama_context * ctx = llama_init.context.get();
|
auto * ctx = llama_init->context();
|
||||||
|
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
LOG_ERR("%s: unable to load model\n", __func__);
|
LOG_ERR("%s: unable to load model\n", __func__);
|
||||||
|
|
|
||||||
|
|
@ -34,10 +34,10 @@ int main(int argc, char ** argv) {
|
||||||
std::string result2;
|
std::string result2;
|
||||||
|
|
||||||
// init
|
// init
|
||||||
common_init_result llama_init = common_init_from_params(params);
|
auto llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
llama_model * model = llama_init.model.get();
|
auto * model = llama_init->model();
|
||||||
llama_context * ctx = llama_init.context.get();
|
auto * ctx = llama_init->context();
|
||||||
|
|
||||||
if (model == nullptr || ctx == nullptr) {
|
if (model == nullptr || ctx == nullptr) {
|
||||||
fprintf(stderr, "%s : failed to init\n", __func__);
|
fprintf(stderr, "%s : failed to init\n", __func__);
|
||||||
|
|
|
||||||
|
|
@ -40,10 +40,10 @@ int main(int argc, char ** argv) {
|
||||||
llama_context * ctx_dft = NULL;
|
llama_context * ctx_dft = NULL;
|
||||||
|
|
||||||
// load the target model
|
// load the target model
|
||||||
common_init_result llama_init_tgt = common_init_from_params(params);
|
auto llama_init_tgt = common_init_from_params(params);
|
||||||
|
|
||||||
model_tgt = llama_init_tgt.model.get();
|
model_tgt = llama_init_tgt->model();
|
||||||
ctx_tgt = llama_init_tgt.context.get();
|
ctx_tgt = llama_init_tgt->context();
|
||||||
|
|
||||||
const llama_vocab * vocab = llama_model_get_vocab(model_tgt);
|
const llama_vocab * vocab = llama_model_get_vocab(model_tgt);
|
||||||
|
|
||||||
|
|
@ -61,10 +61,10 @@ int main(int argc, char ** argv) {
|
||||||
params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
|
params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
|
||||||
params.tensor_buft_overrides = params.speculative.tensor_buft_overrides;
|
params.tensor_buft_overrides = params.speculative.tensor_buft_overrides;
|
||||||
|
|
||||||
common_init_result llama_init_dft = common_init_from_params(params);
|
auto llama_init_dft = common_init_from_params(params);
|
||||||
|
|
||||||
//model_dft = llama_init_dft.model.get();
|
//model_dft = llama_init_dft->model();
|
||||||
ctx_dft = llama_init_dft.context.get();
|
ctx_dft = llama_init_dft->context();
|
||||||
|
|
||||||
if (!common_speculative_are_compatible(ctx_tgt, ctx_dft)) {
|
if (!common_speculative_are_compatible(ctx_tgt, ctx_dft)) {
|
||||||
LOG_INF("the draft model '%s' is not compatible with the target model '%s'. tokens will be translated between the draft and target models.\n", params.speculative.model.path.c_str(), params.model.path.c_str());
|
LOG_INF("the draft model '%s' is not compatible with the target model '%s'. tokens will be translated between the draft and target models.\n", params.speculative.model.path.c_str(), params.model.path.c_str());
|
||||||
|
|
@ -255,6 +255,8 @@ int main(int argc, char ** argv) {
|
||||||
LOG_INF("target:\n\n");
|
LOG_INF("target:\n\n");
|
||||||
common_perf_print(ctx_tgt, smpl);
|
common_perf_print(ctx_tgt, smpl);
|
||||||
|
|
||||||
|
llama_batch_free(batch_tgt);
|
||||||
|
|
||||||
common_sampler_free(smpl);
|
common_sampler_free(smpl);
|
||||||
common_speculative_free(spec);
|
common_speculative_free(spec);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -71,10 +71,10 @@ int main(int argc, char ** argv) {
|
||||||
llama_context * ctx_dft = NULL;
|
llama_context * ctx_dft = NULL;
|
||||||
|
|
||||||
// load the target model
|
// load the target model
|
||||||
common_init_result llama_init_tgt = common_init_from_params(params);
|
auto llama_init_tgt = common_init_from_params(params);
|
||||||
|
|
||||||
model_tgt = llama_init_tgt.model.get();
|
model_tgt = llama_init_tgt->model();
|
||||||
ctx_tgt = llama_init_tgt.context.get();
|
ctx_tgt = llama_init_tgt->context();
|
||||||
|
|
||||||
// load the draft model
|
// load the draft model
|
||||||
params.devices = params.speculative.devices;
|
params.devices = params.speculative.devices;
|
||||||
|
|
@ -87,10 +87,10 @@ int main(int argc, char ** argv) {
|
||||||
params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
|
params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
|
||||||
params.tensor_buft_overrides = params.speculative.tensor_buft_overrides;
|
params.tensor_buft_overrides = params.speculative.tensor_buft_overrides;
|
||||||
|
|
||||||
common_init_result llama_init_dft = common_init_from_params(params);
|
auto llama_init_dft = common_init_from_params(params);
|
||||||
|
|
||||||
model_dft = llama_init_dft.model.get();
|
model_dft = llama_init_dft->model();
|
||||||
ctx_dft = llama_init_dft.context.get();
|
ctx_dft = llama_init_dft->context();
|
||||||
|
|
||||||
const llama_vocab * vocab_tgt = llama_model_get_vocab(model_tgt);
|
const llama_vocab * vocab_tgt = llama_model_get_vocab(model_tgt);
|
||||||
const llama_vocab * vocab_dft = llama_model_get_vocab(model_dft);
|
const llama_vocab * vocab_dft = llama_model_get_vocab(model_dft);
|
||||||
|
|
@ -242,7 +242,7 @@ int main(int argc, char ** argv) {
|
||||||
bool accept = false;
|
bool accept = false;
|
||||||
if (params.sampling.temp > 0) {
|
if (params.sampling.temp > 0) {
|
||||||
// stochastic verification
|
// stochastic verification
|
||||||
common_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft], true);
|
common_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]);
|
||||||
|
|
||||||
auto & dist_tgt = *common_sampler_get_candidates(smpl, true);
|
auto & dist_tgt = *common_sampler_get_candidates(smpl, true);
|
||||||
|
|
||||||
|
|
@ -491,7 +491,7 @@ int main(int argc, char ** argv) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
common_sampler_sample(drafts[s].smpl, ctx_dft, drafts[s].i_batch_dft, true);
|
common_sampler_sample(drafts[s].smpl, ctx_dft, drafts[s].i_batch_dft);
|
||||||
|
|
||||||
const auto * cur_p = common_sampler_get_candidates(drafts[s].smpl, true);
|
const auto * cur_p = common_sampler_get_candidates(drafts[s].smpl, true);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -39,9 +39,10 @@ int main(int argc, char ** argv) {
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
// load the model and apply lora adapter, if any
|
// load the model and apply lora adapter, if any
|
||||||
common_init_result llama_init = common_init_from_params(params);
|
auto llama_init = common_init_from_params(params);
|
||||||
llama_model_ptr & model = llama_init.model;
|
|
||||||
llama_context_ptr & ctx = llama_init.context;
|
auto * model = llama_init->model();
|
||||||
|
auto * ctx = llama_init->context();
|
||||||
|
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
LOG_ERR("%s: unable to load model\n", __func__);
|
LOG_ERR("%s: unable to load model\n", __func__);
|
||||||
|
|
@ -54,8 +55,8 @@ int main(int argc, char ** argv) {
|
||||||
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<llama_token> tokens = common_tokenize(ctx.get(), params.prompt, true);
|
std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, true);
|
||||||
ggml_opt_dataset_t dataset = common_opt_dataset_init(ctx.get(), tokens, llama_n_ctx(ctx.get()) / 2);
|
ggml_opt_dataset_t dataset = common_opt_dataset_init(ctx, tokens, llama_n_ctx(ctx) / 2);
|
||||||
|
|
||||||
struct lr_opt & lr = params.lr;
|
struct lr_opt & lr = params.lr;
|
||||||
LOG_INF("-optimizer %s -lr0 %.2g -wd %.2g -lr-min %.2g -min-epochs %.2g -epochs %d -period %.2g -val %.2g\n",
|
LOG_INF("-optimizer %s -lr0 %.2g -wd %.2g -lr-min %.2g -min-epochs %.2g -epochs %d -period %.2g -val %.2g\n",
|
||||||
|
|
@ -70,7 +71,7 @@ int main(int argc, char ** argv) {
|
||||||
/*get_opt_pars_ud =*/¶ms.lr,
|
/*get_opt_pars_ud =*/¶ms.lr,
|
||||||
/*optimizer_type =*/params.optimizer,
|
/*optimizer_type =*/params.optimizer,
|
||||||
};
|
};
|
||||||
llama_opt_init(ctx.get(), model.get(), lopt_params);
|
llama_opt_init(ctx, model, lopt_params);
|
||||||
|
|
||||||
const int64_t idata_split = ggml_opt_dataset_ndata(dataset) * (1.0f - params.val_split);
|
const int64_t idata_split = ggml_opt_dataset_ndata(dataset) * (1.0f - params.val_split);
|
||||||
|
|
||||||
|
|
@ -78,7 +79,7 @@ int main(int argc, char ** argv) {
|
||||||
ggml_opt_result_t result_eval = ggml_opt_result_init();
|
ggml_opt_result_t result_eval = ggml_opt_result_init();
|
||||||
|
|
||||||
for (lr.epoch = 0; lr.epoch < lr.epochs; ++lr.epoch) {
|
for (lr.epoch = 0; lr.epoch < lr.epochs; ++lr.epoch) {
|
||||||
llama_opt_epoch(ctx.get(), dataset, result_train, result_eval, idata_split,
|
llama_opt_epoch(ctx, dataset, result_train, result_eval, idata_split,
|
||||||
ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar);
|
ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar);
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
|
|
||||||
|
|
@ -88,7 +89,7 @@ int main(int argc, char ** argv) {
|
||||||
ggml_opt_result_free(result_train);
|
ggml_opt_result_free(result_train);
|
||||||
ggml_opt_result_free(result_eval);
|
ggml_opt_result_free(result_eval);
|
||||||
|
|
||||||
llama_model_save_to_file(model.get(), params.out_file.c_str());
|
llama_model_save_to_file(model, params.out_file.c_str());
|
||||||
|
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -54,6 +54,10 @@ if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
|
||||||
# TODO
|
# TODO
|
||||||
else()
|
else()
|
||||||
set(GGML_STANDALONE OFF)
|
set(GGML_STANDALONE OFF)
|
||||||
|
|
||||||
|
if (NOT CMAKE_RUNTIME_OUTPUT_DIRECTORY)
|
||||||
|
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (EMSCRIPTEN)
|
if (EMSCRIPTEN)
|
||||||
|
|
|
||||||
|
|
@ -53,7 +53,14 @@ GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc);
|
||||||
// call with a worst-case graph to avoid buffer reallocations
|
// call with a worst-case graph to avoid buffer reallocations
|
||||||
// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
|
// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
|
||||||
// returns false if the buffer allocation failed
|
// returns false if the buffer allocation failed
|
||||||
|
// ggml_gallocr_resrve_n_size writes the buffer sizes per galloc buffer that would be allocated by ggml_gallocr_reserve_n to sizes
|
||||||
GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
|
GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
|
||||||
|
GGML_API void ggml_gallocr_reserve_n_size(
|
||||||
|
ggml_gallocr_t galloc,
|
||||||
|
struct ggml_cgraph * graph,
|
||||||
|
const int * node_buffer_ids,
|
||||||
|
const int * leaf_buffer_ids,
|
||||||
|
size_t * sizes);
|
||||||
GGML_API bool ggml_gallocr_reserve_n(
|
GGML_API bool ggml_gallocr_reserve_n(
|
||||||
ggml_gallocr_t galloc,
|
ggml_gallocr_t galloc,
|
||||||
struct ggml_cgraph * graph,
|
struct ggml_cgraph * graph,
|
||||||
|
|
@ -68,6 +75,8 @@ GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_i
|
||||||
|
|
||||||
// Utils
|
// Utils
|
||||||
// Create a buffer and allocate all the tensors in a ggml_context
|
// Create a buffer and allocate all the tensors in a ggml_context
|
||||||
|
// ggml_backend_alloc_ctx_tensors_from_buft_size returns the size of the buffer that would be allocated by ggml_backend_alloc_ctx_tensors_from_buft
|
||||||
|
GGML_API size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
|
||||||
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
|
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
|
||||||
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
|
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -307,6 +307,7 @@ extern "C" {
|
||||||
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
||||||
|
|
||||||
// Initialize backend buffers from a measure graph
|
// Initialize backend buffers from a measure graph
|
||||||
|
GGML_API void ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes);
|
||||||
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
|
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
|
||||||
|
|
||||||
GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
|
GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
|
||||||
|
|
|
||||||
|
|
@ -99,6 +99,7 @@ extern "C" {
|
||||||
GGML_BACKEND_API int ggml_cpu_has_sme (void);
|
GGML_BACKEND_API int ggml_cpu_has_sme (void);
|
||||||
// other
|
// other
|
||||||
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
|
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
|
||||||
|
GGML_BACKEND_API int ggml_cpu_get_rvv_vlen (void); // risc-v vector length in bytes
|
||||||
GGML_BACKEND_API int ggml_cpu_has_vsx (void);
|
GGML_BACKEND_API int ggml_cpu_has_vsx (void);
|
||||||
GGML_BACKEND_API int ggml_cpu_has_vxe (void);
|
GGML_BACKEND_API int ggml_cpu_has_vxe (void);
|
||||||
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
|
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
|
||||||
|
|
|
||||||
|
|
@ -2305,12 +2305,10 @@ extern "C" {
|
||||||
float stop,
|
float stop,
|
||||||
float step);
|
float step);
|
||||||
|
|
||||||
#define GGML_KQ_MASK_PAD 1
|
|
||||||
|
|
||||||
// q: [n_embd_k, n_batch, n_head, ne3 ]
|
// q: [n_embd_k, n_batch, n_head, ne3 ]
|
||||||
// k: [n_embd_k, n_kv, n_head_kv, ne3 ]
|
// k: [n_embd_k, n_kv, n_head_kv, ne3 ]
|
||||||
// v: [n_embd_v, n_kv, n_head_kv, ne3 ] !! not transposed !!
|
// v: [n_embd_v, n_kv, n_head_kv, ne3 ] !! not transposed !!
|
||||||
// mask: [n_kv, n_batch_pad, ne32, ne33] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
|
// mask: [n_kv, n_batch, ne32, ne33]
|
||||||
// res: [n_embd_v, n_head, n_batch, ne3 ] !! permuted !!
|
// res: [n_embd_v, n_head, n_batch, ne3 ] !! permuted !!
|
||||||
//
|
//
|
||||||
// broadcast:
|
// broadcast:
|
||||||
|
|
@ -2617,6 +2615,7 @@ extern "C" {
|
||||||
|
|
||||||
// Set callback for all future logging events.
|
// Set callback for all future logging events.
|
||||||
// If this is not called, or NULL is supplied, everything is output on stderr.
|
// If this is not called, or NULL is supplied, everything is output on stderr.
|
||||||
|
GGML_API void ggml_log_get(ggml_log_callback * log_callback, void ** user_data);
|
||||||
GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
|
GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
||||||
|
|
|
||||||
|
|
@ -25,6 +25,7 @@ static bool ggml_is_view(const struct ggml_tensor * t) {
|
||||||
// ops that return true for this function must not use restrict pointers for their backend implementations
|
// ops that return true for this function must not use restrict pointers for their backend implementations
|
||||||
bool ggml_op_can_inplace(enum ggml_op op) {
|
bool ggml_op_can_inplace(enum ggml_op op) {
|
||||||
switch (op) {
|
switch (op) {
|
||||||
|
case GGML_OP_FILL:
|
||||||
case GGML_OP_SCALE:
|
case GGML_OP_SCALE:
|
||||||
case GGML_OP_DIAG_MASK_ZERO:
|
case GGML_OP_DIAG_MASK_ZERO:
|
||||||
case GGML_OP_DIAG_MASK_INF:
|
case GGML_OP_DIAG_MASK_INF:
|
||||||
|
|
@ -311,16 +312,9 @@ static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * al
|
||||||
}
|
}
|
||||||
|
|
||||||
// this is a very naive implementation, but for our case the number of free blocks should be very small
|
// this is a very naive implementation, but for our case the number of free blocks should be very small
|
||||||
static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size, const struct ggml_tensor * tensor) {
|
static void ggml_dyn_tallocr_free_bytes(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size) {
|
||||||
size = aligned_offset(NULL, size, alloc->alignment);
|
size = aligned_offset(NULL, size, alloc->alignment);
|
||||||
|
|
||||||
AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n",
|
|
||||||
__func__, tensor->name, addr.chunk, addr.offset, size, alloc->chunks[addr.chunk]->n_free_blocks);
|
|
||||||
|
|
||||||
#ifdef GGML_ALLOCATOR_DEBUG
|
|
||||||
remove_allocated_tensor(alloc, addr, tensor);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
struct tallocr_chunk * chunk = alloc->chunks[addr.chunk];
|
struct tallocr_chunk * chunk = alloc->chunks[addr.chunk];
|
||||||
|
|
||||||
// see if we can merge with an existing block
|
// see if we can merge with an existing block
|
||||||
|
|
@ -356,8 +350,6 @@ static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, struct
|
||||||
}
|
}
|
||||||
// otherwise, add a new block
|
// otherwise, add a new block
|
||||||
ggml_dyn_tallocr_insert_block(chunk, addr.offset, size);
|
ggml_dyn_tallocr_insert_block(chunk, addr.offset, size);
|
||||||
|
|
||||||
GGML_UNUSED(tensor);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
|
static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
|
||||||
|
|
@ -602,7 +594,9 @@ static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
|
static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
|
||||||
return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;
|
return t->data != NULL // tensor data already set externally
|
||||||
|
|| t->buffer // tensor on external buffer (but not yet allocated)
|
||||||
|
|| ggml_gallocr_is_own(galloc, t); // tensor will be allocated by galloc
|
||||||
}
|
}
|
||||||
|
|
||||||
// free the extra space at the end if the new tensor is smaller
|
// free the extra space at the end if the new tensor is smaller
|
||||||
|
|
@ -615,13 +609,17 @@ static void ggml_gallocr_free_extra_space(ggml_gallocr_t galloc, struct ggml_ten
|
||||||
|
|
||||||
GGML_ASSERT(parent_size >= node_size);
|
GGML_ASSERT(parent_size >= node_size);
|
||||||
|
|
||||||
if (parent_size > node_size) {
|
// note: we want after the freeing the chunks to continue to be aligned
|
||||||
struct ggml_dyn_tallocr * p_alloc = galloc->buf_tallocs[p_hn->buffer_id];
|
struct ggml_dyn_tallocr * p_alloc = galloc->buf_tallocs[p_hn->buffer_id];
|
||||||
|
parent_size = aligned_offset(NULL, parent_size, p_alloc->alignment);
|
||||||
|
node_size = aligned_offset(NULL, node_size, p_alloc->alignment);
|
||||||
|
|
||||||
|
if (parent_size > node_size) {
|
||||||
struct buffer_address p_addr = p_hn->addr;
|
struct buffer_address p_addr = p_hn->addr;
|
||||||
p_addr.offset += node_size;
|
p_addr.offset += node_size;
|
||||||
size_t extra_size = parent_size - node_size;
|
size_t extra_size = parent_size - node_size;
|
||||||
AT_PRINTF("freeing extra %zu bytes from parent %s for %s\n", extra_size, parent->name, node->name);
|
AT_PRINTF("freeing extra %zu bytes from parent %s for %s\n", extra_size, parent->name, node->name);
|
||||||
ggml_dyn_tallocr_free_tensor(p_alloc, p_addr, extra_size, parent);
|
ggml_dyn_tallocr_free_bytes(p_alloc, p_addr, extra_size);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -705,7 +703,14 @@ static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * n
|
||||||
struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
|
struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
|
||||||
ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
|
ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
|
||||||
size_t size = ggml_backend_buft_get_alloc_size(buft, node);
|
size_t size = ggml_backend_buft_get_alloc_size(buft, node);
|
||||||
ggml_dyn_tallocr_free_tensor(alloc, hn->addr, size, node);
|
|
||||||
|
AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n",
|
||||||
|
__func__, node->name, hn->addr.chunk, hn->addr.offset, size, alloc->chunks[hn->addr.chunk]->n_free_blocks);
|
||||||
|
#ifdef GGML_ALLOCATOR_DEBUG
|
||||||
|
remove_allocated_tensor(alloc, hn->addr, node);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
ggml_dyn_tallocr_free_bytes(alloc, hn->addr, size);
|
||||||
hn->allocated = false;
|
hn->allocated = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -820,7 +825,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
|
static bool ggml_gallocr_reserve_n_impl(
|
||||||
|
ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids, bool no_alloc) {
|
||||||
size_t min_hash_size = graph->n_nodes + graph->n_leafs;
|
size_t min_hash_size = graph->n_nodes + graph->n_leafs;
|
||||||
// add 25% margin to avoid hash collisions
|
// add 25% margin to avoid hash collisions
|
||||||
min_hash_size += min_hash_size / 4;
|
min_hash_size += min_hash_size / 4;
|
||||||
|
|
@ -925,12 +931,14 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||||
size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0;
|
size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0;
|
||||||
if (cur_size > 0) {
|
if (cur_size > 0) {
|
||||||
GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n",
|
GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n",
|
||||||
__func__, ggml_backend_buft_name(galloc->bufts[i]),
|
__func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
||||||
cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
ggml_vbuffer_free(galloc->buffers[i]);
|
ggml_vbuffer_free(galloc->buffers[i]);
|
||||||
|
if (no_alloc) {
|
||||||
|
galloc->buffers[i] = NULL;
|
||||||
|
} else {
|
||||||
galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
||||||
if (galloc->buffers[i] == NULL) {
|
if (galloc->buffers[i] == NULL) {
|
||||||
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
|
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
|
||||||
|
|
@ -938,10 +946,26 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_gallocr_reserve_n_size(
|
||||||
|
ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids, size_t * sizes) {
|
||||||
|
GGML_ASSERT(ggml_gallocr_reserve_n_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids, /*no_alloc =*/ true));
|
||||||
|
for (int i = 0; i < galloc->n_buffers; i++) {
|
||||||
|
sizes[i] = 0;
|
||||||
|
for (int c = 0; c < galloc->buf_tallocs[i]->n_chunks; c++) {
|
||||||
|
sizes[i] += galloc->buf_tallocs[i]->chunks[c]->max_size;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
|
||||||
|
return ggml_gallocr_reserve_n_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids, /*no_alloc =*/ false);
|
||||||
|
}
|
||||||
|
|
||||||
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
|
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
|
||||||
return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
|
return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
|
||||||
}
|
}
|
||||||
|
|
@ -1144,7 +1168,8 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
|
static ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft_impl(
|
||||||
|
struct ggml_context * ctx, ggml_backend_buffer_type_t buft, size_t * nbytes_total, bool no_alloc) {
|
||||||
GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
|
GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
|
||||||
|
|
||||||
size_t alignment = ggml_backend_buft_get_alignment(buft);
|
size_t alignment = ggml_backend_buft_get_alignment(buft);
|
||||||
|
|
@ -1152,6 +1177,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
||||||
|
|
||||||
ggml_backend_buffer_t * buffers = NULL;
|
ggml_backend_buffer_t * buffers = NULL;
|
||||||
size_t n_buffers = 0;
|
size_t n_buffers = 0;
|
||||||
|
*nbytes_total = 0;
|
||||||
|
|
||||||
size_t cur_buf_size = 0;
|
size_t cur_buf_size = 0;
|
||||||
struct ggml_tensor * first = ggml_get_first_tensor(ctx);
|
struct ggml_tensor * first = ggml_get_first_tensor(ctx);
|
||||||
|
|
@ -1163,10 +1189,11 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
||||||
|
|
||||||
if (cur_buf_size > 0 && (cur_buf_size + this_size) > max_size) {
|
if (cur_buf_size > 0 && (cur_buf_size + this_size) > max_size) {
|
||||||
// allocate tensors in the current buffer
|
// allocate tensors in the current buffer
|
||||||
if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
|
if (!no_alloc && !alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
first = t;
|
first = t;
|
||||||
|
*nbytes_total += cur_buf_size;
|
||||||
cur_buf_size = this_size;
|
cur_buf_size = this_size;
|
||||||
} else {
|
} else {
|
||||||
cur_buf_size += this_size;
|
cur_buf_size += this_size;
|
||||||
|
|
@ -1175,15 +1202,21 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
||||||
|
|
||||||
// allocate remaining tensors
|
// allocate remaining tensors
|
||||||
if (cur_buf_size > 0) {
|
if (cur_buf_size > 0) {
|
||||||
if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
|
*nbytes_total += cur_buf_size;
|
||||||
|
if (!no_alloc && !alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (no_alloc) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
if (n_buffers == 0) {
|
if (n_buffers == 0) {
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
|
GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
|
||||||
#endif
|
#endif
|
||||||
|
GGML_ASSERT(!buffers);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1193,10 +1226,24 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
||||||
} else {
|
} else {
|
||||||
buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
|
buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
|
||||||
}
|
}
|
||||||
free(buffers);
|
if (buffers) {
|
||||||
|
free(buffers); // can be NULL if context is empty or no_alloc
|
||||||
|
}
|
||||||
return buffer;
|
return buffer;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
|
||||||
|
size_t nbytes_total = 0;
|
||||||
|
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*no_alloc=*/ true);
|
||||||
|
GGML_ASSERT(!buf);
|
||||||
|
return nbytes_total;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
|
||||||
|
size_t nbytes_total = 0;
|
||||||
|
return ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*no_alloc =*/ false);
|
||||||
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) {
|
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) {
|
||||||
return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend));
|
return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend));
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -36,12 +36,11 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||||
|
GGML_ASSERT(buft);
|
||||||
if (size == 0) {
|
if (size == 0) {
|
||||||
// return a dummy buffer for zero-sized allocations
|
// return a dummy buffer for zero-sized allocations
|
||||||
return ggml_backend_buffer_init(buft, {}, NULL, 0);
|
return ggml_backend_buffer_init(buft, {}, NULL, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_ASSERT(buft);
|
|
||||||
return buft->iface.alloc_buffer(buft, size);
|
return buft->iface.alloc_buffer(buft, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -128,6 +127,12 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// FIXME JG: a multi_buffer has a non-zero size, according to the above comment get_base is not optional,
|
||||||
|
// I don't know whether the above comment is correct
|
||||||
|
if (!buffer->iface.get_base) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
void * base = buffer->iface.get_base(buffer);
|
void * base = buffer->iface.get_base(buffer);
|
||||||
|
|
||||||
GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
|
GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
|
||||||
|
|
@ -1727,6 +1732,20 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
||||||
sched->is_alloc = false;
|
sched->is_alloc = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes) {
|
||||||
|
GGML_ASSERT(sched);
|
||||||
|
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
|
||||||
|
GGML_ASSERT(sizes);
|
||||||
|
|
||||||
|
ggml_backend_sched_reset(sched);
|
||||||
|
|
||||||
|
ggml_backend_sched_synchronize(sched);
|
||||||
|
|
||||||
|
ggml_backend_sched_split_graph(sched, measure_graph);
|
||||||
|
|
||||||
|
ggml_gallocr_reserve_n_size(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids, sizes);
|
||||||
|
}
|
||||||
|
|
||||||
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
||||||
GGML_ASSERT(sched);
|
GGML_ASSERT(sched);
|
||||||
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
|
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
|
||||||
|
|
|
||||||
|
|
@ -2251,12 +2251,12 @@ static void aclnn_rope_cache_init(ggml_backend_cann_context & ctx,
|
||||||
int sections[4],
|
int sections[4],
|
||||||
bool mrope_used,
|
bool mrope_used,
|
||||||
bool is_imrope,
|
bool is_imrope,
|
||||||
bool indep_sects) {
|
bool indep_sects,
|
||||||
ggml_tensor * src0 = dst->src[0]; // input
|
int64_t rope_dims) {
|
||||||
ggml_tensor * src1 = dst->src[1]; // position
|
ggml_tensor * src1 = dst->src[1]; // position
|
||||||
ggml_tensor * src2 = dst->src[2]; // freq_factors
|
ggml_tensor * src2 = dst->src[2]; // freq_factors
|
||||||
|
|
||||||
int64_t theta_scale_length = src0->ne[0] / 2;
|
int64_t theta_scale_length = rope_dims / 2;
|
||||||
int64_t position_length = dst->ne[2];
|
int64_t position_length = dst->ne[2];
|
||||||
|
|
||||||
// TODO: check theta_scale_length and position_length.
|
// TODO: check theta_scale_length and position_length.
|
||||||
|
|
@ -2331,18 +2331,17 @@ static void aclnn_rope_cache_init(ggml_backend_cann_context & ctx,
|
||||||
ACL_CHECK(aclrtMemcpyAsync(ctx.rope_cache.theta_scale_cache, theta_scale_length * sizeof(float),
|
ACL_CHECK(aclrtMemcpyAsync(ctx.rope_cache.theta_scale_cache, theta_scale_length * sizeof(float),
|
||||||
ctx.rope_cache.theta_scale_exp_host, theta_scale_length * sizeof(float),
|
ctx.rope_cache.theta_scale_exp_host, theta_scale_length * sizeof(float),
|
||||||
ACL_MEMCPY_HOST_TO_DEVICE, ctx.stream()));
|
ACL_MEMCPY_HOST_TO_DEVICE, ctx.stream()));
|
||||||
|
}
|
||||||
acl_theta_scale_tensor = ggml_cann_create_tensor(ctx.rope_cache.theta_scale_cache, ACL_FLOAT, sizeof(float),
|
acl_theta_scale_tensor = ggml_cann_create_tensor(ctx.rope_cache.theta_scale_cache, ACL_FLOAT, sizeof(float),
|
||||||
theta_scale_ne, theta_scale_nb, 1);
|
theta_scale_ne, theta_scale_nb, 1);
|
||||||
}
|
|
||||||
|
|
||||||
// Step1.2: prepare rope_yarn_ramp, if this part updated, should update theta_scale_tensor.
|
// Step1.2: prepare rope_yarn_ramp, if this part updated, should update theta_scale_tensor.
|
||||||
|
// TODO: acl_yarn_ramp_tensor use rope cache.
|
||||||
bool yarn_ramp_tensor_updated = false;
|
bool yarn_ramp_tensor_updated = false;
|
||||||
ggml_cann_pool_alloc yarn_ramp_allocator(ctx.pool());
|
ggml_cann_pool_alloc yarn_ramp_allocator(ctx.pool());
|
||||||
acl_tensor_ptr acl_yarn_ramp_tensor;
|
acl_tensor_ptr acl_yarn_ramp_tensor;
|
||||||
if (ext_factor != 0 &&
|
if (ext_factor != 0 && (theta_scale_updated || ctx.rope_cache.theta_scale_length != theta_scale_length ||
|
||||||
// TODO: check more parameter.
|
ctx.rope_cache.freq_scale != freq_scale)) {
|
||||||
(ctx.rope_cache.theta_scale_length != theta_scale_length || ctx.rope_cache.freq_scale != freq_scale)) {
|
|
||||||
yarn_ramp_tensor_updated = true;
|
yarn_ramp_tensor_updated = true;
|
||||||
|
|
||||||
// -rope_yarn_ramp
|
// -rope_yarn_ramp
|
||||||
|
|
@ -2590,7 +2589,7 @@ static void aclnn_rope_cache_init(ggml_backend_cann_context & ctx,
|
||||||
aclnn_muls(ctx, acl_cos_tensor.get(), attn_factor, nullptr, true);
|
aclnn_muls(ctx, acl_cos_tensor.get(), attn_factor, nullptr, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
int64_t sin_reshape_ne[4] = { src0->ne[0], 1, dst->ne[2], 1 };
|
int64_t sin_reshape_ne[4] = { rope_dims, 1, dst->ne[2], 1 };
|
||||||
size_t sin_reshape_nb[GGML_MAX_DIMS];
|
size_t sin_reshape_nb[GGML_MAX_DIMS];
|
||||||
sin_reshape_nb[0] = sizeof(float);
|
sin_reshape_nb[0] = sizeof(float);
|
||||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||||
|
|
@ -2660,11 +2659,10 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
||||||
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
|
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
|
||||||
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
||||||
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
||||||
memcpy(§ions, (int32_t *) dst->op_params + 11, sizeof(int)*4);
|
memcpy(§ions, (int32_t *) dst->op_params + 11, sizeof(int) * 4);
|
||||||
|
|
||||||
// TODO: n_dims <= ne0
|
|
||||||
GGML_ASSERT(n_dims == ne0);
|
|
||||||
GGML_ASSERT(n_dims % 2 == 0);
|
GGML_ASSERT(n_dims % 2 == 0);
|
||||||
|
GGML_ASSERT(n_dims <= ne00);
|
||||||
|
|
||||||
const float theta_scale = powf(freq_base, -2.0f / n_dims);
|
const float theta_scale = powf(freq_base, -2.0f / n_dims);
|
||||||
|
|
||||||
|
|
@ -2673,7 +2671,10 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
||||||
|
|
||||||
bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
|
bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
|
||||||
const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE; // qwen3vl apply interleaved mrope
|
const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE; // qwen3vl apply interleaved mrope
|
||||||
const bool mrope_used = mode & GGML_ROPE_TYPE_MROPE; // ggml_rope_multi, note: also true for vision (24 & 8 == true) and for imrope
|
// mrope_used means the GGML_ROPE_TYPE_MROPE bit is set.
|
||||||
|
// Note: this bit is also set for imrope and some vision modes,
|
||||||
|
// so mrope_used does NOT exclusively indicate pure mrope.
|
||||||
|
const bool mrope_used = mode & GGML_ROPE_TYPE_MROPE;
|
||||||
const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
|
const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
|
||||||
|
|
||||||
if (mrope_used) {
|
if (mrope_used) {
|
||||||
|
|
@ -2681,17 +2682,31 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (is_vision) {
|
if (is_vision) {
|
||||||
GGML_ASSERT(n_dims == ne0/2);
|
GGML_ASSERT(n_dims == ne0 / 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (is_imrope || mrope_used) {
|
if (is_imrope || mrope_used) {
|
||||||
is_neox = true;
|
is_neox = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// init ctx.rope_cos/rope_sin cache
|
int64_t rope_dims = n_dims;
|
||||||
aclnn_rope_cache_init(ctx, dst, corr_dims, ext_factor, theta_scale, freq_scale, attn_factor, is_neox, sections, mrope_used, is_imrope, is_vision);
|
|
||||||
|
|
||||||
int64_t sin_reshape_ne[4] = { ne00, 1, ne02, 1 };
|
//Our current RotaryPositionEmbedding does not support the VISION mode,
|
||||||
|
//but essentially it only modifies theta_base in mrope,
|
||||||
|
//then repeats it at the end in the same way as is_neox.
|
||||||
|
//In fact, RoPE is still applied across all dimensions.
|
||||||
|
if (is_vision) {
|
||||||
|
rope_dims = src0->ne[0];
|
||||||
|
}
|
||||||
|
int64_t tail_dims = ne00 - rope_dims;
|
||||||
|
bool has_tail = tail_dims > 0;
|
||||||
|
|
||||||
|
// init ctx.rope_cos/rope_sin cache
|
||||||
|
aclnn_rope_cache_init(ctx, dst, corr_dims, ext_factor, theta_scale, freq_scale, attn_factor, is_neox, sections,
|
||||||
|
mrope_used, is_imrope, is_vision, rope_dims);
|
||||||
|
|
||||||
|
// Cache is generated with ne00 dimensions, so we use ne00 for reshape
|
||||||
|
int64_t sin_reshape_ne[4] = { rope_dims, 1, ne02, 1 };
|
||||||
size_t sin_reshape_nb[GGML_MAX_DIMS];
|
size_t sin_reshape_nb[GGML_MAX_DIMS];
|
||||||
sin_reshape_nb[0] = sizeof(float);
|
sin_reshape_nb[0] = sizeof(float);
|
||||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||||
|
|
@ -2704,7 +2719,6 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
||||||
|
|
||||||
acl_tensor_ptr acl_src = ggml_cann_create_tensor(src0);
|
acl_tensor_ptr acl_src = ggml_cann_create_tensor(src0);
|
||||||
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
|
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
|
||||||
|
|
||||||
#ifdef ASCEND_310P
|
#ifdef ASCEND_310P
|
||||||
// Special ROPE operation for 310P
|
// Special ROPE operation for 310P
|
||||||
|
|
||||||
|
|
@ -2844,46 +2858,124 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
int64_t acl_mode = is_neox ? 0 : 1;
|
int64_t acl_mode = is_neox ? 0 : 1;
|
||||||
|
|
||||||
switch (src0->type) {
|
// Pre-define head and tail dimensions for reuse
|
||||||
case GGML_TYPE_F32:
|
int64_t head_ne[GGML_MAX_DIMS] = { rope_dims, ne01, ne02, ne03 };
|
||||||
{
|
int64_t tail_ne[GGML_MAX_DIMS] = { tail_dims, ne01, ne02, ne03 };
|
||||||
|
|
||||||
|
// Step 1: Prepare trans tensors for F16 type conversion to F32 if needed
|
||||||
|
bool src_dst_need_trans = false;
|
||||||
|
ggml_cann_pool_alloc src_trans_allocator(ctx.pool());
|
||||||
|
ggml_cann_pool_alloc dst_trans_allocator(ctx.pool());
|
||||||
|
acl_tensor_ptr acl_src_trans_tensor;
|
||||||
|
acl_tensor_ptr acl_dst_trans_tensor;
|
||||||
|
void * src_trans_buffer = nullptr;
|
||||||
|
void * dst_trans_buffer = nullptr;
|
||||||
|
size_t src_dst_trans_nb[GGML_MAX_DIMS];
|
||||||
|
if (src0->type == GGML_TYPE_F16) {
|
||||||
|
src_dst_need_trans = true;
|
||||||
|
src_trans_buffer = src_trans_allocator.alloc(ggml_nelements(src0) * sizeof(float));
|
||||||
|
dst_trans_buffer = dst_trans_allocator.alloc(ggml_nelements(dst) * sizeof(float));
|
||||||
|
|
||||||
|
src_dst_trans_nb[0] = sizeof(float);
|
||||||
|
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||||
|
src_dst_trans_nb[i] = src_dst_trans_nb[i - 1] * src0->ne[i - 1];
|
||||||
|
}
|
||||||
|
acl_src_trans_tensor = ggml_cann_create_tensor(src_trans_buffer, ACL_FLOAT, sizeof(float), src0->ne,
|
||||||
|
src_dst_trans_nb, GGML_MAX_DIMS);
|
||||||
|
acl_dst_trans_tensor = ggml_cann_create_tensor(dst_trans_buffer, ACL_FLOAT, sizeof(float), dst->ne,
|
||||||
|
src_dst_trans_nb, GGML_MAX_DIMS);
|
||||||
|
aclnn_cast(ctx, acl_src.get(), acl_src_trans_tensor.get(), ACL_FLOAT);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 2: Prepare head tensors for tail splitting if needed
|
||||||
|
acl_tensor_ptr acl_src_head;
|
||||||
|
acl_tensor_ptr acl_dst_head;
|
||||||
|
if (has_tail) {
|
||||||
|
// Create head views for RotaryPositionEmbedding (only first rope_dims dimensions)
|
||||||
|
// RotaryPositionEmbedding requires contiguous dst tensor, so we use a temporary buffer
|
||||||
|
if (src_dst_need_trans) {
|
||||||
|
// Use F32 trans tensor strides
|
||||||
|
acl_src_head = ggml_cann_create_tensor((char *) src_trans_buffer, ACL_FLOAT, sizeof(float), head_ne,
|
||||||
|
src_dst_trans_nb, GGML_MAX_DIMS);
|
||||||
|
} else {
|
||||||
|
// Use original F32 tensor strides
|
||||||
|
acl_src_head = ggml_cann_create_tensor((char *) src0->data, ACL_FLOAT, sizeof(float), head_ne, src0->nb,
|
||||||
|
GGML_MAX_DIMS);
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t head_elements = rope_dims * ne01 * ne02 * ne03;
|
||||||
|
ggml_cann_pool_alloc dst_head_contiguous_allocator(ctx.pool(), head_elements * sizeof(float));
|
||||||
|
void * dst_head_contiguous_buffer = dst_head_contiguous_allocator.get();
|
||||||
|
|
||||||
|
size_t head_contiguous_nb[GGML_MAX_DIMS];
|
||||||
|
head_contiguous_nb[0] = sizeof(float);
|
||||||
|
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||||
|
head_contiguous_nb[i] = head_contiguous_nb[i - 1] * head_ne[i - 1];
|
||||||
|
}
|
||||||
|
acl_dst_head = ggml_cann_create_tensor(dst_head_contiguous_buffer, ACL_FLOAT, sizeof(float), head_ne,
|
||||||
|
head_contiguous_nb, GGML_MAX_DIMS);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 3: Execute RotaryPositionEmbedding
|
||||||
|
if (has_tail) {
|
||||||
|
// Rotate only the head portion (first rope_dims dimensions)
|
||||||
|
GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_head.get(), acl_cos_reshape_tensor.get(),
|
||||||
|
acl_sin_reshape_tensor.get(), acl_mode, acl_dst_head.get());
|
||||||
|
|
||||||
|
// Copy head result from contiguous buffer back to destination tensor
|
||||||
|
if (src_dst_need_trans) {
|
||||||
|
acl_tensor_ptr acl_dst_head_target = ggml_cann_create_tensor(
|
||||||
|
(char *) dst_trans_buffer, ACL_FLOAT, sizeof(float), head_ne, src_dst_trans_nb, GGML_MAX_DIMS);
|
||||||
|
cann_copy(ctx, acl_dst_head.get(), acl_dst_head_target.get());
|
||||||
|
} else {
|
||||||
|
acl_tensor_ptr acl_dst_head_target =
|
||||||
|
ggml_cann_create_tensor((char *) dst->data, ACL_FLOAT, sizeof(float), head_ne, dst->nb, GGML_MAX_DIMS);
|
||||||
|
cann_copy(ctx, acl_dst_head.get(), acl_dst_head_target.get());
|
||||||
|
}
|
||||||
|
} else if (src_dst_need_trans) {
|
||||||
|
// Rotate full tensor (no tail), using trans tensors
|
||||||
|
GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_trans_tensor.get(), acl_cos_reshape_tensor.get(),
|
||||||
|
acl_sin_reshape_tensor.get(), acl_mode, acl_dst_trans_tensor.get());
|
||||||
|
} else {
|
||||||
|
// Rotate full tensor (no tail), using original tensors
|
||||||
GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src.get(), acl_cos_reshape_tensor.get(),
|
GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src.get(), acl_cos_reshape_tensor.get(),
|
||||||
acl_sin_reshape_tensor.get(), acl_mode, acl_dst.get());
|
acl_sin_reshape_tensor.get(), acl_mode, acl_dst.get());
|
||||||
break;
|
|
||||||
}
|
|
||||||
case GGML_TYPE_F16:
|
|
||||||
{
|
|
||||||
ggml_cann_pool_alloc src_trans_allocator(ctx.pool(), ggml_nelements(src0) * sizeof(float));
|
|
||||||
void * src_trans_buffer = src_trans_allocator.get();
|
|
||||||
ggml_cann_pool_alloc dst_trans_allocator(ctx.pool(), ggml_nelements(dst) * sizeof(float));
|
|
||||||
void * dst_trans_buffer = dst_trans_allocator.get();
|
|
||||||
|
|
||||||
size_t src_trans_nb[GGML_MAX_DIMS];
|
|
||||||
src_trans_nb[0] = sizeof(float);
|
|
||||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
|
||||||
src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
acl_tensor_ptr acl_src_trans_tensor = ggml_cann_create_tensor(
|
// Step 4: Copy unrotated tail portion from source to destination
|
||||||
src_trans_buffer, ACL_FLOAT, sizeof(float), src0->ne, src_trans_nb, GGML_MAX_DIMS);
|
if (has_tail) {
|
||||||
acl_tensor_ptr acl_dst_trans_tensor = ggml_cann_create_tensor(
|
size_t src_tail_offset;
|
||||||
dst_trans_buffer, ACL_FLOAT, sizeof(float), dst->ne, src_trans_nb, GGML_MAX_DIMS);
|
size_t dst_tail_offset;
|
||||||
|
|
||||||
aclnn_cast(ctx, acl_src.get(), acl_src_trans_tensor.get(), ACL_FLOAT);
|
auto copy_tail_device = [&](void * src_ptr, void * dst_ptr, aclDataType dtype, size_t elem_size,
|
||||||
|
size_t * nb_src_arr, size_t * nb_dst_arr) {
|
||||||
|
acl_tensor_ptr acl_src_tail =
|
||||||
|
ggml_cann_create_tensor(src_ptr, dtype, elem_size, tail_ne, nb_src_arr, GGML_MAX_DIMS);
|
||||||
|
acl_tensor_ptr acl_dst_tail =
|
||||||
|
ggml_cann_create_tensor(dst_ptr, dtype, elem_size, tail_ne, nb_dst_arr, GGML_MAX_DIMS);
|
||||||
|
cann_copy(ctx, acl_src_tail.get(), acl_dst_tail.get());
|
||||||
|
};
|
||||||
|
|
||||||
GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_trans_tensor.get(),
|
if (src_dst_need_trans) {
|
||||||
acl_cos_reshape_tensor.get(), acl_sin_reshape_tensor.get(), acl_mode,
|
// Use F32 trans tensor strides and offsets
|
||||||
acl_dst_trans_tensor.get());
|
src_tail_offset = rope_dims * src_dst_trans_nb[0];
|
||||||
|
dst_tail_offset = rope_dims * src_dst_trans_nb[0];
|
||||||
|
copy_tail_device((char *) src_trans_buffer + src_tail_offset, (char *) dst_trans_buffer + dst_tail_offset,
|
||||||
|
ACL_FLOAT, sizeof(float), src_dst_trans_nb, src_dst_trans_nb);
|
||||||
|
} else {
|
||||||
|
// Use original tensor strides and offsets
|
||||||
|
src_tail_offset = rope_dims * nb00;
|
||||||
|
dst_tail_offset = rope_dims * nb0;
|
||||||
|
copy_tail_device((char *) src0->data + src_tail_offset, (char *) dst->data + dst_tail_offset,
|
||||||
|
ggml_cann_type_mapping(dst->type), ggml_element_size(dst), src0->nb, dst->nb);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 5: Cast back to F16 if needed
|
||||||
|
if (src_dst_need_trans) {
|
||||||
aclnn_cast(ctx, acl_dst_trans_tensor.get(), acl_dst.get(), ACL_FLOAT16);
|
aclnn_cast(ctx, acl_dst_trans_tensor.get(), acl_dst.get(), ACL_FLOAT16);
|
||||||
break;
|
|
||||||
}
|
|
||||||
default:
|
|
||||||
GGML_ABORT("Unsupported tensor type for GGML_OP_ROPE");
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -315,7 +315,7 @@ struct ggml_cann_rope_cache {
|
||||||
if (theta_scale_exp_host) {
|
if (theta_scale_exp_host) {
|
||||||
free(theta_scale_exp_host);
|
free(theta_scale_exp_host);
|
||||||
}
|
}
|
||||||
if(position_select_index_host) {
|
if (position_select_index_host) {
|
||||||
free(position_select_index_host);
|
free(position_select_index_host);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -2474,16 +2474,14 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
|
||||||
}
|
}
|
||||||
case GGML_OP_ROPE:
|
case GGML_OP_ROPE:
|
||||||
{
|
{
|
||||||
// TODO: with ops-test v == 1
|
|
||||||
// TODO: n_dims <= ne0
|
|
||||||
if (op->src[0]->ne[0] != op->op_params[1]) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (op->src[0]->ne[0] > 896) {
|
if (op->src[0]->ne[0] > 896) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
#ifdef ASCEND_310P
|
#ifdef ASCEND_310P
|
||||||
|
// TODO: Support rope_dim < ne00(dim)
|
||||||
|
if (op->src[0]->ne[0] != op->op_params[1]) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
if (!ggml_is_contiguous(op->src[0])) {
|
if (!ggml_is_contiguous(op->src[0])) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
@ -2550,6 +2548,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
|
||||||
case GGML_OP_ARGSORT:
|
case GGML_OP_ARGSORT:
|
||||||
case GGML_OP_ACC:
|
case GGML_OP_ACC:
|
||||||
case GGML_OP_GROUP_NORM:
|
case GGML_OP_GROUP_NORM:
|
||||||
|
return true;
|
||||||
case GGML_OP_PAD:
|
case GGML_OP_PAD:
|
||||||
// TODO: add circular padding support for cann, see https://github.com/ggml-org/llama.cpp/pull/16985
|
// TODO: add circular padding support for cann, see https://github.com/ggml-org/llama.cpp/pull/16985
|
||||||
return ggml_get_op_params_i32(op, 8) == 0;
|
return ggml_get_op_params_i32(op, 8) == 0;
|
||||||
|
|
|
||||||
|
|
@ -24,6 +24,7 @@
|
||||||
|
|
||||||
#define UNUSED GGML_UNUSED
|
#define UNUSED GGML_UNUSED
|
||||||
|
|
||||||
|
#if defined(__aarch64__) && defined(__ARM_NEON) && (defined(__ARM_FEATURE_MATMUL_INT8) || defined(__ARM_FEATURE_DOTPROD))
|
||||||
static inline void decode_q4_Kx8_scales_mins(const uint8_t * scales_in,
|
static inline void decode_q4_Kx8_scales_mins(const uint8_t * scales_in,
|
||||||
int16x8_t * out_mins,
|
int16x8_t * out_mins,
|
||||||
int8_t * out_scales) {
|
int8_t * out_scales) {
|
||||||
|
|
@ -46,6 +47,7 @@ static inline void decode_q4_Kx8_scales_mins(const uint8_t * scales_in,
|
||||||
scales_u32[1] = (sm[2] & kmask2) | (((sm[0] >> 6) & kmask3) << 4);
|
scales_u32[1] = (sm[2] & kmask2) | (((sm[0] >> 6) & kmask3) << 4);
|
||||||
memcpy(out_scales, scales_u32, 8);
|
memcpy(out_scales, scales_u32, 8);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
||||||
assert(QK8_0 == 32);
|
assert(QK8_0 == 32);
|
||||||
|
|
|
||||||
|
|
@ -81,6 +81,11 @@ struct ggml_arm_arch_features_type {
|
||||||
} ggml_arm_arch_features = { 0 };
|
} ggml_arm_arch_features = { 0 };
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(__riscv)
|
||||||
|
struct ggml_riscv_arch_features_type {
|
||||||
|
int rvv_vlen;
|
||||||
|
} ggml_riscv_arch_features = { 0 };
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(_WIN32)
|
#if defined(_WIN32)
|
||||||
|
|
||||||
|
|
@ -187,6 +192,9 @@ typedef void * thread_ret_t;
|
||||||
|
|
||||||
typedef pthread_t ggml_thread_t;
|
typedef pthread_t ggml_thread_t;
|
||||||
|
|
||||||
|
#define GGML_THREADPOOL_N_THREADS_MASK (0xffffU)
|
||||||
|
#define GGML_THREADPOOL_N_THREADS_BITS (16)
|
||||||
|
|
||||||
#if defined(__APPLE__)
|
#if defined(__APPLE__)
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
#include <mach/mach.h>
|
#include <mach/mach.h>
|
||||||
|
|
@ -449,7 +457,7 @@ struct ggml_threadpool {
|
||||||
struct ggml_cplan * cplan;
|
struct ggml_cplan * cplan;
|
||||||
|
|
||||||
// synchronization primitives
|
// synchronization primitives
|
||||||
atomic_int n_graph; // incremented when there is work to be done (i.e each graph)
|
atomic_int n_graph; // updated when there is work to be done (i.e each graph) holds graph and active thread counts.
|
||||||
atomic_int GGML_CACHE_ALIGN n_barrier;
|
atomic_int GGML_CACHE_ALIGN n_barrier;
|
||||||
atomic_int GGML_CACHE_ALIGN n_barrier_passed;
|
atomic_int GGML_CACHE_ALIGN n_barrier_passed;
|
||||||
atomic_int GGML_CACHE_ALIGN current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
|
atomic_int GGML_CACHE_ALIGN current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
|
||||||
|
|
@ -460,9 +468,7 @@ struct ggml_threadpool {
|
||||||
atomic_int abort; // Used for aborting processing of a graph
|
atomic_int abort; // Used for aborting processing of a graph
|
||||||
|
|
||||||
struct ggml_compute_state * workers; // per thread state
|
struct ggml_compute_state * workers; // per thread state
|
||||||
int n_threads_max; // number of threads in the pool
|
int n_threads; // Number of threads in the pool
|
||||||
atomic_int n_threads_cur; // number of threads used in the current graph
|
|
||||||
|
|
||||||
int32_t prio; // Scheduling priority
|
int32_t prio; // Scheduling priority
|
||||||
uint32_t poll; // Polling level (0 - no polling)
|
uint32_t poll; // Polling level (0 - no polling)
|
||||||
|
|
||||||
|
|
@ -539,7 +545,7 @@ struct ggml_state {
|
||||||
static struct ggml_state g_state = {0};
|
static struct ggml_state g_state = {0};
|
||||||
|
|
||||||
void ggml_barrier(struct ggml_threadpool * tp) {
|
void ggml_barrier(struct ggml_threadpool * tp) {
|
||||||
int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
|
int n_threads = atomic_load_explicit(&tp->n_graph, memory_order_relaxed) & GGML_THREADPOOL_N_THREADS_MASK;
|
||||||
if (n_threads == 1) {
|
if (n_threads == 1) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
@ -556,7 +562,7 @@ void ggml_barrier(struct ggml_threadpool * tp) {
|
||||||
// last thread
|
// last thread
|
||||||
atomic_store_explicit(&tp->n_barrier, 0, memory_order_relaxed);
|
atomic_store_explicit(&tp->n_barrier, 0, memory_order_relaxed);
|
||||||
|
|
||||||
// exit barrier (fill seq-cst fence)
|
// exit barrier (full seq-cst fence)
|
||||||
atomic_fetch_add_explicit(&tp->n_barrier_passed, 1, memory_order_seq_cst);
|
atomic_fetch_add_explicit(&tp->n_barrier_passed, 1, memory_order_seq_cst);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
@ -702,6 +708,15 @@ static void ggml_init_arm_arch_features(void) {}
|
||||||
#endif
|
#endif
|
||||||
#endif // __ARM_ARCH
|
#endif // __ARM_ARCH
|
||||||
|
|
||||||
|
#if defined(__riscv) && defined(__riscv_v_intrinsic)
|
||||||
|
#include <riscv_vector.h>
|
||||||
|
static void ggml_init_riscv_arch_features(void) {
|
||||||
|
ggml_riscv_arch_features.rvv_vlen = __riscv_vlenb();
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
static void ggml_init_riscv_arch_features(void) {}
|
||||||
|
#endif
|
||||||
|
|
||||||
struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
|
struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
|
||||||
GGML_ASSERT(!ggml_get_no_alloc(ctx));
|
GGML_ASSERT(!ggml_get_no_alloc(ctx));
|
||||||
|
|
||||||
|
|
@ -2628,7 +2643,7 @@ static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask
|
||||||
void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
|
void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
|
||||||
if (!threadpool) return;
|
if (!threadpool) return;
|
||||||
|
|
||||||
const int n_threads = threadpool->n_threads_max;
|
const int n_threads = threadpool->n_threads;
|
||||||
|
|
||||||
#ifndef GGML_USE_OPENMP
|
#ifndef GGML_USE_OPENMP
|
||||||
struct ggml_compute_state* workers = threadpool->workers;
|
struct ggml_compute_state* workers = threadpool->workers;
|
||||||
|
|
@ -2704,7 +2719,7 @@ struct ggml_cplan ggml_graph_plan(
|
||||||
//GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
|
//GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
|
||||||
}
|
}
|
||||||
if (n_threads <= 0) {
|
if (n_threads <= 0) {
|
||||||
n_threads = threadpool ? threadpool->n_threads_max : GGML_DEFAULT_N_THREADS;
|
n_threads = threadpool ? threadpool->n_threads : GGML_DEFAULT_N_THREADS;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(__EMSCRIPTEN__) && !defined(__EMSCRIPTEN_PTHREADS__)
|
#if defined(__EMSCRIPTEN__) && !defined(__EMSCRIPTEN_PTHREADS__)
|
||||||
|
|
@ -2912,12 +2927,14 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||||
|
|
||||||
struct ggml_compute_params params = {
|
struct ggml_compute_params params = {
|
||||||
/*.ith =*/ state->ith,
|
/*.ith =*/ state->ith,
|
||||||
/*.nth =*/ atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed),
|
/*.nth =*/ atomic_load_explicit(&tp->n_graph, memory_order_relaxed) & GGML_THREADPOOL_N_THREADS_MASK,
|
||||||
/*.wsize =*/ cplan->work_size,
|
/*.wsize =*/ cplan->work_size,
|
||||||
/*.wdata =*/ cplan->work_data,
|
/*.wdata =*/ cplan->work_data,
|
||||||
/*.threadpool=*/ tp,
|
/*.threadpool=*/ tp,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
GGML_PRINT_DEBUG("thread #%d compute-start cplan %p last-graph %d \n", state->ith, cplan, state->last_graph);
|
||||||
|
|
||||||
for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
|
for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
|
||||||
struct ggml_tensor * node = cgraph->nodes[node_n];
|
struct ggml_tensor * node = cgraph->nodes[node_n];
|
||||||
|
|
||||||
|
|
@ -2939,6 +2956,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
GGML_PRINT_DEBUG("thread #%d compute-done cplan %p last-graph %d \n", state->ith, cplan, state->last_graph);
|
||||||
|
|
||||||
ggml_barrier(state->threadpool);
|
ggml_barrier(state->threadpool);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
@ -2946,27 +2965,23 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||||
|
|
||||||
#ifndef GGML_USE_OPENMP
|
#ifndef GGML_USE_OPENMP
|
||||||
|
|
||||||
// check if thread is active
|
|
||||||
static inline bool ggml_graph_compute_thread_active(struct ggml_compute_state * state) {
|
|
||||||
struct ggml_threadpool * threadpool = state->threadpool;
|
|
||||||
int n_threads = atomic_load_explicit(&threadpool->n_threads_cur, memory_order_relaxed);
|
|
||||||
return (state->ith < n_threads);
|
|
||||||
}
|
|
||||||
|
|
||||||
// check if thread is ready to proceed (exit from polling or sleeping)
|
// check if thread is ready to proceed (exit from polling or sleeping)
|
||||||
|
// returns true if loops should exit, sets state->pending to indicate new work
|
||||||
static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * state) {
|
static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * state) {
|
||||||
struct ggml_threadpool * threadpool = state->threadpool;
|
struct ggml_threadpool * threadpool = state->threadpool;
|
||||||
|
|
||||||
if (state->pending || threadpool->stop || threadpool->pause) { return true; }
|
if (state->pending || threadpool->stop || threadpool->pause) { return true; }
|
||||||
|
|
||||||
// check for new graph/work
|
// check for new graph/work
|
||||||
int new_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed);
|
int n_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed);
|
||||||
if (new_graph != state->last_graph) {
|
int n_threads = n_graph & GGML_THREADPOOL_N_THREADS_MASK;
|
||||||
state->pending = ggml_graph_compute_thread_active(state);
|
if (n_graph != state->last_graph) {
|
||||||
state->last_graph = new_graph;
|
state->pending = (state->ith < n_threads);
|
||||||
|
state->last_graph = n_graph;
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
return state->pending;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// sync thread state after polling
|
// sync thread state after polling
|
||||||
|
|
@ -2983,11 +2998,6 @@ static inline void ggml_graph_compute_thread_sync(struct ggml_compute_state * st
|
||||||
static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
|
static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
|
||||||
struct ggml_threadpool * threadpool = state->threadpool;
|
struct ggml_threadpool * threadpool = state->threadpool;
|
||||||
|
|
||||||
// Skip polling for unused threads
|
|
||||||
if (!ggml_graph_compute_thread_active(state)) {
|
|
||||||
return state->pending;
|
|
||||||
}
|
|
||||||
|
|
||||||
// This seems to make 0 ... 100 a decent range for polling level across modern processors.
|
// This seems to make 0 ... 100 a decent range for polling level across modern processors.
|
||||||
// Perhaps, we can adjust it dynamically based on load and things.
|
// Perhaps, we can adjust it dynamically based on load and things.
|
||||||
const uint64_t n_rounds = 1024UL * 128 * threadpool->poll;
|
const uint64_t n_rounds = 1024UL * 128 * threadpool->poll;
|
||||||
|
|
@ -3049,7 +3059,6 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
|
||||||
ggml_graph_compute_check_for_work(state);
|
ggml_graph_compute_check_for_work(state);
|
||||||
if (state->pending) {
|
if (state->pending) {
|
||||||
state->pending = false;
|
state->pending = false;
|
||||||
|
|
||||||
ggml_graph_compute_thread(state);
|
ggml_graph_compute_thread(state);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -3064,14 +3073,15 @@ static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int
|
||||||
|
|
||||||
ggml_mutex_lock(&threadpool->mutex);
|
ggml_mutex_lock(&threadpool->mutex);
|
||||||
|
|
||||||
GGML_PRINT_DEBUG("threadpool: n_threads_cur %d n_threads %d\n", threadpool->n_threads_cur, n_threads);
|
// Update the number of active threads and the graph count
|
||||||
|
int n_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed) >> GGML_THREADPOOL_N_THREADS_BITS;
|
||||||
|
n_graph = ((n_graph + 1) << GGML_THREADPOOL_N_THREADS_BITS) | (n_threads & GGML_THREADPOOL_N_THREADS_MASK);
|
||||||
|
|
||||||
// Update the number of active threads
|
GGML_PRINT_DEBUG("compute-kickoff: n_threads %d n_graph %d\n", n_threads, n_graph);
|
||||||
atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
|
|
||||||
|
|
||||||
// Indicate the graph is ready to be processed
|
// Indicate the graph is ready to be processed
|
||||||
// We need the full seq-cst fence here because of the polling threads (used in thread_sync)
|
// We need the full seq-cst fence here because of the polling threads (used in thread_sync)
|
||||||
atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_seq_cst);
|
atomic_store_explicit(&threadpool->n_graph, n_graph, memory_order_seq_cst);
|
||||||
|
|
||||||
if (threadpool->pause) {
|
if (threadpool->pause) {
|
||||||
// Update main thread prio and affinity to match the threadpool settings
|
// Update main thread prio and affinity to match the threadpool settings
|
||||||
|
|
@ -3109,8 +3119,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
|
||||||
threadpool->pause = tpp->paused;
|
threadpool->pause = tpp->paused;
|
||||||
threadpool->abort = -1;
|
threadpool->abort = -1;
|
||||||
threadpool->workers = NULL;
|
threadpool->workers = NULL;
|
||||||
threadpool->n_threads_max = tpp->n_threads;
|
threadpool->n_threads = tpp->n_threads;
|
||||||
threadpool->n_threads_cur = tpp->n_threads;
|
|
||||||
threadpool->poll = tpp->poll;
|
threadpool->poll = tpp->poll;
|
||||||
threadpool->prio = tpp->prio;
|
threadpool->prio = tpp->prio;
|
||||||
threadpool->ec = GGML_STATUS_SUCCESS;
|
threadpool->ec = GGML_STATUS_SUCCESS;
|
||||||
|
|
@ -3205,7 +3214,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
||||||
{
|
{
|
||||||
// update the number of threads from the actual number of threads that we got from OpenMP
|
// update the number of threads from the actual number of threads that we got from OpenMP
|
||||||
n_threads = omp_get_num_threads();
|
n_threads = omp_get_num_threads();
|
||||||
atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
|
atomic_store_explicit(&threadpool->n_graph, n_threads, memory_order_relaxed);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Apply thread CPU mask and priority
|
// Apply thread CPU mask and priority
|
||||||
|
|
@ -3218,13 +3227,13 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
||||||
ggml_graph_compute_thread(&threadpool->workers[ith]);
|
ggml_graph_compute_thread(&threadpool->workers[ith]);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
atomic_store_explicit(&threadpool->n_threads_cur, 1, memory_order_relaxed);
|
atomic_store_explicit(&threadpool->n_graph, 1, memory_order_relaxed);
|
||||||
ggml_graph_compute_thread(&threadpool->workers[0]);
|
ggml_graph_compute_thread(&threadpool->workers[0]);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
if (n_threads > threadpool->n_threads_max) {
|
if (n_threads > threadpool->n_threads) {
|
||||||
GGML_LOG_WARN("cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads_max);
|
GGML_LOG_WARN("cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads);
|
||||||
n_threads = threadpool->n_threads_max;
|
n_threads = threadpool->n_threads;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Kick all threads to start the new graph
|
// Kick all threads to start the new graph
|
||||||
|
|
@ -3464,6 +3473,14 @@ int ggml_cpu_has_riscv_v(void) {
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int ggml_cpu_get_rvv_vlen(void) {
|
||||||
|
#if defined(__riscv) && defined(__riscv_v_intrinsic)
|
||||||
|
return ggml_riscv_arch_features.rvv_vlen;
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
int ggml_cpu_has_f16c(void) {
|
int ggml_cpu_has_f16c(void) {
|
||||||
#if defined(__F16C__)
|
#if defined(__F16C__)
|
||||||
return 1;
|
return 1;
|
||||||
|
|
@ -3630,6 +3647,10 @@ void ggml_cpu_init(void) {
|
||||||
ggml_init_arm_arch_features();
|
ggml_init_arm_arch_features();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(__riscv)
|
||||||
|
ggml_init_riscv_arch_features();
|
||||||
|
#endif
|
||||||
|
|
||||||
is_first_call = false;
|
is_first_call = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -583,6 +583,10 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
|
||||||
if (ggml_cpu_has_riscv_v()) {
|
if (ggml_cpu_has_riscv_v()) {
|
||||||
features.push_back({ "RISCV_V", "1" });
|
features.push_back({ "RISCV_V", "1" });
|
||||||
}
|
}
|
||||||
|
if (ggml_cpu_get_rvv_vlen() > 0) {
|
||||||
|
static std::string rvv_vlen = std::to_string(ggml_cpu_get_rvv_vlen());
|
||||||
|
features.push_back({ "RVV_VLEN", rvv_vlen.c_str() });
|
||||||
|
}
|
||||||
if (ggml_cpu_has_vsx()) {
|
if (ggml_cpu_has_vsx()) {
|
||||||
features.push_back({ "VSX", "1" });
|
features.push_back({ "VSX", "1" });
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -2169,7 +2169,8 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
||||||
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0;
|
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0;
|
||||||
|
|
||||||
if (cur->type == GGML_TYPE_Q4_0) {
|
if (cur->type == GGML_TYPE_Q4_0) {
|
||||||
if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
|
if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)
|
||||||
|
|| (ggml_cpu_has_riscv_v() && (ggml_cpu_get_rvv_vlen() >= QK4_0))) {
|
||||||
if (cur->ne[1] % 8 == 0) {
|
if (cur->ne[1] % 8 == 0) {
|
||||||
return &q4_0_8x8_q8_0;
|
return &q4_0_8x8_q8_0;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -67,13 +67,16 @@
|
||||||
#define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000
|
#define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000
|
||||||
#define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a
|
#define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a
|
||||||
#define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA
|
#define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA
|
||||||
|
#define GGML_CUDA_CC_RDNA3_5 (GGML_CUDA_CC_OFFSET_AMD + 0x1150) // AI 370, AI Max 395 laptops.
|
||||||
#define GGML_CUDA_CC_RDNA4 (GGML_CUDA_CC_OFFSET_AMD + 0x1200) // RX 9000
|
#define GGML_CUDA_CC_RDNA4 (GGML_CUDA_CC_OFFSET_AMD + 0x1200) // RX 9000
|
||||||
|
|
||||||
#define GGML_CUDA_CC_IS_AMD(cc) (cc >= GGML_CUDA_CC_OFFSET_AMD)
|
#define GGML_CUDA_CC_IS_AMD(cc) (cc >= GGML_CUDA_CC_OFFSET_AMD)
|
||||||
#define GGML_CUDA_CC_IS_RDNA(cc) (cc >= GGML_CUDA_CC_RDNA1)
|
#define GGML_CUDA_CC_IS_RDNA(cc) (cc >= GGML_CUDA_CC_RDNA1)
|
||||||
#define GGML_CUDA_CC_IS_RDNA1(cc) (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2)
|
#define GGML_CUDA_CC_IS_RDNA1(cc) (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2)
|
||||||
#define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
|
#define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
|
||||||
#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA4)
|
#define GGML_CUDA_CC_IS_RDNA3_0(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA3_5)
|
||||||
|
#define GGML_CUDA_CC_IS_RDNA3_5(cc) (cc >= GGML_CUDA_CC_RDNA3_5 && cc < GGML_CUDA_CC_RDNA4)
|
||||||
|
#define GGML_CUDA_CC_IS_RDNA3(cc) (GGML_CUDA_CC_IS_RDNA3_0(cc) || GGML_CUDA_CC_IS_RDNA3_5(cc))
|
||||||
#define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4)
|
#define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4)
|
||||||
#define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA1)
|
#define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA1)
|
||||||
#define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_RDNA1)
|
#define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_RDNA1)
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,77 @@
|
||||||
|
#include "convert.cuh"
|
||||||
|
#include "diag.cuh"
|
||||||
|
#include "ggml.h"
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
static __global__ void diag_kernel(T * __restrict__ dst,
|
||||||
|
const T * __restrict__ src,
|
||||||
|
const int64_t ne0,
|
||||||
|
const int64_t ne1,
|
||||||
|
const int64_t ne2,
|
||||||
|
const int64_t ne3,
|
||||||
|
const int64_t total_elements) {
|
||||||
|
const int64_t global_idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
|
||||||
|
if (global_idx >= total_elements) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int64_t i0 = global_idx % ne0;
|
||||||
|
const int64_t i1 = (global_idx / ne0) % ne1;
|
||||||
|
const int64_t i2 = (global_idx / (ne0 * ne1)) % ne2;
|
||||||
|
const int64_t i3 = global_idx / (ne0 * ne1 * ne2);
|
||||||
|
|
||||||
|
const int64_t dst_idx = ((i3 * ne2 + i2) * ne1 + i1) * ne0 + i0;
|
||||||
|
|
||||||
|
if (i0 == i1) {
|
||||||
|
const int64_t batch_idx = i3 * ne2 + i2;
|
||||||
|
const int64_t src_idx = batch_idx * ne0 + i0;
|
||||||
|
dst[dst_idx] = src[src_idx];
|
||||||
|
} else {
|
||||||
|
dst[dst_idx] = ggml_cuda_cast<T>(0);
|
||||||
|
}
|
||||||
|
GGML_UNUSED_VARS(ne3);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_cuda_op_diag(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
|
const ggml_tensor * src0 = dst->src[0];
|
||||||
|
|
||||||
|
void * dst_d = dst->data;
|
||||||
|
const void * src0_d = src0->data;
|
||||||
|
|
||||||
|
cudaStream_t stream = ctx.stream();
|
||||||
|
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(dst));
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||||
|
|
||||||
|
const int64_t ne00 = src0->ne[0];
|
||||||
|
const int64_t ne01 = src0->ne[1];
|
||||||
|
const int64_t ne02 = src0->ne[2];
|
||||||
|
const int64_t ne03 = src0->ne[3];
|
||||||
|
|
||||||
|
const int64_t ne0 = dst->ne[0];
|
||||||
|
const int64_t ne1 = dst->ne[1];
|
||||||
|
const int64_t ne2 = dst->ne[2];
|
||||||
|
const int64_t ne3 = dst->ne[3];
|
||||||
|
|
||||||
|
GGML_ASSERT(ne00 == ne0);
|
||||||
|
GGML_ASSERT(ne01 == 1);
|
||||||
|
GGML_ASSERT(ne02 == ne2);
|
||||||
|
GGML_ASSERT(ne03 == ne3);
|
||||||
|
|
||||||
|
const int64_t n_elems = ggml_nelements(dst);
|
||||||
|
const int64_t num_blocks = (n_elems + CUDA_DIAG_BLOCK_SIZE - 1) / CUDA_DIAG_BLOCK_SIZE;
|
||||||
|
|
||||||
|
switch (dst->type) {
|
||||||
|
case GGML_TYPE_F32:
|
||||||
|
diag_kernel<<<num_blocks, CUDA_DIAG_BLOCK_SIZE, 0, stream>>>((float *) dst_d, (const float *) src0_d, ne0,
|
||||||
|
ne1, ne2, ne3, n_elems);
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_F16:
|
||||||
|
diag_kernel<<<num_blocks, CUDA_DIAG_BLOCK_SIZE, 0, stream>>>((half *) dst_d, (const half *) src0_d, ne0,
|
||||||
|
ne1, ne2, ne3, n_elems);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
GGML_ABORT("unsupported type");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,5 @@
|
||||||
|
#include "common.cuh"
|
||||||
|
|
||||||
|
#define CUDA_DIAG_BLOCK_SIZE 256
|
||||||
|
|
||||||
|
void ggml_cuda_op_diag(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||||
|
|
@ -642,8 +642,8 @@ static __global__ void flash_attn_stream_k_fixup(
|
||||||
const int iter_k = (ne11 + (nbatch_fa - 1)) / nbatch_fa;
|
const int iter_k = (ne11 + (nbatch_fa - 1)) / nbatch_fa;
|
||||||
const int iter_j = (ne01 + (ncols1 - 1)) / ncols1;
|
const int iter_j = (ne01 + (ncols1 - 1)) / ncols1;
|
||||||
|
|
||||||
const int kbc0 = (bidx0 + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
|
const int kbc0 = int64_t(bidx0 + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
|
||||||
const int kbc0_stop = (bidx0 + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
|
const int kbc0_stop = int64_t(bidx0 + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
|
||||||
|
|
||||||
const bool did_not_have_any_data = kbc0 == kbc0_stop;
|
const bool did_not_have_any_data = kbc0 == kbc0_stop;
|
||||||
const bool wrote_beginning_of_tile = kbc0 % iter_k == 0;
|
const bool wrote_beginning_of_tile = kbc0 % iter_k == 0;
|
||||||
|
|
@ -679,7 +679,7 @@ static __global__ void flash_attn_stream_k_fixup(
|
||||||
int bidx = bidx0 - 1;
|
int bidx = bidx0 - 1;
|
||||||
int kbc_stop = kbc0;
|
int kbc_stop = kbc0;
|
||||||
while(true) {
|
while(true) {
|
||||||
const int kbc = bidx*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
|
const int kbc = int64_t(bidx)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
|
||||||
if (kbc == kbc_stop) { // Did not have any data.
|
if (kbc == kbc_stop) { // Did not have any data.
|
||||||
bidx--;
|
bidx--;
|
||||||
kbc_stop = kbc;
|
kbc_stop = kbc;
|
||||||
|
|
|
||||||
|
|
@ -955,9 +955,11 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
|
||||||
(K_h2 + int64_t(kb0)*nbatch_fa*stride_K, tile_K, nbatch_K2, stride_K, k_VKQ_sup);
|
(K_h2 + int64_t(kb0)*nbatch_fa*stride_K, tile_K, nbatch_K2, stride_K, k_VKQ_sup);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// kb0_start is always < kb0_stop so the last iter can be executed unconditionally.
|
||||||
|
if constexpr (ncols2 == 1) {
|
||||||
|
constexpr bool oob_check = true;
|
||||||
for (; kb0 < kb0_stop-1; ++kb0) {
|
for (; kb0 < kb0_stop-1; ++kb0) {
|
||||||
constexpr bool last_iter = false;
|
constexpr bool last_iter = false;
|
||||||
constexpr bool oob_check = false;
|
|
||||||
constexpr int k_VKQ_sup = nbatch_fa;
|
constexpr int k_VKQ_sup = nbatch_fa;
|
||||||
flash_attn_ext_f16_iter
|
flash_attn_ext_f16_iter
|
||||||
<DKQ, DV, ncols1, ncols2, nwarps, use_logit_softcap, mla, needs_fixup, is_fixup, last_iter, oob_check,
|
<DKQ, DV, ncols1, ncols2, nwarps, use_logit_softcap, mla, needs_fixup, is_fixup, last_iter, oob_check,
|
||||||
|
|
@ -966,21 +968,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
|
||||||
ne01, ne02, stride_K, stride_V, stride_mask, tile_Q, tile_K, tile_V, tile_mask, Q_B, VKQ_C,
|
ne01, ne02, stride_K, stride_V, stride_mask, tile_Q, tile_K, tile_V, tile_mask, Q_B, VKQ_C,
|
||||||
KQ_max, KQ_rowsum, jt, kb0, k_VKQ_sup);
|
KQ_max, KQ_rowsum, jt, kb0, k_VKQ_sup);
|
||||||
}
|
}
|
||||||
// kb0_start is always < kb0_stop so the last iter can be executed unconditionally.
|
|
||||||
if constexpr (ncols2 == 1) {
|
|
||||||
if (ne11 % nbatch_fa == 0) {
|
|
||||||
constexpr bool last_iter = true;
|
constexpr bool last_iter = true;
|
||||||
constexpr bool oob_check = false;
|
|
||||||
constexpr int k_VKQ_sup = nbatch_fa;
|
|
||||||
flash_attn_ext_f16_iter
|
|
||||||
<DKQ, DV, ncols1, ncols2, nwarps, use_logit_softcap, mla, needs_fixup, is_fixup, last_iter, oob_check,
|
|
||||||
T_A_KQ, T_B_KQ, T_C_KQ, T_A_VKQ, T_B_VKQ, T_C_VKQ>
|
|
||||||
(Q_f2, K_h2, V_h2, mask_h, dstk, dstk_fixup, scale, slope, logit_softcap,
|
|
||||||
ne01, ne02, stride_K, stride_V, stride_mask, tile_Q, tile_K, tile_V, tile_mask, Q_B, VKQ_C,
|
|
||||||
KQ_max, KQ_rowsum, jt, kb0, k_VKQ_sup);
|
|
||||||
} else {
|
|
||||||
constexpr bool last_iter = true;
|
|
||||||
constexpr bool oob_check = true;
|
|
||||||
const int k_VKQ_sup = ne11 - kb0*nbatch_fa;
|
const int k_VKQ_sup = ne11 - kb0*nbatch_fa;
|
||||||
flash_attn_ext_f16_iter
|
flash_attn_ext_f16_iter
|
||||||
<DKQ, DV, ncols1, ncols2, nwarps, use_logit_softcap, mla, needs_fixup, is_fixup, last_iter, oob_check,
|
<DKQ, DV, ncols1, ncols2, nwarps, use_logit_softcap, mla, needs_fixup, is_fixup, last_iter, oob_check,
|
||||||
|
|
@ -988,10 +976,19 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
|
||||||
(Q_f2, K_h2, V_h2, mask_h, dstk, dstk_fixup, scale, slope, logit_softcap,
|
(Q_f2, K_h2, V_h2, mask_h, dstk, dstk_fixup, scale, slope, logit_softcap,
|
||||||
ne01, ne02, stride_K, stride_V, stride_mask, tile_Q, tile_K, tile_V, tile_mask, Q_B, VKQ_C,
|
ne01, ne02, stride_K, stride_V, stride_mask, tile_Q, tile_K, tile_V, tile_mask, Q_B, VKQ_C,
|
||||||
KQ_max, KQ_rowsum, jt, kb0, k_VKQ_sup);
|
KQ_max, KQ_rowsum, jt, kb0, k_VKQ_sup);
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
constexpr bool last_iter = true;
|
|
||||||
constexpr bool oob_check = false;
|
constexpr bool oob_check = false;
|
||||||
|
for (; kb0 < kb0_stop-1; ++kb0) {
|
||||||
|
constexpr bool last_iter = false;
|
||||||
|
constexpr int k_VKQ_sup = nbatch_fa;
|
||||||
|
flash_attn_ext_f16_iter
|
||||||
|
<DKQ, DV, ncols1, ncols2, nwarps, use_logit_softcap, mla, needs_fixup, is_fixup, last_iter, oob_check,
|
||||||
|
T_A_KQ, T_B_KQ, T_C_KQ, T_A_VKQ, T_B_VKQ, T_C_VKQ>
|
||||||
|
(Q_f2, K_h2, V_h2, mask_h, dstk, dstk_fixup, scale, slope, logit_softcap,
|
||||||
|
ne01, ne02, stride_K, stride_V, stride_mask, tile_Q, tile_K, tile_V, tile_mask, Q_B, VKQ_C,
|
||||||
|
KQ_max, KQ_rowsum, jt, kb0, k_VKQ_sup);
|
||||||
|
}
|
||||||
|
constexpr bool last_iter = true;
|
||||||
constexpr int k_VKQ_sup = nbatch_fa;
|
constexpr int k_VKQ_sup = nbatch_fa;
|
||||||
flash_attn_ext_f16_iter
|
flash_attn_ext_f16_iter
|
||||||
<DKQ, DV, ncols1, ncols2, nwarps, use_logit_softcap, mla, needs_fixup, is_fixup, last_iter, oob_check,
|
<DKQ, DV, ncols1, ncols2, nwarps, use_logit_softcap, mla, needs_fixup, is_fixup, last_iter, oob_check,
|
||||||
|
|
@ -1383,8 +1380,8 @@ static __global__ void flash_attn_ext_f16(
|
||||||
const int iter_j = (ne01.z + (ncols1 - 1)) / ncols1;
|
const int iter_j = (ne01.z + (ncols1 - 1)) / ncols1;
|
||||||
|
|
||||||
// kbc == k block continuous, current index in continuous ijk space.
|
// kbc == k block continuous, current index in continuous ijk space.
|
||||||
int kbc = (blockIdx.x + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
|
int kbc = int64_t(blockIdx.x + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
|
||||||
const int kbc_stop = (blockIdx.x + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
|
const int kbc_stop = int64_t(blockIdx.x + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
|
||||||
|
|
||||||
// If the seams of 2 CUDA blocks fall within an output tile their results need to be combined.
|
// If the seams of 2 CUDA blocks fall within an output tile their results need to be combined.
|
||||||
// For this we need to track both the block that starts the tile (needs_fixup) and the block that finishes the tile (is_fixup).
|
// For this we need to track both the block that starts the tile (needs_fixup) and the block that finishes the tile (is_fixup).
|
||||||
|
|
|
||||||
|
|
@ -564,6 +564,12 @@ static __device__ __forceinline__ void flash_attn_tile_iter(
|
||||||
for (int i_KQ_0 = 0; i_KQ_0 < nbatch_fa; i_KQ_0 += np*warp_size) {
|
for (int i_KQ_0 = 0; i_KQ_0 < nbatch_fa; i_KQ_0 += np*warp_size) {
|
||||||
const int i_KQ = i_KQ_0 + (threadIdx.y % np)*warp_size + threadIdx.x;
|
const int i_KQ = i_KQ_0 + (threadIdx.y % np)*warp_size + threadIdx.x;
|
||||||
|
|
||||||
|
#if defined(FAST_FP16_AVAILABLE) && !defined(V_DOT2_F32_F16_AVAILABLE)
|
||||||
|
// Without the v_dot2_f32_f16 instruction there is a higher risk of numerical overflow in the KQ calculation.
|
||||||
|
// Therefore, scale down Q values and apply the inverse scale the FP32 KQ values afterwards again.
|
||||||
|
KQ_acc[i_KQ_0/(np*warp_size)*cpw + jc0] *= 4.0f;
|
||||||
|
#endif // defined(FAST_FP16_AVAILABLE) && !defined(V_DOT2_F32_F16_AVAILABLE)
|
||||||
|
|
||||||
if (use_logit_softcap) {
|
if (use_logit_softcap) {
|
||||||
KQ_acc[(i_KQ_0/(np*warp_size))*cpw + jc0] = logit_softcap * tanhf(KQ_acc[(i_KQ_0/(np*warp_size))*cpw + jc0]);
|
KQ_acc[(i_KQ_0/(np*warp_size))*cpw + jc0] = logit_softcap * tanhf(KQ_acc[(i_KQ_0/(np*warp_size))*cpw + jc0]);
|
||||||
}
|
}
|
||||||
|
|
@ -858,6 +864,11 @@ static __global__ void flash_attn_tile(
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i1 = 0; i1 < cpy_ne_D; i1 += 2) {
|
for (int i1 = 0; i1 < cpy_ne_D; i1 += 2) {
|
||||||
tmp_h2[i1/2] = make_half2(tmp_f[i1 + 0], tmp_f[i1 + 1]);
|
tmp_h2[i1/2] = make_half2(tmp_f[i1 + 0], tmp_f[i1 + 1]);
|
||||||
|
#if defined(FAST_FP16_AVAILABLE) && !defined(V_DOT2_F32_F16_AVAILABLE)
|
||||||
|
// Without the v_dot2_f32_f16 instruction there is a higher risk of numerical overflow in the KQ calculation.
|
||||||
|
// Therefore, scale down Q values and apply the inverse scale the FP32 KQ values afterwards again.
|
||||||
|
tmp_h2[i1/2] *= make_half2(0.25f, 0.25f);
|
||||||
|
#endif // defined(FAST_FP16_AVAILABLE) && !defined(V_DOT2_F32_F16_AVAILABLE)
|
||||||
}
|
}
|
||||||
ggml_cuda_memcpy_1<sizeof(tmp_h2)>(
|
ggml_cuda_memcpy_1<sizeof(tmp_h2)>(
|
||||||
&Q_tmp[jc*(DKQ/2) + i0/2 + (threadIdx.y % np)*(warp_size*cpy_ne_D/2) + threadIdx.x*(cpy_ne_D/2)],
|
&Q_tmp[jc*(DKQ/2) + i0/2 + (threadIdx.y % np)*(warp_size*cpy_ne_D/2) + threadIdx.x*(cpy_ne_D/2)],
|
||||||
|
|
|
||||||
|
|
@ -36,12 +36,26 @@ static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2(ggml_backend_cuda_con
|
||||||
const ggml_tensor * KQV = dst;
|
const ggml_tensor * KQV = dst;
|
||||||
const ggml_tensor * Q = dst->src[0];
|
const ggml_tensor * Q = dst->src[0];
|
||||||
const ggml_tensor * K = dst->src[1];
|
const ggml_tensor * K = dst->src[1];
|
||||||
|
const ggml_tensor * V = dst->src[2];
|
||||||
const ggml_tensor * mask = dst->src[3];
|
const ggml_tensor * mask = dst->src[3];
|
||||||
|
|
||||||
float max_bias = 0.0f;
|
float max_bias = 0.0f;
|
||||||
memcpy(&max_bias, (const float *) KQV->op_params + 1, sizeof(float));
|
memcpy(&max_bias, (const float *) KQV->op_params + 1, sizeof(float));
|
||||||
|
|
||||||
const bool use_gqa_opt = mask && max_bias == 0.0f && K->ne[1] % FATTN_KQ_STRIDE == 0;
|
// Edge cases like no mask, ALiBi, unpadded K/V, or misaligned addresses for large data transfers
|
||||||
|
// are put into the template specialization without GQA optimizations.
|
||||||
|
bool use_gqa_opt = mask && max_bias == 0.0f && K->ne[1] % FATTN_KQ_STRIDE == 0;
|
||||||
|
for (const ggml_tensor * t : {Q, K, V, mask}) {
|
||||||
|
if (t == nullptr) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
for (size_t i = 1; i < GGML_MAX_DIMS; ++i) {
|
||||||
|
if (t->nb[i] % 16 != 0) {
|
||||||
|
use_gqa_opt = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
GGML_ASSERT(Q->ne[2] % K->ne[2] == 0);
|
GGML_ASSERT(Q->ne[2] % K->ne[2] == 0);
|
||||||
const int gqa_ratio = Q->ne[2] / K->ne[2];
|
const int gqa_ratio = Q->ne[2] / K->ne[2];
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,7 @@
|
||||||
#define CUDA_FILL_BLOCK_SIZE 256
|
#define CUDA_FILL_BLOCK_SIZE 256
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
static __global__ void fill_kernel(T * __restrict__ dst, const int64_t k, const T value) {
|
static __global__ void fill_kernel(T * dst, const int64_t k, const T value) {
|
||||||
const int64_t i = (int64_t)blockDim.x * blockIdx.x + threadIdx.x;
|
const int64_t i = (int64_t)blockDim.x * blockIdx.x + threadIdx.x;
|
||||||
if (i >= k) {
|
if (i >= k) {
|
||||||
return;
|
return;
|
||||||
|
|
|
||||||
|
|
@ -20,6 +20,7 @@
|
||||||
#include "ggml-cuda/cpy.cuh"
|
#include "ggml-cuda/cpy.cuh"
|
||||||
#include "ggml-cuda/cross-entropy-loss.cuh"
|
#include "ggml-cuda/cross-entropy-loss.cuh"
|
||||||
#include "ggml-cuda/diagmask.cuh"
|
#include "ggml-cuda/diagmask.cuh"
|
||||||
|
#include "ggml-cuda/diag.cuh"
|
||||||
#include "ggml-cuda/fattn.cuh"
|
#include "ggml-cuda/fattn.cuh"
|
||||||
#include "ggml-cuda/getrows.cuh"
|
#include "ggml-cuda/getrows.cuh"
|
||||||
#include "ggml-cuda/im2col.cuh"
|
#include "ggml-cuda/im2col.cuh"
|
||||||
|
|
@ -2641,6 +2642,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||||
case GGML_OP_PERMUTE:
|
case GGML_OP_PERMUTE:
|
||||||
case GGML_OP_TRANSPOSE:
|
case GGML_OP_TRANSPOSE:
|
||||||
break;
|
break;
|
||||||
|
case GGML_OP_DIAG:
|
||||||
|
ggml_cuda_op_diag(ctx, dst);
|
||||||
|
break;
|
||||||
case GGML_OP_DIAG_MASK_INF:
|
case GGML_OP_DIAG_MASK_INF:
|
||||||
ggml_cuda_op_diag_mask_inf(ctx, dst);
|
ggml_cuda_op_diag_mask_inf(ctx, dst);
|
||||||
break;
|
break;
|
||||||
|
|
@ -4309,6 +4313,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||||
case GGML_UNARY_OP_EXPM1:
|
case GGML_UNARY_OP_EXPM1:
|
||||||
case GGML_UNARY_OP_SOFTPLUS:
|
case GGML_UNARY_OP_SOFTPLUS:
|
||||||
case GGML_UNARY_OP_ELU:
|
case GGML_UNARY_OP_ELU:
|
||||||
|
case GGML_UNARY_OP_XIELU:
|
||||||
case GGML_UNARY_OP_FLOOR:
|
case GGML_UNARY_OP_FLOOR:
|
||||||
case GGML_UNARY_OP_CEIL:
|
case GGML_UNARY_OP_CEIL:
|
||||||
case GGML_UNARY_OP_ROUND:
|
case GGML_UNARY_OP_ROUND:
|
||||||
|
|
@ -4624,9 +4629,10 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||||
case GGML_OP_FILL:
|
case GGML_OP_FILL:
|
||||||
case GGML_OP_CUMSUM:
|
case GGML_OP_CUMSUM:
|
||||||
case GGML_OP_TRI:
|
case GGML_OP_TRI:
|
||||||
return true;
|
case GGML_OP_DIAG:
|
||||||
case GGML_OP_SOLVE_TRI:
|
case GGML_OP_SOLVE_TRI:
|
||||||
return op->src[0]->ne[0] <= 64 && op->src[1]->ne[0] <= 32;
|
return true;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -189,6 +189,9 @@ namespace ggml_cuda_mma {
|
||||||
return 8 * (threadIdx.x / 16) + l;
|
return 8 * (threadIdx.x / 16) + l;
|
||||||
#elif defined(RDNA3)
|
#elif defined(RDNA3)
|
||||||
return 2 * l + (threadIdx.x / 16);
|
return 2 * l + (threadIdx.x / 16);
|
||||||
|
#else
|
||||||
|
NO_DEVICE_CODE;
|
||||||
|
return -1;
|
||||||
#endif // defined(RDNA4)
|
#endif // defined(RDNA4)
|
||||||
} else {
|
} else {
|
||||||
NO_DEVICE_CODE;
|
NO_DEVICE_CODE;
|
||||||
|
|
@ -290,8 +293,12 @@ namespace ggml_cuda_mma {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#elif defined(AMD_WMMA_AVAILABLE)
|
#elif defined(AMD_WMMA_AVAILABLE)
|
||||||
|
#if defined(RDNA3)
|
||||||
|
// RDNA3 has duplicated data as input.
|
||||||
|
static constexpr int ne = I * J / 32 * 2;
|
||||||
|
#else
|
||||||
static constexpr int ne = I * J / 32;
|
static constexpr int ne = I * J / 32;
|
||||||
|
#endif // defined(RDNA3)
|
||||||
half2 x[ne] = {{0.0f, 0.0f}};
|
half2 x[ne] = {{0.0f, 0.0f}};
|
||||||
|
|
||||||
static constexpr __device__ bool supported() {
|
static constexpr __device__ bool supported() {
|
||||||
|
|
@ -310,7 +317,14 @@ namespace ggml_cuda_mma {
|
||||||
|
|
||||||
static __device__ __forceinline__ int get_j(const int l) {
|
static __device__ __forceinline__ int get_j(const int l) {
|
||||||
if constexpr (I == 16 && J == 8) {
|
if constexpr (I == 16 && J == 8) {
|
||||||
|
#if defined(RDNA4)
|
||||||
return 4 * (threadIdx.x / 16) + l;
|
return 4 * (threadIdx.x / 16) + l;
|
||||||
|
#elif defined(RDNA3)
|
||||||
|
return l;
|
||||||
|
#else
|
||||||
|
NO_DEVICE_CODE;
|
||||||
|
return -1;
|
||||||
|
#endif // defined(RDNA4)
|
||||||
} else {
|
} else {
|
||||||
NO_DEVICE_CODE;
|
NO_DEVICE_CODE;
|
||||||
return -1;
|
return -1;
|
||||||
|
|
@ -366,11 +380,16 @@ namespace ggml_cuda_mma {
|
||||||
static constexpr int I = I_;
|
static constexpr int I = I_;
|
||||||
static constexpr int J = J_;
|
static constexpr int J = J_;
|
||||||
static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR;
|
static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR;
|
||||||
static constexpr int ne = I * J / WARP_SIZE;
|
|
||||||
|
|
||||||
nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
|
|
||||||
|
|
||||||
#if defined(AMD_WMMA_AVAILABLE)
|
#if defined(AMD_WMMA_AVAILABLE)
|
||||||
|
#if defined(RDNA3)
|
||||||
|
// RDNA3 has duplicated data as input.
|
||||||
|
static constexpr int ne = I * J / 32 * 2;
|
||||||
|
#else
|
||||||
|
static constexpr int ne = I * J / 32;
|
||||||
|
#endif // defined(RDNA3)
|
||||||
|
nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
|
||||||
|
|
||||||
static constexpr __device__ bool supported() {
|
static constexpr __device__ bool supported() {
|
||||||
if (I == 16 && J == 8) return true;
|
if (I == 16 && J == 8) return true;
|
||||||
return false;
|
return false;
|
||||||
|
|
@ -387,13 +406,23 @@ namespace ggml_cuda_mma {
|
||||||
|
|
||||||
static __device__ __forceinline__ int get_j(const int l) {
|
static __device__ __forceinline__ int get_j(const int l) {
|
||||||
if constexpr (I == 16 && J == 8) {
|
if constexpr (I == 16 && J == 8) {
|
||||||
|
#if defined(RDNA4)
|
||||||
return 4 * (threadIdx.x / 16) + l;
|
return 4 * (threadIdx.x / 16) + l;
|
||||||
|
#elif defined(RDNA3)
|
||||||
|
return l;
|
||||||
|
#else
|
||||||
|
NO_DEVICE_CODE;
|
||||||
|
return -1;
|
||||||
|
#endif // defined(RDNA4)
|
||||||
} else {
|
} else {
|
||||||
NO_DEVICE_CODE;
|
NO_DEVICE_CODE;
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
|
static constexpr int ne = I * J / WARP_SIZE;
|
||||||
|
nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
|
||||||
|
|
||||||
static constexpr __device__ bool supported() {
|
static constexpr __device__ bool supported() {
|
||||||
if (I == 8 && J == 8) return true;
|
if (I == 8 && J == 8) return true;
|
||||||
if (I == 16 && J == 4) return true;
|
if (I == 16 && J == 4) return true;
|
||||||
|
|
@ -546,8 +575,14 @@ namespace ggml_cuda_mma {
|
||||||
}
|
}
|
||||||
#elif defined(AMD_WMMA_AVAILABLE)
|
#elif defined(AMD_WMMA_AVAILABLE)
|
||||||
if constexpr (std::is_same_v<T, half2> || std::is_same_v<T, nv_bfloat162>) {
|
if constexpr (std::is_same_v<T, half2> || std::is_same_v<T, nv_bfloat162>) {
|
||||||
|
#if defined(RDNA4)
|
||||||
ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
|
ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
|
||||||
|
#elif defined(RDNA3)
|
||||||
|
ggml_cuda_memcpy_1<sizeof(t.x)/2>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
|
||||||
|
ggml_cuda_memcpy_1<sizeof(t.x)/2>(t.x + t.ne/2, xs0 + t.get_i(0) * stride + t.get_j(t.ne/2));
|
||||||
|
#else
|
||||||
|
NO_DEVICE_CODE;
|
||||||
|
#endif // defined(RDNA4)
|
||||||
} else if constexpr (std::is_same_v<T, int>) {
|
} else if constexpr (std::is_same_v<T, int>) {
|
||||||
if constexpr (I == 16 && J == 4) {
|
if constexpr (I == 16 && J == 4) {
|
||||||
int64_t * xi = (int64_t *) t.x;
|
int64_t * xi = (int64_t *) t.x;
|
||||||
|
|
@ -888,6 +923,16 @@ namespace ggml_cuda_mma {
|
||||||
const halfx8_t& a_frag = reinterpret_cast<const halfx8_t&>(A.x[0]);
|
const halfx8_t& a_frag = reinterpret_cast<const halfx8_t&>(A.x[0]);
|
||||||
const halfx8_t& b_frag = reinterpret_cast<const halfx8_t&>(B.x[0]);
|
const halfx8_t& b_frag = reinterpret_cast<const halfx8_t&>(B.x[0]);
|
||||||
acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(a_frag, b_frag, acc_frag);
|
acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(a_frag, b_frag, acc_frag);
|
||||||
|
#elif defined(RDNA3)
|
||||||
|
using halfx16_t = __attribute__((ext_vector_type(16))) _Float16;
|
||||||
|
using floatx8_t = __attribute__((ext_vector_type(8))) float;
|
||||||
|
floatx8_t& acc_frag = reinterpret_cast<floatx8_t&>(D.x[0]);
|
||||||
|
const halfx16_t& a_frag = reinterpret_cast<const halfx16_t&>(A.x[0]);
|
||||||
|
const halfx16_t& b_frag = reinterpret_cast<const halfx16_t&>(B.x[0]);
|
||||||
|
acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32(a_frag, b_frag, acc_frag);
|
||||||
|
#else
|
||||||
|
GGML_UNUSED_VARS(D, A, B);
|
||||||
|
NO_DEVICE_CODE;
|
||||||
#endif // RDNA4
|
#endif // RDNA4
|
||||||
#else
|
#else
|
||||||
GGML_UNUSED_VARS(D, A, B);
|
GGML_UNUSED_VARS(D, A, B);
|
||||||
|
|
@ -905,6 +950,16 @@ namespace ggml_cuda_mma {
|
||||||
const bf16x8_t& a_frag = reinterpret_cast<const bf16x8_t&>(A.x[0]);
|
const bf16x8_t& a_frag = reinterpret_cast<const bf16x8_t&>(A.x[0]);
|
||||||
const bf16x8_t& b_frag = reinterpret_cast<const bf16x8_t&>(B.x[0]);
|
const bf16x8_t& b_frag = reinterpret_cast<const bf16x8_t&>(B.x[0]);
|
||||||
acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12(a_frag, b_frag, acc_frag);
|
acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12(a_frag, b_frag, acc_frag);
|
||||||
|
#elif defined(RDNA3)
|
||||||
|
using bf16x16_t = __attribute__((ext_vector_type(16))) __bf16;
|
||||||
|
using floatx8_t = __attribute__((ext_vector_type(8))) float;
|
||||||
|
floatx8_t& acc_frag = reinterpret_cast<floatx8_t&>(D.x[0]);
|
||||||
|
const bf16x16_t& a_frag = reinterpret_cast<const bf16x16_t&>(A.x[0]);
|
||||||
|
const bf16x16_t& b_frag = reinterpret_cast<const bf16x16_t&>(B.x[0]);
|
||||||
|
acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32(a_frag, b_frag, acc_frag);
|
||||||
|
#else
|
||||||
|
GGML_UNUSED_VARS(D, A, B);
|
||||||
|
NO_DEVICE_CODE;
|
||||||
#endif // RDNA4
|
#endif // RDNA4
|
||||||
#else
|
#else
|
||||||
GGML_UNUSED_VARS(D, A, B);
|
GGML_UNUSED_VARS(D, A, B);
|
||||||
|
|
|
||||||
|
|
@ -151,7 +151,9 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (src1_ncols > 16) {
|
if (GGML_CUDA_CC_IS_RDNA3_0(cc) && src1_ncols > 8) {
|
||||||
|
return false;
|
||||||
|
} else if (src1_ncols > 16) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -160,9 +162,9 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
return ampere_mma_available(cc);
|
return ampere_mma_available(cc);
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
return volta_mma_available(cc) || turing_mma_available(cc) || (amd_wmma_available(cc) && GGML_CUDA_CC_IS_RDNA4(cc));
|
return volta_mma_available(cc) || turing_mma_available(cc) || amd_wmma_available(cc);
|
||||||
case GGML_TYPE_BF16:
|
case GGML_TYPE_BF16:
|
||||||
return ampere_mma_available(cc) || (amd_wmma_available(cc) && GGML_CUDA_CC_IS_RDNA4(cc));
|
return ampere_mma_available(cc) || amd_wmma_available(cc);
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -765,7 +765,10 @@ bool ggml_cuda_should_use_mmvf(enum ggml_type type, int cc, const int64_t * src0
|
||||||
return ne11 <= 8;
|
return ne11 <= 8;
|
||||||
} else if (GGML_CUDA_CC_IS_AMD(cc)) {
|
} else if (GGML_CUDA_CC_IS_AMD(cc)) {
|
||||||
if (fp16_mma_hardware_available(cc)) {
|
if (fp16_mma_hardware_available(cc)) {
|
||||||
if (GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
|
if (GGML_CUDA_CC_IS_RDNA3(cc)) {
|
||||||
|
return ne11 <= 3;
|
||||||
|
}
|
||||||
|
if (GGML_CUDA_CC_IS_RDNA4(cc)) {
|
||||||
return ne11 <= 5;
|
return ne11 <= 5;
|
||||||
}
|
}
|
||||||
return ne11 <= 2;
|
return ne11 <= 2;
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,80 @@
|
||||||
#include "solve_tri.cuh"
|
#include "solve_tri.cuh"
|
||||||
|
|
||||||
#define MAX_N_FAST 64
|
#define MAX_N_FAST 64
|
||||||
|
#define MAX_K_FAST 32
|
||||||
|
|
||||||
|
static __global__ void get_batch_pointers(const float * A,
|
||||||
|
float * X,
|
||||||
|
const float ** A_ptrs,
|
||||||
|
float ** X_ptrs,
|
||||||
|
int64_t ne02,
|
||||||
|
int64_t total_batches,
|
||||||
|
size_t s02,
|
||||||
|
size_t s03,
|
||||||
|
size_t s2,
|
||||||
|
size_t s3) {
|
||||||
|
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
if (idx >= total_batches) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int64_t i3 = idx / ne02;
|
||||||
|
const int64_t i2 = idx % ne02;
|
||||||
|
|
||||||
|
A_ptrs[idx] = A + i3 * s03 + i2 * s02;
|
||||||
|
X_ptrs[idx] = X + i3 * s3 + i2 * s2;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void solve_tri_f32_cublas(ggml_backend_cuda_context & ctx,
|
||||||
|
const float * A,
|
||||||
|
const float * B,
|
||||||
|
float * X,
|
||||||
|
int n,
|
||||||
|
int k,
|
||||||
|
int64_t ne02,
|
||||||
|
int64_t ne03,
|
||||||
|
size_t s02,
|
||||||
|
size_t s03,
|
||||||
|
size_t s12,
|
||||||
|
size_t s13,
|
||||||
|
size_t s2,
|
||||||
|
size_t s3,
|
||||||
|
cudaStream_t stream) {
|
||||||
|
const float alpha = 1.0f;
|
||||||
|
const int64_t total_batches = ne02 * ne03;
|
||||||
|
if (total_batches == 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Bulk copy B -> X (contiguous tensors)
|
||||||
|
if (X != B) {
|
||||||
|
const int64_t total_elements_BX = n * k * total_batches;
|
||||||
|
CUDA_CHECK(cudaMemcpyAsync(X, B, total_elements_BX * sizeof(float), cudaMemcpyDeviceToDevice, stream));
|
||||||
|
}
|
||||||
|
|
||||||
|
const int id = ggml_cuda_get_device();
|
||||||
|
|
||||||
|
ggml_cuda_pool_alloc<const float *> A_ptrs_alloc(ctx.pool(id), total_batches);
|
||||||
|
ggml_cuda_pool_alloc<float *> X_ptrs_alloc(ctx.pool(id), total_batches);
|
||||||
|
|
||||||
|
const float ** A_ptrs_dev = A_ptrs_alloc.get();
|
||||||
|
float ** X_ptrs_dev = X_ptrs_alloc.get();
|
||||||
|
|
||||||
|
get_batch_pointers<<<(total_batches + 255) / 256, 256, 0, stream>>>(A, X, A_ptrs_dev, X_ptrs_dev, ne02,
|
||||||
|
total_batches, s02, s03, s2, s3);
|
||||||
|
|
||||||
|
CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
|
||||||
|
|
||||||
|
// Yes, this is necessary, without this we get RMSE errors
|
||||||
|
CUBLAS_CHECK(cublasSetMathMode(ctx.cublas_handle(id), CUBLAS_DEFAULT_MATH));
|
||||||
|
CUBLAS_CHECK(cublasStrsmBatched(ctx.cublas_handle(id), CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N,
|
||||||
|
CUBLAS_DIAG_NON_UNIT, k, n, &alpha, A_ptrs_dev, n, X_ptrs_dev, k, total_batches));
|
||||||
|
|
||||||
|
// revert to standard mode from common.cuh
|
||||||
|
CUBLAS_CHECK(cublasSetMathMode(ctx.cublas_handle(id), CUBLAS_TF32_TENSOR_OP_MATH));
|
||||||
|
|
||||||
|
GGML_UNUSED_VARS(s12, s13);
|
||||||
|
}
|
||||||
|
|
||||||
// ======================
|
// ======================
|
||||||
// Fast Kernel (n <= 64, k <= 32) - Warp-based parallel reduction
|
// Fast Kernel (n <= 64, k <= 32) - Warp-based parallel reduction
|
||||||
|
|
@ -176,20 +250,26 @@ static void solve_tri_f32_cuda(const float * A,
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_cuda_op_solve_tri(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
void ggml_cuda_op_solve_tri(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
const ggml_tensor * src0 = dst->src[0]; // A (triangular n x x matrix)
|
const ggml_tensor * src0 = dst->src[0]; // A (n×n, lower triangular)
|
||||||
const ggml_tensor * src1 = dst->src[1]; // B (right hand side of n x k equation columns)
|
const ggml_tensor * src1 = dst->src[1]; // B (n×k)
|
||||||
|
|
||||||
ggml_is_contiguous(src0);
|
ggml_is_contiguous(src0);
|
||||||
ggml_is_contiguous(src1);
|
ggml_is_contiguous(src1);
|
||||||
|
|
||||||
const int64_t n = src0->ne[0];
|
const int64_t n = src0->ne[0];
|
||||||
const int64_t k = src1->ne[0];
|
const int64_t k = src1->ne[0];
|
||||||
|
const int64_t ne02 = src0->ne[2];
|
||||||
|
const int64_t ne03 = src0->ne[3];
|
||||||
|
|
||||||
GGML_ASSERT(n <= 64);
|
if (n <= MAX_N_FAST && k <= MAX_K_FAST) {
|
||||||
GGML_ASSERT(k <= 32);
|
solve_tri_f32_cuda((const float *) src0->data, (const float *) src1->data, (float *) dst->data, n, k,
|
||||||
|
src0->ne[2], src0->ne[3], src0->nb[2] / sizeof(float), src0->nb[3] / sizeof(float),
|
||||||
solve_tri_f32_cuda((const float *) src0->data, (const float *) src1->data, (float *) dst->data, n, k, src0->ne[2],
|
|
||||||
src0->ne[3], src0->nb[2] / sizeof(float), src0->nb[3] / sizeof(float),
|
|
||||||
src1->nb[2] / sizeof(float), src1->nb[3] / sizeof(float), dst->nb[2] / sizeof(float),
|
src1->nb[2] / sizeof(float), src1->nb[3] / sizeof(float), dst->nb[2] / sizeof(float),
|
||||||
dst->nb[3] / sizeof(float), ctx.stream());
|
dst->nb[3] / sizeof(float), ctx.stream());
|
||||||
|
} else {
|
||||||
|
solve_tri_f32_cublas(ctx, (const float *) src0->data, (const float *) src1->data, (float *) dst->data, n, k,
|
||||||
|
ne02, ne03, src0->nb[2] / sizeof(float), src0->nb[3] / sizeof(float),
|
||||||
|
src1->nb[2] / sizeof(float), src1->nb[3] / sizeof(float), dst->nb[2] / sizeof(float),
|
||||||
|
dst->nb[3] / sizeof(float), ctx.stream());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -19,6 +19,9 @@
|
||||||
#define CUDA_R_16F HIPBLAS_R_16F
|
#define CUDA_R_16F HIPBLAS_R_16F
|
||||||
#define CUDA_R_16BF HIPBLAS_R_16B
|
#define CUDA_R_16BF HIPBLAS_R_16B
|
||||||
#define CUDA_R_32F HIPBLAS_R_32F
|
#define CUDA_R_32F HIPBLAS_R_32F
|
||||||
|
#define CUBLAS_SIDE_RIGHT HIPBLAS_SIDE_RIGHT
|
||||||
|
#define CUBLAS_FILL_MODE_UPPER HIPBLAS_FILL_MODE_UPPER
|
||||||
|
#define CUBLAS_DIAG_NON_UNIT HIPBLAS_DIAG_NON_UNIT
|
||||||
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED hipDeviceAttributeVirtualMemoryManagementSupported
|
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED hipDeviceAttributeVirtualMemoryManagementSupported
|
||||||
#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED hipMemAllocationGranularityRecommended
|
#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED hipMemAllocationGranularityRecommended
|
||||||
#define CU_MEM_ALLOCATION_TYPE_PINNED hipMemAllocationTypePinned
|
#define CU_MEM_ALLOCATION_TYPE_PINNED hipMemAllocationTypePinned
|
||||||
|
|
@ -30,6 +33,7 @@
|
||||||
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
||||||
#define __all_sync(mask, var) __all(var)
|
#define __all_sync(mask, var) __all(var)
|
||||||
#define __any_sync(mask, var) __any(var)
|
#define __any_sync(mask, var) __any(var)
|
||||||
|
#define cublasStrsmBatched hipblasStrsmBatched
|
||||||
#define cublasCreate hipblasCreate
|
#define cublasCreate hipblasCreate
|
||||||
#define cublasDestroy hipblasDestroy
|
#define cublasDestroy hipblasDestroy
|
||||||
#define cublasGemmEx hipblasGemmEx
|
#define cublasGemmEx hipblasGemmEx
|
||||||
|
|
|
||||||
|
|
@ -12,11 +12,16 @@
|
||||||
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT
|
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT
|
||||||
#define CUBLAS_OP_N MUBLAS_OP_N
|
#define CUBLAS_OP_N MUBLAS_OP_N
|
||||||
#define CUBLAS_OP_T MUBLAS_OP_T
|
#define CUBLAS_OP_T MUBLAS_OP_T
|
||||||
|
#define CUBLAS_DEFAULT_MATH MUBLAS_DEFAULT_MATH
|
||||||
|
#define CUBLAS_SIDE_RIGHT MUBLAS_SIDE_RIGHT
|
||||||
|
#define CUBLAS_FILL_MODE_UPPER MUBLAS_FILL_MODE_UPPER
|
||||||
|
#define CUBLAS_DIAG_NON_UNIT MUBLAS_DIAG_NON_UNIT
|
||||||
#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
|
#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
|
||||||
#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_TENSOR_OP_MATH
|
#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_TENSOR_OP_MATH
|
||||||
#define CUDA_R_16F MUSA_R_16F
|
#define CUDA_R_16F MUSA_R_16F
|
||||||
#define CUDA_R_16BF MUSA_R_16BF
|
#define CUDA_R_16BF MUSA_R_16BF
|
||||||
#define CUDA_R_32F MUSA_R_32F
|
#define CUDA_R_32F MUSA_R_32F
|
||||||
|
#define cublasStrsmBatched mublasStrsmBatched
|
||||||
#define cublasComputeType_t cudaDataType_t
|
#define cublasComputeType_t cudaDataType_t
|
||||||
#define cublasCreate mublasCreate
|
#define cublasCreate mublasCreate
|
||||||
#define cublasDestroy mublasDestroy
|
#define cublasDestroy mublasDestroy
|
||||||
|
|
|
||||||
|
|
@ -1976,9 +1976,6 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
if (!opt_experimental) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
|
|
|
||||||
|
|
@ -903,7 +903,7 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri
|
||||||
const float * restrict vy = (const float * restrict) y;
|
const float * restrict vy = (const float * restrict) y;
|
||||||
|
|
||||||
for (uint32_t i = 0; i < n; i++) {
|
for (uint32_t i = 0; i < n; i++) {
|
||||||
rsum += vx[i] * (__fp16) vy[i];
|
rsum += (float)vx[i] * vy[i];
|
||||||
}
|
}
|
||||||
*s = rsum;
|
*s = rsum;
|
||||||
return;
|
return;
|
||||||
|
|
@ -917,7 +917,7 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri
|
||||||
|
|
||||||
// for some reason we need volatile here so that the compiler doesn't try anything funky
|
// for some reason we need volatile here so that the compiler doesn't try anything funky
|
||||||
volatile HVX_Vector rsum = Q6_V_vsplat_R(0);
|
volatile HVX_Vector rsum = Q6_V_vsplat_R(0);
|
||||||
|
float r_sum_scalar = 0.0f;
|
||||||
uint32_t i = 0;
|
uint32_t i = 0;
|
||||||
|
|
||||||
for (i = 0; i < nv0; i++) {
|
for (i = 0; i < nv0; i++) {
|
||||||
|
|
@ -926,31 +926,42 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri
|
||||||
HVX_Vector x = vx[i];
|
HVX_Vector x = vx[i];
|
||||||
HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), Q6_Vh_vsplat_R(0x3C00)); // mul by 1.0
|
HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), Q6_Vh_vsplat_R(0x3C00)); // mul by 1.0
|
||||||
|
|
||||||
HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_hi_W(xp)), Q6_V_hi_W(yp));
|
//NOTE: need volatile here to prevent compiler optimization
|
||||||
HVX_Vector lo = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp));
|
// Seem compiler cannot guarantee read-after-write??
|
||||||
|
volatile HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_hi_W(xp)), Q6_V_hi_W(yp));
|
||||||
|
volatile HVX_Vector lo = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp));
|
||||||
|
|
||||||
HVX_Vector sum = Q6_Vqf32_vadd_Vqf32Vqf32(hi, lo);
|
HVX_Vector sum = Q6_Vqf32_vadd_Vqf32Vqf32(hi, lo);
|
||||||
rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, sum);
|
rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, sum);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (nv1) {
|
if (nv1) {
|
||||||
HVX_VectorPair yp = vy[i];
|
// HVX_VectorPair yp = vy[i];
|
||||||
|
|
||||||
HVX_Vector x = vx[i];
|
// HVX_Vector x = vx[i];
|
||||||
HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), Q6_Vh_vsplat_R(0x3C00)); // mul by 1.0
|
// HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), Q6_Vh_vsplat_R(0x3C00)); // mul by 1.0
|
||||||
|
|
||||||
if (nv1 >= 32) {
|
// if (nv1 >= 32) {
|
||||||
HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_hi_W(xp)), Q6_V_hi_W(yp));
|
// volatile HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_hi_W(xp)), Q6_V_hi_W(yp));
|
||||||
rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, hi);
|
// rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, hi);
|
||||||
nv1 -= 32;
|
// nv1 -= 32;
|
||||||
}
|
// }
|
||||||
|
|
||||||
|
// rsum = hvx_vec_qf32_reduce_sum(rsum);
|
||||||
|
|
||||||
|
// if (nv1) {
|
||||||
|
// volatile HVX_Vector lo = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp));
|
||||||
|
// HVX_Vector sum = hvx_vec_qf32_reduce_sum_n(lo, nv1);
|
||||||
|
// rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, sum);
|
||||||
|
// }
|
||||||
|
|
||||||
|
//process the remainder using scalar loop
|
||||||
rsum = hvx_vec_qf32_reduce_sum(rsum);
|
rsum = hvx_vec_qf32_reduce_sum(rsum);
|
||||||
|
const __fp16 * restrict sx = (const __fp16 * restrict) x;
|
||||||
|
const float * restrict sy = (const float * restrict) y;
|
||||||
|
|
||||||
if (nv1) {
|
for (uint32_t i = nv0 * 64; i < n; i++) {
|
||||||
HVX_Vector lo = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp));
|
r_sum_scalar += (float) sx[i] * sy[i];
|
||||||
HVX_Vector sum = hvx_vec_qf32_reduce_sum_n(lo, nv1);
|
|
||||||
rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, sum);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// hvx_vec_dump_fp16("X", x);
|
// hvx_vec_dump_fp16("X", x);
|
||||||
|
|
@ -961,7 +972,7 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri
|
||||||
rsum = hvx_vec_qf32_reduce_sum(rsum);
|
rsum = hvx_vec_qf32_reduce_sum(rsum);
|
||||||
}
|
}
|
||||||
|
|
||||||
*s = hvx_vec_get_fp32(Q6_Vsf_equals_Vqf32(rsum));
|
*s = hvx_vec_get_fp32(Q6_Vsf_equals_Vqf32(rsum)) + r_sum_scalar;
|
||||||
|
|
||||||
# ifdef HTP_DEBUG
|
# ifdef HTP_DEBUG
|
||||||
{
|
{
|
||||||
|
|
@ -1498,9 +1509,6 @@ static void matmul_f16_f32(struct htp_tensor * restrict src0,
|
||||||
uint64_t t1, t2;
|
uint64_t t1, t2;
|
||||||
t1 = HAP_perf_get_qtimer_count();
|
t1 = HAP_perf_get_qtimer_count();
|
||||||
|
|
||||||
const size_t src0_row_size = sizeof(__fp16) * ne00;
|
|
||||||
const size_t src1_row_size = sizeof(float) * ne10;
|
|
||||||
|
|
||||||
assert(ne12 % ne02 == 0);
|
assert(ne12 % ne02 == 0);
|
||||||
assert(ne13 % ne03 == 0);
|
assert(ne13 % ne03 == 0);
|
||||||
|
|
||||||
|
|
@ -1510,8 +1518,6 @@ static void matmul_f16_f32(struct htp_tensor * restrict src0,
|
||||||
// This is the size of the rest of the dimensions of the result
|
// This is the size of the rest of the dimensions of the result
|
||||||
const uint32_t nr1 = ne1 * ne2 * ne3;
|
const uint32_t nr1 = ne1 * ne2 * ne3;
|
||||||
|
|
||||||
uint32_t chunk_size = 64;
|
|
||||||
|
|
||||||
// distribute the thread work across the inner or outer loop based on which one is larger
|
// distribute the thread work across the inner or outer loop based on which one is larger
|
||||||
uint32_t nchunk0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
|
uint32_t nchunk0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
|
||||||
uint32_t nchunk1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
|
uint32_t nchunk1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
|
||||||
|
|
@ -1544,11 +1550,11 @@ static void matmul_f16_f32(struct htp_tensor * restrict src0,
|
||||||
const uint32_t blck_0 = 64;
|
const uint32_t blck_0 = 64;
|
||||||
const uint32_t blck_1 = 64;
|
const uint32_t blck_1 = 64;
|
||||||
|
|
||||||
float tmp[32];
|
__attribute__((aligned(128))) float tmp[64];
|
||||||
|
|
||||||
for (uint32_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) {
|
for (uint32_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) {
|
||||||
for (uint32_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) {
|
for (uint32_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) {
|
||||||
for (uint32_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir1_end; ir1++) {
|
for (uint32_t ir1 = iir1; ir1 < MIN(iir1 + blck_1, ir1_end); ir1++) {
|
||||||
const uint32_t i13 = (ir1 / (ne12 * ne1));
|
const uint32_t i13 = (ir1 / (ne12 * ne1));
|
||||||
const uint32_t i12 = (ir1 - i13 * ne12 * ne1) / ne1;
|
const uint32_t i12 = (ir1 - i13 * ne12 * ne1) / ne1;
|
||||||
const uint32_t i11 = (ir1 - i13 * ne12 * ne1 - i12 * ne1);
|
const uint32_t i11 = (ir1 - i13 * ne12 * ne1 - i12 * ne1);
|
||||||
|
|
@ -1561,13 +1567,16 @@ static void matmul_f16_f32(struct htp_tensor * restrict src0,
|
||||||
const uint32_t i2 = i12;
|
const uint32_t i2 = i12;
|
||||||
const uint32_t i3 = i13;
|
const uint32_t i3 = i13;
|
||||||
|
|
||||||
const uint8_t * restrict src0_row = (const uint8_t *) src0->data + (0 + i02 * nb02 + i03 * nb03);
|
const uint8_t * restrict src0_base = (const uint8_t *) src0->data + (0 + i02 * nb02 + i03 * nb03);
|
||||||
const uint8_t * restrict src1_col =
|
const uint8_t * restrict src1_col =
|
||||||
(const uint8_t *) src1->data + (i11 + i12 * ne11 + i13 * ne12 * ne11) * src1_row_size;
|
(const uint8_t *) src1->data + (i11 * nb11 + i12 * nb12 + i13 * nb13);
|
||||||
float * dst_col = (float *) ((uint8_t * restrict) dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
|
float * dst_col = (float *) ((uint8_t * restrict) dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
|
||||||
|
|
||||||
for (uint32_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0++) {
|
const uint32_t ir0_block_end = MIN(iir0 + blck_0, ir0_end);
|
||||||
vec_dot_f16_f32(ne00, &tmp[ir0 - iir0], src0_row + ir0 * src0_row_size, src1_col);
|
for (uint32_t ir0 = iir0; ir0 < ir0_block_end; ir0++) {
|
||||||
|
// Use nb01 stride for non-contiguous src0 support
|
||||||
|
const uint8_t * restrict src0_row = src0_base + ir0 * nb01;
|
||||||
|
vec_dot_f16_f32(ne00, &tmp[ir0 - iir0], src0_row, src1_col);
|
||||||
}
|
}
|
||||||
|
|
||||||
hvx_copy_fp32_ua((uint8_t *) &dst_col[iir0], (uint8_t *) tmp, MIN(iir0 + blck_0, ir0_end) - iir0);
|
hvx_copy_fp32_ua((uint8_t *) &dst_col[iir0], (uint8_t *) tmp, MIN(iir0 + blck_0, ir0_end) - iir0);
|
||||||
|
|
|
||||||
|
|
@ -74,14 +74,14 @@ static float rope_yarn_ramp(const float low, const float high, const int i0) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static void rope_cache_init(const float theta_base,
|
static void rope_cache_init(const float theta_base,
|
||||||
float freq_scale,
|
const float freq_scale,
|
||||||
const float * freq_factors,
|
const float * freq_factors,
|
||||||
float * corr_dims,
|
float * corr_dims,
|
||||||
uint32_t ne0,
|
const uint32_t ne0,
|
||||||
float ext_factor,
|
const float ext_factor,
|
||||||
float mscale,
|
const float mscale,
|
||||||
float * cache,
|
float * cache,
|
||||||
float theta_scale) {
|
const float theta_scale) {
|
||||||
// ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
|
// ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
|
||||||
float theta = theta_base;
|
float theta = theta_base;
|
||||||
|
|
||||||
|
|
@ -92,18 +92,19 @@ static void rope_cache_init(const float theta_base,
|
||||||
|
|
||||||
// Get n-d rotational scaling corrected for extrapolation
|
// Get n-d rotational scaling corrected for extrapolation
|
||||||
float theta_interp = freq_scale * theta_extrap;
|
float theta_interp = freq_scale * theta_extrap;
|
||||||
float theta2 = theta_interp;
|
float theta_final = theta_interp;
|
||||||
|
float mscale_final = mscale;
|
||||||
|
|
||||||
if (ext_factor != 0.0f) {
|
if (ext_factor != 0.0f) {
|
||||||
float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
|
float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
|
||||||
theta2 = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
|
theta_final = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
|
||||||
|
|
||||||
// Get n-d magnitude scaling corrected for interpolation
|
// Get n-d magnitude scaling corrected for interpolation
|
||||||
mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
|
mscale_final *= 1.0f + 0.1f * logf(1.0f / freq_scale);
|
||||||
}
|
}
|
||||||
|
|
||||||
cache[i0 + 0] = cosf(theta2) * mscale;
|
cache[i0 + 0] = cosf(theta_final) * mscale_final;
|
||||||
cache[i0 + 1] = sinf(theta2) * mscale;
|
cache[i0 + 1] = sinf(theta_final) * mscale_final;
|
||||||
|
|
||||||
theta *= theta_scale;
|
theta *= theta_scale;
|
||||||
}
|
}
|
||||||
|
|
@ -259,7 +260,7 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
|
||||||
const uint32_t ir1,
|
const uint32_t ir1,
|
||||||
int nth,
|
int nth,
|
||||||
int ith,
|
int ith,
|
||||||
int opt_path) {
|
const int opt_path) {
|
||||||
struct htp_ops_context * octx = rope_ctx->octx;
|
struct htp_ops_context * octx = rope_ctx->octx;
|
||||||
|
|
||||||
const struct htp_tensor * src0 = &octx->src0;
|
const struct htp_tensor * src0 = &octx->src0;
|
||||||
|
|
@ -281,8 +282,9 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
|
||||||
freq_factors = (const float *) src2->data;
|
freq_factors = (const float *) src2->data;
|
||||||
}
|
}
|
||||||
|
|
||||||
int ir = 0;
|
const uint32_t i1_end = MIN(ir1, ne1);
|
||||||
|
const int32_t half_dims = rope_ctx->n_dims / 2;
|
||||||
|
const size_t remain_bytes = (ne0 - rope_ctx->n_dims) * sizeof(float);
|
||||||
for (uint32_t i3 = 0; i3 < ne3; i3++) { // batch
|
for (uint32_t i3 = 0; i3 < ne3; i3++) { // batch
|
||||||
for (uint32_t i2 = 0; i2 < ne2; i2++) { // seq-len
|
for (uint32_t i2 = 0; i2 < ne2; i2++) { // seq-len
|
||||||
const int32_t p = pos[i2];
|
const int32_t p = pos[i2];
|
||||||
|
|
@ -290,14 +292,7 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
|
||||||
rope_cache_init(p, rope_ctx->freq_scale, freq_factors, rope_ctx->corr_dims, ne0, rope_ctx->ext_factor,
|
rope_cache_init(p, rope_ctx->freq_scale, freq_factors, rope_ctx->corr_dims, ne0, rope_ctx->ext_factor,
|
||||||
rope_ctx->attn_factor, wp0, rope_ctx->theta_scale);
|
rope_ctx->attn_factor, wp0, rope_ctx->theta_scale);
|
||||||
|
|
||||||
for (uint32_t i1 = 0; i1 < ne1; i1++) { // attn-heads
|
for (uint32_t i1 = ir0; i1 < i1_end; i1++) { // attn-heads
|
||||||
if (ir++ < ir0) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (ir > ir1) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
const float * src = (float *) ((char *) src0->data + i3 * nb03 + i2 * nb02 + i1 * nb01);
|
const float * src = (float *) ((char *) src0->data + i3 * nb03 + i2 * nb02 + i1 * nb01);
|
||||||
float * dst_data = (float *) ((char *) dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1);
|
float * dst_data = (float *) ((char *) dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1);
|
||||||
|
|
||||||
|
|
@ -310,6 +305,9 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
|
||||||
} else {
|
} else {
|
||||||
hvx_calc_rope_f32(src_loc, dst_data_loc, rope_ctx->n_dims, wp0);
|
hvx_calc_rope_f32(src_loc, dst_data_loc, rope_ctx->n_dims, wp0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
src_loc += rope_ctx->n_dims;
|
||||||
|
dst_data_loc += rope_ctx->n_dims;
|
||||||
} else {
|
} else {
|
||||||
for (uint32_t i0 = 0; i0 < rope_ctx->n_dims; i0 += 2) {
|
for (uint32_t i0 = 0; i0 < rope_ctx->n_dims; i0 += 2) {
|
||||||
const float cos_theta = wp0[i0 + 0];
|
const float cos_theta = wp0[i0 + 0];
|
||||||
|
|
@ -317,10 +315,10 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
|
||||||
|
|
||||||
if (is_neox) {
|
if (is_neox) {
|
||||||
const float x0 = src_loc[0];
|
const float x0 = src_loc[0];
|
||||||
const float x1 = src_loc[rope_ctx->n_dims/2];
|
const float x1 = src_loc[half_dims];
|
||||||
|
|
||||||
dst_data_loc[0] = x0 * cos_theta - x1 * sin_theta;
|
dst_data_loc[0] = x0 * cos_theta - x1 * sin_theta;
|
||||||
dst_data_loc[rope_ctx->n_dims/2] = x0 * sin_theta + x1 * cos_theta;
|
dst_data_loc[half_dims] = x0 * sin_theta + x1 * cos_theta;
|
||||||
|
|
||||||
src_loc += 1;
|
src_loc += 1;
|
||||||
dst_data_loc += 1;
|
dst_data_loc += 1;
|
||||||
|
|
@ -335,15 +333,13 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
|
||||||
dst_data_loc += 2;
|
dst_data_loc += 2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
src_loc += (is_neox ? half_dims : 0);
|
||||||
|
dst_data_loc += (is_neox ? half_dims : 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (uint32_t i0 = rope_ctx->n_dims; i0 < ne0; i0 += 2) {
|
// TODO: use simd to speed up the remaining elements copy
|
||||||
dst_data_loc[0] = src_loc[0];
|
memcpy(dst_data_loc, src_loc, remain_bytes);
|
||||||
dst_data_loc[1] = src_loc[1];
|
|
||||||
|
|
||||||
src_loc += 2;
|
|
||||||
dst_data_loc += 2;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -411,6 +411,38 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_conv(ggml_me
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_conv_batched(ggml_metal_library_t lib, const ggml_tensor * op, int ssm_conv_bs) {
|
||||||
|
GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32);
|
||||||
|
GGML_ASSERT(op->src[1]->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(op->src[0]));
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(op->src[1]));
|
||||||
|
|
||||||
|
char base[256];
|
||||||
|
char name[256];
|
||||||
|
|
||||||
|
const char * suffix = "";
|
||||||
|
if (op->src[1]->ne[0] % 4 == 0) {
|
||||||
|
suffix = "_4";
|
||||||
|
}
|
||||||
|
|
||||||
|
snprintf(base, 256, "kernel_ssm_conv_%s_%s_batched%s", ggml_type_name(op->src[0]->type), ggml_type_name(op->src[1]->type), suffix);
|
||||||
|
snprintf(name, 256, "%s_ssm_conv_bs=%d", base, ssm_conv_bs);
|
||||||
|
|
||||||
|
ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
|
||||||
|
if (!res.pipeline) {
|
||||||
|
ggml_metal_cv_t cv = ggml_metal_cv_init();
|
||||||
|
|
||||||
|
ggml_metal_cv_set_int16(cv, ssm_conv_bs, FC_SSM_CONV + 0);
|
||||||
|
|
||||||
|
res = ggml_metal_library_compile_pipeline(lib, base, name, cv);
|
||||||
|
|
||||||
|
ggml_metal_cv_free(cv);
|
||||||
|
}
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_scan(ggml_metal_library_t lib, const ggml_tensor * op) {
|
ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_scan(ggml_metal_library_t lib, const ggml_tensor * op) {
|
||||||
GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
|
GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
|
||||||
|
|
||||||
|
|
@ -427,7 +459,12 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_scan(ggml_me
|
||||||
res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr);
|
res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
res.smem = 32*sizeof(float)*nsg;
|
// Shared memory layout:
|
||||||
|
// - sgptg * NW floats for partial sums (nsg * 32)
|
||||||
|
// - sgptg floats for shared_x_dt (nsg)
|
||||||
|
// - sgptg floats for shared_dA (nsg)
|
||||||
|
// Total: nsg * (32 + 2) floats
|
||||||
|
res.smem = (32 + 2)*sizeof(float)*nsg;
|
||||||
|
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -117,6 +117,7 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_cumsum_ad
|
||||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_tri (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_tri (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_soft_max (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_soft_max (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_conv (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_conv (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||||
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_conv_batched (ggml_metal_library_t lib, const struct ggml_tensor * op, int ssm_conv_bs);
|
||||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_scan (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_scan (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_rwkv (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_rwkv (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv_ext (ggml_metal_library_t lib, enum ggml_type tsrc0, enum ggml_type tsrc1, int nsg, int nxpsg, int r1ptg);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv_ext (ggml_metal_library_t lib, enum ggml_type tsrc0, enum ggml_type tsrc1, int nsg, int nxpsg, int r1ptg);
|
||||||
|
|
|
||||||
|
|
@ -769,9 +769,16 @@ ggml_metal_device_t ggml_metal_device_init(void) {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
dev->props.use_shared_buffers = dev->props.has_unified_memory;
|
dev->props.use_shared_buffers = dev->props.has_unified_memory;
|
||||||
|
#if TARGET_OS_OSX
|
||||||
|
// In case of eGPU, shared memory may be preferable.
|
||||||
|
dev->props.use_shared_buffers |= [dev->mtl_device location] == MTLDeviceLocationExternal;
|
||||||
|
#endif
|
||||||
if (getenv("GGML_METAL_SHARED_BUFFERS_DISABLE") != NULL) {
|
if (getenv("GGML_METAL_SHARED_BUFFERS_DISABLE") != NULL) {
|
||||||
dev->props.use_shared_buffers = false;
|
dev->props.use_shared_buffers = false;
|
||||||
}
|
}
|
||||||
|
if (getenv("GGML_METAL_SHARED_BUFFERS_ENABLE") != NULL) {
|
||||||
|
dev->props.use_shared_buffers = true;
|
||||||
|
}
|
||||||
|
|
||||||
dev->props.supports_gpu_family_apple7 = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7];
|
dev->props.supports_gpu_family_apple7 = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7];
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -77,6 +77,7 @@
|
||||||
#define FC_MUL_MV 600
|
#define FC_MUL_MV 600
|
||||||
#define FC_MUL_MM 700
|
#define FC_MUL_MM 700
|
||||||
#define FC_ROPE 800
|
#define FC_ROPE 800
|
||||||
|
#define FC_SSM_CONV 900
|
||||||
|
|
||||||
// op-specific constants
|
// op-specific constants
|
||||||
#define OP_FLASH_ATTN_EXT_NQPTG 8
|
#define OP_FLASH_ATTN_EXT_NQPTG 8
|
||||||
|
|
|
||||||
|
|
@ -221,7 +221,7 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ctx->debug_graph > 0) {
|
if (ctx->debug_graph > 0) {
|
||||||
GGML_LOG_DEBUG("%s: node[%5d] - %-12s %s\n", __func__, idx, ggml_op_name(node->op), is_concurrent ? "(concurrent)" : "");
|
GGML_LOG_DEBUG("%s: node[%5d] - %-12s %-12s %s\n", __func__, idx, ggml_op_name(node->op), ggml_get_name(node), is_concurrent ? "(concurrent)" : "");
|
||||||
}
|
}
|
||||||
if (ctx->debug_graph > 1) {
|
if (ctx->debug_graph > 1) {
|
||||||
GGML_TENSOR_LOCALS( int64_t, ne0, node->src[0], ne);
|
GGML_TENSOR_LOCALS( int64_t, ne0, node->src[0], ne);
|
||||||
|
|
@ -1365,6 +1365,33 @@ int ggml_metal_op_ssm_conv(ggml_metal_op_t ctx, int idx) {
|
||||||
/*.nb2 =*/ nb2,
|
/*.nb2 =*/ nb2,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Use batched kernel for prefill (ne1 > 1) to reduce threadgroup dispatch overhead
|
||||||
|
const bool use_batched = (ne1 > 1);
|
||||||
|
|
||||||
|
if (use_batched) {
|
||||||
|
// Determine the smallest power of 2 that's >= ne1, but <= 256
|
||||||
|
int BATCH_SIZE;
|
||||||
|
if (ne1 > 128) BATCH_SIZE = 256;
|
||||||
|
else if (ne1 > 64 ) BATCH_SIZE = 128;
|
||||||
|
else if (ne1 > 32 ) BATCH_SIZE = 64;
|
||||||
|
else if (ne1 > 16 ) BATCH_SIZE = 32;
|
||||||
|
else if (ne1 > 8 ) BATCH_SIZE = 16;
|
||||||
|
else if (ne1 > 4 ) BATCH_SIZE = 8;
|
||||||
|
else BATCH_SIZE = 2;
|
||||||
|
|
||||||
|
auto pipeline = ggml_metal_library_get_pipeline_ssm_conv_batched(lib, op, BATCH_SIZE);
|
||||||
|
|
||||||
|
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
||||||
|
ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 0);
|
||||||
|
ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[0]), 1);
|
||||||
|
ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[1]), 2);
|
||||||
|
ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op), 3);
|
||||||
|
|
||||||
|
// Dispatch: ne01 rows, ceil(ne1/BATCH_SIZE) token batches, ne02 sequences
|
||||||
|
// Each threadgroup has BATCH_SIZE threads, each handling one token
|
||||||
|
const int n_token_batches = (ne1 + BATCH_SIZE - 1) / BATCH_SIZE;
|
||||||
|
ggml_metal_encoder_dispatch_threadgroups(enc, ne01, n_token_batches, ne02, BATCH_SIZE, 1, 1);
|
||||||
|
} else {
|
||||||
auto pipeline = ggml_metal_library_get_pipeline_ssm_conv(lib, op);
|
auto pipeline = ggml_metal_library_get_pipeline_ssm_conv(lib, op);
|
||||||
|
|
||||||
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
||||||
|
|
@ -1374,6 +1401,7 @@ int ggml_metal_op_ssm_conv(ggml_metal_op_t ctx, int idx) {
|
||||||
ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op), 3);
|
ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op), 3);
|
||||||
|
|
||||||
ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne1, ne02, 1, 1, 1);
|
ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne1, ne02, 1, 1, 1);
|
||||||
|
}
|
||||||
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -2343,7 +2343,102 @@ kernel void kernel_ssm_conv_f32_f32_4(
|
||||||
x[0] = sumf;
|
x[0] = sumf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
constant short FC_ssm_conv_bs [[function_constant(FC_SSM_CONV + 0)]];
|
||||||
|
|
||||||
|
// Batched version: each threadgroup processes multiple tokens for better efficiency
|
||||||
|
// Thread layout: each thread handles one token, threadgroup covers BATCH_SIZE tokens
|
||||||
|
kernel void kernel_ssm_conv_f32_f32_batched(
|
||||||
|
constant ggml_metal_kargs_ssm_conv & args,
|
||||||
|
device const void * src0,
|
||||||
|
device const void * src1,
|
||||||
|
device float * dst,
|
||||||
|
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||||
|
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||||
|
uint3 ntg[[threads_per_threadgroup]]) {
|
||||||
|
// tgpig.x = row index (ir)
|
||||||
|
// tgpig.y = batch of tokens (i2_base / BATCH_SIZE)
|
||||||
|
// tgpig.z = sequence index (i3)
|
||||||
|
// tpitg.x = thread within batch (0..BATCH_SIZE-1)
|
||||||
|
const short BATCH_SIZE = FC_ssm_conv_bs;
|
||||||
|
|
||||||
|
const int64_t ir = tgpig.x;
|
||||||
|
const int64_t i2_base = tgpig.y * BATCH_SIZE;
|
||||||
|
const int64_t i3 = tgpig.z;
|
||||||
|
const int64_t i2_off = tpitg.x;
|
||||||
|
const int64_t i2 = i2_base + i2_off;
|
||||||
|
|
||||||
|
const int64_t nc = args.ne10; // conv kernel size (typically 4)
|
||||||
|
const int64_t n_t = args.ne1; // number of tokens
|
||||||
|
|
||||||
|
// Bounds check for partial batches at the end
|
||||||
|
if (i2 >= n_t) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Load conv weights (shared across all tokens for this row)
|
||||||
|
device const float * c = (device const float *) ((device const char *) src1 + ir*args.nb11);
|
||||||
|
|
||||||
|
// Load source for this specific token
|
||||||
|
device const float * s = (device const float *) ((device const char *) src0 + ir*args.nb01 + i2*args.nb00 + i3*args.nb02);
|
||||||
|
|
||||||
|
// Output location for this token
|
||||||
|
device float * x = (device float *) ((device char *) dst + ir*args.nb0 + i2*args.nb1 + i3*args.nb2);
|
||||||
|
|
||||||
|
float sumf = 0.0f;
|
||||||
|
for (int64_t i0 = 0; i0 < nc; ++i0) {
|
||||||
|
sumf += s[i0] * c[i0];
|
||||||
|
}
|
||||||
|
|
||||||
|
x[0] = sumf;
|
||||||
|
}
|
||||||
|
|
||||||
|
kernel void kernel_ssm_conv_f32_f32_batched_4(
|
||||||
|
constant ggml_metal_kargs_ssm_conv & args,
|
||||||
|
device const void * src0,
|
||||||
|
device const void * src1,
|
||||||
|
device float * dst,
|
||||||
|
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||||
|
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||||
|
uint3 ntg[[threads_per_threadgroup]]) {
|
||||||
|
// tgpig.x = row index (ir)
|
||||||
|
// tgpig.y = batch of tokens (i2_base / BATCH_SIZE)
|
||||||
|
// tgpig.z = sequence index (i3)
|
||||||
|
// tpitg.x = thread within batch (0..BATCH_SIZE-1)
|
||||||
|
const short BATCH_SIZE = FC_ssm_conv_bs;
|
||||||
|
|
||||||
|
const int64_t ir = tgpig.x;
|
||||||
|
const int64_t i2_base = tgpig.y * BATCH_SIZE;
|
||||||
|
const int64_t i3 = tgpig.z;
|
||||||
|
const int64_t i2_off = tpitg.x;
|
||||||
|
const int64_t i2 = i2_base + i2_off;
|
||||||
|
|
||||||
|
const int64_t nc = args.ne10; // conv kernel size (typically 4)
|
||||||
|
const int64_t n_t = args.ne1; // number of tokens
|
||||||
|
|
||||||
|
// Bounds check for partial batches at the end
|
||||||
|
if (i2 >= n_t) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Load conv weights (shared across all tokens for this row)
|
||||||
|
device const float4 * c = (device const float4 *) ((device const char *) src1 + ir*args.nb11);
|
||||||
|
|
||||||
|
// Load source for this specific token
|
||||||
|
device const float4 * s = (device const float4 *) ((device const char *) src0 + ir*args.nb01 + i2*args.nb00 + i3*args.nb02);
|
||||||
|
|
||||||
|
// Output location for this token
|
||||||
|
device float * x = (device float *) ((device char *) dst + ir*args.nb0 + i2*args.nb1 + i3*args.nb2);
|
||||||
|
|
||||||
|
float sumf = 0.0f;
|
||||||
|
for (int64_t i0 = 0; i0 < nc/4; ++i0) {
|
||||||
|
sumf += dot(s[i0], c[i0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
x[0] = sumf;
|
||||||
|
}
|
||||||
|
|
||||||
// ref: ggml.c:ggml_compute_forward_ssm_scan_f32, Mamba-2 part
|
// ref: ggml.c:ggml_compute_forward_ssm_scan_f32, Mamba-2 part
|
||||||
|
// Optimized version: reduces redundant memory loads by having one thread load shared values
|
||||||
kernel void kernel_ssm_scan_f32(
|
kernel void kernel_ssm_scan_f32(
|
||||||
constant ggml_metal_kargs_ssm_scan & args,
|
constant ggml_metal_kargs_ssm_scan & args,
|
||||||
device const void * src0,
|
device const void * src0,
|
||||||
|
|
@ -2363,7 +2458,15 @@ kernel void kernel_ssm_scan_f32(
|
||||||
uint3 tgpg[[threadgroups_per_grid]]) {
|
uint3 tgpg[[threadgroups_per_grid]]) {
|
||||||
constexpr short NW = N_SIMDWIDTH;
|
constexpr short NW = N_SIMDWIDTH;
|
||||||
|
|
||||||
shared[tpitg.x] = 0.0f;
|
// Shared memory layout:
|
||||||
|
// [0..sgptg*NW-1]: partial sums for reduction (existing)
|
||||||
|
// [sgptg*NW..sgptg*NW+sgptg-1]: pre-computed x_dt values for each token in batch
|
||||||
|
// [sgptg*NW+sgptg..sgptg*NW+2*sgptg-1]: pre-computed dA values for each token in batch
|
||||||
|
threadgroup float * shared_sums = shared;
|
||||||
|
threadgroup float * shared_x_dt = shared + sgptg * NW;
|
||||||
|
threadgroup float * shared_dA = shared + sgptg * NW + sgptg;
|
||||||
|
|
||||||
|
shared_sums[tpitg.x] = 0.0f;
|
||||||
|
|
||||||
const int32_t i0 = tpitg.x;
|
const int32_t i0 = tpitg.x;
|
||||||
const int32_t i1 = tgpig.x;
|
const int32_t i1 = tgpig.x;
|
||||||
|
|
@ -2403,32 +2506,47 @@ kernel void kernel_ssm_scan_f32(
|
||||||
for (int i2 = 0; i2 < n_t; i2 += sgptg) {
|
for (int i2 = 0; i2 < n_t; i2 += sgptg) {
|
||||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
|
||||||
for (int t = 0; t < sgptg && i2 + t < n_t; t++) {
|
// Pre-compute x_dt and dA for this batch of tokens
|
||||||
const float dt0 = dt[0];
|
// Only first sgptg threads do the loads and expensive math
|
||||||
|
if (i0 < sgptg && i2 + i0 < n_t) {
|
||||||
|
// ns12 and ns21 are element strides (nb12/nb10, nb21/nb20)
|
||||||
|
device const float * x_t = x + i0 * args.ns12;
|
||||||
|
device const float * dt_t = dt + i0 * args.ns21;
|
||||||
|
|
||||||
|
const float dt0 = dt_t[0];
|
||||||
const float dtsp = dt0 <= 20.0f ? log(1.0f + exp(dt0)) : dt0;
|
const float dtsp = dt0 <= 20.0f ? log(1.0f + exp(dt0)) : dt0;
|
||||||
const float x_dt = x[0] * dtsp;
|
shared_x_dt[i0] = x_t[0] * dtsp;
|
||||||
const float dA = exp(dtsp * A0);
|
shared_dA[i0] = dtsp; // Store dtsp, compute exp(dtsp * A0) per-thread since A0 varies
|
||||||
|
}
|
||||||
|
|
||||||
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
|
||||||
|
for (int t = 0; t < sgptg && i2 + t < n_t; t++) {
|
||||||
|
const float x_dt = shared_x_dt[t];
|
||||||
|
const float dA = exp(shared_dA[t] * A0);
|
||||||
|
|
||||||
s = (s0 * dA) + (B[i0] * x_dt);
|
s = (s0 * dA) + (B[i0] * x_dt);
|
||||||
|
|
||||||
const float sumf = simd_sum(s * C[i0]);
|
const float sumf = simd_sum(s * C[i0]);
|
||||||
|
|
||||||
if (tiisg == 0) {
|
if (tiisg == 0) {
|
||||||
shared[t*NW + sgitg] = sumf;
|
shared_sums[t*NW + sgitg] = sumf;
|
||||||
}
|
}
|
||||||
|
|
||||||
// recurse
|
// recurse
|
||||||
s0 = s;
|
s0 = s;
|
||||||
|
|
||||||
x += args.ns12;
|
|
||||||
dt += args.ns21;
|
|
||||||
B += args.ns42;
|
B += args.ns42;
|
||||||
C += args.ns52;
|
C += args.ns52;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Advance pointers for next batch
|
||||||
|
x += sgptg * args.ns12;
|
||||||
|
dt += sgptg * args.ns21;
|
||||||
|
|
||||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
|
||||||
const float sumf = simd_sum(shared[sgitg*NW + tiisg]);
|
const float sumf = simd_sum(shared_sums[sgitg*NW + tiisg]);
|
||||||
|
|
||||||
if (tiisg == 0 && i2 + sgitg < n_t) {
|
if (tiisg == 0 && i2 + sgitg < n_t) {
|
||||||
y[sgitg*nh*nr] = sumf;
|
y[sgitg*nh*nr] = sumf;
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,77 @@
|
||||||
|
#include <sycl/sycl.hpp>
|
||||||
|
#include "common.hpp"
|
||||||
|
#include "add-id.hpp"
|
||||||
|
|
||||||
|
static void add_id_kernel(
|
||||||
|
const float* src0,
|
||||||
|
const float* src1,
|
||||||
|
const int32_t* src2,
|
||||||
|
float* dst,
|
||||||
|
int64_t ne0,
|
||||||
|
int64_t ne1,
|
||||||
|
size_t nb01,
|
||||||
|
size_t nb02,
|
||||||
|
size_t nb11,
|
||||||
|
size_t nb21,
|
||||||
|
sycl::nd_item<3> item_ct1) {
|
||||||
|
const int64_t i1 = item_ct1.get_group(2);
|
||||||
|
const int64_t i2 = item_ct1.get_group(1);
|
||||||
|
|
||||||
|
const int i11 =
|
||||||
|
*(const int32_t*)((const char*)src2 + i1 * sizeof(int32_t) + i2 * nb21);
|
||||||
|
|
||||||
|
const size_t nb1 = ne0 * sizeof(float);
|
||||||
|
const size_t nb2 = ne1 * nb1;
|
||||||
|
|
||||||
|
float* dst_row = (float*)((char*)dst + i1 * nb1 + i2 * nb2);
|
||||||
|
const float* src0_row =
|
||||||
|
(const float*)((const char*)src0 + i1 * nb01 + i2 * nb02);
|
||||||
|
const float* src1_row = (const float*)((const char*)src1 + i11 * nb11);
|
||||||
|
|
||||||
|
for (int64_t i0 = item_ct1.get_local_id(2); i0 < ne0;
|
||||||
|
i0 += item_ct1.get_local_range(2)) {
|
||||||
|
dst_row[i0] = src0_row[i0] + src1_row[i0];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_sycl_add_id(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
|
||||||
|
const ggml_tensor* src0 = dst->src[0];
|
||||||
|
const ggml_tensor* src1 = dst->src[1];
|
||||||
|
const ggml_tensor* src2 = dst->src[2];
|
||||||
|
|
||||||
|
GGML_TENSOR_TERNARY_OP_LOCALS
|
||||||
|
|
||||||
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||||
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||||
|
GGML_ASSERT(src2->type == GGML_TYPE_I32);
|
||||||
|
|
||||||
|
GGML_ASSERT(nb00 == sizeof(float));
|
||||||
|
GGML_ASSERT(nb10 == sizeof(float));
|
||||||
|
GGML_ASSERT(nb20 == sizeof(int32_t));
|
||||||
|
|
||||||
|
const float* src0_d = (const float*)src0->data;
|
||||||
|
const float* src1_d = (const float*)src1->data;
|
||||||
|
const int32_t* src2_d = (const int32_t*)src2->data;
|
||||||
|
float* dst_d = (float*)dst->data;
|
||||||
|
|
||||||
|
int threads = std::min((int)ne00, 768); // cols
|
||||||
|
ctx.stream()->parallel_for(
|
||||||
|
sycl::nd_range<3>(
|
||||||
|
sycl::range<3>(1, ne02, ne01) * sycl::range<3>(1, 1, threads),
|
||||||
|
sycl::range<3>(1, 1, threads)),
|
||||||
|
[=](sycl::nd_item<3> item_ct1) {
|
||||||
|
add_id_kernel(
|
||||||
|
src0_d,
|
||||||
|
src1_d,
|
||||||
|
src2_d,
|
||||||
|
dst_d,
|
||||||
|
ne0,
|
||||||
|
ne1,
|
||||||
|
nb01,
|
||||||
|
nb02,
|
||||||
|
nb11,
|
||||||
|
nb21,
|
||||||
|
item_ct1);
|
||||||
|
});
|
||||||
|
}
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue