Merge branch 'ggml-org:master' into i8mm-ci
This commit is contained in:
commit
f4b71ac22f
|
|
@ -4,7 +4,7 @@
|
||||||
|
|
||||||
# Define the CANN base image for easier version updates later
|
# Define the CANN base image for easier version updates later
|
||||||
ARG CHIP_TYPE=910b
|
ARG CHIP_TYPE=910b
|
||||||
ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.3.rc1.alpha001-${CHIP_TYPE}-openeuler22.03-py3.11
|
ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.3.rc2-${CHIP_TYPE}-openeuler24.03-py3.11
|
||||||
|
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
# BUILD STAGE
|
# BUILD STAGE
|
||||||
|
|
@ -111,7 +111,7 @@ ENTRYPOINT ["/app/tools.sh"]
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
FROM base AS light
|
FROM base AS light
|
||||||
|
|
||||||
COPY --from=build /app/full/llama-cli /app
|
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-cli" ]
|
ENTRYPOINT [ "/app/llama-cli" ]
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -68,7 +68,7 @@ ENTRYPOINT ["/app/tools.sh"]
|
||||||
### Light, CLI only
|
### Light, CLI only
|
||||||
FROM base AS light
|
FROM base AS light
|
||||||
|
|
||||||
COPY --from=build /app/full/llama-cli /app
|
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -74,7 +74,7 @@ ENTRYPOINT ["/app/tools.sh"]
|
||||||
### Light, CLI only
|
### Light, CLI only
|
||||||
FROM base AS light
|
FROM base AS light
|
||||||
|
|
||||||
COPY --from=build /app/full/llama-cli /app
|
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -73,7 +73,7 @@ ENTRYPOINT ["/app/tools.sh"]
|
||||||
FROM base AS light
|
FROM base AS light
|
||||||
|
|
||||||
COPY --from=build /app/lib/ /app
|
COPY --from=build /app/lib/ /app
|
||||||
COPY --from=build /app/full/llama-cli /app
|
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -81,7 +81,7 @@ ENTRYPOINT ["/app/tools.sh"]
|
||||||
### Light, CLI only
|
### Light, CLI only
|
||||||
FROM base AS light
|
FROM base AS light
|
||||||
|
|
||||||
COPY --from=build /app/full/llama-cli /app
|
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -94,7 +94,7 @@ ENTRYPOINT ["/app/tools.sh"]
|
||||||
### Light, CLI only
|
### Light, CLI only
|
||||||
FROM base AS light
|
FROM base AS light
|
||||||
|
|
||||||
COPY --from=build /app/full/llama-cli /app
|
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -105,7 +105,7 @@ WORKDIR /llama.cpp/bin
|
||||||
|
|
||||||
# Copy llama.cpp binaries and libraries
|
# Copy llama.cpp binaries and libraries
|
||||||
COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
|
COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
|
||||||
COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin
|
COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin/llama-completion /llama.cpp/bin
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ]
|
ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ]
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -13,6 +13,8 @@ elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
|
||||||
exec ./llama-quantize "$@"
|
exec ./llama-quantize "$@"
|
||||||
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
|
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
|
||||||
exec ./llama-cli "$@"
|
exec ./llama-cli "$@"
|
||||||
|
elif [[ "$arg1" == '--run-legacy' || "$arg1" == '-l' ]]; then
|
||||||
|
exec ./llama-completion "$@"
|
||||||
elif [[ "$arg1" == '--bench' || "$arg1" == '-b' ]]; then
|
elif [[ "$arg1" == '--bench' || "$arg1" == '-b' ]]; then
|
||||||
exec ./llama-bench "$@"
|
exec ./llama-bench "$@"
|
||||||
elif [[ "$arg1" == '--perplexity' || "$arg1" == '-p' ]]; then
|
elif [[ "$arg1" == '--perplexity' || "$arg1" == '-p' ]]; then
|
||||||
|
|
@ -32,8 +34,10 @@ elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
|
||||||
else
|
else
|
||||||
echo "Unknown command: $arg1"
|
echo "Unknown command: $arg1"
|
||||||
echo "Available commands: "
|
echo "Available commands: "
|
||||||
echo " --run (-r): Run a model previously converted into ggml"
|
echo " --run (-r): Run a model (chat) previously converted into ggml"
|
||||||
echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
|
echo " ex: -m /models/7B/ggml-model-q4_0.bin"
|
||||||
|
echo " --run-legacy (-l): Run a model (legacy completion) previously converted into ggml"
|
||||||
|
echo " ex: -m /models/7B/ggml-model-q4_0.bin -no-cnv -p \"Building a website can be done in 10 simple steps:\" -n 512"
|
||||||
echo " --bench (-b): Benchmark the performance of the inference for various parameters."
|
echo " --bench (-b): Benchmark the performance of the inference for various parameters."
|
||||||
echo " ex: -m model.gguf"
|
echo " ex: -m model.gguf"
|
||||||
echo " --perplexity (-p): Measure the perplexity of a model over a given text."
|
echo " --perplexity (-p): Measure the perplexity of a model over a given text."
|
||||||
|
|
|
||||||
|
|
@ -68,7 +68,7 @@ ENTRYPOINT ["/app/tools.sh"]
|
||||||
### Light, CLI only
|
### Light, CLI only
|
||||||
FROM base AS light
|
FROM base AS light
|
||||||
|
|
||||||
COPY --from=build /app/full/llama-cli /app
|
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -11,7 +11,7 @@ body:
|
||||||
(i.e. the generated text) are incorrect or llama.cpp crashes during model evaluation.
|
(i.e. the generated text) are incorrect or llama.cpp crashes during model evaluation.
|
||||||
If you encountered the issue while using an external UI (e.g. ollama),
|
If you encountered the issue while using an external UI (e.g. ollama),
|
||||||
please reproduce your issue using one of the examples/binaries in this repository.
|
please reproduce your issue using one of the examples/binaries in this repository.
|
||||||
The `llama-cli` binary can be used for simple and reproducible model inference.
|
The `llama-completion` binary can be used for simple and reproducible model inference.
|
||||||
- type: textarea
|
- type: textarea
|
||||||
id: version
|
id: version
|
||||||
attributes:
|
attributes:
|
||||||
|
|
@ -74,9 +74,12 @@ body:
|
||||||
Please give us a summary of the problem and tell us how to reproduce it.
|
Please give us a summary of the problem and tell us how to reproduce it.
|
||||||
If you can narrow down the bug to specific hardware, compile flags, or command line arguments,
|
If you can narrow down the bug to specific hardware, compile flags, or command line arguments,
|
||||||
that information would be very much appreciated by us.
|
that information would be very much appreciated by us.
|
||||||
|
|
||||||
|
If possible, please try to reproduce the issue using `llama-completion` with `-fit off`.
|
||||||
|
If you can only reproduce the issue with `-fit on`, please provide logs both with and without `--verbose`.
|
||||||
placeholder: >
|
placeholder: >
|
||||||
e.g. when I run llama-cli with -ngl 99 I get garbled outputs.
|
e.g. when I run llama-completion with `-fa on` I get garbled outputs for very long prompts.
|
||||||
When I use -ngl 0 it works correctly.
|
With short prompts or `-fa off` it works correctly.
|
||||||
Here are the exact commands that I used: ...
|
Here are the exact commands that I used: ...
|
||||||
validations:
|
validations:
|
||||||
required: true
|
required: true
|
||||||
|
|
|
||||||
|
|
@ -20,7 +20,8 @@ on:
|
||||||
'**/*.swift',
|
'**/*.swift',
|
||||||
'**/*.m',
|
'**/*.m',
|
||||||
'**/*.metal',
|
'**/*.metal',
|
||||||
'**/*.comp'
|
'**/*.comp',
|
||||||
|
'**/*.glsl'
|
||||||
]
|
]
|
||||||
|
|
||||||
pull_request:
|
pull_request:
|
||||||
|
|
@ -40,7 +41,8 @@ on:
|
||||||
'**/*.swift',
|
'**/*.swift',
|
||||||
'**/*.m',
|
'**/*.m',
|
||||||
'**/*.metal',
|
'**/*.metal',
|
||||||
'**/*.comp'
|
'**/*.comp',
|
||||||
|
'**/*.glsl'
|
||||||
]
|
]
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
|
|
@ -1400,26 +1402,55 @@ jobs:
|
||||||
chip_type: ['910b', '310p']
|
chip_type: ['910b', '310p']
|
||||||
build: ['Release']
|
build: ['Release']
|
||||||
runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
|
runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
|
||||||
container: ascendai/cann:${{ matrix.chip_type == '910b' && '8.3.rc1.alpha001-910b-openeuler22.03-py3.11' || '8.2.rc1-310p-openeuler22.03-py3.11' }}
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
- name: Dependencies
|
- name: Free up disk space
|
||||||
|
uses: ggml-org/free-disk-space@v1.3.1
|
||||||
|
with:
|
||||||
|
tool-cache: true
|
||||||
|
|
||||||
|
- name: Set container image
|
||||||
|
id: cann-image
|
||||||
run: |
|
run: |
|
||||||
yum update -y
|
image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.3.rc2-910b-openeuler24.03-py3.11' || '8.3.rc2-310p-openeuler24.03-py3.11' }}"
|
||||||
yum install -y git gcc gcc-c++ make cmake libcurl-devel
|
echo "image=${image}" >> "${GITHUB_OUTPUT}"
|
||||||
|
|
||||||
|
- name: Pull container image
|
||||||
|
run: docker pull "${{ steps.cann-image.outputs.image }}"
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
|
env:
|
||||||
|
BUILD_TYPE: ${{ matrix.build }}
|
||||||
|
SOC_TYPE: ascend${{ matrix.chip_type }}
|
||||||
run: |
|
run: |
|
||||||
export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
|
HOST_UID=$(id -u)
|
||||||
|
HOST_GID=$(id -g)
|
||||||
|
|
||||||
|
docker run --rm \
|
||||||
|
-v "${PWD}:/workspace" \
|
||||||
|
-w /workspace \
|
||||||
|
-e SOC_TYPE=${SOC_TYPE} \
|
||||||
|
-e BUILD_TYPE=${BUILD_TYPE} \
|
||||||
|
"${{ steps.cann-image.outputs.image }}" \
|
||||||
|
bash -lc '
|
||||||
|
set -e
|
||||||
|
yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake libcurl-devel
|
||||||
|
yum clean all && rm -rf /var/cache/yum
|
||||||
|
git config --global --add safe.directory "/workspace"
|
||||||
|
export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
|
||||||
cmake -S . -B build \
|
cmake -S . -B build \
|
||||||
-DCMAKE_BUILD_TYPE=${{ matrix.build }} \
|
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
|
||||||
-DGGML_CANN=on \
|
-DGGML_CANN=on \
|
||||||
-DSOC_TYPE=ascend${{ matrix.chip_type }}
|
-DSOC_TYPE=${SOC_TYPE}
|
||||||
cmake --build build -j $(nproc)
|
cmake --build build -j $(nproc)
|
||||||
|
|
||||||
|
chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
|
||||||
|
'
|
||||||
|
|
||||||
# TODO: simplify the following workflows using a matrix
|
# TODO: simplify the following workflows using a matrix
|
||||||
# TODO: run lighter CI on PRs and the full CI only on master (if needed)
|
# TODO: run lighter CI on PRs and the full CI only on master (if needed)
|
||||||
ggml-ci-x64-cpu-low-perf:
|
ggml-ci-x64-cpu-low-perf:
|
||||||
|
|
|
||||||
|
|
@ -731,6 +731,78 @@ jobs:
|
||||||
path: llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz
|
path: llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz
|
||||||
name: llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz
|
name: llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz
|
||||||
|
|
||||||
|
|
||||||
|
openEuler-cann:
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
arch: [x86, aarch64]
|
||||||
|
chip_type: ['910b', '310p']
|
||||||
|
build: ['Release']
|
||||||
|
runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
|
- name: Free up disk space
|
||||||
|
uses: ggml-org/free-disk-space@v1.3.1
|
||||||
|
with:
|
||||||
|
tool-cache: true
|
||||||
|
|
||||||
|
- name: Set container image
|
||||||
|
id: cann-image
|
||||||
|
run: |
|
||||||
|
image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.3.rc2-910b-openeuler24.03-py3.11' || '8.3.rc2-310p-openeuler24.03-py3.11' }}"
|
||||||
|
echo "image=${image}" >> "${GITHUB_OUTPUT}"
|
||||||
|
|
||||||
|
- name: Pull container image
|
||||||
|
run: docker pull "${{ steps.cann-image.outputs.image }}"
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
env:
|
||||||
|
BUILD_TYPE: ${{ matrix.build }}
|
||||||
|
SOC_TYPE: ascend${{ matrix.chip_type }}
|
||||||
|
run: |
|
||||||
|
HOST_UID=$(id -u)
|
||||||
|
HOST_GID=$(id -g)
|
||||||
|
|
||||||
|
docker run --rm \
|
||||||
|
-v "${PWD}:/workspace" \
|
||||||
|
-w /workspace \
|
||||||
|
-e SOC_TYPE=${SOC_TYPE} \
|
||||||
|
-e BUILD_TYPE=${BUILD_TYPE} \
|
||||||
|
"${{ steps.cann-image.outputs.image }}" \
|
||||||
|
bash -lc '
|
||||||
|
set -e
|
||||||
|
yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake libcurl-devel
|
||||||
|
yum clean all && rm -rf /var/cache/yum
|
||||||
|
git config --global --add safe.directory "/workspace"
|
||||||
|
export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
|
||||||
|
cmake -S . -B build \
|
||||||
|
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
|
||||||
|
-DGGML_CANN=on \
|
||||||
|
-DSOC_TYPE=${SOC_TYPE}
|
||||||
|
cmake --build build -j $(nproc)
|
||||||
|
|
||||||
|
chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
|
||||||
|
'
|
||||||
|
|
||||||
|
- name: Determine tag name
|
||||||
|
id: tag
|
||||||
|
uses: ./.github/actions/get-tag-name
|
||||||
|
|
||||||
|
- name: Pack artifacts
|
||||||
|
run: |
|
||||||
|
cp LICENSE ./build/bin/
|
||||||
|
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
|
||||||
|
|
||||||
|
- name: Upload artifacts (tar)
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.tar.gz
|
||||||
|
name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.tar.gz
|
||||||
|
|
||||||
release:
|
release:
|
||||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||||
|
|
||||||
|
|
@ -752,6 +824,7 @@ jobs:
|
||||||
- macOS-arm64
|
- macOS-arm64
|
||||||
- macOS-x64
|
- macOS-x64
|
||||||
- ios-xcode-build
|
- ios-xcode-build
|
||||||
|
- openEuler-cann
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
|
|
@ -844,6 +917,12 @@ jobs:
|
||||||
- [Windows x64 (SYCL)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip)
|
- [Windows x64 (SYCL)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip)
|
||||||
- [Windows x64 (HIP)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-hip-radeon-x64.zip)
|
- [Windows x64 (HIP)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-hip-radeon-x64.zip)
|
||||||
|
|
||||||
|
**openEuler:**
|
||||||
|
- [openEuler x86 (310p)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-310p-openEuler-x86.tar.gz)
|
||||||
|
- [openEuler x86 (910b)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-x86.tar.gz)
|
||||||
|
- [openEuler aarch64 (310p)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-310p-openEuler-aarch64.tar.gz)
|
||||||
|
- [openEuler aarch64 (910b)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-aarch64.tar.gz)
|
||||||
|
|
||||||
- name: Upload release
|
- name: Upload release
|
||||||
id: upload_release
|
id: upload_release
|
||||||
uses: actions/github-script@v3
|
uses: actions/github-script@v3
|
||||||
|
|
|
||||||
|
|
@ -54,6 +54,7 @@
|
||||||
/out/
|
/out/
|
||||||
/tmp/
|
/tmp/
|
||||||
/autogen-*.md
|
/autogen-*.md
|
||||||
|
/common/build-info.cpp
|
||||||
|
|
||||||
# Deprecated
|
# Deprecated
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -87,7 +87,8 @@
|
||||||
/tests/ @ggerganov
|
/tests/ @ggerganov
|
||||||
/tests/test-chat-.* @pwilkin
|
/tests/test-chat-.* @pwilkin
|
||||||
/tools/batched-bench/ @ggerganov
|
/tools/batched-bench/ @ggerganov
|
||||||
/tools/main/ @ggerganov
|
/tools/cli/ @ngxson
|
||||||
|
/tools/completion/ @ggerganov
|
||||||
/tools/mtmd/ @ngxson
|
/tools/mtmd/ @ngxson
|
||||||
/tools/perplexity/ @ggerganov
|
/tools/perplexity/ @ggerganov
|
||||||
/tools/quantize/ @ggerganov
|
/tools/quantize/ @ggerganov
|
||||||
|
|
|
||||||
|
|
@ -313,7 +313,7 @@ The Hugging Face platform provides a variety of online tools for converting, qua
|
||||||
|
|
||||||
To learn more about model quantization, [read this documentation](tools/quantize/README.md)
|
To learn more about model quantization, [read this documentation](tools/quantize/README.md)
|
||||||
|
|
||||||
## [`llama-cli`](tools/main)
|
## [`llama-cli`](tools/cli)
|
||||||
|
|
||||||
#### A CLI tool for accessing and experimenting with most of `llama.cpp`'s functionality.
|
#### A CLI tool for accessing and experimenting with most of `llama.cpp`'s functionality.
|
||||||
|
|
||||||
|
|
@ -525,7 +525,8 @@ To learn more about model quantization, [read this documentation](tools/quantize
|
||||||
|
|
||||||
## Other documentation
|
## Other documentation
|
||||||
|
|
||||||
- [main (cli)](tools/main/README.md)
|
- [cli](tools/cli/README.md)
|
||||||
|
- [completion](tools/completion/README.md)
|
||||||
- [server](tools/server/README.md)
|
- [server](tools/server/README.md)
|
||||||
- [GBNF grammars](grammars/README.md)
|
- [GBNF grammars](grammars/README.md)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -398,6 +398,8 @@ function gg_run_qwen3_0_6b {
|
||||||
./bin/llama-quantize ${model_bf16} ${model_q5_k} q5_k $(nproc)
|
./bin/llama-quantize ${model_bf16} ${model_q5_k} q5_k $(nproc)
|
||||||
./bin/llama-quantize ${model_bf16} ${model_q6_k} q6_k $(nproc)
|
./bin/llama-quantize ${model_bf16} ${model_q6_k} q6_k $(nproc)
|
||||||
|
|
||||||
|
(time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
|
||||||
|
|
||||||
(time ./bin/llama-completion -no-cnv --model ${model_f16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/llama-completion -no-cnv --model ${model_f16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/llama-completion -no-cnv --model ${model_bf16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
|
(time ./bin/llama-completion -no-cnv --model ${model_bf16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
|
||||||
(time ./bin/llama-completion -no-cnv --model ${model_q8_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/llama-completion -no-cnv --model ${model_q8_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
|
|
@ -523,6 +525,8 @@ function gg_run_embd_bge_small {
|
||||||
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
||||||
|
|
||||||
|
(time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
|
||||||
|
|
||||||
(time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
|
|
||||||
|
|
@ -563,6 +567,8 @@ function gg_run_rerank_tiny {
|
||||||
|
|
||||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||||
|
|
||||||
|
(time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
|
||||||
|
|
||||||
# for this model, the SEP token is "</s>"
|
# for this model, the SEP token is "</s>"
|
||||||
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --no-op-offload --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
|
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --no-op-offload --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
|
||||||
|
|
||||||
|
|
|
||||||
333
common/arg.cpp
333
common/arg.cpp
|
|
@ -20,6 +20,7 @@
|
||||||
#include <nlohmann/json.hpp>
|
#include <nlohmann/json.hpp>
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
#include <cinttypes>
|
||||||
#include <climits>
|
#include <climits>
|
||||||
#include <cstdarg>
|
#include <cstdarg>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
|
|
@ -105,6 +106,16 @@ bool common_arg::is_exclude(enum llama_example ex) {
|
||||||
|
|
||||||
bool common_arg::get_value_from_env(std::string & output) const {
|
bool common_arg::get_value_from_env(std::string & output) const {
|
||||||
if (env == nullptr) return false;
|
if (env == nullptr) return false;
|
||||||
|
if (!args_neg.empty()) {
|
||||||
|
// for compatibility, we need to check LLAMA_ARG_NO_ env as well
|
||||||
|
std::string neg_env = env;
|
||||||
|
string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
|
||||||
|
char * neg_value = std::getenv(neg_env.c_str());
|
||||||
|
if (neg_value) {
|
||||||
|
output = "0"; // falsey
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
char * value = std::getenv(env);
|
char * value = std::getenv(env);
|
||||||
if (value) {
|
if (value) {
|
||||||
output = value;
|
output = value;
|
||||||
|
|
@ -114,6 +125,14 @@ bool common_arg::get_value_from_env(std::string & output) const {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool common_arg::has_value_from_env() const {
|
bool common_arg::has_value_from_env() const {
|
||||||
|
if (env != nullptr && !args_neg.empty()) {
|
||||||
|
// for compatibility, we need to check LLAMA_ARG_NO_ env as well
|
||||||
|
std::string neg_env = env;
|
||||||
|
string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
|
||||||
|
if (std::getenv(neg_env.c_str())) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
return env != nullptr && std::getenv(env);
|
return env != nullptr && std::getenv(env);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -151,9 +170,10 @@ std::string common_arg::to_string() const {
|
||||||
std::string leading_spaces(n_leading_spaces, ' ');
|
std::string leading_spaces(n_leading_spaces, ' ');
|
||||||
|
|
||||||
std::ostringstream ss;
|
std::ostringstream ss;
|
||||||
for (const auto arg : args) {
|
auto all_args = get_args(); // also contains args_neg
|
||||||
if (arg == args.front()) {
|
for (const auto & arg : all_args) {
|
||||||
if (args.size() == 1) {
|
if (arg == all_args.front()) {
|
||||||
|
if (all_args.size() == 1) {
|
||||||
ss << arg;
|
ss << arg;
|
||||||
} else {
|
} else {
|
||||||
// first arg is usually abbreviation, we need padding to make it more beautiful
|
// first arg is usually abbreviation, we need padding to make it more beautiful
|
||||||
|
|
@ -162,7 +182,7 @@ std::string common_arg::to_string() const {
|
||||||
ss << tmp << spaces;
|
ss << tmp << spaces;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
ss << arg << (arg != args.back() ? ", " : "");
|
ss << arg << (arg != all_args.back() ? ", " : "");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (value_hint) ss << " " << value_hint;
|
if (value_hint) ss << " " << value_hint;
|
||||||
|
|
@ -181,6 +201,31 @@ std::string common_arg::to_string() const {
|
||||||
return ss.str();
|
return ss.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::vector<std::string> common_arg::get_args() const {
|
||||||
|
std::vector<std::string> result;
|
||||||
|
for (const auto & arg : args) {
|
||||||
|
result.push_back(std::string(arg));
|
||||||
|
}
|
||||||
|
for (const auto & arg : args_neg) {
|
||||||
|
result.push_back(std::string(arg));
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::string> common_arg::get_env() const {
|
||||||
|
std::vector<std::string> result;
|
||||||
|
if (env) {
|
||||||
|
result.push_back(std::string(env));
|
||||||
|
}
|
||||||
|
if (!args_neg.empty() && env) {
|
||||||
|
// for compatibility, we need to add LLAMA_ARG_NO_ variant
|
||||||
|
std::string neg_env = env;
|
||||||
|
string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
|
||||||
|
result.push_back(neg_env);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// utils
|
// utils
|
||||||
//
|
//
|
||||||
|
|
@ -316,6 +361,16 @@ static std::string get_all_kv_cache_types() {
|
||||||
return msg.str();
|
return msg.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool parse_bool_value(const std::string & value) {
|
||||||
|
if (is_truthy(value)) {
|
||||||
|
return true;
|
||||||
|
} else if (is_falsey(value)) {
|
||||||
|
return false;
|
||||||
|
} else {
|
||||||
|
throw std::invalid_argument("invalid boolean value");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// CLI argument parsing functions
|
// CLI argument parsing functions
|
||||||
//
|
//
|
||||||
|
|
@ -323,10 +378,13 @@ static std::string get_all_kv_cache_types() {
|
||||||
static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
|
static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
|
||||||
common_params & params = ctx_arg.params;
|
common_params & params = ctx_arg.params;
|
||||||
|
|
||||||
std::unordered_map<std::string, common_arg *> arg_to_options;
|
std::unordered_map<std::string, std::pair<common_arg *, bool>> arg_to_options;
|
||||||
for (auto & opt : ctx_arg.options) {
|
for (auto & opt : ctx_arg.options) {
|
||||||
for (const auto & arg : opt.args) {
|
for (const auto & arg : opt.args) {
|
||||||
arg_to_options[arg] = &opt;
|
arg_to_options[arg] = {&opt, /* is_positive */ true};
|
||||||
|
}
|
||||||
|
for (const auto & arg : opt.args_neg) {
|
||||||
|
arg_to_options[arg] = {&opt, /* is_positive */ false};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -335,12 +393,15 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||||
std::string value;
|
std::string value;
|
||||||
if (opt.get_value_from_env(value)) {
|
if (opt.get_value_from_env(value)) {
|
||||||
try {
|
try {
|
||||||
if (opt.handler_void && (value == "1" || value == "true")) {
|
if (opt.handler_void && is_truthy(value)) {
|
||||||
opt.handler_void(params);
|
opt.handler_void(params);
|
||||||
}
|
}
|
||||||
if (opt.handler_int) {
|
if (opt.handler_int) {
|
||||||
opt.handler_int(params, std::stoi(value));
|
opt.handler_int(params, std::stoi(value));
|
||||||
}
|
}
|
||||||
|
if (opt.handler_bool) {
|
||||||
|
opt.handler_bool(params, parse_bool_value(value));
|
||||||
|
}
|
||||||
if (opt.handler_string) {
|
if (opt.handler_string) {
|
||||||
opt.handler_string(params, value);
|
opt.handler_string(params, value);
|
||||||
continue;
|
continue;
|
||||||
|
|
@ -369,7 +430,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||||
if (arg_to_options.find(arg) == arg_to_options.end()) {
|
if (arg_to_options.find(arg) == arg_to_options.end()) {
|
||||||
throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
|
throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
|
||||||
}
|
}
|
||||||
auto opt = *arg_to_options[arg];
|
auto & tmp = arg_to_options[arg];
|
||||||
|
auto opt = *tmp.first;
|
||||||
|
bool is_positive = tmp.second;
|
||||||
if (opt.has_value_from_env()) {
|
if (opt.has_value_from_env()) {
|
||||||
fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
|
fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
|
||||||
}
|
}
|
||||||
|
|
@ -378,6 +441,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||||
opt.handler_void(params);
|
opt.handler_void(params);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
if (opt.handler_bool) {
|
||||||
|
opt.handler_bool(params, is_positive);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
// arg with single value
|
// arg with single value
|
||||||
check_arg(i);
|
check_arg(i);
|
||||||
|
|
@ -402,7 +469,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||||
throw std::invalid_argument(string_format(
|
throw std::invalid_argument(string_format(
|
||||||
"error while handling argument \"%s\": %s\n\n"
|
"error while handling argument \"%s\": %s\n\n"
|
||||||
"usage:\n%s\n\nto show complete usage, run with -h",
|
"usage:\n%s\n\nto show complete usage, run with -h",
|
||||||
arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str()));
|
arg.c_str(), e.what(), opt.to_string().c_str()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -438,7 +505,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||||
|
|
||||||
// model is required (except for server)
|
// model is required (except for server)
|
||||||
// TODO @ngxson : maybe show a list of available models in CLI in this case
|
// TODO @ngxson : maybe show a list of available models in CLI in this case
|
||||||
if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !params.usage) {
|
if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !params.usage && !params.completion) {
|
||||||
throw std::invalid_argument("error: --model is required\n");
|
throw std::invalid_argument("error: --model is required\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -463,7 +530,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||||
params.kv_overrides.back().key[0] = 0;
|
params.kv_overrides.back().key[0] = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!params.tensor_buft_overrides.empty()) {
|
// pad tensor_buft_overrides for llama_params_fit:
|
||||||
|
const size_t ntbo = llama_max_tensor_buft_overrides();
|
||||||
|
while (params.tensor_buft_overrides.size() < ntbo) {
|
||||||
params.tensor_buft_overrides.push_back({nullptr, nullptr});
|
params.tensor_buft_overrides.push_back({nullptr, nullptr});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -573,6 +642,7 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
|
||||||
"llama-batched-bench",
|
"llama-batched-bench",
|
||||||
"llama-bench",
|
"llama-bench",
|
||||||
"llama-cli",
|
"llama-cli",
|
||||||
|
"llama-completion",
|
||||||
"llama-convert-llama2c-to-ggml",
|
"llama-convert-llama2c-to-ggml",
|
||||||
"llama-cvector-generator",
|
"llama-cvector-generator",
|
||||||
"llama-embedding",
|
"llama-embedding",
|
||||||
|
|
@ -657,7 +727,7 @@ static void add_rpc_devices(const std::string & servers) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool common_params_parse(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map) {
|
bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map) {
|
||||||
common_params dummy_params;
|
common_params dummy_params;
|
||||||
common_params_context ctx_arg = common_params_parser_init(dummy_params, ex, nullptr);
|
common_params_context ctx_arg = common_params_parser_init(dummy_params, ex, nullptr);
|
||||||
|
|
||||||
|
|
@ -666,6 +736,9 @@ bool common_params_parse(int argc, char ** argv, llama_example ex, std::map<comm
|
||||||
for (const auto & arg : opt.args) {
|
for (const auto & arg : opt.args) {
|
||||||
arg_to_options[arg] = &opt;
|
arg_to_options[arg] = &opt;
|
||||||
}
|
}
|
||||||
|
for (const auto & arg : opt.args_neg) {
|
||||||
|
arg_to_options[arg] = &opt;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO @ngxson : find a way to deduplicate this code
|
// TODO @ngxson : find a way to deduplicate this code
|
||||||
|
|
@ -750,11 +823,11 @@ static std::string list_builtin_chat_templates() {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool common_arg_utils::is_truthy(const std::string & value) {
|
bool common_arg_utils::is_truthy(const std::string & value) {
|
||||||
return value == "on" || value == "enabled" || value == "1";
|
return value == "on" || value == "enabled" || value == "true" || value == "1";
|
||||||
}
|
}
|
||||||
|
|
||||||
bool common_arg_utils::is_falsey(const std::string & value) {
|
bool common_arg_utils::is_falsey(const std::string & value) {
|
||||||
return value == "off" || value == "disabled" || value == "0";
|
return value == "off" || value == "disabled" || value == "false" || value == "0";
|
||||||
}
|
}
|
||||||
|
|
||||||
bool common_arg_utils::is_autoy(const std::string & value) {
|
bool common_arg_utils::is_autoy(const std::string & value) {
|
||||||
|
|
@ -839,10 +912,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
}
|
}
|
||||||
));
|
));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
{"--display-prompt"},
|
||||||
{"--no-display-prompt"},
|
{"--no-display-prompt"},
|
||||||
string_format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"),
|
string_format("whether to print prompt at generation (default: %s)", params.display_prompt ? "true" : "false"),
|
||||||
[](common_params & params) {
|
[](common_params & params, bool value) {
|
||||||
params.display_prompt = false;
|
params.display_prompt = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
|
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
|
@ -1055,18 +1129,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
params.kv_unified = true;
|
params.kv_unified = true;
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_KV_UNIFIED"));
|
).set_env("LLAMA_ARG_KV_UNIFIED"));
|
||||||
add_opt(common_arg(
|
|
||||||
{"--no-context-shift"},
|
|
||||||
string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
|
|
||||||
[](common_params & params) {
|
|
||||||
params.ctx_shift = false;
|
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
|
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--context-shift"},
|
{"--context-shift"},
|
||||||
string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
|
{"--no-context-shift"},
|
||||||
[](common_params & params) {
|
string_format("whether to use context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
|
||||||
params.ctx_shift = true;
|
[](common_params & params, bool value) {
|
||||||
|
params.ctx_shift = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT"));
|
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
|
@ -1106,20 +1174,22 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
|
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
{"--perf"},
|
||||||
{"--no-perf"},
|
{"--no-perf"},
|
||||||
string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
|
string_format("whether to enable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
|
||||||
[](common_params & params) {
|
[](common_params & params, bool value) {
|
||||||
params.no_perf = true;
|
params.no_perf = !value;
|
||||||
params.sampling.no_perf = true;
|
params.sampling.no_perf = !value;
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_NO_PERF"));
|
).set_env("LLAMA_ARG_PERF"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
{"--show-timings"},
|
||||||
{"--no-show-timings"},
|
{"--no-show-timings"},
|
||||||
string_format("disable timing information after each response (default: %s)", params.show_timings ? "true" : "false"),
|
string_format("whether to show timing information after each response (default: %s)", params.show_timings ? "true" : "false"),
|
||||||
[](common_params & params) {
|
[](common_params & params, bool value) {
|
||||||
params.show_timings = false;
|
params.show_timings = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_NO_SHOW_TIMINGS"));
|
).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SHOW_TIMINGS"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-f", "--file"}, "FNAME",
|
{"-f", "--file"}, "FNAME",
|
||||||
"a file containing the prompt (default: none)",
|
"a file containing the prompt (default: none)",
|
||||||
|
|
@ -1171,16 +1241,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_excludes({LLAMA_EXAMPLE_SERVER}));
|
).set_excludes({LLAMA_EXAMPLE_SERVER}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-e", "--escape"},
|
{"-e", "--escape"},
|
||||||
string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
|
|
||||||
[](common_params & params) {
|
|
||||||
params.escape = true;
|
|
||||||
}
|
|
||||||
));
|
|
||||||
add_opt(common_arg(
|
|
||||||
{"--no-escape"},
|
{"--no-escape"},
|
||||||
"do not process escape sequences",
|
string_format("whether to process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
|
||||||
[](common_params & params) {
|
[](common_params & params, bool value) {
|
||||||
params.escape = false;
|
params.escape = value;
|
||||||
}
|
}
|
||||||
));
|
));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
|
@ -1227,19 +1291,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}));
|
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-cnv", "--conversation"},
|
{"-cnv", "--conversation"},
|
||||||
"run in conversation mode:\n"
|
{"-no-cnv", "--no-conversation"},
|
||||||
|
"whether to run in conversation mode:\n"
|
||||||
"- does not print special tokens and suffix/prefix\n"
|
"- does not print special tokens and suffix/prefix\n"
|
||||||
"- interactive mode is also enabled\n"
|
"- interactive mode is also enabled\n"
|
||||||
"(default: auto enabled if chat template is available)",
|
"(default: auto enabled if chat template is available)",
|
||||||
[](common_params & params) {
|
[](common_params & params, bool value) {
|
||||||
params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED;
|
params.conversation_mode = value ? COMMON_CONVERSATION_MODE_ENABLED : COMMON_CONVERSATION_MODE_DISABLED;
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_COMPLETION}));
|
|
||||||
add_opt(common_arg(
|
|
||||||
{"-no-cnv", "--no-conversation"},
|
|
||||||
"force disable conversation mode (default: false)",
|
|
||||||
[](common_params & params) {
|
|
||||||
params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
|
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
|
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
|
@ -1297,10 +1355,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_COMPLETION}));
|
).set_examples({LLAMA_EXAMPLE_COMPLETION}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
{"--warmup"},
|
||||||
{"--no-warmup"},
|
{"--no-warmup"},
|
||||||
"skip warming up the model with an empty run",
|
string_format("whether to perform warmup with an empty run (default: %s)", params.warmup ? "enabled" : "disabled"),
|
||||||
[](common_params & params) {
|
[](common_params & params, bool value) {
|
||||||
params.warmup = false;
|
params.warmup = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
|
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
|
@ -1359,7 +1418,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
params.sampling.top_k = value;
|
params.sampling.top_k = value;
|
||||||
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K;
|
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K;
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam().set_env("LLAMA_ARG_TOP_K"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--top-p"}, "N",
|
{"--top-p"}, "N",
|
||||||
string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sampling.top_p),
|
string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sampling.top_p),
|
||||||
|
|
@ -1702,19 +1761,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_COMPLETION}));
|
).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_COMPLETION}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
{"-kvo", "--kv-offload"},
|
||||||
{"-nkvo", "--no-kv-offload"},
|
{"-nkvo", "--no-kv-offload"},
|
||||||
"disable KV offload",
|
string_format("whether to enable KV cache offloading (default: %s)", params.no_kv_offload ? "disabled" : "enabled"),
|
||||||
[](common_params & params) {
|
[](common_params & params, bool value) {
|
||||||
params.no_kv_offload = true;
|
params.no_kv_offload = !value;
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
|
).set_env("LLAMA_ARG_KV_OFFLOAD"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
{"--repack"},
|
||||||
{"-nr", "--no-repack"},
|
{"-nr", "--no-repack"},
|
||||||
"disable weight repacking",
|
string_format("whether to enable weight repacking (default: %s)", params.no_extra_bufts ? "disabled" : "enabled"),
|
||||||
[](common_params & params) {
|
[](common_params & params, bool value) {
|
||||||
params.no_extra_bufts = true;
|
params.no_extra_bufts = !value;
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_NO_REPACK"));
|
).set_env("LLAMA_ARG_REPACK"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--no-host"},
|
{"--no-host"},
|
||||||
"bypass host buffer allowing extra buffers to be used",
|
"bypass host buffer allowing extra buffers to be used",
|
||||||
|
|
@ -1843,18 +1904,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_examples({LLAMA_EXAMPLE_PARALLEL}));
|
).set_examples({LLAMA_EXAMPLE_PARALLEL}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-cb", "--cont-batching"},
|
{"-cb", "--cont-batching"},
|
||||||
string_format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
|
{"-nocb", "--no-cont-batching"},
|
||||||
[](common_params & params) {
|
string_format("whether to enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
|
||||||
params.cont_batching = true;
|
[](common_params & params, bool value) {
|
||||||
|
params.cont_batching = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING"));
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING"));
|
||||||
add_opt(common_arg(
|
|
||||||
{"-nocb", "--no-cont-batching"},
|
|
||||||
"disable continuous batching",
|
|
||||||
[](common_params & params) {
|
|
||||||
params.cont_batching = false;
|
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
|
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-mm", "--mmproj"}, "FILE",
|
{"-mm", "--mmproj"}, "FILE",
|
||||||
"path to a multimodal projector file. see tools/mtmd/README.md\n"
|
"path to a multimodal projector file. see tools/mtmd/README.md\n"
|
||||||
|
|
@ -1871,19 +1926,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
}
|
}
|
||||||
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_URL"));
|
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_URL"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--no-mmproj"},
|
{"--mmproj-auto"},
|
||||||
"explicitly disable multimodal projector, useful when using -hf",
|
{"--no-mmproj", "--no-mmproj-auto"},
|
||||||
[](common_params & params) {
|
string_format("whether to use multimodal projector file (if available), useful when using -hf (default: %s)", params.no_mmproj ? "disabled" : "enabled"),
|
||||||
params.no_mmproj = true;
|
[](common_params & params, bool value) {
|
||||||
|
params.no_mmproj = !value;
|
||||||
}
|
}
|
||||||
).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ"));
|
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_AUTO"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
{"--mmproj-offload"},
|
||||||
{"--no-mmproj-offload"},
|
{"--no-mmproj-offload"},
|
||||||
"do not offload multimodal projector to GPU",
|
string_format("whether to enable GPU offloading for multimodal projector (default: %s)", params.mmproj_use_gpu ? "enabled" : "disabled"),
|
||||||
[](common_params & params) {
|
[](common_params & params, bool value) {
|
||||||
params.mmproj_use_gpu = false;
|
params.mmproj_use_gpu = value;
|
||||||
}
|
}
|
||||||
).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD"));
|
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_OFFLOAD"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--image", "--audio"}, "FILE",
|
{"--image", "--audio"}, "FILE",
|
||||||
"path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n",
|
"path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n",
|
||||||
|
|
@ -1923,12 +1980,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_MLOCK"));
|
).set_env("LLAMA_ARG_MLOCK"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
{"--mmap"},
|
||||||
{"--no-mmap"},
|
{"--no-mmap"},
|
||||||
"do not memory-map model (slower load but may reduce pageouts if not using mlock)",
|
string_format("whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
|
||||||
[](common_params & params) {
|
[](common_params & params, bool value) {
|
||||||
params.use_mmap = false;
|
params.use_mmap = value;
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_NO_MMAP"));
|
).set_env("LLAMA_ARG_MMAP"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--numa"}, "TYPE",
|
{"--numa"}, "TYPE",
|
||||||
"attempt optimizations that help on some NUMA systems\n"
|
"attempt optimizations that help on some NUMA systems\n"
|
||||||
|
|
@ -2098,6 +2156,34 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_MAIN_GPU"));
|
).set_env("LLAMA_ARG_MAIN_GPU"));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{ "-fit", "--fit" }, "[on|off]",
|
||||||
|
string_format("whether to adjust unset arguments to fit in device memory ('on' or 'off', default: '%s')", params.fit_params ? "on" : "off"),
|
||||||
|
[](common_params & params, const std::string & value) {
|
||||||
|
if (is_truthy(value)) {
|
||||||
|
params.fit_params = true;
|
||||||
|
} else if (is_falsey(value)) {
|
||||||
|
params.fit_params = false;
|
||||||
|
} else {
|
||||||
|
throw std::runtime_error(
|
||||||
|
string_format("error: unkown value for --fit: '%s'\n", value.c_str()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
).set_env("LLAMA_ARG_FIT"));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{ "-fitt", "--fit-target" }, "MiB",
|
||||||
|
string_format("target margin per device for --fit option, default: %zu", params.fit_params_target/(1024*1024)),
|
||||||
|
[](common_params & params, int value) {
|
||||||
|
params.fit_params_target = value * size_t(1024*1024);
|
||||||
|
}
|
||||||
|
).set_env("LLAMA_ARG_FIT_TARGET"));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{ "-fitc", "--fit-ctx" }, "N",
|
||||||
|
string_format("minimum ctx size that can be set by --fit option, default: %" PRIu32, params.fit_params_min_ctx),
|
||||||
|
[](common_params & params, int value) {
|
||||||
|
params.fit_params_min_ctx = value;
|
||||||
|
}
|
||||||
|
).set_env("LLAMA_ARG_FIT_CTX"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--check-tensors"},
|
{"--check-tensors"},
|
||||||
string_format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"),
|
string_format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"),
|
||||||
|
|
@ -2116,10 +2202,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
}
|
}
|
||||||
));
|
));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
{"--op-offload"},
|
||||||
{"--no-op-offload"},
|
{"--no-op-offload"},
|
||||||
string_format("disable offloading host tensor operations to device (default: %s)", params.no_op_offload ? "true" : "false"),
|
string_format("whether to offload host tensor operations to device (default: %s)", params.no_op_offload ? "false" : "true"),
|
||||||
[](common_params & params) {
|
[](common_params & params, bool value) {
|
||||||
params.no_op_offload = true;
|
params.no_op_offload = !value;
|
||||||
}
|
}
|
||||||
));
|
));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
|
@ -2315,10 +2402,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
{"--ppl"},
|
||||||
{"--no-ppl"},
|
{"--no-ppl"},
|
||||||
string_format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
|
string_format("whether to compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
|
||||||
[](common_params & params) {
|
[](common_params & params, bool value) {
|
||||||
params.compute_ppl = false;
|
params.compute_ppl = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
|
@ -2437,12 +2525,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
{"--webui"},
|
||||||
{"--no-webui"},
|
{"--no-webui"},
|
||||||
string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
|
string_format("whether to enable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
|
||||||
[](common_params & params) {
|
[](common_params & params, bool value) {
|
||||||
params.webui = false;
|
params.webui = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_WEBUI"));
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--embedding", "--embeddings"},
|
{"--embedding", "--embeddings"},
|
||||||
string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
|
string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
|
||||||
|
|
@ -2547,18 +2636,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS"));
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--slots"},
|
{"--slots"},
|
||||||
string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
|
{"--no-slots"},
|
||||||
[](common_params & params) {
|
string_format("expose slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
|
||||||
params.endpoint_slots = true;
|
[](common_params & params, bool value) {
|
||||||
|
params.endpoint_slots = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
|
||||||
add_opt(common_arg(
|
|
||||||
{"--no-slots"},
|
|
||||||
"disables slots monitoring endpoint",
|
|
||||||
[](common_params & params) {
|
|
||||||
params.endpoint_slots = false;
|
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS"));
|
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--slot-save-path"}, "PATH",
|
{"--slot-save-path"}, "PATH",
|
||||||
"path to save slot kv cache (default: disabled)",
|
"path to save slot kv cache (default: disabled)",
|
||||||
|
|
@ -2609,26 +2692,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX"));
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
{"--models-autoload"},
|
||||||
{"--no-models-autoload"},
|
{"--no-models-autoload"},
|
||||||
"disables automatic loading of models (default: enabled)",
|
string_format("for router server, whether to automatically load models (default: %s)", params.models_autoload ? "enabled" : "disabled"),
|
||||||
[](common_params & params) {
|
[](common_params & params, bool value) {
|
||||||
params.models_autoload = false;
|
params.models_autoload = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_MODELS_AUTOLOAD"));
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_AUTOLOAD"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--jinja"},
|
{"--jinja"},
|
||||||
string_format("use jinja template for chat (default: %s)", params.use_jinja ? "enabled" : "disabled"),
|
{"--no-jinja"},
|
||||||
[](common_params & params) {
|
string_format("whether to use jinja template engine for chat (default: %s)", params.use_jinja ? "enabled" : "disabled"),
|
||||||
params.use_jinja = true;
|
[](common_params & params, bool value) {
|
||||||
|
params.use_jinja = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
|
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
|
||||||
add_opt(common_arg(
|
|
||||||
{"--no-jinja"},
|
|
||||||
string_format("disable jinja template for chat (default: %s)", params.use_jinja ? "disabled" : "enabled"),
|
|
||||||
[](common_params & params) {
|
|
||||||
params.use_jinja = false;
|
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_NO_JINJA"));
|
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--reasoning-format"}, "FORMAT",
|
{"--reasoning-format"}, "FORMAT",
|
||||||
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
|
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
|
||||||
|
|
@ -2673,15 +2751,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
|
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
{"--prefill-assistant"},
|
||||||
{"--no-prefill-assistant"},
|
{"--no-prefill-assistant"},
|
||||||
string_format(
|
string_format(
|
||||||
"whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n"
|
"whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n"
|
||||||
"when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n"
|
"when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n"
|
||||||
),
|
),
|
||||||
[](common_params & params) {
|
[](common_params & params, bool value) {
|
||||||
params.prefill_assistant = false;
|
params.prefill_assistant = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_PREFILL_ASSISTANT"));
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PREFILL_ASSISTANT"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
|
{"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
|
||||||
string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
|
string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
|
||||||
|
|
|
||||||
15
common/arg.h
15
common/arg.h
|
|
@ -16,6 +16,7 @@ struct common_arg {
|
||||||
std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
|
std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
|
||||||
std::set<enum llama_example> excludes = {};
|
std::set<enum llama_example> excludes = {};
|
||||||
std::vector<const char *> args;
|
std::vector<const char *> args;
|
||||||
|
std::vector<const char *> args_neg; // for negated args like --no-xxx
|
||||||
const char * value_hint = nullptr; // help text or example for arg value
|
const char * value_hint = nullptr; // help text or example for arg value
|
||||||
const char * value_hint_2 = nullptr; // for second arg value
|
const char * value_hint_2 = nullptr; // for second arg value
|
||||||
const char * env = nullptr;
|
const char * env = nullptr;
|
||||||
|
|
@ -25,6 +26,7 @@ struct common_arg {
|
||||||
void (*handler_string) (common_params & params, const std::string &) = nullptr;
|
void (*handler_string) (common_params & params, const std::string &) = nullptr;
|
||||||
void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr;
|
void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr;
|
||||||
void (*handler_int) (common_params & params, int) = nullptr;
|
void (*handler_int) (common_params & params, int) = nullptr;
|
||||||
|
void (*handler_bool) (common_params & params, bool) = nullptr;
|
||||||
|
|
||||||
common_arg() = default;
|
common_arg() = default;
|
||||||
|
|
||||||
|
|
@ -48,6 +50,13 @@ struct common_arg {
|
||||||
void (*handler)(common_params & params)
|
void (*handler)(common_params & params)
|
||||||
) : args(args), help(help), handler_void(handler) {}
|
) : args(args), help(help), handler_void(handler) {}
|
||||||
|
|
||||||
|
common_arg(
|
||||||
|
const std::initializer_list<const char *> & args,
|
||||||
|
const std::initializer_list<const char *> & args_neg,
|
||||||
|
const std::string & help,
|
||||||
|
void (*handler)(common_params & params, bool)
|
||||||
|
) : args(args), args_neg(args_neg), help(help), handler_bool(handler) {}
|
||||||
|
|
||||||
// support 2 values for arg
|
// support 2 values for arg
|
||||||
common_arg(
|
common_arg(
|
||||||
const std::initializer_list<const char *> & args,
|
const std::initializer_list<const char *> & args,
|
||||||
|
|
@ -80,6 +89,10 @@ struct common_arg {
|
||||||
}
|
}
|
||||||
return strcmp(args[0], other.args[0]) == 0;
|
return strcmp(args[0], other.args[0]) == 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// get all args and env vars (including negated args/env)
|
||||||
|
std::vector<std::string> get_args() const;
|
||||||
|
std::vector<std::string> get_env() const;
|
||||||
};
|
};
|
||||||
|
|
||||||
namespace common_arg_utils {
|
namespace common_arg_utils {
|
||||||
|
|
@ -102,7 +115,7 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
|
||||||
|
|
||||||
// parse input arguments from CLI into a map
|
// parse input arguments from CLI into a map
|
||||||
// TODO: support repeated args in the future
|
// TODO: support repeated args in the future
|
||||||
bool common_params_parse(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map);
|
bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map);
|
||||||
|
|
||||||
// initialize argument parser context - used by test-arg-parser and preset
|
// initialize argument parser context - used by test-arg-parser and preset
|
||||||
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
||||||
|
|
|
||||||
|
|
@ -1013,31 +1013,40 @@ bool tty_can_use_colors() {
|
||||||
// Model utils
|
// Model utils
|
||||||
//
|
//
|
||||||
|
|
||||||
static inline void common_init_sampler_from_model(
|
// TODO: move to common/sampling
|
||||||
|
static void common_init_sampler_from_model(
|
||||||
const llama_model * model,
|
const llama_model * model,
|
||||||
common_params_sampling & sparams) {
|
common_params_sampling & sparams) {
|
||||||
|
|
||||||
const uint64_t config = sparams.user_sampling_config;
|
const uint64_t config = sparams.user_sampling_config;
|
||||||
|
|
||||||
auto get_int32 = [&](const char * key, int32_t & dst, uint64_t user_config) {
|
auto get_int32 = [&](const char * key, int32_t & dst, uint64_t user_config) {
|
||||||
if (config & user_config) return;
|
if (config & user_config) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
char buf[64] = {0};
|
char buf[64] = {0};
|
||||||
if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
|
if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
|
||||||
char * end = nullptr;
|
char * end = nullptr;
|
||||||
int32_t v = strtol(buf, &end, 10);
|
int32_t v = strtol(buf, &end, 10);
|
||||||
if (end && end != buf) dst = v;
|
if (end && end != buf) {
|
||||||
|
dst = v;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
auto get_float = [&](const char * key, float & dst, uint64_t user_config) {
|
auto get_float = [&](const char * key, float & dst, uint64_t user_config) {
|
||||||
if (config & user_config) return;
|
if (config & user_config) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
char buf[128] = {0};
|
char buf[128] = {0};
|
||||||
if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
|
if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
|
||||||
char * end = nullptr;
|
char * end = nullptr;
|
||||||
float v = strtof(buf, &end);
|
float v = strtof(buf, &end);
|
||||||
if (end && end != buf) dst = v;
|
if (end && end != buf) {
|
||||||
|
dst = v;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
@ -1065,31 +1074,125 @@ static inline void common_init_sampler_from_model(
|
||||||
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA), sparams.mirostat_eta, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA);
|
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA), sparams.mirostat_eta, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct common_init_result common_init_from_params(common_params & params) {
|
struct common_init_result::impl {
|
||||||
common_init_result iparams;
|
impl() = default;
|
||||||
|
~impl() = default;
|
||||||
|
|
||||||
|
llama_model_ptr model;
|
||||||
|
llama_context_ptr context;
|
||||||
|
|
||||||
|
std::vector<llama_adapter_lora_ptr> lora;
|
||||||
|
|
||||||
|
std::vector<common_sampler_ptr> samplers;
|
||||||
|
};
|
||||||
|
|
||||||
|
common_init_result::common_init_result(common_params & params) :
|
||||||
|
pimpl(new impl{}) {
|
||||||
auto mparams = common_model_params_to_llama(params);
|
auto mparams = common_model_params_to_llama(params);
|
||||||
|
auto cparams = common_context_params_to_llama(params);
|
||||||
|
|
||||||
|
if (params.fit_params) {
|
||||||
|
LOG_INF("%s: fitting params to device memory, to report bugs during this step use -fit off (or --verbose if you can't)\n", __func__);
|
||||||
|
llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
|
||||||
|
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
|
||||||
|
params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
|
||||||
|
}
|
||||||
|
|
||||||
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
|
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
LOG_ERR("%s: failed to load model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
|
return;
|
||||||
__func__, params.model.path.c_str());
|
|
||||||
return iparams;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
common_init_sampler_from_model(model, params.sampling);
|
pimpl->model.reset(model);
|
||||||
|
|
||||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||||
|
|
||||||
auto cparams = common_context_params_to_llama(params);
|
// updates params.sampling
|
||||||
|
// TODO: fix naming
|
||||||
|
common_init_sampler_from_model(model, params.sampling);
|
||||||
|
|
||||||
|
if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
|
||||||
|
LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
|
||||||
|
params.sampling.ignore_eos = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// initialize once
|
||||||
|
for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
|
||||||
|
if (llama_vocab_is_eog(vocab, i)) {
|
||||||
|
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(vocab, i).c_str(), -INFINITY);
|
||||||
|
params.sampling.logit_bias_eog.push_back({i, -INFINITY});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (params.sampling.ignore_eos) {
|
||||||
|
// add EOG biases to the active set of logit biases
|
||||||
|
params.sampling.logit_bias.insert(
|
||||||
|
params.sampling.logit_bias.end(),
|
||||||
|
params.sampling.logit_bias_eog.begin(), params.sampling.logit_bias_eog.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
//if (params.sampling.penalty_last_n == -1) {
|
||||||
|
// LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
|
||||||
|
// params.sampling.penalty_last_n = llama_n_ctx(lctx);
|
||||||
|
//}
|
||||||
|
|
||||||
|
//if (params.sampling.dry_penalty_last_n == -1) {
|
||||||
|
// LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
|
||||||
|
// params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
|
||||||
|
//}
|
||||||
|
|
||||||
|
pimpl->samplers.resize(cparams.n_seq_max);
|
||||||
|
|
||||||
|
for (int i = 0; i < (int) cparams.n_seq_max; ++i) {
|
||||||
|
pimpl->samplers[i].reset(common_sampler_init(model, params.sampling));
|
||||||
|
}
|
||||||
|
|
||||||
llama_context * lctx = llama_init_from_model(model, cparams);
|
llama_context * lctx = llama_init_from_model(model, cparams);
|
||||||
if (lctx == NULL) {
|
if (lctx == NULL) {
|
||||||
LOG_ERR("%s: failed to create context with model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
|
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
|
||||||
__func__, params.model.path.c_str());
|
return;
|
||||||
llama_model_free(model);
|
|
||||||
return iparams;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pimpl->context.reset(lctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_model * common_init_result::model() {
|
||||||
|
return pimpl->model.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_context * common_init_result::context() {
|
||||||
|
return pimpl->context.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
common_sampler * common_init_result::sampler(llama_seq_id seq_id) {
|
||||||
|
return pimpl->samplers[seq_id].get();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<llama_adapter_lora_ptr> & common_init_result::lora() {
|
||||||
|
return pimpl->lora;
|
||||||
|
}
|
||||||
|
|
||||||
|
void common_init_result::free_context() {
|
||||||
|
pimpl->context.reset();
|
||||||
|
}
|
||||||
|
|
||||||
|
common_init_result_ptr common_init_from_params(common_params & params) {
|
||||||
|
common_init_result_ptr res(new common_init_result(params));
|
||||||
|
|
||||||
|
llama_model * model = res->model();
|
||||||
|
if (model == NULL) {
|
||||||
|
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_context * lctx = res->context();
|
||||||
|
if (lctx == NULL) {
|
||||||
|
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||||
|
|
||||||
if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) {
|
if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) {
|
||||||
LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
|
LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
|
||||||
params.ctx_shift = false;
|
params.ctx_shift = false;
|
||||||
|
|
@ -1101,10 +1204,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||||
|
|
||||||
const auto cvec = common_control_vector_load(params.control_vectors);
|
const auto cvec = common_control_vector_load(params.control_vectors);
|
||||||
if (cvec.n_embd == -1) {
|
if (cvec.n_embd == -1) {
|
||||||
llama_free(lctx);
|
return res;
|
||||||
llama_model_free(model);
|
|
||||||
|
|
||||||
return iparams;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int err = llama_apply_adapter_cvec(
|
int err = llama_apply_adapter_cvec(
|
||||||
|
|
@ -1115,10 +1215,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||||
params.control_vector_layer_start,
|
params.control_vector_layer_start,
|
||||||
params.control_vector_layer_end);
|
params.control_vector_layer_end);
|
||||||
if (err) {
|
if (err) {
|
||||||
llama_free(lctx);
|
return res;
|
||||||
llama_model_free(model);
|
|
||||||
|
|
||||||
return iparams;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1142,10 +1239,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!ok) {
|
if (!ok) {
|
||||||
llama_free(lctx);
|
return res;
|
||||||
llama_model_free(model);
|
|
||||||
|
|
||||||
return iparams;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1155,9 +1249,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||||
lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
|
lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
|
||||||
if (lora == nullptr) {
|
if (lora == nullptr) {
|
||||||
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
|
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
|
||||||
llama_free(lctx);
|
return res;
|
||||||
llama_model_free(model);
|
|
||||||
return iparams;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
char buf[1024];
|
char buf[1024];
|
||||||
|
|
@ -1166,43 +1258,13 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||||
la.task_name = buf;
|
la.task_name = buf;
|
||||||
llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
|
llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
|
||||||
la.prompt_prefix = buf;
|
la.prompt_prefix = buf;
|
||||||
iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
|
res->lora().emplace_back(std::move(lora)); // copy to list of loaded adapters
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!params.lora_init_without_apply) {
|
if (!params.lora_init_without_apply) {
|
||||||
common_set_adapter_lora(lctx, params.lora_adapters);
|
common_set_adapter_lora(lctx, params.lora_adapters);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
|
|
||||||
LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
|
|
||||||
params.sampling.ignore_eos = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// initialize once
|
|
||||||
for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
|
|
||||||
if (llama_vocab_is_eog(vocab, i)) {
|
|
||||||
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
|
|
||||||
params.sampling.logit_bias_eog.push_back({i, -INFINITY});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (params.sampling.ignore_eos) {
|
|
||||||
// add EOG biases to the active set of logit biases
|
|
||||||
params.sampling.logit_bias.insert(
|
|
||||||
params.sampling.logit_bias.end(),
|
|
||||||
params.sampling.logit_bias_eog.begin(), params.sampling.logit_bias_eog.end());
|
|
||||||
}
|
|
||||||
|
|
||||||
if (params.sampling.penalty_last_n == -1) {
|
|
||||||
LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
|
|
||||||
params.sampling.penalty_last_n = llama_n_ctx(lctx);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (params.sampling.dry_penalty_last_n == -1) {
|
|
||||||
LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
|
|
||||||
params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (params.warmup) {
|
if (params.warmup) {
|
||||||
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
|
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
|
||||||
|
|
||||||
|
|
@ -1241,12 +1303,11 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||||
llama_set_warmup(lctx, false);
|
llama_set_warmup(lctx, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
iparams.model.reset(model);
|
return res;
|
||||||
iparams.context.reset(lctx);
|
|
||||||
|
|
||||||
return iparams;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
common_init_result::~common_init_result() = default;
|
||||||
|
|
||||||
std::string get_model_endpoint() {
|
std::string get_model_endpoint() {
|
||||||
const char * model_endpoint_env = getenv("MODEL_ENDPOINT");
|
const char * model_endpoint_env = getenv("MODEL_ENDPOINT");
|
||||||
// We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility.
|
// We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility.
|
||||||
|
|
@ -1255,7 +1316,9 @@ std::string get_model_endpoint() {
|
||||||
std::string model_endpoint = "https://huggingface.co/";
|
std::string model_endpoint = "https://huggingface.co/";
|
||||||
if (endpoint_env) {
|
if (endpoint_env) {
|
||||||
model_endpoint = endpoint_env;
|
model_endpoint = endpoint_env;
|
||||||
if (model_endpoint.back() != '/') model_endpoint += '/';
|
if (model_endpoint.back() != '/') {
|
||||||
|
model_endpoint += '/';
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return model_endpoint;
|
return model_endpoint;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -99,6 +99,7 @@ enum llama_example {
|
||||||
LLAMA_EXAMPLE_TTS,
|
LLAMA_EXAMPLE_TTS,
|
||||||
LLAMA_EXAMPLE_DIFFUSION,
|
LLAMA_EXAMPLE_DIFFUSION,
|
||||||
LLAMA_EXAMPLE_FINETUNE,
|
LLAMA_EXAMPLE_FINETUNE,
|
||||||
|
LLAMA_EXAMPLE_FIT_PARAMS,
|
||||||
|
|
||||||
LLAMA_EXAMPLE_COUNT,
|
LLAMA_EXAMPLE_COUNT,
|
||||||
};
|
};
|
||||||
|
|
@ -195,7 +196,6 @@ struct common_params_sampling {
|
||||||
|
|
||||||
std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
|
std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
|
||||||
|
|
||||||
|
|
||||||
std::vector<enum common_sampler_type> samplers = {
|
std::vector<enum common_sampler_type> samplers = {
|
||||||
COMMON_SAMPLER_TYPE_PENALTIES,
|
COMMON_SAMPLER_TYPE_PENALTIES,
|
||||||
COMMON_SAMPLER_TYPE_DRY,
|
COMMON_SAMPLER_TYPE_DRY,
|
||||||
|
|
@ -216,6 +216,10 @@ struct common_params_sampling {
|
||||||
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
|
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
|
||||||
std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
|
std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
|
||||||
|
|
||||||
|
bool has_logit_bias() const {
|
||||||
|
return !logit_bias.empty();
|
||||||
|
}
|
||||||
|
|
||||||
// print the parameters into a string
|
// print the parameters into a string
|
||||||
std::string print() const;
|
std::string print() const;
|
||||||
};
|
};
|
||||||
|
|
@ -303,8 +307,8 @@ struct lr_opt {
|
||||||
struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
|
struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
|
||||||
|
|
||||||
struct common_params {
|
struct common_params {
|
||||||
int32_t n_predict = -1; // new tokens to predict
|
int32_t n_predict = -1; // max. number of new tokens to predict, -1 == no limit
|
||||||
int32_t n_ctx = 4096; // context size
|
int32_t n_ctx = 0; // context size, 0 == context the model was trained with
|
||||||
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
||||||
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
|
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
|
||||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||||
|
|
@ -328,6 +332,9 @@ struct common_params {
|
||||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
||||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||||
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
||||||
|
bool fit_params = true; // whether to fit unset model/context parameters to free device memory
|
||||||
|
size_t fit_params_target = 1024 * 1024*1024; // margin per device in bytes for fitting parameters to free memory
|
||||||
|
int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
|
||||||
|
|
||||||
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
||||||
|
|
||||||
|
|
@ -669,15 +676,29 @@ bool tty_can_use_colors();
|
||||||
// Model utils
|
// Model utils
|
||||||
//
|
//
|
||||||
|
|
||||||
// note: defines object's lifetime
|
struct common_sampler;
|
||||||
struct common_init_result {
|
|
||||||
llama_model_ptr model;
|
|
||||||
llama_context_ptr context;
|
|
||||||
|
|
||||||
std::vector<llama_adapter_lora_ptr> lora;
|
// note: defines the model, context, samplers, ets. lifetimes
|
||||||
|
struct common_init_result {
|
||||||
|
common_init_result(common_params & params);
|
||||||
|
~common_init_result();
|
||||||
|
|
||||||
|
llama_model * model();
|
||||||
|
llama_context * context();
|
||||||
|
common_sampler * sampler(llama_seq_id seq_id);
|
||||||
|
|
||||||
|
std::vector<llama_adapter_lora_ptr> & lora();
|
||||||
|
|
||||||
|
void free_context();
|
||||||
|
|
||||||
|
private:
|
||||||
|
struct impl;
|
||||||
|
std::unique_ptr<impl> pimpl;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct common_init_result common_init_from_params(common_params & params);
|
using common_init_result_ptr = std::unique_ptr<common_init_result>;
|
||||||
|
|
||||||
|
common_init_result_ptr common_init_from_params(common_params & params);
|
||||||
|
|
||||||
struct llama_model_params common_model_params_to_llama ( common_params & params);
|
struct llama_model_params common_model_params_to_llama ( common_params & params);
|
||||||
struct llama_context_params common_context_params_to_llama(const common_params & params);
|
struct llama_context_params common_context_params_to_llama(const common_params & params);
|
||||||
|
|
|
||||||
|
|
@ -23,10 +23,16 @@ std::vector<std::string> common_preset::to_args() const {
|
||||||
if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) {
|
if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) {
|
||||||
// flag option, no value
|
// flag option, no value
|
||||||
if (common_arg_utils::is_falsey(value)) {
|
if (common_arg_utils::is_falsey(value)) {
|
||||||
// skip the flag
|
// use negative arg if available
|
||||||
|
if (!opt.args_neg.empty()) {
|
||||||
|
args.back() = opt.args_neg.back();
|
||||||
|
} else {
|
||||||
|
// otherwise, skip the flag
|
||||||
|
// TODO: maybe throw an error instead?
|
||||||
args.pop_back();
|
args.pop_back();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
if (opt.value_hint != nullptr) {
|
if (opt.value_hint != nullptr) {
|
||||||
// single value
|
// single value
|
||||||
args.push_back(value);
|
args.push_back(value);
|
||||||
|
|
@ -141,16 +147,31 @@ static std::map<std::string, std::map<std::string, std::string>> parse_ini_from_
|
||||||
static std::map<std::string, common_arg> get_map_key_opt(common_params_context & ctx_params) {
|
static std::map<std::string, common_arg> get_map_key_opt(common_params_context & ctx_params) {
|
||||||
std::map<std::string, common_arg> mapping;
|
std::map<std::string, common_arg> mapping;
|
||||||
for (const auto & opt : ctx_params.options) {
|
for (const auto & opt : ctx_params.options) {
|
||||||
if (opt.env != nullptr) {
|
for (const auto & env : opt.get_env()) {
|
||||||
mapping[opt.env] = opt;
|
mapping[env] = opt;
|
||||||
}
|
}
|
||||||
for (const auto & arg : opt.args) {
|
for (const auto & arg : opt.get_args()) {
|
||||||
mapping[rm_leading_dashes(arg)] = opt;
|
mapping[rm_leading_dashes(arg)] = opt;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return mapping;
|
return mapping;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool is_bool_arg(const common_arg & arg) {
|
||||||
|
return !arg.args_neg.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::string parse_bool_arg(const common_arg & arg, const std::string & key, const std::string & value) {
|
||||||
|
// if this is a negated arg, we need to reverse the value
|
||||||
|
for (const auto & neg_arg : arg.args_neg) {
|
||||||
|
if (rm_leading_dashes(neg_arg) == key) {
|
||||||
|
return common_arg_utils::is_truthy(value) ? "false" : "true";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// otherwise, not negated
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
common_presets common_presets_load(const std::string & path, common_params_context & ctx_params) {
|
common_presets common_presets_load(const std::string & path, common_params_context & ctx_params) {
|
||||||
common_presets out;
|
common_presets out;
|
||||||
auto key_to_opt = get_map_key_opt(ctx_params);
|
auto key_to_opt = get_map_key_opt(ctx_params);
|
||||||
|
|
@ -167,8 +188,13 @@ common_presets common_presets_load(const std::string & path, common_params_conte
|
||||||
for (const auto & [key, value] : section.second) {
|
for (const auto & [key, value] : section.second) {
|
||||||
LOG_DBG("option: %s = %s\n", key.c_str(), value.c_str());
|
LOG_DBG("option: %s = %s\n", key.c_str(), value.c_str());
|
||||||
if (key_to_opt.find(key) != key_to_opt.end()) {
|
if (key_to_opt.find(key) != key_to_opt.end()) {
|
||||||
preset.options[key_to_opt[key]] = value;
|
auto & opt = key_to_opt[key];
|
||||||
LOG_DBG("accepted option: %s = %s\n", key.c_str(), value.c_str());
|
if (is_bool_arg(opt)) {
|
||||||
|
preset.options[opt] = parse_bool_arg(opt, key, value);
|
||||||
|
} else {
|
||||||
|
preset.options[opt] = value;
|
||||||
|
}
|
||||||
|
LOG_DBG("accepted option: %s = %s\n", key.c_str(), preset.options[opt].c_str());
|
||||||
} else {
|
} else {
|
||||||
// TODO: maybe warn about unknown key?
|
// TODO: maybe warn about unknown key?
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -104,9 +104,10 @@ struct ring_buffer {
|
||||||
struct common_sampler {
|
struct common_sampler {
|
||||||
common_params_sampling params;
|
common_params_sampling params;
|
||||||
|
|
||||||
struct llama_sampler * grmr;
|
|
||||||
struct llama_sampler * chain;
|
struct llama_sampler * chain;
|
||||||
|
|
||||||
|
bool grammar;
|
||||||
|
|
||||||
ring_buffer<llama_token> prev;
|
ring_buffer<llama_token> prev;
|
||||||
|
|
||||||
std::vector<llama_token_data> cur;
|
std::vector<llama_token_data> cur;
|
||||||
|
|
@ -116,7 +117,6 @@ struct common_sampler {
|
||||||
void reset() {
|
void reset() {
|
||||||
prev.clear();
|
prev.clear();
|
||||||
|
|
||||||
llama_sampler_reset(grmr);
|
|
||||||
llama_sampler_reset(chain);
|
llama_sampler_reset(chain);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -167,10 +167,15 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
||||||
|
|
||||||
lparams.no_perf = params.no_perf;
|
lparams.no_perf = params.no_perf;
|
||||||
|
|
||||||
struct llama_sampler * grmr;
|
llama_sampler * chain = llama_sampler_chain_init(lparams);
|
||||||
|
|
||||||
|
bool grammar = false;
|
||||||
|
std::vector<llama_sampler *> samplers;
|
||||||
|
|
||||||
if (params.grammar.compare(0, 11, "%llguidance") == 0) {
|
if (params.grammar.compare(0, 11, "%llguidance") == 0) {
|
||||||
#ifdef LLAMA_USE_LLGUIDANCE
|
#ifdef LLAMA_USE_LLGUIDANCE
|
||||||
grmr = llama_sampler_init_llg(vocab, "lark", params.grammar.c_str());
|
samplers.push_back(llama_sampler_init_llg(vocab, "lark", params.grammar.c_str()));
|
||||||
|
grammar = true;
|
||||||
#else
|
#else
|
||||||
GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
|
GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
|
||||||
#endif // LLAMA_USE_LLGUIDANCE
|
#endif // LLAMA_USE_LLGUIDANCE
|
||||||
|
|
@ -217,30 +222,23 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
||||||
trigger_patterns_c.push_back(regex.c_str());
|
trigger_patterns_c.push_back(regex.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
grmr = params.grammar_lazy
|
if (!params.grammar.empty()) {
|
||||||
? llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
|
if (params.grammar_lazy) {
|
||||||
|
samplers.push_back(
|
||||||
|
llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
|
||||||
trigger_patterns_c.data(), trigger_patterns_c.size(),
|
trigger_patterns_c.data(), trigger_patterns_c.size(),
|
||||||
trigger_tokens.data(), trigger_tokens.size())
|
trigger_tokens.data(), trigger_tokens.size()));
|
||||||
: llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
|
} else {
|
||||||
if (!grmr) {
|
samplers.push_back(llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root"));
|
||||||
return nullptr;
|
}
|
||||||
|
|
||||||
|
grammar = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
auto * result = new common_sampler {
|
if (params.has_logit_bias()) {
|
||||||
/* .params = */ params,
|
samplers.push_back(llama_sampler_init_logit_bias(llama_vocab_n_tokens(vocab), params.logit_bias.size(), params.logit_bias.data()));
|
||||||
/* .grmr = */ grmr,
|
}
|
||||||
/* .chain = */ llama_sampler_chain_init(lparams),
|
|
||||||
/* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
|
|
||||||
/* .cur = */ {},
|
|
||||||
/* .cur_p = */ {},
|
|
||||||
};
|
|
||||||
|
|
||||||
llama_sampler_chain_add(result->chain,
|
|
||||||
llama_sampler_init_logit_bias(
|
|
||||||
llama_vocab_n_tokens(vocab),
|
|
||||||
params.logit_bias.size(),
|
|
||||||
params.logit_bias.data()));
|
|
||||||
|
|
||||||
if (params.mirostat == 0) {
|
if (params.mirostat == 0) {
|
||||||
for (const auto & cnstr : params.samplers) {
|
for (const auto & cnstr : params.samplers) {
|
||||||
|
|
@ -253,58 +251,70 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
||||||
c_breakers.push_back(str.c_str());
|
c_breakers.push_back(str.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
|
samplers.push_back(llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_TOP_K:
|
case COMMON_SAMPLER_TYPE_TOP_K:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
|
samplers.push_back(llama_sampler_init_top_k (params.top_k));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_TOP_P:
|
case COMMON_SAMPLER_TYPE_TOP_P:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
|
samplers.push_back(llama_sampler_init_top_p (params.top_p, params.min_keep));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA:
|
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
|
samplers.push_back(llama_sampler_init_top_n_sigma(params.top_n_sigma));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_MIN_P:
|
case COMMON_SAMPLER_TYPE_MIN_P:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
|
samplers.push_back(llama_sampler_init_min_p (params.min_p, params.min_keep));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_XTC:
|
case COMMON_SAMPLER_TYPE_XTC:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
|
samplers.push_back(llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_TYPICAL_P:
|
case COMMON_SAMPLER_TYPE_TYPICAL_P:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
|
samplers.push_back(llama_sampler_init_typical (params.typ_p, params.min_keep));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_TEMPERATURE:
|
case COMMON_SAMPLER_TYPE_TEMPERATURE:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
samplers.push_back(llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_INFILL:
|
case COMMON_SAMPLER_TYPE_INFILL:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
|
samplers.push_back(llama_sampler_init_infill (vocab));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_PENALTIES:
|
case COMMON_SAMPLER_TYPE_PENALTIES:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_penalties (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
|
samplers.push_back(llama_sampler_init_penalties (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
GGML_ASSERT(false && "unknown sampler type");
|
GGML_ASSERT(false && "unknown sampler type");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
|
|
||||||
|
samplers.push_back(llama_sampler_init_dist(params.seed));
|
||||||
} else if (params.mirostat == 1) {
|
} else if (params.mirostat == 1) {
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
|
samplers.push_back(llama_sampler_init_temp(params.temp));
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
|
samplers.push_back(llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
|
||||||
} else if (params.mirostat == 2) {
|
} else if (params.mirostat == 2) {
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
|
samplers.push_back(llama_sampler_init_temp(params.temp));
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
|
samplers.push_back(llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
|
||||||
} else {
|
} else {
|
||||||
GGML_ASSERT(false && "unknown mirostat version");
|
GGML_ASSERT(false && "unknown mirostat version");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (auto * smpl : samplers) {
|
||||||
|
llama_sampler_chain_add(chain, smpl);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto * result = new common_sampler {
|
||||||
|
/* .params = */ params,
|
||||||
|
/* .chain = */ chain,
|
||||||
|
/* .grammar = */ grammar,
|
||||||
|
/* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
|
||||||
|
/* .cur = */ {},
|
||||||
|
/* .cur_p = */ {},
|
||||||
|
};
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
void common_sampler_free(struct common_sampler * gsmpl) {
|
void common_sampler_free(struct common_sampler * gsmpl) {
|
||||||
if (gsmpl) {
|
if (gsmpl) {
|
||||||
llama_sampler_free(gsmpl->grmr);
|
|
||||||
|
|
||||||
llama_sampler_free(gsmpl->chain);
|
llama_sampler_free(gsmpl->chain);
|
||||||
|
|
||||||
delete gsmpl;
|
delete gsmpl;
|
||||||
|
|
@ -314,11 +324,24 @@ void common_sampler_free(struct common_sampler * gsmpl) {
|
||||||
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
|
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
|
||||||
const auto tm = gsmpl->tm();
|
const auto tm = gsmpl->tm();
|
||||||
|
|
||||||
if (accept_grammar) {
|
if (gsmpl->grammar) {
|
||||||
llama_sampler_accept(gsmpl->grmr, token);
|
const int n_smpl = llama_sampler_chain_n(gsmpl->chain);
|
||||||
}
|
|
||||||
|
|
||||||
|
for (int i = 0; i < n_smpl; i++) {
|
||||||
|
auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
|
||||||
|
|
||||||
|
// the grammar sampler is always the first one
|
||||||
|
if (i == 0) {
|
||||||
|
if (accept_grammar) {
|
||||||
|
llama_sampler_accept(smpl, token);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
llama_sampler_accept(smpl, token);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
llama_sampler_accept(gsmpl->chain, token);
|
llama_sampler_accept(gsmpl->chain, token);
|
||||||
|
}
|
||||||
|
|
||||||
gsmpl->prev.push_back(token);
|
gsmpl->prev.push_back(token);
|
||||||
}
|
}
|
||||||
|
|
@ -330,8 +353,8 @@ void common_sampler_reset(struct common_sampler * gsmpl) {
|
||||||
struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
|
struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
|
||||||
return new common_sampler {
|
return new common_sampler {
|
||||||
/* .params = */ gsmpl->params,
|
/* .params = */ gsmpl->params,
|
||||||
/* .grmr = */ llama_sampler_clone(gsmpl->grmr),
|
|
||||||
/* .chain = */ llama_sampler_clone(gsmpl->chain),
|
/* .chain = */ llama_sampler_clone(gsmpl->chain),
|
||||||
|
/* .grammar = */ gsmpl->grammar,
|
||||||
/* .prev = */ gsmpl->prev,
|
/* .prev = */ gsmpl->prev,
|
||||||
/* .cur = */ gsmpl->cur,
|
/* .cur = */ gsmpl->cur,
|
||||||
/* .cur_p = */ gsmpl->cur_p,
|
/* .cur_p = */ gsmpl->cur_p,
|
||||||
|
|
@ -383,58 +406,33 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
|
struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl) {
|
||||||
|
return gsmpl->chain;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx) {
|
||||||
llama_synchronize(ctx);
|
llama_synchronize(ctx);
|
||||||
|
|
||||||
// start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations
|
// start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations
|
||||||
const auto tm = gsmpl->tm();
|
const auto tm = gsmpl->tm();
|
||||||
|
|
||||||
gsmpl->set_logits(ctx, idx);
|
llama_token id = LLAMA_TOKEN_NULL;
|
||||||
|
|
||||||
auto & grmr = gsmpl->grmr;
|
|
||||||
auto & chain = gsmpl->chain;
|
auto & chain = gsmpl->chain;
|
||||||
auto & cur_p = gsmpl->cur_p; // initialized by set_logits
|
auto & cur_p = gsmpl->cur_p; // initialized by set_logits
|
||||||
|
|
||||||
if (grammar_first) {
|
gsmpl->set_logits(ctx, idx);
|
||||||
llama_sampler_apply(grmr, &cur_p);
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_sampler_apply(chain, &cur_p);
|
llama_sampler_apply(chain, &cur_p);
|
||||||
|
|
||||||
GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
|
GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
|
||||||
|
|
||||||
const llama_token id = cur_p.data[cur_p.selected].id;
|
id = cur_p.data[cur_p.selected].id;
|
||||||
|
|
||||||
if (grammar_first) {
|
|
||||||
return id;
|
return id;
|
||||||
}
|
}
|
||||||
|
|
||||||
// check if it the sampled token fits the grammar
|
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft) {
|
||||||
{
|
|
||||||
llama_token_data single_token_data = { id, 1.0f, 0.0f };
|
|
||||||
llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
|
|
||||||
|
|
||||||
llama_sampler_apply(grmr, &single_token_data_array);
|
|
||||||
|
|
||||||
const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
|
|
||||||
if (is_valid) {
|
|
||||||
return id;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// resampling:
|
|
||||||
// if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
|
|
||||||
gsmpl->set_logits(ctx, idx);
|
|
||||||
|
|
||||||
llama_sampler_apply(grmr, &cur_p);
|
|
||||||
llama_sampler_apply(chain, &cur_p);
|
|
||||||
|
|
||||||
GGML_ASSERT(cur_p.selected != -1 && "no selected token during re-sampling - check your sampling configuration");
|
|
||||||
|
|
||||||
return cur_p.data[cur_p.selected].id;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first) {
|
|
||||||
GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1");
|
GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1");
|
||||||
|
|
||||||
std::vector<llama_token> result;
|
std::vector<llama_token> result;
|
||||||
|
|
@ -442,7 +440,7 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
|
||||||
|
|
||||||
size_t i = 0;
|
size_t i = 0;
|
||||||
for (; i < draft.size(); i++) {
|
for (; i < draft.size(); i++) {
|
||||||
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
|
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);
|
||||||
|
|
||||||
common_sampler_accept(gsmpl, id, true);
|
common_sampler_accept(gsmpl, id, true);
|
||||||
|
|
||||||
|
|
@ -454,7 +452,7 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
|
||||||
}
|
}
|
||||||
|
|
||||||
if (i == draft.size()) {
|
if (i == draft.size()) {
|
||||||
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
|
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);
|
||||||
|
|
||||||
common_sampler_accept(gsmpl, id, true);
|
common_sampler_accept(gsmpl, id, true);
|
||||||
|
|
||||||
|
|
@ -464,13 +462,13 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) {
|
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft) {
|
||||||
std::vector<int> idxs(draft.size() + 1);
|
std::vector<int> idxs(draft.size() + 1);
|
||||||
for (size_t i = 0; i < idxs.size(); ++i) {
|
for (size_t i = 0; i < idxs.size(); ++i) {
|
||||||
idxs[i] = i;
|
idxs[i] = i;
|
||||||
}
|
}
|
||||||
|
|
||||||
return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first);
|
return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft);
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
|
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
|
||||||
|
|
@ -515,7 +513,8 @@ std::string common_sampler_print(const struct common_sampler * gsmpl) {
|
||||||
|
|
||||||
for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
|
for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
|
||||||
const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
|
const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
|
||||||
result += std::string("-> ") + llama_sampler_name(smpl) + " ";
|
result += std::string("-> ");
|
||||||
|
result += std::string(llama_sampler_name(smpl)) + " ";
|
||||||
}
|
}
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
|
|
|
||||||
|
|
@ -48,6 +48,8 @@ struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
|
||||||
// arguments can be nullptr to skip printing
|
// arguments can be nullptr to skip printing
|
||||||
void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);
|
void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);
|
||||||
|
|
||||||
|
struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl);
|
||||||
|
|
||||||
// extended sampling implementation:
|
// extended sampling implementation:
|
||||||
//
|
//
|
||||||
// - set logits
|
// - set logits
|
||||||
|
|
@ -55,10 +57,7 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
|
||||||
// - check if the token fits the grammar (if any)
|
// - check if the token fits the grammar (if any)
|
||||||
// - if not: resample by first applying the grammar constraints and then sampling again (slower path)
|
// - if not: resample by first applying the grammar constraints and then sampling again (slower path)
|
||||||
//
|
//
|
||||||
// if grammar_first is true, the grammar is applied before the samplers (slower)
|
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx);
|
||||||
// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
|
|
||||||
//
|
|
||||||
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
|
|
||||||
|
|
||||||
// generalized version of common_sampler_sample
|
// generalized version of common_sampler_sample
|
||||||
//
|
//
|
||||||
|
|
@ -76,10 +75,10 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
|
||||||
//
|
//
|
||||||
// returns at least 1 token, up to idxs.size()
|
// returns at least 1 token, up to idxs.size()
|
||||||
//
|
//
|
||||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first = false);
|
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft);
|
||||||
|
|
||||||
// assume idxs == [ 0, 1, 2, ..., draft.size() ]
|
// assume idxs == [ 0, 1, 2, ..., draft.size() ]
|
||||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false);
|
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft);
|
||||||
|
|
||||||
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
|
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
|
||||||
|
|
||||||
|
|
@ -107,3 +106,9 @@ std::vector<enum common_sampler_type> common_sampler_types_from_chars(const std:
|
||||||
|
|
||||||
llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab,
|
llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab,
|
||||||
const char * grammar_kind, const char * grammar_data);
|
const char * grammar_kind, const char * grammar_data);
|
||||||
|
|
||||||
|
struct common_sampler_deleter {
|
||||||
|
void operator()(common_sampler * s) { common_sampler_free(s); }
|
||||||
|
};
|
||||||
|
|
||||||
|
typedef std::unique_ptr<common_sampler, common_sampler_deleter> common_sampler_ptr;
|
||||||
|
|
|
||||||
|
|
@ -315,7 +315,7 @@ llama_tokens common_speculative_gen_draft(
|
||||||
for (int i = 0; i < params.n_draft; ++i) {
|
for (int i = 0; i < params.n_draft; ++i) {
|
||||||
common_batch_clear(batch);
|
common_batch_clear(batch);
|
||||||
|
|
||||||
common_sampler_sample(smpl, ctx_dft, 0, true);
|
common_sampler_sample(smpl, ctx_dft, 0);
|
||||||
|
|
||||||
const auto * cur_p = common_sampler_get_candidates(smpl, true);
|
const auto * cur_p = common_sampler_get_candidates(smpl, true);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -136,11 +136,19 @@ class ModelBase:
|
||||||
self.remote_hf_model_id = remote_hf_model_id
|
self.remote_hf_model_id = remote_hf_model_id
|
||||||
self.sentence_transformers_dense_modules = sentence_transformers_dense_modules
|
self.sentence_transformers_dense_modules = sentence_transformers_dense_modules
|
||||||
self.hparams = ModelBase.load_hparams(self.dir_model, self.is_mistral_format) if hparams is None else hparams
|
self.hparams = ModelBase.load_hparams(self.dir_model, self.is_mistral_format) if hparams is None else hparams
|
||||||
|
self.rope_parameters = self.hparams.get("rope_parameters", self.hparams.get("rope_scaling")) or {}
|
||||||
self.model_tensors = self.index_tensors(remote_hf_model_id=remote_hf_model_id)
|
self.model_tensors = self.index_tensors(remote_hf_model_id=remote_hf_model_id)
|
||||||
self.metadata_override = metadata_override
|
self.metadata_override = metadata_override
|
||||||
self.model_name = model_name
|
self.model_name = model_name
|
||||||
self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
|
self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
|
||||||
|
|
||||||
|
# Ensure "rope_theta" and "rope_type" is mirrored in rope_parameters
|
||||||
|
if "full_attention" not in self.rope_parameters and "sliding_attention" not in self.rope_parameters:
|
||||||
|
if "rope_theta" not in self.rope_parameters and (rope_theta := self.find_hparam(["rope_theta", "global_rope_theta", "rotary_emb_base"], optional=True)) is not None:
|
||||||
|
self.rope_parameters["rope_theta"] = rope_theta
|
||||||
|
if "rope_type" not in self.rope_parameters and (rope_type := self.rope_parameters.get("type")) is not None:
|
||||||
|
self.rope_parameters["rope_type"] = rope_type
|
||||||
|
|
||||||
# Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
|
# Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
|
||||||
if self.ftype == gguf.LlamaFileType.GUESSED:
|
if self.ftype == gguf.LlamaFileType.GUESSED:
|
||||||
# NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
|
# NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
|
||||||
|
|
@ -705,6 +713,9 @@ class ModelBase:
|
||||||
if "llm_config" in config:
|
if "llm_config" in config:
|
||||||
# rename for InternVL
|
# rename for InternVL
|
||||||
config["text_config"] = config["llm_config"]
|
config["text_config"] = config["llm_config"]
|
||||||
|
if "lm_config" in config:
|
||||||
|
# rename for GlmASR
|
||||||
|
config["text_config"] = config["lm_config"]
|
||||||
if "thinker_config" in config:
|
if "thinker_config" in config:
|
||||||
# rename for Qwen2.5-Omni
|
# rename for Qwen2.5-Omni
|
||||||
config["text_config"] = config["thinker_config"]["text_config"]
|
config["text_config"] = config["thinker_config"]["text_config"]
|
||||||
|
|
@ -795,7 +806,7 @@ class TextModel(ModelBase):
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
self.gguf_writer.add_block_count(self.block_count)
|
self.gguf_writer.add_block_count(self.block_count)
|
||||||
|
|
||||||
if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions", "max_length"], optional=True)) is not None:
|
if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions", "max_length", "max_sequence_length", "model_max_length"], optional=True)) is not None:
|
||||||
self.gguf_writer.add_context_length(n_ctx)
|
self.gguf_writer.add_context_length(n_ctx)
|
||||||
logger.info(f"gguf: context length = {n_ctx}")
|
logger.info(f"gguf: context length = {n_ctx}")
|
||||||
|
|
||||||
|
|
@ -815,7 +826,42 @@ class TextModel(ModelBase):
|
||||||
self.gguf_writer.add_head_count_kv(n_head_kv)
|
self.gguf_writer.add_head_count_kv(n_head_kv)
|
||||||
logger.info(f"gguf: key-value head count = {n_head_kv}")
|
logger.info(f"gguf: key-value head count = {n_head_kv}")
|
||||||
|
|
||||||
if (rope_theta := self.hparams.get("rope_theta")) is not None:
|
rope_params = self.rope_parameters.get("full_attention", self.rope_parameters)
|
||||||
|
if (rope_type := rope_params.get("rope_type")) is not None:
|
||||||
|
rope_factor = rope_params.get("factor")
|
||||||
|
rope_gguf_type = gguf.RopeScalingType.NONE
|
||||||
|
if rope_type == "linear" and rope_factor is not None:
|
||||||
|
rope_gguf_type = gguf.RopeScalingType.LINEAR
|
||||||
|
self.gguf_writer.add_rope_scaling_type(rope_gguf_type)
|
||||||
|
self.gguf_writer.add_rope_scaling_factor(rope_factor)
|
||||||
|
elif rope_type == "yarn" and rope_factor is not None:
|
||||||
|
rope_gguf_type = gguf.RopeScalingType.YARN
|
||||||
|
self.gguf_writer.add_rope_scaling_type(rope_gguf_type)
|
||||||
|
self.gguf_writer.add_rope_scaling_factor(rope_factor)
|
||||||
|
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_params["original_max_position_embeddings"])
|
||||||
|
if (yarn_ext_factor := rope_params.get("extrapolation_factor")) is not None:
|
||||||
|
self.gguf_writer.add_rope_scaling_yarn_ext_factor(yarn_ext_factor)
|
||||||
|
if (yarn_attn_factor := rope_params.get("attention_factor", rope_params.get("attn_factor"))) is not None:
|
||||||
|
self.gguf_writer.add_rope_scaling_yarn_attn_factor(yarn_attn_factor)
|
||||||
|
if (yarn_beta_fast := rope_params.get("beta_fast")) is not None:
|
||||||
|
self.gguf_writer.add_rope_scaling_yarn_beta_fast(yarn_beta_fast)
|
||||||
|
if (yarn_beta_slow := rope_params.get("beta_slow")) is not None:
|
||||||
|
self.gguf_writer.add_rope_scaling_yarn_beta_slow(yarn_beta_slow)
|
||||||
|
# self.gguf_writer.add_rope_scaling_yarn_log_mul(rope_params["mscale_all_dim"])
|
||||||
|
elif rope_type == "su" or rope_type == "longrope":
|
||||||
|
rope_gguf_type = gguf.RopeScalingType.LONGROPE
|
||||||
|
self.gguf_writer.add_rope_scaling_type(rope_gguf_type)
|
||||||
|
elif rope_type == "dynamic":
|
||||||
|
# HunYuan, handled in model class
|
||||||
|
pass
|
||||||
|
elif rope_type.lower() == "llama3":
|
||||||
|
# Handled in generate_extra_tensors
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
logger.warning(f"Unknown RoPE type: {rope_type}")
|
||||||
|
logger.info(f"gguf: rope scaling type = {rope_gguf_type.name}")
|
||||||
|
|
||||||
|
if (rope_theta := rope_params.get("rope_theta")) is not None:
|
||||||
self.gguf_writer.add_rope_freq_base(rope_theta)
|
self.gguf_writer.add_rope_freq_base(rope_theta)
|
||||||
logger.info(f"gguf: rope theta = {rope_theta}")
|
logger.info(f"gguf: rope theta = {rope_theta}")
|
||||||
if (f_rms_eps := self.find_hparam(["rms_norm_eps", "norm_eps"], optional=True)) is not None:
|
if (f_rms_eps := self.find_hparam(["rms_norm_eps", "norm_eps"], optional=True)) is not None:
|
||||||
|
|
@ -1486,6 +1532,21 @@ class TextModel(ModelBase):
|
||||||
raise NotImplementedError("Only MEAN, CLS, and LAST pooling types supported")
|
raise NotImplementedError("Only MEAN, CLS, and LAST pooling types supported")
|
||||||
self.gguf_writer.add_pooling_type(pooling_type)
|
self.gguf_writer.add_pooling_type(pooling_type)
|
||||||
|
|
||||||
|
def _set_vocab_glmedge(self):
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
|
||||||
|
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
|
||||||
|
tokens, toktypes, tokpre = self.get_vocab_base()
|
||||||
|
self.gguf_writer.add_tokenizer_model("gpt2")
|
||||||
|
self.gguf_writer.add_tokenizer_pre(tokpre)
|
||||||
|
self.gguf_writer.add_token_list(tokens)
|
||||||
|
self.gguf_writer.add_token_types(toktypes)
|
||||||
|
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
|
||||||
|
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
|
||||||
|
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"])
|
||||||
|
special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"])
|
||||||
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
def _set_vocab_interns1(self):
|
def _set_vocab_interns1(self):
|
||||||
tokens: list[str] = []
|
tokens: list[str] = []
|
||||||
toktypes: list[int] = []
|
toktypes: list[int] = []
|
||||||
|
|
@ -1615,7 +1676,7 @@ class MmprojModel(ModelBase):
|
||||||
preprocessor_config: dict[str, Any]
|
preprocessor_config: dict[str, Any]
|
||||||
global_config: dict[str, Any]
|
global_config: dict[str, Any]
|
||||||
|
|
||||||
n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth"]
|
n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth", "encoder_layers"]
|
||||||
|
|
||||||
has_vision_encoder: bool = True # by default
|
has_vision_encoder: bool = True # by default
|
||||||
has_audio_encoder: bool = False
|
has_audio_encoder: bool = False
|
||||||
|
|
@ -1691,7 +1752,8 @@ class MmprojModel(ModelBase):
|
||||||
return self.global_config.get(config_name)
|
return self.global_config.get(config_name)
|
||||||
|
|
||||||
def get_audio_config(self) -> dict[str, Any] | None:
|
def get_audio_config(self) -> dict[str, Any] | None:
|
||||||
return self.global_config.get("audio_config")
|
mm_config_key = "whisper_config" if "whisper_config" in self.hparams else "audio_config"
|
||||||
|
return self.global_config.get(mm_config_key)
|
||||||
|
|
||||||
def set_type(self):
|
def set_type(self):
|
||||||
self.gguf_writer.add_type(gguf.GGUFType.MMPROJ)
|
self.gguf_writer.add_type(gguf.GGUFType.MMPROJ)
|
||||||
|
|
@ -1966,34 +2028,10 @@ class BaichuanModel(TextModel):
|
||||||
self._set_vocab_sentencepiece()
|
self._set_vocab_sentencepiece()
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
head_count = self.hparams["num_attention_heads"]
|
super().set_gguf_parameters()
|
||||||
head_count_kv = self.hparams.get("num_key_value_heads", head_count)
|
|
||||||
|
|
||||||
ctx_length = 0
|
|
||||||
if "max_sequence_length" in self.hparams:
|
|
||||||
ctx_length = self.hparams["max_sequence_length"]
|
|
||||||
elif "max_position_embeddings" in self.hparams:
|
|
||||||
ctx_length = self.hparams["max_position_embeddings"]
|
|
||||||
elif "model_max_length" in self.hparams:
|
|
||||||
ctx_length = self.hparams["model_max_length"]
|
|
||||||
else:
|
|
||||||
raise ValueError("gguf: can not find ctx length parameter.")
|
|
||||||
|
|
||||||
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
||||||
self.gguf_writer.add_context_length(ctx_length)
|
|
||||||
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
|
||||||
self.gguf_writer.add_block_count(self.block_count)
|
|
||||||
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
|
||||||
self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
||||||
self.gguf_writer.add_head_count(head_count)
|
|
||||||
self.gguf_writer.add_head_count_kv(head_count_kv)
|
|
||||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
|
||||||
self.gguf_writer.add_file_type(self.ftype)
|
|
||||||
|
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
head_count = self.hparams["num_attention_heads"]
|
head_count = self.hparams["num_attention_heads"]
|
||||||
|
|
@ -2089,34 +2127,10 @@ class XverseModel(TextModel):
|
||||||
special_vocab.add_to_gguf(self.gguf_writer)
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
head_count = self.hparams["num_attention_heads"]
|
super().set_gguf_parameters()
|
||||||
head_count_kv = self.hparams.get("num_key_value_heads", head_count)
|
|
||||||
|
|
||||||
ctx_length = 0
|
|
||||||
if "max_sequence_length" in self.hparams:
|
|
||||||
ctx_length = self.hparams["max_sequence_length"]
|
|
||||||
elif "max_position_embeddings" in self.hparams:
|
|
||||||
ctx_length = self.hparams["max_position_embeddings"]
|
|
||||||
elif "model_max_length" in self.hparams:
|
|
||||||
ctx_length = self.hparams["model_max_length"]
|
|
||||||
else:
|
|
||||||
raise ValueError("gguf: can not find ctx length parameter.")
|
|
||||||
|
|
||||||
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
||||||
self.gguf_writer.add_context_length(ctx_length)
|
|
||||||
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
|
||||||
self.gguf_writer.add_block_count(self.block_count)
|
|
||||||
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
|
||||||
self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
||||||
self.gguf_writer.add_head_count(head_count)
|
|
||||||
self.gguf_writer.add_head_count_kv(head_count_kv)
|
|
||||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
|
||||||
self.gguf_writer.add_file_type(self.ftype)
|
|
||||||
|
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
del bid # unused
|
del bid # unused
|
||||||
|
|
@ -2377,8 +2391,13 @@ class LlamaModel(TextModel):
|
||||||
# fix for SmolVLM2, missing `num_attention_heads` in config.json
|
# fix for SmolVLM2, missing `num_attention_heads` in config.json
|
||||||
if self.hf_arch == "VLlama3ForCausalLM":
|
if self.hf_arch == "VLlama3ForCausalLM":
|
||||||
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
|
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
|
||||||
|
hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
|
||||||
|
self.origin_hf_arch = hparams.get('architectures', [None])[0]
|
||||||
|
|
||||||
def set_vocab(self):
|
def set_vocab(self):
|
||||||
|
if self.origin_hf_arch == "GlmasrModel":
|
||||||
|
return self._set_vocab_glmedge()
|
||||||
|
|
||||||
if self.is_mistral_format:
|
if self.is_mistral_format:
|
||||||
return self._set_vocab_mistral()
|
return self._set_vocab_mistral()
|
||||||
|
|
||||||
|
|
@ -2430,11 +2449,6 @@ class LlamaModel(TextModel):
|
||||||
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||||
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||||
|
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
||||||
if n_head_kv is not None and n_head != n_head_kv:
|
if n_head_kv is not None and n_head != n_head_kv:
|
||||||
|
|
@ -2454,6 +2468,7 @@ class LlamaModel(TextModel):
|
||||||
"vision_language_adapter.",
|
"vision_language_adapter.",
|
||||||
"patch_merger.",
|
"patch_merger.",
|
||||||
"pre_mm_projector_norm",
|
"pre_mm_projector_norm",
|
||||||
|
"audio_encoder.",
|
||||||
]
|
]
|
||||||
|
|
||||||
is_multimodal_tensor = "vision_tower" in name \
|
is_multimodal_tensor = "vision_tower" in name \
|
||||||
|
|
@ -2518,16 +2533,16 @@ class LlamaModel(TextModel):
|
||||||
return [(self.map_tensor_name(name), data_torch)]
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||||
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters):
|
||||||
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
if rope_params.get("rope_type", '').lower() == "llama3":
|
||||||
base = self.hparams.get("rope_theta", 10000.0)
|
base = rope_params.get("rope_theta", 10000.0)
|
||||||
if (dim := self.hparams.get("head_dim")) is None:
|
if (dim := self.hparams.get("head_dim")) is None:
|
||||||
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||||
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
||||||
|
|
||||||
factor = rope_scaling.get("factor", 8.0)
|
factor = rope_params.get("factor", 8.0)
|
||||||
low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
|
low_freq_factor = rope_params.get("low_freq_factor", 1.0)
|
||||||
high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
|
high_freq_factor = rope_params.get("high_freq_factor", 4.0)
|
||||||
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
|
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
|
||||||
|
|
||||||
low_freq_wavelen = old_context_len / low_freq_factor
|
low_freq_wavelen = old_context_len / low_freq_factor
|
||||||
|
|
@ -2564,11 +2579,6 @@ class ArceeModel(LlamaModel):
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
self._try_set_pooling_type()
|
self._try_set_pooling_type()
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
|
||||||
|
|
||||||
|
|
||||||
@ModelBase.register("AfmoeForCausalLM")
|
@ModelBase.register("AfmoeForCausalLM")
|
||||||
|
|
@ -2851,17 +2861,11 @@ class Mistral3Model(LlamaModel):
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
rope_params = self.hparams.get("rope_parameters")
|
rope_params = self.rope_parameters
|
||||||
if self.hparams.get("model_type") == "ministral3":
|
if self.hparams.get("model_type") == "ministral3":
|
||||||
assert rope_params is not None, "ministral3 must have 'rope_parameters' config"
|
assert rope_params, "ministral3 must have 'rope_parameters' config"
|
||||||
assert rope_params["rope_type"] == "yarn", "ministral3 rope_type must be 'yarn'"
|
assert rope_params["rope_type"] == "yarn", "ministral3 rope_type must be 'yarn'"
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_params["factor"])
|
|
||||||
self.gguf_writer.add_rope_scaling_yarn_beta_fast(rope_params["beta_fast"])
|
|
||||||
self.gguf_writer.add_rope_scaling_yarn_beta_slow(rope_params["beta_slow"])
|
|
||||||
self.gguf_writer.add_rope_scaling_yarn_log_mul(rope_params["mscale_all_dim"])
|
self.gguf_writer.add_rope_scaling_yarn_log_mul(rope_params["mscale_all_dim"])
|
||||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_params["original_max_position_embeddings"])
|
|
||||||
self.gguf_writer.add_rope_freq_base(rope_params["rope_theta"])
|
|
||||||
self.gguf_writer.add_attn_temperature_scale(rope_params["llama_4_scaling_beta"])
|
self.gguf_writer.add_attn_temperature_scale(rope_params["llama_4_scaling_beta"])
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
|
||||||
|
|
@ -2958,7 +2962,7 @@ class DeciModel(TextModel):
|
||||||
assert self.block_count == len(self._num_kv_heads)
|
assert self.block_count == len(self._num_kv_heads)
|
||||||
assert self.block_count == len(self._num_heads)
|
assert self.block_count == len(self._num_heads)
|
||||||
assert self.block_count == len(self._ffn_dims)
|
assert self.block_count == len(self._ffn_dims)
|
||||||
if (rope_theta := self.hparams.get("rope_theta")) is not None:
|
if (rope_theta := self.rope_parameters.get("rope_theta")) is not None:
|
||||||
self.gguf_writer.add_rope_freq_base(rope_theta)
|
self.gguf_writer.add_rope_freq_base(rope_theta)
|
||||||
self.gguf_writer.add_head_count_kv(self._num_kv_heads)
|
self.gguf_writer.add_head_count_kv(self._num_kv_heads)
|
||||||
self.gguf_writer.add_head_count(self._num_heads)
|
self.gguf_writer.add_head_count(self._num_heads)
|
||||||
|
|
@ -2983,11 +2987,6 @@ class DeciModel(TextModel):
|
||||||
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||||
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||||
|
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
||||||
if n_head_kv is not None and n_head != n_head_kv:
|
if n_head_kv is not None and n_head != n_head_kv:
|
||||||
|
|
@ -3016,16 +3015,16 @@ class DeciModel(TextModel):
|
||||||
return [(self.map_tensor_name(name), data_torch)]
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||||
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters):
|
||||||
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
if rope_params.get("rope_type", '').lower() == "llama3":
|
||||||
base = self.hparams.get("rope_theta", 10000.0)
|
base = rope_params.get("rope_theta", 10000.0)
|
||||||
if (dim := self.hparams.get("head_dim")) is None:
|
if (dim := self.hparams.get("head_dim")) is None:
|
||||||
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||||
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
||||||
|
|
||||||
factor = rope_scaling.get("factor", 8.0)
|
factor = rope_params.get("factor", 8.0)
|
||||||
low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
|
low_freq_factor = rope_params.get("low_freq_factor", 1.0)
|
||||||
high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
|
high_freq_factor = rope_params.get("high_freq_factor", 4.0)
|
||||||
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
|
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
|
||||||
|
|
||||||
low_freq_wavelen = old_context_len / low_freq_factor
|
low_freq_wavelen = old_context_len / low_freq_factor
|
||||||
|
|
@ -3279,10 +3278,6 @@ class MiniCPMModel(TextModel):
|
||||||
logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"]
|
logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"]
|
||||||
self.gguf_writer.add_logit_scale(logit_scale)
|
self.gguf_writer.add_logit_scale(logit_scale)
|
||||||
logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}")
|
logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}")
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "longrope":
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE)
|
|
||||||
logger.info(f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}")
|
|
||||||
|
|
||||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||||
rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||||
|
|
@ -3402,17 +3397,6 @@ class QwenModel(TextModel):
|
||||||
def set_vocab(self):
|
def set_vocab(self):
|
||||||
self._set_vocab_qwen()
|
self._set_vocab_qwen()
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
|
||||||
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
|
||||||
self.gguf_writer.add_block_count(self.block_count)
|
|
||||||
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
|
||||||
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
|
||||||
self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
|
|
||||||
self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
|
||||||
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
|
|
||||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
|
|
||||||
self.gguf_writer.add_file_type(self.ftype)
|
|
||||||
|
|
||||||
|
|
||||||
@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM", "Qwen2AudioForConditionalGeneration")
|
@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM", "Qwen2AudioForConditionalGeneration")
|
||||||
class Qwen2Model(TextModel):
|
class Qwen2Model(TextModel):
|
||||||
|
|
@ -3427,11 +3411,6 @@ class Qwen2Model(TextModel):
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
self._try_set_pooling_type()
|
self._try_set_pooling_type()
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
if self.hf_arch == "Qwen2Model":
|
if self.hf_arch == "Qwen2Model":
|
||||||
|
|
@ -3499,12 +3478,6 @@ class DreamModel(TextModel):
|
||||||
|
|
||||||
# Dream models use non-causal attention for diffusion
|
# Dream models use non-causal attention for diffusion
|
||||||
self.gguf_writer.add_causal_attention(False)
|
self.gguf_writer.add_causal_attention(False)
|
||||||
# Handle RoPE scaling similar to Qwen2
|
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
|
||||||
|
|
||||||
# Add Dream-specific parameters
|
# Add Dream-specific parameters
|
||||||
mask_token_id = self.hparams.get("mask_token_id")
|
mask_token_id = self.hparams.get("mask_token_id")
|
||||||
|
|
@ -4048,13 +4021,6 @@ class Qwen2MoeModel(TextModel):
|
||||||
if (shared_expert_intermediate_size := self.hparams.get('shared_expert_intermediate_size')) is not None:
|
if (shared_expert_intermediate_size := self.hparams.get('shared_expert_intermediate_size')) is not None:
|
||||||
self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size)
|
self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size)
|
||||||
logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}")
|
logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}")
|
||||||
# YaRN is not enabled by default
|
|
||||||
# To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts
|
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
|
||||||
|
|
||||||
_experts: list[dict[str, Tensor]] | None = None
|
_experts: list[dict[str, Tensor]] | None = None
|
||||||
|
|
||||||
|
|
@ -4656,7 +4622,7 @@ class Phi3MiniModel(TextModel):
|
||||||
self.gguf_writer.add_head_count_kv(n_head_kv)
|
self.gguf_writer.add_head_count_kv(n_head_kv)
|
||||||
self.gguf_writer.add_layer_norm_rms_eps(rms_eps)
|
self.gguf_writer.add_layer_norm_rms_eps(rms_eps)
|
||||||
self.gguf_writer.add_rope_dimension_count(rope_dims)
|
self.gguf_writer.add_rope_dimension_count(rope_dims)
|
||||||
self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
|
self.gguf_writer.add_rope_freq_base(self.rope_parameters.get("full_attention", self.rope_parameters)["rope_theta"])
|
||||||
self.gguf_writer.add_file_type(self.ftype)
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
sliding_window = self.hparams.get("sliding_window")
|
sliding_window = self.hparams.get("sliding_window")
|
||||||
# use zero value of sliding_window to distinguish Phi-4 from other PHI3 models
|
# use zero value of sliding_window to distinguish Phi-4 from other PHI3 models
|
||||||
|
|
@ -4932,7 +4898,7 @@ class Plamo2Model(TextModel):
|
||||||
self.gguf_writer.add_value_length(hparams.get("hidden_size_per_head", 128))
|
self.gguf_writer.add_value_length(hparams.get("hidden_size_per_head", 128))
|
||||||
self.gguf_writer.add_block_count(self.block_count)
|
self.gguf_writer.add_block_count(self.block_count)
|
||||||
self.gguf_writer.add_layer_norm_rms_eps(hparams.get("rms_norm_eps", 1e-06))
|
self.gguf_writer.add_layer_norm_rms_eps(hparams.get("rms_norm_eps", 1e-06))
|
||||||
self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 10000))
|
self.gguf_writer.add_rope_freq_base(self.rope_parameters.get("rope_theta", 10000))
|
||||||
|
|
||||||
# Mamba parameters
|
# Mamba parameters
|
||||||
self.gguf_writer.add_ssm_state_size(hparams.get("mamba_d_state", 64))
|
self.gguf_writer.add_ssm_state_size(hparams.get("mamba_d_state", 64))
|
||||||
|
|
@ -5130,21 +5096,6 @@ class InternLM2Model(TextModel):
|
||||||
|
|
||||||
special_vocab.add_to_gguf(self.gguf_writer)
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
|
||||||
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
|
||||||
self.gguf_writer.add_block_count(self.block_count)
|
|
||||||
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
|
||||||
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
|
||||||
self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
|
|
||||||
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
|
|
||||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
|
||||||
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
|
|
||||||
self.gguf_writer.add_file_type(self.ftype)
|
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
num_heads = self.hparams["num_attention_heads"]
|
num_heads = self.hparams["num_attention_heads"]
|
||||||
num_kv_heads = self.hparams["num_key_value_heads"]
|
num_kv_heads = self.hparams["num_key_value_heads"]
|
||||||
|
|
@ -5221,11 +5172,6 @@ class InternLM3Model(TextModel):
|
||||||
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||||
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||||
|
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
n_head = self.hparams["num_attention_heads"]
|
n_head = self.hparams["num_attention_heads"]
|
||||||
n_kv_head = self.hparams.get("num_key_value_heads")
|
n_kv_head = self.hparams.get("num_key_value_heads")
|
||||||
|
|
@ -5588,7 +5534,6 @@ class NomicBertModel(BertModel):
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
|
|
||||||
if self.is_moe:
|
if self.is_moe:
|
||||||
self.gguf_writer.add_moe_every_n_layers(self.hparams["moe_every_n_layers"])
|
self.gguf_writer.add_moe_every_n_layers(self.hparams["moe_every_n_layers"])
|
||||||
self.gguf_writer.add_expert_count(self.hparams["num_experts"])
|
self.gguf_writer.add_expert_count(self.hparams["num_experts"])
|
||||||
|
|
@ -5711,8 +5656,6 @@ class XLMRobertaModel(BertModel):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
|
|
||||||
# jina-embeddings-v3
|
# jina-embeddings-v3
|
||||||
if rotary_emb_base := self.hparams.get("rotary_emb_base"):
|
|
||||||
self.gguf_writer.add_rope_freq_base(rotary_emb_base)
|
|
||||||
lora_alpha = self.hparams.get("lora_alpha")
|
lora_alpha = self.hparams.get("lora_alpha")
|
||||||
if lora_prompt_prefixes := self.hparams.get("task_instructions"):
|
if lora_prompt_prefixes := self.hparams.get("task_instructions"):
|
||||||
assert self._lora_files and all(lora_name in lora_prompt_prefixes for lora_name in self._lora_files.keys())
|
assert self._lora_files and all(lora_name in lora_prompt_prefixes for lora_name in self._lora_files.keys())
|
||||||
|
|
@ -5840,19 +5783,16 @@ class Gemma3Model(TextModel):
|
||||||
self._set_vocab_gpt2()
|
self._set_vocab_gpt2()
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
|
super().set_gguf_parameters()
|
||||||
hparams = self.hparams
|
hparams = self.hparams
|
||||||
|
|
||||||
# some default values are not specified in the hparams
|
# some default values are not specified in the hparams
|
||||||
self.gguf_writer.add_context_length(hparams.get("max_position_embeddings", 131072))
|
self.gguf_writer.add_context_length(hparams.get("max_position_embeddings", 131072))
|
||||||
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
|
||||||
self.gguf_writer.add_block_count(self.block_count)
|
|
||||||
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
|
||||||
self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 8))
|
self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 8))
|
||||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-6))
|
self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-6))
|
||||||
self.gguf_writer.add_key_length(hparams.get("head_dim", 256))
|
self.gguf_writer.add_key_length(hparams.get("head_dim", 256))
|
||||||
self.gguf_writer.add_value_length(hparams.get("head_dim", 256))
|
self.gguf_writer.add_value_length(hparams.get("head_dim", 256))
|
||||||
self.gguf_writer.add_file_type(self.ftype)
|
self.gguf_writer.add_rope_freq_base(self.rope_parameters.get("full_attention", self.rope_parameters).get("rope_theta", 1_000_000.0)) # for global layers
|
||||||
self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1_000_000.0)) # for global layers
|
|
||||||
# attn_logit_softcapping is removed in Gemma3
|
# attn_logit_softcapping is removed in Gemma3
|
||||||
assert hparams.get("attn_logit_softcapping") is None
|
assert hparams.get("attn_logit_softcapping") is None
|
||||||
if (final_logit_softcap := hparams.get("final_logit_softcapping")):
|
if (final_logit_softcap := hparams.get("final_logit_softcapping")):
|
||||||
|
|
@ -5860,19 +5800,6 @@ class Gemma3Model(TextModel):
|
||||||
if hparams.get("sliding_window_pattern") != 1:
|
if hparams.get("sliding_window_pattern") != 1:
|
||||||
self.gguf_writer.add_sliding_window(hparams["sliding_window"])
|
self.gguf_writer.add_sliding_window(hparams["sliding_window"])
|
||||||
self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4))
|
self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4))
|
||||||
if hparams.get("rope_scaling") is not None:
|
|
||||||
rope_scaling = hparams["rope_scaling"]
|
|
||||||
if rope_scaling["rope_type"] == "linear":
|
|
||||||
# important: this rope_scaling is only applied for global layers, and not used by 1B model
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
elif rope_scaling["rope_type"] == "yarn":
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
|
||||||
self.gguf_writer.add_rope_scaling_yarn_ext_factor(rope_scaling["extrapolation_factor"])
|
|
||||||
self.gguf_writer.add_rope_scaling_yarn_beta_fast(rope_scaling["beta_fast"])
|
|
||||||
self.gguf_writer.add_rope_scaling_yarn_beta_slow(rope_scaling["beta_slow"])
|
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
del bid # unused
|
del bid # unused
|
||||||
|
|
@ -6776,13 +6703,6 @@ class Olmo2Model(TextModel):
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
|
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
self.gguf_writer.add_rope_scaling_attn_factors(rope_scaling["attention_factor"])
|
|
||||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
|
||||||
|
|
||||||
if "sliding_window" in self.hparams:
|
if "sliding_window" in self.hparams:
|
||||||
self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
|
self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
|
||||||
|
|
||||||
|
|
@ -7281,16 +7201,11 @@ class DeepseekV2Model(TextModel):
|
||||||
|
|
||||||
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
|
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
|
||||||
|
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
if (rope_mscale_all := self.rope_parameters.get("mscale_all_dim")) is not None:
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
|
||||||
|
|
||||||
# [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
|
# [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
|
||||||
# note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul
|
# note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul
|
||||||
# ref https://github.com/ggml-org/llama.cpp/pull/17945
|
# ref https://github.com/ggml-org/llama.cpp/pull/17945
|
||||||
self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_scaling["mscale_all_dim"])
|
self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_mscale_all)
|
||||||
|
|
||||||
_experts: list[dict[str, Tensor]] | None = None
|
_experts: list[dict[str, Tensor]] | None = None
|
||||||
|
|
||||||
|
|
@ -7898,11 +7813,6 @@ class Glm4Model(TextModel):
|
||||||
if (rope_dim := self.hparams.get("head_dim")) is None:
|
if (rope_dim := self.hparams.get("head_dim")) is None:
|
||||||
rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||||
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
|
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
if name.startswith("model.visual."): # ignore visual part of Glm4v
|
if name.startswith("model.visual."): # ignore visual part of Glm4v
|
||||||
|
|
@ -8240,50 +8150,26 @@ class ExaoneModel(TextModel):
|
||||||
model_arch = gguf.MODEL_ARCH.EXAONE
|
model_arch = gguf.MODEL_ARCH.EXAONE
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
|
super().set_gguf_parameters()
|
||||||
hparams = self.hparams
|
hparams = self.hparams
|
||||||
|
|
||||||
assert (hparams["activation_function"] == "silu")
|
assert (hparams["activation_function"] == "silu")
|
||||||
|
|
||||||
max_position_embeddings = hparams["max_position_embeddings"]
|
|
||||||
embed_dim = hparams["hidden_size"]
|
|
||||||
num_heads = hparams["num_attention_heads"]
|
|
||||||
num_kv_heads = hparams.get("num_key_value_heads", num_heads)
|
|
||||||
layer_norm_eps = hparams["layer_norm_epsilon"]
|
|
||||||
intermediate_size = hparams["intermediate_size"] if "intermediate_size" in hparams else 4 * embed_dim
|
|
||||||
# ignore for now as EXAONE-3.0-7.8B-Instruct attentino_dropout is 0.0
|
|
||||||
# attention_dropout_rate = hparams["attention_dropout"]
|
|
||||||
# ignore for now as EXAONE-3.0-7.8B-Instruct embed_dropout is 0.0
|
|
||||||
# embed_dropout_rate = hparams["embed_dropout"]
|
|
||||||
self.gguf_writer.add_embedding_length(embed_dim)
|
|
||||||
self.gguf_writer.add_head_count(num_heads)
|
|
||||||
self.gguf_writer.add_head_count_kv(num_kv_heads)
|
|
||||||
self.gguf_writer.add_context_length(max_position_embeddings)
|
|
||||||
self.gguf_writer.add_layer_norm_rms_eps(layer_norm_eps)
|
|
||||||
self.gguf_writer.add_feed_forward_length(intermediate_size)
|
|
||||||
self.gguf_writer.add_block_count(self.block_count)
|
|
||||||
self.gguf_writer.add_file_type(self.ftype)
|
|
||||||
|
|
||||||
if (rope_theta := self.hparams.get("rope_theta")) is not None:
|
|
||||||
self.gguf_writer.add_rope_freq_base(rope_theta)
|
|
||||||
rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True)
|
rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True)
|
||||||
rotary_factor = rotary_factor if rotary_factor is not None else 1.0
|
rotary_factor = rotary_factor if rotary_factor is not None else 1.0
|
||||||
self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
|
self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
|
|
||||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||||
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters):
|
||||||
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
if rope_params.get("rope_type", '').lower() == "llama3":
|
||||||
base = self.hparams.get("rope_theta", 10000.0)
|
base = self.rope_parameters.get("rope_theta", 10000.0)
|
||||||
if (dim := self.hparams.get("head_dim")) is None:
|
if (dim := self.hparams.get("head_dim")) is None:
|
||||||
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||||
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
||||||
|
|
||||||
factor = rope_scaling.get("factor", 8.0)
|
factor = rope_params.get("factor", 8.0)
|
||||||
low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
|
low_freq_factor = rope_params.get("low_freq_factor", 1.0)
|
||||||
high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
|
high_freq_factor = rope_params.get("high_freq_factor", 4.0)
|
||||||
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
|
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
|
||||||
|
|
||||||
low_freq_wavelen = old_context_len / low_freq_factor
|
low_freq_wavelen = old_context_len / low_freq_factor
|
||||||
|
|
@ -8338,22 +8224,17 @@ class Exaone4Model(TextModel):
|
||||||
if len(sliding_window_pattern) == hparams["num_hidden_layers"]:
|
if len(sliding_window_pattern) == hparams["num_hidden_layers"]:
|
||||||
self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
|
self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
|
||||||
|
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
|
|
||||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||||
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters):
|
||||||
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
if rope_params.get("rope_type", '').lower() == "llama3":
|
||||||
base = self.hparams.get("rope_theta", 10_000.0)
|
base = rope_params.get("rope_theta", 10_000.0)
|
||||||
if (dim := self.hparams.get("head_dim")) is None:
|
if (dim := self.hparams.get("head_dim")) is None:
|
||||||
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||||
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
||||||
|
|
||||||
factor = rope_scaling.get("factor", 16.0)
|
factor = rope_params.get("factor", 16.0)
|
||||||
low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
|
low_freq_factor = rope_params.get("low_freq_factor", 1.0)
|
||||||
high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
|
high_freq_factor = rope_params.get("high_freq_factor", 4.0)
|
||||||
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
|
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
|
||||||
|
|
||||||
low_freq_wavelen = old_context_len / low_freq_factor
|
low_freq_wavelen = old_context_len / low_freq_factor
|
||||||
|
|
@ -8664,13 +8545,6 @@ class BailingMoeModel(TextModel):
|
||||||
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||||
|
|
||||||
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
|
||||||
else:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
|
||||||
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
|
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
|
||||||
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
||||||
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
|
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
|
||||||
|
|
@ -8777,13 +8651,6 @@ class BailingMoeV2Model(TextModel):
|
||||||
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||||
|
|
||||||
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
|
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
|
||||||
else:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
|
||||||
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
|
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
|
||||||
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
||||||
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
|
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
|
||||||
|
|
@ -8862,13 +8729,6 @@ class GroveMoeModel(TextModel):
|
||||||
self.gguf_writer.add_experts_per_group(2)
|
self.gguf_writer.add_experts_per_group(2)
|
||||||
# FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L376
|
# FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L376
|
||||||
self.gguf_writer.add_expert_group_scale(0.05)
|
self.gguf_writer.add_expert_group_scale(0.05)
|
||||||
# YaRN is not enabled by default
|
|
||||||
# To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts
|
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
|
||||||
|
|
||||||
_experts: list[dict[str, Tensor]] | None = None
|
_experts: list[dict[str, Tensor]] | None = None
|
||||||
_chunk_experts: list[dict[str, Tensor]] | None = None
|
_chunk_experts: list[dict[str, Tensor]] | None = None
|
||||||
|
|
@ -9011,6 +8871,63 @@ class UltravoxModel(TextModel):
|
||||||
raise NotImplementedError("Ultravox does not have text decoder. Instead, it uses Llama or other models for text. If you want to get the audio encoder, please use --mmproj argument")
|
raise NotImplementedError("Ultravox does not have text decoder. Instead, it uses Llama or other models for text. If you want to get the audio encoder, please use --mmproj argument")
|
||||||
|
|
||||||
|
|
||||||
|
@ModelBase.register("GlmasrModel")
|
||||||
|
class GlmASRWhisperEncoderModel(MmprojModel):
|
||||||
|
has_vision_encoder = False
|
||||||
|
has_audio_encoder = True
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
if "hidden_size" not in self.hparams and "intermediate_size" not in self.hparams:
|
||||||
|
self.hparams["hidden_size"] = self.hparams["d_model"]
|
||||||
|
self.hparams["intermediate_size"] = self.hparams["encoder_ffn_dim"]
|
||||||
|
self.hparams["num_attention_heads"] = self.hparams["encoder_attention_heads"]
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
super().set_gguf_parameters()
|
||||||
|
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GLMA)
|
||||||
|
self.gguf_writer.add_audio_num_mel_bins(self.hparams["num_mel_bins"])
|
||||||
|
self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5))
|
||||||
|
self.gguf_writer.add_audio_stack_factor(self.global_config["merge_factor"])
|
||||||
|
|
||||||
|
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||||
|
if ".conv" in name and ".weight" in name:
|
||||||
|
return gguf.GGMLQuantizationType.F16
|
||||||
|
return super().tensor_force_quant(name, new_name, bid, n_dims)
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
del bid # unused
|
||||||
|
|
||||||
|
if name.startswith("model.") or name.startswith("lm_head."):
|
||||||
|
# skip language model tensors
|
||||||
|
return []
|
||||||
|
|
||||||
|
if name.startswith("audio_encoder.whisper."):
|
||||||
|
name = name.replace("audio_encoder.whisper.","audio_tower.")
|
||||||
|
if "audio_encoder.layer_norm." in name or "audio_encoder.proj." in name:
|
||||||
|
name = name.replace("audio_encoder.", "audio_encoder.adapting.")
|
||||||
|
|
||||||
|
if name.startswith("audio_encoder.audio_bos_eos_token."):
|
||||||
|
return [(self.map_tensor_name("model.vision.boi"), data_torch[0]), (self.map_tensor_name("model.vision.eoi"), data_torch[1])]
|
||||||
|
|
||||||
|
if name.startswith("audio_encoder.adapting."):
|
||||||
|
name = name.replace("audio_encoder.adapting.","audio.multi_modal_projector.")
|
||||||
|
if ".layer_norm." in name:
|
||||||
|
name = name.replace(".layer_norm.", ".ln_pre.")
|
||||||
|
if ".0." in name:
|
||||||
|
name = name.replace(".0.", ".linear_1.")
|
||||||
|
if ".2." in name:
|
||||||
|
name = name.replace(".2.", ".linear_2.")
|
||||||
|
if ".proj." in name:
|
||||||
|
return []
|
||||||
|
|
||||||
|
if "conv1.bias" in name or "conv2.bias" in name:
|
||||||
|
# transpose conv1 and conv2 bias
|
||||||
|
data_torch = data_torch.unsqueeze(-1)
|
||||||
|
|
||||||
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
|
|
||||||
@ModelBase.register("Qwen2AudioForConditionalGeneration")
|
@ModelBase.register("Qwen2AudioForConditionalGeneration")
|
||||||
class WhisperEncoderModel(MmprojModel):
|
class WhisperEncoderModel(MmprojModel):
|
||||||
has_vision_encoder = False # no vision encoder
|
has_vision_encoder = False # no vision encoder
|
||||||
|
|
@ -9178,7 +9095,7 @@ class FalconH1Model(Mamba2Model):
|
||||||
assert self.d_inner % self.d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {self.d_head}"
|
assert self.d_inner % self.d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {self.d_head}"
|
||||||
|
|
||||||
# Add any other Falcon Mamba2 specific configuration
|
# Add any other Falcon Mamba2 specific configuration
|
||||||
self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
|
self.gguf_writer.add_rope_freq_base(self.rope_parameters["rope_theta"])
|
||||||
|
|
||||||
|
|
||||||
@ModelBase.register("HunYuanMoEV1ForCausalLM")
|
@ModelBase.register("HunYuanMoEV1ForCausalLM")
|
||||||
|
|
@ -9256,12 +9173,11 @@ class HunYuanMoEModel(TextModel):
|
||||||
self.gguf_writer.add_expert_shared_count(moe_shared_expert[0])
|
self.gguf_writer.add_expert_shared_count(moe_shared_expert[0])
|
||||||
|
|
||||||
# Rope
|
# Rope
|
||||||
rope_scaling = hparams.get("rope_scaling", {})
|
if self.rope_parameters.get("rope_type") == "dynamic":
|
||||||
if rope_scaling.get("type") == "dynamic":
|
|
||||||
# HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
|
# HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
|
||||||
# 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf)
|
# 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf)
|
||||||
alpha = rope_scaling.get("alpha", 1000)
|
alpha = self.rope_parameters.get("alpha", 1000)
|
||||||
base = hparams.get("rope_theta", 10000.0)
|
base = self.rope_parameters.get("rope_theta", 10000.0)
|
||||||
dim = (hparams["hidden_size"] // hparams["num_attention_heads"]) # 128
|
dim = (hparams["hidden_size"] // hparams["num_attention_heads"]) # 128
|
||||||
scaled_base = base * (alpha ** (dim / (dim - 2))) # 10000 * (1000 ** (128 / 126)) = 11158839.9251
|
scaled_base = base * (alpha ** (dim / (dim - 2))) # 10000 * (1000 ** (128 / 126)) = 11158839.9251
|
||||||
self.gguf_writer.add_rope_freq_base(scaled_base)
|
self.gguf_writer.add_rope_freq_base(scaled_base)
|
||||||
|
|
@ -9456,12 +9372,11 @@ class HunYuanModel(TextModel):
|
||||||
hparams = self.hparams
|
hparams = self.hparams
|
||||||
|
|
||||||
# Rope
|
# Rope
|
||||||
rope_scaling = hparams.get("rope_scaling", {})
|
if self.rope_parameters.get("rope_type") == "dynamic":
|
||||||
if rope_scaling.get("type") == "dynamic":
|
|
||||||
# HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
|
# HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
|
||||||
# 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf)
|
# 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf)
|
||||||
alpha = rope_scaling.get("alpha", 50)
|
alpha = self.rope_parameters.get("alpha", 50)
|
||||||
base = hparams.get("rope_theta", 10000.0)
|
base = self.rope_parameters.get("rope_theta", 10000.0)
|
||||||
dim = hparams["head_dim"]
|
dim = hparams["head_dim"]
|
||||||
scaled_base = base * (alpha ** (dim / (dim - 2)))
|
scaled_base = base * (alpha ** (dim / (dim - 2)))
|
||||||
self.gguf_writer.add_rope_freq_base(scaled_base)
|
self.gguf_writer.add_rope_freq_base(scaled_base)
|
||||||
|
|
@ -9612,13 +9527,6 @@ class GptOssModel(TextModel):
|
||||||
self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
|
self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
|
||||||
self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size"])
|
self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size"])
|
||||||
|
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
rope_type = rope_scaling.get("rope_type", rope_scaling.get("type"))
|
|
||||||
assert rope_type == "yarn", f"GPT-OSS only supports yarn rope scaling, got {rope_type}"
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling.get("original_max_position_embeddings", 4096))
|
|
||||||
|
|
||||||
|
|
||||||
@ModelBase.register("Lfm2ForCausalLM", "LFM2ForCausalLM")
|
@ModelBase.register("Lfm2ForCausalLM", "LFM2ForCausalLM")
|
||||||
class LFM2Model(TextModel):
|
class LFM2Model(TextModel):
|
||||||
|
|
@ -9791,13 +9699,6 @@ class SmallThinkerModel(TextModel):
|
||||||
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
|
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
|
||||||
else:
|
else:
|
||||||
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
|
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
|
||||||
# YaRN is not enabled by default
|
|
||||||
# To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts
|
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
|
||||||
|
|
||||||
sliding_window_layout = self.hparams.get("sliding_window_layout")
|
sliding_window_layout = self.hparams.get("sliding_window_layout")
|
||||||
if sliding_window_layout:
|
if sliding_window_layout:
|
||||||
|
|
|
||||||
|
|
@ -9,7 +9,8 @@ Adding a model requires few steps:
|
||||||
After following these steps, you can open PR.
|
After following these steps, you can open PR.
|
||||||
|
|
||||||
Also, it is important to check that the examples and main ggml backends (CUDA, METAL, CPU) are working with the new architecture, especially:
|
Also, it is important to check that the examples and main ggml backends (CUDA, METAL, CPU) are working with the new architecture, especially:
|
||||||
- [main](/tools/main/)
|
- [cli](/tools/cli/)
|
||||||
|
- [completion](/tools/completion/)
|
||||||
- [imatrix](/tools/imatrix/)
|
- [imatrix](/tools/imatrix/)
|
||||||
- [quantize](/tools/quantize/)
|
- [quantize](/tools/quantize/)
|
||||||
- [server](/tools/server/)
|
- [server](/tools/server/)
|
||||||
|
|
|
||||||
18
docs/ops.md
18
docs/ops.md
|
|
@ -18,12 +18,12 @@ Legend:
|
||||||
| ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
| ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||||
| ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| ADD_ID | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
| ADD_ID | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
|
||||||
| CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
| CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
| CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
|
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
|
||||||
| CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
| CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||||
|
|
@ -31,7 +31,7 @@ Legend:
|
||||||
| CONV_3D | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
| CONV_3D | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||||
| CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
| CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
| COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
|
| CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
|
||||||
| CROSS_ENTROPY_LOSS | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
| CROSS_ENTROPY_LOSS | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||||
|
|
@ -64,7 +64,7 @@ Legend:
|
||||||
| IM2COL_3D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
| IM2COL_3D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| L2_NORM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| L2_NORM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
|
| LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| LOG | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
|
| LOG | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
| MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||||
| MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
| MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||||
|
|
@ -98,14 +98,14 @@ Legend:
|
||||||
| SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
| SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
| SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
| SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
| SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| SOFTCAP | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
| SOFTCAP | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||||
| SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
|
| SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
| SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||||
| SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
|
| SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
|
||||||
| SOLVE_TRI | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
|
| SOLVE_TRI | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
| SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
| SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| SSM_CONV | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| SSM_CONV | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
|
| SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
| STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
|
|
@ -113,7 +113,7 @@ Legend:
|
||||||
| SUM | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
| SUM | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| SUM_ROWS | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
|
| SUM_ROWS | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
|
||||||
| SWIGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
| SWIGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| SWIGLU_OAI | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ❌ | ❌ |
|
| SWIGLU_OAI | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
| TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| TOP_K | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
|
| TOP_K | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
|
||||||
|
|
|
||||||
1158
docs/ops/SYCL.csv
1158
docs/ops/SYCL.csv
File diff suppressed because it is too large
Load Diff
|
|
@ -2,6 +2,7 @@
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "log.h"
|
#include "log.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
#include "sampling.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
|
|
@ -64,11 +65,12 @@ int main(int argc, char ** argv) {
|
||||||
ctx_params.n_ctx = n_kv_req;
|
ctx_params.n_ctx = n_kv_req;
|
||||||
ctx_params.n_batch = std::max(n_predict, n_parallel);
|
ctx_params.n_batch = std::max(n_predict, n_parallel);
|
||||||
|
|
||||||
llama_context * ctx = llama_init_from_model(model, ctx_params);
|
|
||||||
|
|
||||||
auto sparams = llama_sampler_chain_default_params();
|
auto sparams = llama_sampler_chain_default_params();
|
||||||
sparams.no_perf = false;
|
sparams.no_perf = false;
|
||||||
|
|
||||||
|
std::vector<llama_sampler *> samplers;
|
||||||
|
|
||||||
|
for (int32_t i = 0; i < n_parallel; ++i) {
|
||||||
llama_sampler * smpl = llama_sampler_chain_init(sparams);
|
llama_sampler * smpl = llama_sampler_chain_init(sparams);
|
||||||
|
|
||||||
llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sampling.top_k));
|
llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sampling.top_k));
|
||||||
|
|
@ -76,6 +78,11 @@ int main(int argc, char ** argv) {
|
||||||
llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sampling.temp));
|
llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sampling.temp));
|
||||||
llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sampling.seed));
|
llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sampling.seed));
|
||||||
|
|
||||||
|
samplers.push_back(smpl);
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_context * ctx = llama_init_from_model(model, ctx_params);
|
||||||
|
|
||||||
if (ctx == NULL) {
|
if (ctx == NULL) {
|
||||||
LOG_ERR("%s: error: failed to create the llama_context\n" , __func__);
|
LOG_ERR("%s: error: failed to create the llama_context\n" , __func__);
|
||||||
return 1;
|
return 1;
|
||||||
|
|
@ -173,7 +180,7 @@ int main(int argc, char ** argv) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
const llama_token new_token_id = llama_sampler_sample(smpl, ctx, i_batch[i]);
|
const llama_token new_token_id = llama_sampler_sample(samplers[i], ctx, i_batch[i]);
|
||||||
|
|
||||||
// is it an end of generation? -> mark the stream as finished
|
// is it an end of generation? -> mark the stream as finished
|
||||||
if (llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_predict) {
|
if (llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_predict) {
|
||||||
|
|
@ -229,14 +236,17 @@ int main(int argc, char ** argv) {
|
||||||
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
||||||
|
|
||||||
LOG("\n");
|
LOG("\n");
|
||||||
llama_perf_sampler_print(smpl);
|
llama_perf_sampler_print(samplers[0]);
|
||||||
llama_perf_context_print(ctx);
|
llama_perf_context_print(ctx);
|
||||||
|
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
|
|
||||||
llama_batch_free(batch);
|
llama_batch_free(batch);
|
||||||
|
|
||||||
llama_sampler_free(smpl);
|
for (auto & sampler_config : samplers) {
|
||||||
|
llama_sampler_free(sampler_config);
|
||||||
|
}
|
||||||
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_model_free(model);
|
llama_model_free(model);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -131,10 +131,10 @@ int main(int argc, char ** argv) {
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
// load the model
|
// load the model
|
||||||
common_init_result llama_init = common_init_from_params(params);
|
auto llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
llama_model * model = llama_init.model.get();
|
auto * model = llama_init->model();
|
||||||
llama_context * ctx = llama_init.context.get();
|
auto * ctx = llama_init->context();
|
||||||
|
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
LOG_ERR("%s: unable to load model\n", __func__);
|
LOG_ERR("%s: unable to load model\n", __func__);
|
||||||
|
|
|
||||||
|
|
@ -202,10 +202,10 @@ int main(int argc, char ** argv) {
|
||||||
params.warmup = false;
|
params.warmup = false;
|
||||||
|
|
||||||
// init
|
// init
|
||||||
common_init_result llama_init = common_init_from_params(params);
|
auto llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
llama_model * model = llama_init.model.get();
|
auto * model = llama_init->model();
|
||||||
llama_context * ctx = llama_init.context.get();
|
auto * ctx = llama_init->context();
|
||||||
|
|
||||||
if (model == nullptr || ctx == nullptr) {
|
if (model == nullptr || ctx == nullptr) {
|
||||||
LOG_ERR("%s : failed to init\n", __func__);
|
LOG_ERR("%s : failed to init\n", __func__);
|
||||||
|
|
|
||||||
|
|
@ -14,12 +14,13 @@ static void write_table_header(std::ofstream & file) {
|
||||||
static void write_table_entry(std::ofstream & file, const common_arg & opt) {
|
static void write_table_entry(std::ofstream & file, const common_arg & opt) {
|
||||||
file << "| `";
|
file << "| `";
|
||||||
// args
|
// args
|
||||||
for (const auto & arg : opt.args) {
|
auto all_args = opt.get_args();
|
||||||
if (arg == opt.args.front()) {
|
for (const auto & arg : all_args) {
|
||||||
|
if (arg == all_args.front()) {
|
||||||
file << arg;
|
file << arg;
|
||||||
if (opt.args.size() > 1) file << ", ";
|
if (all_args.size() > 1) file << ", ";
|
||||||
} else {
|
} else {
|
||||||
file << arg << (arg != opt.args.back() ? ", " : "");
|
file << arg << (arg != all_args.back() ? ", " : "");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// value hint
|
// value hint
|
||||||
|
|
|
||||||
|
|
@ -55,10 +55,10 @@ int main(int argc, char ** argv) {
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
// load the target model
|
// load the target model
|
||||||
common_init_result llama_init = common_init_from_params(params);
|
auto llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
llama_model * model = llama_init.model.get();
|
auto * model = llama_init->model();
|
||||||
llama_context * ctx = llama_init.context.get();
|
auto * ctx = llama_init->context();
|
||||||
|
|
||||||
auto * mem = llama_get_memory(ctx);
|
auto * mem = llama_get_memory(ctx);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -18,16 +18,16 @@ int main(int argc, char ** argv){
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
// load the model
|
// load the model
|
||||||
common_init_result llama_init = common_init_from_params(params);
|
auto llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
llama_model_ptr & model = llama_init.model;
|
auto * model = llama_init->model();
|
||||||
llama_context_ptr & ctx = llama_init.context;
|
auto * ctx = llama_init->context();
|
||||||
|
|
||||||
GGML_ASSERT(model != nullptr);
|
GGML_ASSERT(model != nullptr);
|
||||||
|
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
std::vector<llama_token> inp;
|
std::vector<llama_token> inp;
|
||||||
inp = common_tokenize(ctx.get(), params.prompt, true, true);
|
inp = common_tokenize(ctx, params.prompt, true, true);
|
||||||
fprintf(stderr, "%s: tokenization done\n", __func__);
|
fprintf(stderr, "%s: tokenization done\n", __func__);
|
||||||
|
|
||||||
common_ngram_cache ngram_cache;
|
common_ngram_cache ngram_cache;
|
||||||
|
|
|
||||||
|
|
@ -28,13 +28,13 @@ int main(int argc, char ** argv){
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
// load the model
|
// load the model
|
||||||
common_init_result llama_init = common_init_from_params(params);
|
auto llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
llama_context_ptr & ctx = llama_init.context;
|
llama_context * ctx = llama_init->context();
|
||||||
|
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
std::vector<llama_token> inp;
|
std::vector<llama_token> inp;
|
||||||
inp = common_tokenize(ctx.get(), params.prompt, true, true);
|
inp = common_tokenize(ctx, params.prompt, true, true);
|
||||||
|
|
||||||
common_ngram_cache ngram_cache_context;
|
common_ngram_cache ngram_cache_context;
|
||||||
common_ngram_cache ngram_cache_dynamic;
|
common_ngram_cache ngram_cache_dynamic;
|
||||||
|
|
@ -65,7 +65,7 @@ int main(int argc, char ** argv){
|
||||||
}
|
}
|
||||||
|
|
||||||
const int n_input = inp.size();
|
const int n_input = inp.size();
|
||||||
const int n_ctx = llama_n_ctx(ctx.get());
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
|
|
||||||
int n_drafted = 0;
|
int n_drafted = 0;
|
||||||
int n_accept = 0;
|
int n_accept = 0;
|
||||||
|
|
|
||||||
|
|
@ -29,10 +29,10 @@ int main(int argc, char ** argv){
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
// load the model
|
// load the model
|
||||||
common_init_result llama_init = common_init_from_params(params);
|
auto llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
llama_model * model = llama_init.model.get();
|
auto * model = llama_init->model();
|
||||||
llama_context * ctx = llama_init.context.get();
|
auto * ctx = llama_init->context();
|
||||||
|
|
||||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,10 +1,13 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import sys
|
import sys
|
||||||
import os
|
import numpy as np
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Add utils directory to path for direct script execution
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent / "utils"))
|
||||||
|
from common import get_model_name_from_env_path # type: ignore[import-not-found]
|
||||||
|
|
||||||
def quick_logits_check(pytorch_file, llamacpp_file):
|
def quick_logits_check(pytorch_file, llamacpp_file):
|
||||||
"""Lightweight sanity check before NMSE"""
|
"""Lightweight sanity check before NMSE"""
|
||||||
|
|
||||||
|
|
@ -35,20 +38,13 @@ def quick_logits_check(pytorch_file, llamacpp_file):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
model_path = os.getenv('MODEL_PATH')
|
model_name = get_model_name_from_env_path('MODEL_PATH')
|
||||||
if not model_path:
|
|
||||||
print("Error: MODEL_PATH environment variable not set")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
if not os.path.exists(model_path):
|
|
||||||
print(f"Error: Model file not found: {model_path}")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
model_name = os.path.basename(model_path)
|
|
||||||
data_dir = Path("data")
|
data_dir = Path("data")
|
||||||
|
|
||||||
pytorch_file = data_dir / f"pytorch-{model_name}.bin"
|
pytorch_file = data_dir / f"pytorch-{model_name}.bin"
|
||||||
llamacpp_file = data_dir / f"llamacpp-{model_name}.bin"
|
|
||||||
|
llamacpp_model_name = get_model_name_from_env_path('CONVERTED_MODEL')
|
||||||
|
print(f"Using converted model: {llamacpp_model_name}")
|
||||||
|
llamacpp_file = data_dir / f"llamacpp-{llamacpp_model_name}.bin"
|
||||||
|
|
||||||
if not pytorch_file.exists():
|
if not pytorch_file.exists():
|
||||||
print(f"Error: PyTorch logits file not found: {pytorch_file}")
|
print(f"Error: PyTorch logits file not found: {pytorch_file}")
|
||||||
|
|
|
||||||
|
|
@ -200,7 +200,7 @@ with torch.no_grad():
|
||||||
logits = outputs.logits
|
logits = outputs.logits
|
||||||
|
|
||||||
# Extract logits for the last token (next token prediction)
|
# Extract logits for the last token (next token prediction)
|
||||||
last_logits = logits[0, -1, :].cpu().numpy()
|
last_logits = logits[0, -1, :].float().cpu().numpy()
|
||||||
|
|
||||||
print(f"Logits shape: {logits.shape}")
|
print(f"Logits shape: {logits.shape}")
|
||||||
print(f"Last token logits shape: {last_logits.shape}")
|
print(f"Last token logits shape: {last_logits.shape}")
|
||||||
|
|
|
||||||
|
|
@ -5,6 +5,7 @@ import sys
|
||||||
import os
|
import os
|
||||||
import argparse
|
import argparse
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from common import get_model_name_from_env_path # type: ignore[import-not-found]
|
||||||
|
|
||||||
def calculate_nmse(reference, test):
|
def calculate_nmse(reference, test):
|
||||||
mse = np.mean((test - reference) ** 2)
|
mse = np.mean((test - reference) ** 2)
|
||||||
|
|
@ -67,11 +68,13 @@ def main():
|
||||||
parser.add_argument('-m', '--model-path', required=True, help='Path to the model directory')
|
parser.add_argument('-m', '--model-path', required=True, help='Path to the model directory')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
model_name = os.path.basename(args.model_path)
|
model_name = get_model_name_from_env_path('MODEL_PATH')
|
||||||
data_dir = Path("data")
|
data_dir = Path("data")
|
||||||
|
|
||||||
pytorch_file = data_dir / f"pytorch-{model_name}.bin"
|
pytorch_file = data_dir / f"pytorch-{model_name}.bin"
|
||||||
llamacpp_file = data_dir / f"llamacpp-{model_name}.bin"
|
|
||||||
|
llamacpp_model_name = get_model_name_from_env_path('CONVERTED_MODEL')
|
||||||
|
llamacpp_file = data_dir / f"llamacpp-{llamacpp_model_name}.bin"
|
||||||
|
|
||||||
print(f"Model name: {model_name}")
|
print(f"Model name: {model_name}")
|
||||||
print(f"PyTorch logits file: {pytorch_file}")
|
print(f"PyTorch logits file: {pytorch_file}")
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,20 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
def get_model_name_from_env_path(env_path_name):
|
||||||
|
model_path = os.getenv(env_path_name)
|
||||||
|
if not model_path:
|
||||||
|
print(f"Error: {env_path_name} environment variable not set")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if not os.path.exists(model_path):
|
||||||
|
print(f"Error: Model file not found: {model_path}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
name = os.path.basename(os.path.normpath(model_path))
|
||||||
|
if name.endswith(".gguf"):
|
||||||
|
name = name[:-5]
|
||||||
|
|
||||||
|
return name
|
||||||
|
|
@ -192,10 +192,10 @@ int main(int argc, char ** argv) {
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
// load the target model
|
// load the target model
|
||||||
common_init_result llama_init = common_init_from_params(params);
|
auto llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
llama_model * model = llama_init.model.get();
|
auto * model = llama_init->model();
|
||||||
llama_context * ctx = llama_init.context.get();
|
auto * ctx = llama_init->context();
|
||||||
|
|
||||||
auto * mem = llama_get_memory(ctx);
|
auto * mem = llama_get_memory(ctx);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -149,10 +149,10 @@ int main(int argc, char ** argv) {
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
// load the model
|
// load the model
|
||||||
common_init_result llama_init = common_init_from_params(params);
|
auto llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
llama_model * model = llama_init.model.get();
|
auto * model = llama_init->model();
|
||||||
llama_context * ctx = llama_init.context.get();
|
auto * ctx = llama_init->context();
|
||||||
|
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
LOG_ERR("%s: unable to load model\n", __func__);
|
LOG_ERR("%s: unable to load model\n", __func__);
|
||||||
|
|
|
||||||
|
|
@ -34,10 +34,10 @@ int main(int argc, char ** argv) {
|
||||||
std::string result2;
|
std::string result2;
|
||||||
|
|
||||||
// init
|
// init
|
||||||
common_init_result llama_init = common_init_from_params(params);
|
auto llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
llama_model * model = llama_init.model.get();
|
auto * model = llama_init->model();
|
||||||
llama_context * ctx = llama_init.context.get();
|
auto * ctx = llama_init->context();
|
||||||
|
|
||||||
if (model == nullptr || ctx == nullptr) {
|
if (model == nullptr || ctx == nullptr) {
|
||||||
fprintf(stderr, "%s : failed to init\n", __func__);
|
fprintf(stderr, "%s : failed to init\n", __func__);
|
||||||
|
|
|
||||||
|
|
@ -40,10 +40,10 @@ int main(int argc, char ** argv) {
|
||||||
llama_context * ctx_dft = NULL;
|
llama_context * ctx_dft = NULL;
|
||||||
|
|
||||||
// load the target model
|
// load the target model
|
||||||
common_init_result llama_init_tgt = common_init_from_params(params);
|
auto llama_init_tgt = common_init_from_params(params);
|
||||||
|
|
||||||
model_tgt = llama_init_tgt.model.get();
|
model_tgt = llama_init_tgt->model();
|
||||||
ctx_tgt = llama_init_tgt.context.get();
|
ctx_tgt = llama_init_tgt->context();
|
||||||
|
|
||||||
const llama_vocab * vocab = llama_model_get_vocab(model_tgt);
|
const llama_vocab * vocab = llama_model_get_vocab(model_tgt);
|
||||||
|
|
||||||
|
|
@ -61,10 +61,10 @@ int main(int argc, char ** argv) {
|
||||||
params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
|
params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
|
||||||
params.tensor_buft_overrides = params.speculative.tensor_buft_overrides;
|
params.tensor_buft_overrides = params.speculative.tensor_buft_overrides;
|
||||||
|
|
||||||
common_init_result llama_init_dft = common_init_from_params(params);
|
auto llama_init_dft = common_init_from_params(params);
|
||||||
|
|
||||||
//model_dft = llama_init_dft.model.get();
|
//model_dft = llama_init_dft->model();
|
||||||
ctx_dft = llama_init_dft.context.get();
|
ctx_dft = llama_init_dft->context();
|
||||||
|
|
||||||
if (!common_speculative_are_compatible(ctx_tgt, ctx_dft)) {
|
if (!common_speculative_are_compatible(ctx_tgt, ctx_dft)) {
|
||||||
LOG_INF("the draft model '%s' is not compatible with the target model '%s'. tokens will be translated between the draft and target models.\n", params.speculative.model.path.c_str(), params.model.path.c_str());
|
LOG_INF("the draft model '%s' is not compatible with the target model '%s'. tokens will be translated between the draft and target models.\n", params.speculative.model.path.c_str(), params.model.path.c_str());
|
||||||
|
|
@ -255,6 +255,8 @@ int main(int argc, char ** argv) {
|
||||||
LOG_INF("target:\n\n");
|
LOG_INF("target:\n\n");
|
||||||
common_perf_print(ctx_tgt, smpl);
|
common_perf_print(ctx_tgt, smpl);
|
||||||
|
|
||||||
|
llama_batch_free(batch_tgt);
|
||||||
|
|
||||||
common_sampler_free(smpl);
|
common_sampler_free(smpl);
|
||||||
common_speculative_free(spec);
|
common_speculative_free(spec);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -71,10 +71,10 @@ int main(int argc, char ** argv) {
|
||||||
llama_context * ctx_dft = NULL;
|
llama_context * ctx_dft = NULL;
|
||||||
|
|
||||||
// load the target model
|
// load the target model
|
||||||
common_init_result llama_init_tgt = common_init_from_params(params);
|
auto llama_init_tgt = common_init_from_params(params);
|
||||||
|
|
||||||
model_tgt = llama_init_tgt.model.get();
|
model_tgt = llama_init_tgt->model();
|
||||||
ctx_tgt = llama_init_tgt.context.get();
|
ctx_tgt = llama_init_tgt->context();
|
||||||
|
|
||||||
// load the draft model
|
// load the draft model
|
||||||
params.devices = params.speculative.devices;
|
params.devices = params.speculative.devices;
|
||||||
|
|
@ -87,10 +87,10 @@ int main(int argc, char ** argv) {
|
||||||
params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
|
params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
|
||||||
params.tensor_buft_overrides = params.speculative.tensor_buft_overrides;
|
params.tensor_buft_overrides = params.speculative.tensor_buft_overrides;
|
||||||
|
|
||||||
common_init_result llama_init_dft = common_init_from_params(params);
|
auto llama_init_dft = common_init_from_params(params);
|
||||||
|
|
||||||
model_dft = llama_init_dft.model.get();
|
model_dft = llama_init_dft->model();
|
||||||
ctx_dft = llama_init_dft.context.get();
|
ctx_dft = llama_init_dft->context();
|
||||||
|
|
||||||
const llama_vocab * vocab_tgt = llama_model_get_vocab(model_tgt);
|
const llama_vocab * vocab_tgt = llama_model_get_vocab(model_tgt);
|
||||||
const llama_vocab * vocab_dft = llama_model_get_vocab(model_dft);
|
const llama_vocab * vocab_dft = llama_model_get_vocab(model_dft);
|
||||||
|
|
@ -242,7 +242,7 @@ int main(int argc, char ** argv) {
|
||||||
bool accept = false;
|
bool accept = false;
|
||||||
if (params.sampling.temp > 0) {
|
if (params.sampling.temp > 0) {
|
||||||
// stochastic verification
|
// stochastic verification
|
||||||
common_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft], true);
|
common_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]);
|
||||||
|
|
||||||
auto & dist_tgt = *common_sampler_get_candidates(smpl, true);
|
auto & dist_tgt = *common_sampler_get_candidates(smpl, true);
|
||||||
|
|
||||||
|
|
@ -491,7 +491,7 @@ int main(int argc, char ** argv) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
common_sampler_sample(drafts[s].smpl, ctx_dft, drafts[s].i_batch_dft, true);
|
common_sampler_sample(drafts[s].smpl, ctx_dft, drafts[s].i_batch_dft);
|
||||||
|
|
||||||
const auto * cur_p = common_sampler_get_candidates(drafts[s].smpl, true);
|
const auto * cur_p = common_sampler_get_candidates(drafts[s].smpl, true);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -39,9 +39,10 @@ int main(int argc, char ** argv) {
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
// load the model and apply lora adapter, if any
|
// load the model and apply lora adapter, if any
|
||||||
common_init_result llama_init = common_init_from_params(params);
|
auto llama_init = common_init_from_params(params);
|
||||||
llama_model_ptr & model = llama_init.model;
|
|
||||||
llama_context_ptr & ctx = llama_init.context;
|
auto * model = llama_init->model();
|
||||||
|
auto * ctx = llama_init->context();
|
||||||
|
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
LOG_ERR("%s: unable to load model\n", __func__);
|
LOG_ERR("%s: unable to load model\n", __func__);
|
||||||
|
|
@ -54,8 +55,8 @@ int main(int argc, char ** argv) {
|
||||||
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<llama_token> tokens = common_tokenize(ctx.get(), params.prompt, true);
|
std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, true);
|
||||||
ggml_opt_dataset_t dataset = common_opt_dataset_init(ctx.get(), tokens, llama_n_ctx(ctx.get()) / 2);
|
ggml_opt_dataset_t dataset = common_opt_dataset_init(ctx, tokens, llama_n_ctx(ctx) / 2);
|
||||||
|
|
||||||
struct lr_opt & lr = params.lr;
|
struct lr_opt & lr = params.lr;
|
||||||
LOG_INF("-optimizer %s -lr0 %.2g -wd %.2g -lr-min %.2g -min-epochs %.2g -epochs %d -period %.2g -val %.2g\n",
|
LOG_INF("-optimizer %s -lr0 %.2g -wd %.2g -lr-min %.2g -min-epochs %.2g -epochs %d -period %.2g -val %.2g\n",
|
||||||
|
|
@ -70,7 +71,7 @@ int main(int argc, char ** argv) {
|
||||||
/*get_opt_pars_ud =*/¶ms.lr,
|
/*get_opt_pars_ud =*/¶ms.lr,
|
||||||
/*optimizer_type =*/params.optimizer,
|
/*optimizer_type =*/params.optimizer,
|
||||||
};
|
};
|
||||||
llama_opt_init(ctx.get(), model.get(), lopt_params);
|
llama_opt_init(ctx, model, lopt_params);
|
||||||
|
|
||||||
const int64_t idata_split = ggml_opt_dataset_ndata(dataset) * (1.0f - params.val_split);
|
const int64_t idata_split = ggml_opt_dataset_ndata(dataset) * (1.0f - params.val_split);
|
||||||
|
|
||||||
|
|
@ -78,7 +79,7 @@ int main(int argc, char ** argv) {
|
||||||
ggml_opt_result_t result_eval = ggml_opt_result_init();
|
ggml_opt_result_t result_eval = ggml_opt_result_init();
|
||||||
|
|
||||||
for (lr.epoch = 0; lr.epoch < lr.epochs; ++lr.epoch) {
|
for (lr.epoch = 0; lr.epoch < lr.epochs; ++lr.epoch) {
|
||||||
llama_opt_epoch(ctx.get(), dataset, result_train, result_eval, idata_split,
|
llama_opt_epoch(ctx, dataset, result_train, result_eval, idata_split,
|
||||||
ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar);
|
ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar);
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
|
|
||||||
|
|
@ -88,7 +89,7 @@ int main(int argc, char ** argv) {
|
||||||
ggml_opt_result_free(result_train);
|
ggml_opt_result_free(result_train);
|
||||||
ggml_opt_result_free(result_eval);
|
ggml_opt_result_free(result_eval);
|
||||||
|
|
||||||
llama_model_save_to_file(model.get(), params.out_file.c_str());
|
llama_model_save_to_file(model, params.out_file.c_str());
|
||||||
|
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -54,6 +54,10 @@ if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
|
||||||
# TODO
|
# TODO
|
||||||
else()
|
else()
|
||||||
set(GGML_STANDALONE OFF)
|
set(GGML_STANDALONE OFF)
|
||||||
|
|
||||||
|
if (NOT CMAKE_RUNTIME_OUTPUT_DIRECTORY)
|
||||||
|
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (EMSCRIPTEN)
|
if (EMSCRIPTEN)
|
||||||
|
|
|
||||||
|
|
@ -53,7 +53,14 @@ GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc);
|
||||||
// call with a worst-case graph to avoid buffer reallocations
|
// call with a worst-case graph to avoid buffer reallocations
|
||||||
// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
|
// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
|
||||||
// returns false if the buffer allocation failed
|
// returns false if the buffer allocation failed
|
||||||
|
// ggml_gallocr_resrve_n_size writes the buffer sizes per galloc buffer that would be allocated by ggml_gallocr_reserve_n to sizes
|
||||||
GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
|
GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
|
||||||
|
GGML_API void ggml_gallocr_reserve_n_size(
|
||||||
|
ggml_gallocr_t galloc,
|
||||||
|
struct ggml_cgraph * graph,
|
||||||
|
const int * node_buffer_ids,
|
||||||
|
const int * leaf_buffer_ids,
|
||||||
|
size_t * sizes);
|
||||||
GGML_API bool ggml_gallocr_reserve_n(
|
GGML_API bool ggml_gallocr_reserve_n(
|
||||||
ggml_gallocr_t galloc,
|
ggml_gallocr_t galloc,
|
||||||
struct ggml_cgraph * graph,
|
struct ggml_cgraph * graph,
|
||||||
|
|
@ -68,6 +75,8 @@ GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_i
|
||||||
|
|
||||||
// Utils
|
// Utils
|
||||||
// Create a buffer and allocate all the tensors in a ggml_context
|
// Create a buffer and allocate all the tensors in a ggml_context
|
||||||
|
// ggml_backend_alloc_ctx_tensors_from_buft_size returns the size of the buffer that would be allocated by ggml_backend_alloc_ctx_tensors_from_buft
|
||||||
|
GGML_API size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
|
||||||
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
|
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
|
||||||
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
|
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -307,6 +307,7 @@ extern "C" {
|
||||||
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
||||||
|
|
||||||
// Initialize backend buffers from a measure graph
|
// Initialize backend buffers from a measure graph
|
||||||
|
GGML_API void ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes);
|
||||||
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
|
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
|
||||||
|
|
||||||
GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
|
GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
|
||||||
|
|
|
||||||
|
|
@ -2615,6 +2615,7 @@ extern "C" {
|
||||||
|
|
||||||
// Set callback for all future logging events.
|
// Set callback for all future logging events.
|
||||||
// If this is not called, or NULL is supplied, everything is output on stderr.
|
// If this is not called, or NULL is supplied, everything is output on stderr.
|
||||||
|
GGML_API void ggml_log_get(ggml_log_callback * log_callback, void ** user_data);
|
||||||
GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
|
GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
||||||
|
|
|
||||||
|
|
@ -594,7 +594,9 @@ static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
|
static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
|
||||||
return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;
|
return t->data != NULL // tensor data already set externally
|
||||||
|
|| t->buffer // tensor on external buffer (but not yet allocated)
|
||||||
|
|| ggml_gallocr_is_own(galloc, t); // tensor will be allocated by galloc
|
||||||
}
|
}
|
||||||
|
|
||||||
// free the extra space at the end if the new tensor is smaller
|
// free the extra space at the end if the new tensor is smaller
|
||||||
|
|
@ -823,7 +825,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
|
static bool ggml_gallocr_reserve_n_impl(
|
||||||
|
ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids, bool no_alloc) {
|
||||||
size_t min_hash_size = graph->n_nodes + graph->n_leafs;
|
size_t min_hash_size = graph->n_nodes + graph->n_leafs;
|
||||||
// add 25% margin to avoid hash collisions
|
// add 25% margin to avoid hash collisions
|
||||||
min_hash_size += min_hash_size / 4;
|
min_hash_size += min_hash_size / 4;
|
||||||
|
|
@ -928,12 +931,14 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||||
size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0;
|
size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0;
|
||||||
if (cur_size > 0) {
|
if (cur_size > 0) {
|
||||||
GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n",
|
GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n",
|
||||||
__func__, ggml_backend_buft_name(galloc->bufts[i]),
|
__func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
||||||
cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
ggml_vbuffer_free(galloc->buffers[i]);
|
ggml_vbuffer_free(galloc->buffers[i]);
|
||||||
|
if (no_alloc) {
|
||||||
|
galloc->buffers[i] = NULL;
|
||||||
|
} else {
|
||||||
galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
||||||
if (galloc->buffers[i] == NULL) {
|
if (galloc->buffers[i] == NULL) {
|
||||||
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
|
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
|
||||||
|
|
@ -941,10 +946,26 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_gallocr_reserve_n_size(
|
||||||
|
ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids, size_t * sizes) {
|
||||||
|
GGML_ASSERT(ggml_gallocr_reserve_n_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids, /*no_alloc =*/ true));
|
||||||
|
for (int i = 0; i < galloc->n_buffers; i++) {
|
||||||
|
sizes[i] = 0;
|
||||||
|
for (int c = 0; c < galloc->buf_tallocs[i]->n_chunks; c++) {
|
||||||
|
sizes[i] += galloc->buf_tallocs[i]->chunks[c]->max_size;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
|
||||||
|
return ggml_gallocr_reserve_n_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids, /*no_alloc =*/ false);
|
||||||
|
}
|
||||||
|
|
||||||
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
|
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
|
||||||
return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
|
return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
|
||||||
}
|
}
|
||||||
|
|
@ -1147,7 +1168,8 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
|
static ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft_impl(
|
||||||
|
struct ggml_context * ctx, ggml_backend_buffer_type_t buft, size_t * nbytes_total, bool no_alloc) {
|
||||||
GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
|
GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
|
||||||
|
|
||||||
size_t alignment = ggml_backend_buft_get_alignment(buft);
|
size_t alignment = ggml_backend_buft_get_alignment(buft);
|
||||||
|
|
@ -1155,6 +1177,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
||||||
|
|
||||||
ggml_backend_buffer_t * buffers = NULL;
|
ggml_backend_buffer_t * buffers = NULL;
|
||||||
size_t n_buffers = 0;
|
size_t n_buffers = 0;
|
||||||
|
*nbytes_total = 0;
|
||||||
|
|
||||||
size_t cur_buf_size = 0;
|
size_t cur_buf_size = 0;
|
||||||
struct ggml_tensor * first = ggml_get_first_tensor(ctx);
|
struct ggml_tensor * first = ggml_get_first_tensor(ctx);
|
||||||
|
|
@ -1166,10 +1189,11 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
||||||
|
|
||||||
if (cur_buf_size > 0 && (cur_buf_size + this_size) > max_size) {
|
if (cur_buf_size > 0 && (cur_buf_size + this_size) > max_size) {
|
||||||
// allocate tensors in the current buffer
|
// allocate tensors in the current buffer
|
||||||
if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
|
if (!no_alloc && !alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
first = t;
|
first = t;
|
||||||
|
*nbytes_total += cur_buf_size;
|
||||||
cur_buf_size = this_size;
|
cur_buf_size = this_size;
|
||||||
} else {
|
} else {
|
||||||
cur_buf_size += this_size;
|
cur_buf_size += this_size;
|
||||||
|
|
@ -1178,15 +1202,21 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
||||||
|
|
||||||
// allocate remaining tensors
|
// allocate remaining tensors
|
||||||
if (cur_buf_size > 0) {
|
if (cur_buf_size > 0) {
|
||||||
if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
|
*nbytes_total += cur_buf_size;
|
||||||
|
if (!no_alloc && !alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (no_alloc) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
if (n_buffers == 0) {
|
if (n_buffers == 0) {
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
|
GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
|
||||||
#endif
|
#endif
|
||||||
|
GGML_ASSERT(!buffers);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1196,10 +1226,24 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
||||||
} else {
|
} else {
|
||||||
buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
|
buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
|
||||||
}
|
}
|
||||||
free(buffers);
|
if (buffers) {
|
||||||
|
free(buffers); // can be NULL if context is empty or no_alloc
|
||||||
|
}
|
||||||
return buffer;
|
return buffer;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
|
||||||
|
size_t nbytes_total = 0;
|
||||||
|
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*no_alloc=*/ true);
|
||||||
|
GGML_ASSERT(!buf);
|
||||||
|
return nbytes_total;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
|
||||||
|
size_t nbytes_total = 0;
|
||||||
|
return ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*no_alloc =*/ false);
|
||||||
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) {
|
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) {
|
||||||
return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend));
|
return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend));
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -36,12 +36,11 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||||
|
GGML_ASSERT(buft);
|
||||||
if (size == 0) {
|
if (size == 0) {
|
||||||
// return a dummy buffer for zero-sized allocations
|
// return a dummy buffer for zero-sized allocations
|
||||||
return ggml_backend_buffer_init(buft, {}, NULL, 0);
|
return ggml_backend_buffer_init(buft, {}, NULL, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_ASSERT(buft);
|
|
||||||
return buft->iface.alloc_buffer(buft, size);
|
return buft->iface.alloc_buffer(buft, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -128,6 +127,12 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// FIXME JG: a multi_buffer has a non-zero size, according to the above comment get_base is not optional,
|
||||||
|
// I don't know whether the above comment is correct
|
||||||
|
if (!buffer->iface.get_base) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
void * base = buffer->iface.get_base(buffer);
|
void * base = buffer->iface.get_base(buffer);
|
||||||
|
|
||||||
GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
|
GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
|
||||||
|
|
@ -1727,6 +1732,20 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
||||||
sched->is_alloc = false;
|
sched->is_alloc = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes) {
|
||||||
|
GGML_ASSERT(sched);
|
||||||
|
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
|
||||||
|
GGML_ASSERT(sizes);
|
||||||
|
|
||||||
|
ggml_backend_sched_reset(sched);
|
||||||
|
|
||||||
|
ggml_backend_sched_synchronize(sched);
|
||||||
|
|
||||||
|
ggml_backend_sched_split_graph(sched, measure_graph);
|
||||||
|
|
||||||
|
ggml_gallocr_reserve_n_size(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids, sizes);
|
||||||
|
}
|
||||||
|
|
||||||
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
||||||
GGML_ASSERT(sched);
|
GGML_ASSERT(sched);
|
||||||
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
|
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
|
||||||
|
|
|
||||||
|
|
@ -24,6 +24,7 @@
|
||||||
|
|
||||||
#define UNUSED GGML_UNUSED
|
#define UNUSED GGML_UNUSED
|
||||||
|
|
||||||
|
#if defined(__aarch64__) && defined(__ARM_NEON) && (defined(__ARM_FEATURE_MATMUL_INT8) || defined(__ARM_FEATURE_DOTPROD))
|
||||||
static inline void decode_q4_Kx8_scales_mins(const uint8_t * scales_in,
|
static inline void decode_q4_Kx8_scales_mins(const uint8_t * scales_in,
|
||||||
int16x8_t * out_mins,
|
int16x8_t * out_mins,
|
||||||
int8_t * out_scales) {
|
int8_t * out_scales) {
|
||||||
|
|
@ -46,6 +47,7 @@ static inline void decode_q4_Kx8_scales_mins(const uint8_t * scales_in,
|
||||||
scales_u32[1] = (sm[2] & kmask2) | (((sm[0] >> 6) & kmask3) << 4);
|
scales_u32[1] = (sm[2] & kmask2) | (((sm[0] >> 6) & kmask3) << 4);
|
||||||
memcpy(out_scales, scales_u32, 8);
|
memcpy(out_scales, scales_u32, 8);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
||||||
assert(QK8_0 == 32);
|
assert(QK8_0 == 32);
|
||||||
|
|
|
||||||
|
|
@ -642,8 +642,8 @@ static __global__ void flash_attn_stream_k_fixup(
|
||||||
const int iter_k = (ne11 + (nbatch_fa - 1)) / nbatch_fa;
|
const int iter_k = (ne11 + (nbatch_fa - 1)) / nbatch_fa;
|
||||||
const int iter_j = (ne01 + (ncols1 - 1)) / ncols1;
|
const int iter_j = (ne01 + (ncols1 - 1)) / ncols1;
|
||||||
|
|
||||||
const int kbc0 = (bidx0 + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
|
const int kbc0 = int64_t(bidx0 + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
|
||||||
const int kbc0_stop = (bidx0 + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
|
const int kbc0_stop = int64_t(bidx0 + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
|
||||||
|
|
||||||
const bool did_not_have_any_data = kbc0 == kbc0_stop;
|
const bool did_not_have_any_data = kbc0 == kbc0_stop;
|
||||||
const bool wrote_beginning_of_tile = kbc0 % iter_k == 0;
|
const bool wrote_beginning_of_tile = kbc0 % iter_k == 0;
|
||||||
|
|
@ -679,7 +679,7 @@ static __global__ void flash_attn_stream_k_fixup(
|
||||||
int bidx = bidx0 - 1;
|
int bidx = bidx0 - 1;
|
||||||
int kbc_stop = kbc0;
|
int kbc_stop = kbc0;
|
||||||
while(true) {
|
while(true) {
|
||||||
const int kbc = bidx*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
|
const int kbc = int64_t(bidx)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
|
||||||
if (kbc == kbc_stop) { // Did not have any data.
|
if (kbc == kbc_stop) { // Did not have any data.
|
||||||
bidx--;
|
bidx--;
|
||||||
kbc_stop = kbc;
|
kbc_stop = kbc;
|
||||||
|
|
|
||||||
|
|
@ -1380,8 +1380,8 @@ static __global__ void flash_attn_ext_f16(
|
||||||
const int iter_j = (ne01.z + (ncols1 - 1)) / ncols1;
|
const int iter_j = (ne01.z + (ncols1 - 1)) / ncols1;
|
||||||
|
|
||||||
// kbc == k block continuous, current index in continuous ijk space.
|
// kbc == k block continuous, current index in continuous ijk space.
|
||||||
int kbc = (blockIdx.x + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
|
int kbc = int64_t(blockIdx.x + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
|
||||||
const int kbc_stop = (blockIdx.x + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
|
const int kbc_stop = int64_t(blockIdx.x + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
|
||||||
|
|
||||||
// If the seams of 2 CUDA blocks fall within an output tile their results need to be combined.
|
// If the seams of 2 CUDA blocks fall within an output tile their results need to be combined.
|
||||||
// For this we need to track both the block that starts the tile (needs_fixup) and the block that finishes the tile (is_fixup).
|
// For this we need to track both the block that starts the tile (needs_fixup) and the block that finishes the tile (is_fixup).
|
||||||
|
|
|
||||||
|
|
@ -769,9 +769,16 @@ ggml_metal_device_t ggml_metal_device_init(void) {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
dev->props.use_shared_buffers = dev->props.has_unified_memory;
|
dev->props.use_shared_buffers = dev->props.has_unified_memory;
|
||||||
|
#if TARGET_OS_OSX
|
||||||
|
// In case of eGPU, shared memory may be preferable.
|
||||||
|
dev->props.use_shared_buffers |= [dev->mtl_device location] == MTLDeviceLocationExternal;
|
||||||
|
#endif
|
||||||
if (getenv("GGML_METAL_SHARED_BUFFERS_DISABLE") != NULL) {
|
if (getenv("GGML_METAL_SHARED_BUFFERS_DISABLE") != NULL) {
|
||||||
dev->props.use_shared_buffers = false;
|
dev->props.use_shared_buffers = false;
|
||||||
}
|
}
|
||||||
|
if (getenv("GGML_METAL_SHARED_BUFFERS_ENABLE") != NULL) {
|
||||||
|
dev->props.use_shared_buffers = true;
|
||||||
|
}
|
||||||
|
|
||||||
dev->props.supports_gpu_family_apple7 = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7];
|
dev->props.supports_gpu_family_apple7 = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7];
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,77 @@
|
||||||
|
#include <sycl/sycl.hpp>
|
||||||
|
#include "common.hpp"
|
||||||
|
#include "add-id.hpp"
|
||||||
|
|
||||||
|
static void add_id_kernel(
|
||||||
|
const float* src0,
|
||||||
|
const float* src1,
|
||||||
|
const int32_t* src2,
|
||||||
|
float* dst,
|
||||||
|
int64_t ne0,
|
||||||
|
int64_t ne1,
|
||||||
|
size_t nb01,
|
||||||
|
size_t nb02,
|
||||||
|
size_t nb11,
|
||||||
|
size_t nb21,
|
||||||
|
sycl::nd_item<3> item_ct1) {
|
||||||
|
const int64_t i1 = item_ct1.get_group(2);
|
||||||
|
const int64_t i2 = item_ct1.get_group(1);
|
||||||
|
|
||||||
|
const int i11 =
|
||||||
|
*(const int32_t*)((const char*)src2 + i1 * sizeof(int32_t) + i2 * nb21);
|
||||||
|
|
||||||
|
const size_t nb1 = ne0 * sizeof(float);
|
||||||
|
const size_t nb2 = ne1 * nb1;
|
||||||
|
|
||||||
|
float* dst_row = (float*)((char*)dst + i1 * nb1 + i2 * nb2);
|
||||||
|
const float* src0_row =
|
||||||
|
(const float*)((const char*)src0 + i1 * nb01 + i2 * nb02);
|
||||||
|
const float* src1_row = (const float*)((const char*)src1 + i11 * nb11);
|
||||||
|
|
||||||
|
for (int64_t i0 = item_ct1.get_local_id(2); i0 < ne0;
|
||||||
|
i0 += item_ct1.get_local_range(2)) {
|
||||||
|
dst_row[i0] = src0_row[i0] + src1_row[i0];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_sycl_add_id(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
|
||||||
|
const ggml_tensor* src0 = dst->src[0];
|
||||||
|
const ggml_tensor* src1 = dst->src[1];
|
||||||
|
const ggml_tensor* src2 = dst->src[2];
|
||||||
|
|
||||||
|
GGML_TENSOR_TERNARY_OP_LOCALS
|
||||||
|
|
||||||
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||||
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||||
|
GGML_ASSERT(src2->type == GGML_TYPE_I32);
|
||||||
|
|
||||||
|
GGML_ASSERT(nb00 == sizeof(float));
|
||||||
|
GGML_ASSERT(nb10 == sizeof(float));
|
||||||
|
GGML_ASSERT(nb20 == sizeof(int32_t));
|
||||||
|
|
||||||
|
const float* src0_d = (const float*)src0->data;
|
||||||
|
const float* src1_d = (const float*)src1->data;
|
||||||
|
const int32_t* src2_d = (const int32_t*)src2->data;
|
||||||
|
float* dst_d = (float*)dst->data;
|
||||||
|
|
||||||
|
int threads = std::min((int)ne00, 768); // cols
|
||||||
|
ctx.stream()->parallel_for(
|
||||||
|
sycl::nd_range<3>(
|
||||||
|
sycl::range<3>(1, ne02, ne01) * sycl::range<3>(1, 1, threads),
|
||||||
|
sycl::range<3>(1, 1, threads)),
|
||||||
|
[=](sycl::nd_item<3> item_ct1) {
|
||||||
|
add_id_kernel(
|
||||||
|
src0_d,
|
||||||
|
src1_d,
|
||||||
|
src2_d,
|
||||||
|
dst_d,
|
||||||
|
ne0,
|
||||||
|
ne1,
|
||||||
|
nb01,
|
||||||
|
nb02,
|
||||||
|
nb11,
|
||||||
|
nb21,
|
||||||
|
item_ct1);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,8 @@
|
||||||
|
#ifndef GGML_SYCL_ADD_ID_HPP
|
||||||
|
#define GGML_SYCL_ADD_ID_HPP
|
||||||
|
|
||||||
|
#include "common.hpp"
|
||||||
|
|
||||||
|
void ggml_sycl_add_id(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||||
|
|
||||||
|
#endif // GGML_SYCL_ADD_ID_HPP
|
||||||
|
|
@ -642,5 +642,22 @@ static __dpct_inline__ sycl::uint2 fast_div_modulo(uint32_t n, const sycl::uint3
|
||||||
return sycl::uint2(div_val, mod_val);
|
return sycl::uint2(div_val, mod_val);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __dpct_inline__ int ggml_sycl_dp4a(const int a, const int b, int c) {
|
||||||
|
return dpct::dp4a(a, b, c);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __dpct_inline__ float ggml_sycl_e8m0_to_fp32(uint8_t x) {
|
||||||
|
uint32_t bits;
|
||||||
|
if (x == 0) {
|
||||||
|
bits = 0x00400000;
|
||||||
|
} else {
|
||||||
|
bits = (uint32_t) x << 23;
|
||||||
|
}
|
||||||
|
|
||||||
|
float result;
|
||||||
|
memcpy(&result, &bits, sizeof(float));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
#endif // GGML_SYCL_COMMON_HPP
|
#endif // GGML_SYCL_COMMON_HPP
|
||||||
|
|
|
||||||
|
|
@ -472,6 +472,16 @@ static void dequantize_row_iq4_nl_sycl(const void *vx, dst_t *y, const int64_t k
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename dst_t>
|
||||||
|
static void dequantize_row_mxfp4_sycl(const void * vx, dst_t * y, const int64_t k, dpct::queue_ptr stream) {
|
||||||
|
const int nb = (k + QK_K - 1) / QK_K;
|
||||||
|
stream->parallel_for(
|
||||||
|
sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
|
||||||
|
[=](sycl::nd_item<3> item_ct1) {
|
||||||
|
dequantize_block_mxfp4(vx, y, item_ct1);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
template <typename src_t, typename dst_t>
|
template <typename src_t, typename dst_t>
|
||||||
static void convert_unary_nc(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t ne00, const int64_t ne01,
|
static void convert_unary_nc(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t ne00, const int64_t ne01,
|
||||||
const int64_t ne02, const int64_t s01, const int64_t s02, const int64_t s03,
|
const int64_t ne02, const int64_t s01, const int64_t s02, const int64_t s03,
|
||||||
|
|
@ -518,6 +528,7 @@ static void convert_unary_sycl(const void * vx, dst_t * y, const int64_t k, dpct
|
||||||
convert_unary_nc_sycl<src_t>(vx, y, k, 1, 1, 1, k, k, k, queue);
|
convert_unary_nc_sycl<src_t>(vx, y, k, 1, 1, 1, k, k, k, queue);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
|
to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case GGML_TYPE_Q4_0:
|
case GGML_TYPE_Q4_0:
|
||||||
|
|
@ -571,6 +582,8 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
|
||||||
return dequantize_row_iq4_xs_sycl;
|
return dequantize_row_iq4_xs_sycl;
|
||||||
case GGML_TYPE_IQ4_NL:
|
case GGML_TYPE_IQ4_NL:
|
||||||
return dequantize_row_iq4_nl_sycl;
|
return dequantize_row_iq4_nl_sycl;
|
||||||
|
case GGML_TYPE_MXFP4:
|
||||||
|
return dequantize_row_mxfp4_sycl;
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
return convert_unary_sycl<float>;
|
return convert_unary_sycl<float>;
|
||||||
#ifdef GGML_SYCL_HAS_BF16
|
#ifdef GGML_SYCL_HAS_BF16
|
||||||
|
|
@ -636,6 +649,8 @@ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
|
||||||
return dequantize_row_iq4_xs_sycl;
|
return dequantize_row_iq4_xs_sycl;
|
||||||
case GGML_TYPE_IQ4_NL:
|
case GGML_TYPE_IQ4_NL:
|
||||||
return dequantize_row_iq4_nl_sycl;
|
return dequantize_row_iq4_nl_sycl;
|
||||||
|
case GGML_TYPE_MXFP4:
|
||||||
|
return dequantize_row_mxfp4_sycl;
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
return convert_unary_sycl<sycl::half>;
|
return convert_unary_sycl<sycl::half>;
|
||||||
#ifdef GGML_SYCL_HAS_BF16
|
#ifdef GGML_SYCL_HAS_BF16
|
||||||
|
|
|
||||||
|
|
@ -819,5 +819,23 @@ dequantize_block_iq4_xs(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename dst_t>
|
||||||
|
static void dequantize_block_mxfp4(const void * __restrict__ vx, dst_t * __restrict__ yy,
|
||||||
|
const sycl::nd_item<3> &item_ct1) {
|
||||||
|
// auto item_ct1 = sycl::ext::oneapi::this_work_item::get_nd_item<3>();
|
||||||
|
const int64_t i = item_ct1.get_group(2);
|
||||||
|
const block_mxfp4 * x = (const block_mxfp4 *) vx + i*(QK_K/QK_MXFP4);
|
||||||
|
|
||||||
|
const int64_t tid = item_ct1.get_local_id(2);
|
||||||
|
const int64_t il = tid/8; // 0...3
|
||||||
|
const int64_t ib = tid%8; // 0...7
|
||||||
|
dst_t * y = yy + i*QK_K + 32*ib + 4*il;
|
||||||
|
const uint8_t * q4 = x[ib].qs + 4*il;
|
||||||
|
const float d = ggml_sycl_e8m0_to_fp32(x[ib].e);
|
||||||
|
for (int j = 0; j < 4; ++j) {
|
||||||
|
y[j+ 0] = d * kvalues_mxfp4[q4[j] & 0xf]*0.5f;
|
||||||
|
y[j+16] = d * kvalues_mxfp4[q4[j] >> 4]*0.5f;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#endif // GGML_SYCL_DEQUANTIZE_HPP
|
#endif // GGML_SYCL_DEQUANTIZE_HPP
|
||||||
|
|
|
||||||
|
|
@ -1860,10 +1860,31 @@ namespace dpct
|
||||||
: id);
|
: id);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename T1, typename T2>
|
||||||
|
using dot_product_acc_t = std::conditional_t<
|
||||||
|
std::is_unsigned_v<T1> && std::is_unsigned_v<T2>,
|
||||||
|
uint32_t,
|
||||||
|
int32_t>;
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
sycl::vec<T, 4> extract_and_sign_or_zero_extend4(T val) {
|
||||||
|
return sycl::vec<T, 1>(val)
|
||||||
|
.template as<sycl::vec<
|
||||||
|
std::conditional_t<std::is_signed_v<T>, int8_t, uint8_t>,
|
||||||
|
4>>()
|
||||||
|
.template convert<T>();
|
||||||
|
}
|
||||||
|
|
||||||
template <typename T1, typename T2, typename T3>
|
template <typename T1, typename T2, typename T3>
|
||||||
inline auto dp4a(T1 a, T2 b, T3 c)
|
inline auto dp4a(T1 a, T2 b, T3 c) {
|
||||||
{
|
dot_product_acc_t<T1, T2> res = c;
|
||||||
return syclcompat::dp4a(a, b, c);
|
auto va = extract_and_sign_or_zero_extend4(a);
|
||||||
|
auto vb = extract_and_sign_or_zero_extend4(b);
|
||||||
|
res += va[0] * vb[0];
|
||||||
|
res += va[1] * vb[1];
|
||||||
|
res += va[2] * vb[2];
|
||||||
|
res += va[3] * vb[3];
|
||||||
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct sub_sat
|
struct sub_sat
|
||||||
|
|
@ -2972,6 +2993,38 @@ namespace dpct
|
||||||
atomic_fetch_add<T1, addressSpace>(addr, operand, memoryOrder);
|
atomic_fetch_add<T1, addressSpace>(addr, operand, memoryOrder);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline unsigned int byte_level_permute(
|
||||||
|
unsigned int a, unsigned int b, unsigned int s) {
|
||||||
|
unsigned int ret;
|
||||||
|
ret = ((((std::uint64_t)b << 32 | a) >> (s & 0x7) * 8) & 0xff) |
|
||||||
|
(((((std::uint64_t)b << 32 | a) >> ((s >> 4) & 0x7) * 8) & 0xff)
|
||||||
|
<< 8) |
|
||||||
|
(((((std::uint64_t)b << 32 | a) >> ((s >> 8) & 0x7) * 8) & 0xff)
|
||||||
|
<< 16) |
|
||||||
|
(((((std::uint64_t)b << 32 | a) >> ((s >> 12) & 0x7) * 8) & 0xff)
|
||||||
|
<< 24);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline uint32_t byte_level_permute_custom(
|
||||||
|
uint32_t low32, uint32_t high32, uint32_t sel, int mode = 0) {
|
||||||
|
constexpr uint16_t lookup[6][4] = {
|
||||||
|
{0x3210, 0x4321, 0x5432, 0x6543}, // Forward 4-byte extract
|
||||||
|
{0x5670, 0x6701, 0x7012, 0x0123}, // Backward 4-byte extract
|
||||||
|
{0x0000, 0x1111, 0x2222, 0x3333}, // Replicate 8-bit values
|
||||||
|
{0x3210, 0x3211, 0x3222, 0x3333}, // Edge clamp left
|
||||||
|
{0x0000, 0x1110, 0x2210, 0x3210}, // Edge clamp right
|
||||||
|
{0x1010, 0x3232, 0x1010, 0x3232} // Replicate 16-bit values
|
||||||
|
};
|
||||||
|
|
||||||
|
if (mode >= 1 && mode <= 6) {
|
||||||
|
return byte_level_permute(low32, high32, lookup[mode - 1][sel & 0x3]);
|
||||||
|
} else if (!mode) {
|
||||||
|
return byte_level_permute(low32, high32, sel);
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
} // COPY from DPCT head files
|
} // COPY from DPCT head files
|
||||||
|
|
||||||
#endif // GGML_SYCL_DPCT_HELPER_HPP
|
#endif // GGML_SYCL_DPCT_HELPER_HPP
|
||||||
|
|
|
||||||
|
|
@ -911,6 +911,98 @@ static inline void ggml_sycl_op_swiglu(ggml_backend_sycl_context & ctx, ggml_ten
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__dpct_inline__ float ggml_sycl_op_swiglu_oai_single(float x, float g, float alpha = 1.702f, float limit = 7.0f) {
|
||||||
|
x = sycl::fmin(x, limit);
|
||||||
|
g = sycl::fmax(sycl::fmin(g, limit), -limit);
|
||||||
|
|
||||||
|
float out_glu = x / (1.0f + sycl::native::exp(-x * alpha));
|
||||||
|
out_glu = out_glu * (1.0f + g);
|
||||||
|
return out_glu;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
static void swiglu_oai_kernel(const T * x, const T * g, T * dst, const int64_t k,
|
||||||
|
const int64_t n, const int64_t o0, const int64_t o1,
|
||||||
|
float alpha, float limit, sycl::nd_item<3> item_ct1) {
|
||||||
|
const int64_t i = int64_t(item_ct1.get_local_range(2)) * item_ct1.get_group(2) + item_ct1.get_local_id(2);
|
||||||
|
|
||||||
|
if (i >= k) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int64_t j0 = (i / n) * o0 + (i % n);
|
||||||
|
const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n);
|
||||||
|
|
||||||
|
float xi = x[j0];
|
||||||
|
float gi = g[j1];
|
||||||
|
|
||||||
|
dst[i] = ggml_sycl_op_swiglu_oai_single(xi, gi, alpha, limit);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
static void swiglu_oai_sycl(const T * x,
|
||||||
|
const T * g,
|
||||||
|
T * dst,
|
||||||
|
const int64_t k,
|
||||||
|
const int64_t n,
|
||||||
|
const int64_t o0,
|
||||||
|
const int64_t o1,
|
||||||
|
const float alpha,
|
||||||
|
const float limit,
|
||||||
|
dpct::queue_ptr stream) {
|
||||||
|
const int64_t num_blocks = (k + SYCL_GLU_BLOCK_SIZE - 1) / SYCL_GLU_BLOCK_SIZE;
|
||||||
|
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_GLU_BLOCK_SIZE),
|
||||||
|
sycl::range<3>(1, 1, SYCL_GLU_BLOCK_SIZE)),
|
||||||
|
[=](sycl::nd_item<3> item_ct1) {
|
||||||
|
swiglu_oai_kernel(x, g, dst, k, n, o0, o1, alpha, limit, item_ct1);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_sycl_op_swiglu_oai(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||||
|
const ggml_tensor * src0 = dst->src[0];
|
||||||
|
const ggml_tensor * src1 = dst->src[1];
|
||||||
|
void * src0_d = src0->data;
|
||||||
|
void * src1_d = src1 ? src1->data : src0->data;
|
||||||
|
const int64_t src0_o = src0->nb[1];
|
||||||
|
const int64_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
|
||||||
|
void * dst_d = dst->data;
|
||||||
|
const int64_t nc = src1 ? src0->ne[0] : src0->ne[0] / 2;
|
||||||
|
dpct::queue_ptr stream = ctx.stream();
|
||||||
|
|
||||||
|
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
||||||
|
GGML_ASSERT(src0->nb[0] == ggml_element_size(src0));
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(dst));
|
||||||
|
|
||||||
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||||
|
GGML_ASSERT(src0->type == dst->type);
|
||||||
|
GGML_ASSERT(dst->ne[0] == nc);
|
||||||
|
GGML_ASSERT(ggml_nrows(dst) == ggml_nrows(src0));
|
||||||
|
|
||||||
|
if (src1) {
|
||||||
|
GGML_ASSERT(ggml_is_contiguous_1(src1));
|
||||||
|
GGML_ASSERT(src1->nb[0] == ggml_element_size(src1));
|
||||||
|
GGML_ASSERT(src1->ne[0] == nc);
|
||||||
|
GGML_ASSERT(src0->type == src1->type);
|
||||||
|
}
|
||||||
|
|
||||||
|
//const int32_t swapped = ((const int32_t *) dst->op_params)[1];
|
||||||
|
const int32_t swapped = ggml_get_op_params_i32(dst, 1);
|
||||||
|
const float alpha = ggml_get_op_params_f32(dst, 2);
|
||||||
|
const float limit = ggml_get_op_params_f32(dst, 3);
|
||||||
|
|
||||||
|
float * src0_p = (float *) src0_d;
|
||||||
|
float * src1_p = (float *) src1_d;
|
||||||
|
|
||||||
|
if (!src1) {
|
||||||
|
src0_p += swapped ? nc : 0;
|
||||||
|
src1_p += swapped ? 0 : nc;
|
||||||
|
}
|
||||||
|
|
||||||
|
swiglu_oai_sycl(src0_p, src1_p, (float *)dst_d, ggml_nelements(dst), nc, src0_o / sizeof(float), src1_o / sizeof(float), alpha, limit, stream);
|
||||||
|
}
|
||||||
|
|
||||||
static inline void ggml_sycl_op_geglu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
static inline void ggml_sycl_op_geglu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||||
ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst,
|
ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst,
|
||||||
[](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
|
[](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
|
||||||
|
|
@ -1070,6 +1162,11 @@ void ggml_sycl_swiglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||||
ggml_sycl_op_swiglu(ctx, dst);
|
ggml_sycl_op_swiglu(ctx, dst);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_sycl_swiglu_oai(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||||
|
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||||
|
ggml_sycl_op_swiglu_oai(ctx, dst);
|
||||||
|
}
|
||||||
|
|
||||||
void ggml_sycl_geglu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
void ggml_sycl_geglu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||||
ggml_sycl_op_geglu_erf(ctx, dst);
|
ggml_sycl_op_geglu_erf(ctx, dst);
|
||||||
|
|
|
||||||
|
|
@ -5,6 +5,8 @@
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include <limits> // For std::numeric_limits
|
#include <limits> // For std::numeric_limits
|
||||||
|
|
||||||
|
#define SYCL_GLU_BLOCK_SIZE 256
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
T neg_infinity() {
|
T neg_infinity() {
|
||||||
return -std::numeric_limits<T>::infinity();
|
return -std::numeric_limits<T>::infinity();
|
||||||
|
|
@ -41,6 +43,8 @@ void ggml_sycl_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||||
|
|
||||||
void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||||
|
|
||||||
|
void ggml_sycl_swiglu_oai(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||||
|
|
||||||
void ggml_sycl_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
void ggml_sycl_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||||
|
|
||||||
void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||||
|
|
|
||||||
|
|
@ -39,6 +39,7 @@
|
||||||
#include "ggml-impl.h"
|
#include "ggml-impl.h"
|
||||||
#include "ggml-backend-impl.h"
|
#include "ggml-backend-impl.h"
|
||||||
|
|
||||||
|
#include "ggml-sycl/add-id.hpp"
|
||||||
#include "ggml-sycl/backend.hpp"
|
#include "ggml-sycl/backend.hpp"
|
||||||
#include "ggml-sycl/common.hpp"
|
#include "ggml-sycl/common.hpp"
|
||||||
#include "ggml-sycl/element_wise.hpp"
|
#include "ggml-sycl/element_wise.hpp"
|
||||||
|
|
@ -3313,6 +3314,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
|
||||||
bool use_mul_mat_q = ggml_sycl_supports_mmq(src0->type)
|
bool use_mul_mat_q = ggml_sycl_supports_mmq(src0->type)
|
||||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
||||||
|
|
||||||
|
|
||||||
// mmvq and mmq need the __dp4a instruction which is available for gen12+
|
// mmvq and mmq need the __dp4a instruction which is available for gen12+
|
||||||
// Workaround in https://github.com/ggerganov/llama.cpp/commit/95f84d5ce8b449a9b16009434aca800df504a02e
|
// Workaround in https://github.com/ggerganov/llama.cpp/commit/95f84d5ce8b449a9b16009434aca800df504a02e
|
||||||
use_mul_mat_q = use_mul_mat_q && (src0->type != GGML_TYPE_IQ2_XXS);
|
use_mul_mat_q = use_mul_mat_q && (src0->type != GGML_TYPE_IQ2_XXS);
|
||||||
|
|
@ -3320,7 +3322,6 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
|
||||||
use_mul_mat_q = use_mul_mat_q && (src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
|
use_mul_mat_q = use_mul_mat_q && (src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
|
||||||
#endif // SYCL_USE_XMX
|
#endif // SYCL_USE_XMX
|
||||||
|
|
||||||
|
|
||||||
// mmvq path is faster in the CUDA backend.
|
// mmvq path is faster in the CUDA backend.
|
||||||
if (!g_ggml_sycl_prioritize_dmmv && (ctx.stream()->get_backend() == sycl::backend::ext_oneapi_cuda
|
if (!g_ggml_sycl_prioritize_dmmv && (ctx.stream()->get_backend() == sycl::backend::ext_oneapi_cuda
|
||||||
// Dispatch becomes obscure with the reorder, MMVQ when the reorder optimization
|
// Dispatch becomes obscure with the reorder, MMVQ when the reorder optimization
|
||||||
|
|
@ -3711,6 +3712,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
|
||||||
case GGML_OP_ADD1: // TODO: more efficient implementation
|
case GGML_OP_ADD1: // TODO: more efficient implementation
|
||||||
ggml_sycl_add(ctx, dst);
|
ggml_sycl_add(ctx, dst);
|
||||||
break;
|
break;
|
||||||
|
case GGML_OP_ADD_ID:
|
||||||
|
ggml_sycl_add_id(ctx, dst);
|
||||||
|
break;
|
||||||
case GGML_OP_SUB:
|
case GGML_OP_SUB:
|
||||||
ggml_sycl_sub(ctx, dst);
|
ggml_sycl_sub(ctx, dst);
|
||||||
break;
|
break;
|
||||||
|
|
@ -3803,6 +3807,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
|
||||||
case GGML_GLU_OP_SWIGLU:
|
case GGML_GLU_OP_SWIGLU:
|
||||||
ggml_sycl_swiglu(ctx, dst);
|
ggml_sycl_swiglu(ctx, dst);
|
||||||
break;
|
break;
|
||||||
|
case GGML_GLU_OP_SWIGLU_OAI:
|
||||||
|
ggml_sycl_swiglu_oai(ctx, dst);
|
||||||
|
break;
|
||||||
case GGML_GLU_OP_GEGLU_ERF:
|
case GGML_GLU_OP_GEGLU_ERF:
|
||||||
ggml_sycl_geglu_erf(ctx, dst);
|
ggml_sycl_geglu_erf(ctx, dst);
|
||||||
break;
|
break;
|
||||||
|
|
@ -4397,6 +4404,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||||
case GGML_GLU_OP_REGLU:
|
case GGML_GLU_OP_REGLU:
|
||||||
case GGML_GLU_OP_GEGLU:
|
case GGML_GLU_OP_GEGLU:
|
||||||
case GGML_GLU_OP_SWIGLU:
|
case GGML_GLU_OP_SWIGLU:
|
||||||
|
case GGML_GLU_OP_SWIGLU_OAI:
|
||||||
case GGML_GLU_OP_GEGLU_ERF:
|
case GGML_GLU_OP_GEGLU_ERF:
|
||||||
case GGML_GLU_OP_GEGLU_QUICK:
|
case GGML_GLU_OP_GEGLU_QUICK:
|
||||||
return ggml_is_contiguous_1(op->src[0]);
|
return ggml_is_contiguous_1(op->src[0]);
|
||||||
|
|
@ -4424,15 +4432,18 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
ggml_type src0_type = op->src[0]->type;
|
ggml_type src0_type = op->src[0]->type;
|
||||||
if (src0_type == GGML_TYPE_BF16 || src0_type == GGML_TYPE_MXFP4) {
|
if (src0_type == GGML_TYPE_BF16 ) {
|
||||||
// TODO: support MXFP4
|
// TODO: support GGML_TYPE_BF16
|
||||||
// FIXME: keep a list of supported types to avoid breaking the backend when a new type is added
|
// FIXME: keep a list of supported types to avoid breaking the backend when a new type is added
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: The configuration below needs more work to be supported with oneDNN
|
// TODO: The configuration below needs more work to be supported with oneDNN
|
||||||
if (ggml_is_permuted(a) && !ggml_is_contiguous(a) && a->ne[2] > 1 && a->ne[3] > 1) {
|
if (ggml_is_permuted(a) && !ggml_is_contiguous(a) &&
|
||||||
|
a->ne[2] > 1 && a->ne[3] > 1 && src0_type == GGML_TYPE_F16) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: This specific configuration can fail with oneDNN and needs more debugging
|
// TODO: This specific configuration can fail with oneDNN and needs more debugging
|
||||||
if (!ggml_is_permuted(a) && ggml_is_permuted(b) && b->ne[2] > 1 && b->ne[3] > 1 &&
|
if (!ggml_is_permuted(a) && ggml_is_permuted(b) && b->ne[2] > 1 && b->ne[3] > 1 &&
|
||||||
a->ne[0] > 128 && a->ne[2] == 1 && src0_type == GGML_TYPE_F16) {
|
a->ne[0] > 128 && a->ne[2] == 1 && src0_type == GGML_TYPE_F16) {
|
||||||
|
|
@ -4553,9 +4564,9 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||||
case GGML_OP_VIEW:
|
case GGML_OP_VIEW:
|
||||||
case GGML_OP_PERMUTE:
|
case GGML_OP_PERMUTE:
|
||||||
case GGML_OP_TRANSPOSE:
|
case GGML_OP_TRANSPOSE:
|
||||||
return true;
|
|
||||||
case GGML_OP_ADD:
|
case GGML_OP_ADD:
|
||||||
case GGML_OP_ADD1:
|
case GGML_OP_ADD1:
|
||||||
|
case GGML_OP_ADD_ID:
|
||||||
case GGML_OP_SUB:
|
case GGML_OP_SUB:
|
||||||
case GGML_OP_COUNT_EQUAL:
|
case GGML_OP_COUNT_EQUAL:
|
||||||
case GGML_OP_MUL:
|
case GGML_OP_MUL:
|
||||||
|
|
|
||||||
|
|
@ -595,6 +595,25 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void mul_mat_vec_mxfp4_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols, const int nrows,
|
||||||
|
dpct::queue_ptr stream) {
|
||||||
|
GGML_ASSERT(ncols % QK_MXFP4 == 0);
|
||||||
|
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||||
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||||
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||||
|
|
||||||
|
{
|
||||||
|
stream->submit([&](sycl::handler & cgh) {
|
||||||
|
cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
|
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
|
mul_mat_vec_q<QK_MXFP4, QI_MXFP4, block_mxfp4, VDR_MXFP4_Q8_1_MMVQ, vec_dot_mxfp4_q8_1>(
|
||||||
|
vx, vy, dst, ncols, nrows, item_ct1);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
|
static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
|
||||||
float *dst, const int ncols,
|
float *dst, const int ncols,
|
||||||
const int nrows,
|
const int nrows,
|
||||||
|
|
@ -1123,6 +1142,9 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
|
||||||
case GGML_TYPE_IQ4_XS:
|
case GGML_TYPE_IQ4_XS:
|
||||||
mul_mat_vec_iq4_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
mul_mat_vec_iq4_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||||
break;
|
break;
|
||||||
|
case GGML_TYPE_MXFP4:
|
||||||
|
mul_mat_vec_mxfp4_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -16,8 +16,8 @@
|
||||||
static void pad_f32(const float * src, float * dst,
|
static void pad_f32(const float * src, float * dst,
|
||||||
const int lp0, const int rp0, const int lp1, const int rp1,
|
const int lp0, const int rp0, const int lp1, const int rp1,
|
||||||
const int lp2, const int rp2, const int lp3, const int rp3,
|
const int lp2, const int rp2, const int lp3, const int rp3,
|
||||||
const int ne0, const int ne1, const int ne2, const int ne3) {
|
const int ne0, const int ne1, const int ne2, const int ne3,
|
||||||
auto item_ct1 = sycl::ext::oneapi::this_work_item::get_nd_item<3>();
|
sycl::nd_item<3> item_ct1) {
|
||||||
int i0 = item_ct1.get_local_id(2) +
|
int i0 = item_ct1.get_local_id(2) +
|
||||||
item_ct1.get_group(2) * item_ct1.get_local_range(2);
|
item_ct1.get_group(2) * item_ct1.get_local_range(2);
|
||||||
int i1 = item_ct1.get_group(1);
|
int i1 = item_ct1.get_group(1);
|
||||||
|
|
@ -63,7 +63,7 @@ static void pad_f32_sycl(const float *src, float *dst, const int lp0,
|
||||||
sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE)),
|
sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE)),
|
||||||
[=](sycl::nd_item<3> item_ct1) {
|
[=](sycl::nd_item<3> item_ct1) {
|
||||||
pad_f32(src, dst, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3, ne0, ne1,
|
pad_f32(src, dst, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3, ne0, ne1,
|
||||||
ne2, ne3);
|
ne2, ne3, item_ct1);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -88,7 +88,7 @@ void ggml_sycl_ssm_conv(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||||
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
||||||
GGML_ASSERT(src1->nb[0] == sizeof(float));
|
GGML_ASSERT(src1->nb[0] == sizeof(float));
|
||||||
|
|
||||||
GGML_ASSERT(src0->nb[1] == src0->ne[0] * static_cast<int>(sizeof(float)));
|
GGML_ASSERT(src0->nb[1] == src0->ne[0] * sizeof(float));
|
||||||
|
|
||||||
const int src_stride_inner = ncs;
|
const int src_stride_inner = ncs;
|
||||||
const int src_stride_seq = ncs * d_inner;
|
const int src_stride_seq = ncs * d_inner;
|
||||||
|
|
|
||||||
|
|
@ -20,6 +20,18 @@
|
||||||
typedef float (*vec_dot_q_sycl_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1,
|
typedef float (*vec_dot_q_sycl_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1,
|
||||||
const int & iqs);
|
const int & iqs);
|
||||||
|
|
||||||
|
static __dpct_inline__ int get_int_b1(const void * x, const int & i32) {
|
||||||
|
const uint8_t * x8 = (const uint8_t *) x;
|
||||||
|
|
||||||
|
int x32 = x8[4*i32 + 0] << 0;
|
||||||
|
x32 |= x8[4*i32 + 1] << 8;
|
||||||
|
x32 |= x8[4*i32 + 2] << 16;
|
||||||
|
x32 |= x8[4*i32 + 3] << 24;
|
||||||
|
|
||||||
|
return x32;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
static __dpct_inline__ int get_int_from_int8(const int8_t* x8, const int& i32) {
|
static __dpct_inline__ int get_int_from_int8(const int8_t* x8, const int& i32) {
|
||||||
const uint16_t* x16 =
|
const uint16_t* x16 =
|
||||||
(const uint16_t*)(x8 + sizeof(int) * i32); // assume at least 2 byte
|
(const uint16_t*)(x8 + sizeof(int) * i32); // assume at least 2 byte
|
||||||
|
|
@ -75,6 +87,28 @@ static __dpct_inline__ void get_int_from_table_16(const uint32_t &q4,
|
||||||
val2 = v1 | (v2 << 16);
|
val2 = v1 | (v2 << 16);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __dpct_inline__ sycl::int2 get_int_from_table_16(
|
||||||
|
const int& q4, const int8_t* table) {
|
||||||
|
const uint32_t* table32 = (const uint32_t*)table;
|
||||||
|
uint32_t tmp[2];
|
||||||
|
const uint32_t low_high_selection_indices =
|
||||||
|
(0x32103210 | ((q4 & 0x88888888) >> 1));
|
||||||
|
#pragma unroll
|
||||||
|
for (uint32_t i = 0; i < 2; ++i) {
|
||||||
|
const uint32_t shift = 16 * i;
|
||||||
|
|
||||||
|
const uint32_t low =
|
||||||
|
dpct::byte_level_permute(table32[0], table32[1], q4 >> shift);
|
||||||
|
const uint32_t high =
|
||||||
|
dpct::byte_level_permute(table32[2], table32[3], q4 >> shift);
|
||||||
|
tmp[i] = dpct::byte_level_permute(
|
||||||
|
low, high, low_high_selection_indices >> shift);
|
||||||
|
}
|
||||||
|
return sycl::int2(
|
||||||
|
dpct::byte_level_permute(tmp[0], tmp[1], 0x6420),
|
||||||
|
dpct::byte_level_permute(tmp[0], tmp[1], 0x7531));
|
||||||
|
}
|
||||||
|
|
||||||
#define VDR_Q2_K_Q8_1_MMVQ 1
|
#define VDR_Q2_K_Q8_1_MMVQ 1
|
||||||
|
|
||||||
// contiguous v/x values
|
// contiguous v/x values
|
||||||
|
|
@ -685,6 +719,30 @@ vec_dot_q4_1_q8_1(const void *__restrict__ vbq,
|
||||||
return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
|
return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define VDR_MXFP4_Q8_1_MMVQ 2
|
||||||
|
#define VDR_MXFP4_Q8_1_MMQ 4
|
||||||
|
|
||||||
|
static __dpct_inline__ float vec_dot_mxfp4_q8_1(const void * __restrict__ vbq,
|
||||||
|
const block_q8_1 * __restrict__ bq8_1,
|
||||||
|
const int & iqs) {
|
||||||
|
const block_mxfp4 * bq4 = (const block_mxfp4 *) vbq;
|
||||||
|
|
||||||
|
const int * q8 = (const int *) bq8_1->qs + iqs;
|
||||||
|
|
||||||
|
int sumi = 0;
|
||||||
|
#pragma unroll
|
||||||
|
for (int l = 0; l < VDR_MXFP4_Q8_1_MMVQ; ++l) {
|
||||||
|
const int aux_q4 = get_int_b1(bq4->qs, iqs + l);
|
||||||
|
const sycl::int2 v = get_int_from_table_16(aux_q4, kvalues_mxfp4);
|
||||||
|
sumi = ggml_sycl_dp4a(v.x(), q8[l + 0], sumi);
|
||||||
|
sumi = ggml_sycl_dp4a(v.y(), q8[l + 4], sumi);
|
||||||
|
}
|
||||||
|
|
||||||
|
const float d = ggml_sycl_e8m0_to_fp32(bq4->e) * 0.5f * (bq8_1->ds)[0];
|
||||||
|
return d * sumi;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
static __dpct_inline__ float
|
static __dpct_inline__ float
|
||||||
vec_dot_q5_0_q8_1(const void *__restrict__ vbq,
|
vec_dot_q5_0_q8_1(const void *__restrict__ vbq,
|
||||||
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
||||||
|
|
|
||||||
|
|
@ -659,6 +659,7 @@ struct vk_device_struct {
|
||||||
vk_pipeline pipeline_cos_f32;
|
vk_pipeline pipeline_cos_f32;
|
||||||
vk_pipeline pipeline_log[2];
|
vk_pipeline pipeline_log[2];
|
||||||
vk_pipeline pipeline_tri[2];
|
vk_pipeline pipeline_tri[2];
|
||||||
|
vk_pipeline pipeline_diag[2];
|
||||||
vk_pipeline pipeline_clamp_f32;
|
vk_pipeline pipeline_clamp_f32;
|
||||||
vk_pipeline pipeline_pad_f32;
|
vk_pipeline pipeline_pad_f32;
|
||||||
vk_pipeline pipeline_roll_f32;
|
vk_pipeline pipeline_roll_f32;
|
||||||
|
|
@ -722,6 +723,11 @@ struct vk_device_struct {
|
||||||
vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16;
|
vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16;
|
||||||
vk_pipeline pipeline_soft_max_f32_wg512, pipeline_soft_max_f32_f16_wg512;
|
vk_pipeline pipeline_soft_max_f32_wg512, pipeline_soft_max_f32_f16_wg512;
|
||||||
vk_pipeline pipeline_soft_max_back_f32;
|
vk_pipeline pipeline_soft_max_back_f32;
|
||||||
|
|
||||||
|
vk_pipeline pipeline_soft_max_large1_f32, pipeline_soft_max_large1_f32_f16;
|
||||||
|
vk_pipeline pipeline_soft_max_large2_f32, pipeline_soft_max_large2_f32_f16;
|
||||||
|
vk_pipeline pipeline_soft_max_large3_f32, pipeline_soft_max_large3_f32_f16;
|
||||||
|
|
||||||
vk_pipeline pipeline_rope_norm_f32, pipeline_rope_norm_f16, pipeline_rope_norm_f32_f16;
|
vk_pipeline pipeline_rope_norm_f32, pipeline_rope_norm_f16, pipeline_rope_norm_f32_f16;
|
||||||
vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16, pipeline_rope_neox_f32_f16;
|
vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16, pipeline_rope_neox_f32_f16;
|
||||||
vk_pipeline pipeline_rope_multi_f32, pipeline_rope_multi_f16;
|
vk_pipeline pipeline_rope_multi_f32, pipeline_rope_multi_f16;
|
||||||
|
|
@ -757,7 +763,8 @@ struct vk_device_struct {
|
||||||
|
|
||||||
vk_pipeline pipeline_flash_attn_split_k_reduce;
|
vk_pipeline pipeline_flash_attn_split_k_reduce;
|
||||||
|
|
||||||
vk_pipeline pipeline_topk_moe[num_topk_moe_pipelines][TOPK_MOE_COUNT];
|
// [2] is for whether to take n_experts from spec constant (0) or push constant (1)
|
||||||
|
vk_pipeline pipeline_topk_moe[num_topk_moe_pipelines][TOPK_MOE_COUNT][2];
|
||||||
|
|
||||||
std::vector<vk_pipeline_ref> all_pipelines;
|
std::vector<vk_pipeline_ref> all_pipelines;
|
||||||
|
|
||||||
|
|
@ -1149,6 +1156,7 @@ static_assert(sizeof(vk_op_multi_add_push_constants) <= 256);
|
||||||
|
|
||||||
struct vk_op_topk_moe_push_constants {
|
struct vk_op_topk_moe_push_constants {
|
||||||
uint32_t n_rows;
|
uint32_t n_rows;
|
||||||
|
uint32_t n_experts_push;
|
||||||
uint32_t n_expert_used;
|
uint32_t n_expert_used;
|
||||||
float clamp_min;
|
float clamp_min;
|
||||||
float clamp_max;
|
float clamp_max;
|
||||||
|
|
@ -3730,6 +3738,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ4_XS], "get_rows_iq4_xs", get_rows_iq4_xs_len, get_rows_iq4_xs_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ4_XS], "get_rows_iq4_xs", get_rows_iq4_xs_len, get_rows_iq4_xs_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl", get_rows_iq4_nl_len, get_rows_iq4_nl_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl", get_rows_iq4_nl_len, get_rows_iq4_nl_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_MXFP4], "get_rows_mxfp4", get_rows_mxfp4_len, get_rows_mxfp4_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_MXFP4], "get_rows_mxfp4", get_rows_mxfp4_len, get_rows_mxfp4_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
||||||
|
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_I32], "get_rows_i32", get_rows_i32_len, get_rows_i32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
||||||
|
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f32_f32", get_rows_f32_f32_len, get_rows_f32_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f32_f32", get_rows_f32_f32_len, get_rows_f32_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F16 ], "get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F16 ], "get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
||||||
|
|
@ -3917,6 +3926,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_tri[0], "tri_f32", tri_f32_len, tri_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_tri[0], "tri_f32", tri_f32_len, tri_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_tri[1], "tri_f16", tri_f16_len, tri_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_tri[1], "tri_f16", tri_f16_len, tri_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||||
|
|
||||||
|
ggml_vk_create_pipeline(device, device->pipeline_diag[0], "diag_f32", diag_f32_len, diag_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||||
|
ggml_vk_create_pipeline(device, device->pipeline_diag[1], "diag_f16", diag_f16_len, diag_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||||
|
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_clamp_f32, "clamp_f32", clamp_f32_len, clamp_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_clamp_f32, "clamp_f32", clamp_f32_len, clamp_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||||
|
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_pad_f32, "pad_f32", pad_f32_len, pad_f32_data, "main", 2, sizeof(vk_op_pad_push_constants), {512, 1, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_pad_f32, "pad_f32", pad_f32_len, pad_f32_data, "main", 2, sizeof(vk_op_pad_push_constants), {512, 1, 1}, {}, 1);
|
||||||
|
|
@ -3996,6 +4008,13 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_f16_wg512, "soft_max_f32_f16_wg512", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 512 }, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_f16_wg512, "soft_max_f32_f16_wg512", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 512 }, 1);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_soft_max_back_f32, "soft_max_back_f32", soft_max_back_f32_len, soft_max_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_soft_max_back_f32, "soft_max_back_f32", soft_max_back_f32_len, soft_max_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1, true);
|
||||||
|
|
||||||
|
ggml_vk_create_pipeline(device, device->pipeline_soft_max_large1_f32, "soft_max_large1_f32", soft_max_large1_f32_len, soft_max_large1_f32_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true);
|
||||||
|
ggml_vk_create_pipeline(device, device->pipeline_soft_max_large2_f32, "soft_max_large2_f32", soft_max_large2_f32_len, soft_max_large2_f32_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true);
|
||||||
|
ggml_vk_create_pipeline(device, device->pipeline_soft_max_large3_f32, "soft_max_large3_f32", soft_max_large3_f32_len, soft_max_large3_f32_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true);
|
||||||
|
ggml_vk_create_pipeline(device, device->pipeline_soft_max_large1_f32_f16, "soft_max_large1_f32_f16", soft_max_large1_f32_f16_len, soft_max_large1_f32_f16_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true);
|
||||||
|
ggml_vk_create_pipeline(device, device->pipeline_soft_max_large2_f32_f16, "soft_max_large2_f32_f16", soft_max_large2_f32_f16_len, soft_max_large2_f32_f16_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true);
|
||||||
|
ggml_vk_create_pipeline(device, device->pipeline_soft_max_large3_f32_f16, "soft_max_large3_f32_f16", soft_max_large3_f32_f16_len, soft_max_large3_f32_f16_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true);
|
||||||
|
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f32, "rope_norm_f32", rope_norm_f32_len, rope_norm_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f32, "rope_norm_f32", rope_norm_f32_len, rope_norm_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_rope_multi_f32, "rope_multi_f32", rope_multi_f32_len, rope_multi_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_rope_multi_f32, "rope_multi_f32", rope_multi_f32_len, rope_multi_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
||||||
|
|
@ -4204,10 +4223,12 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f16_f32, "conv2d_dw_whcn_f16_f32", conv2d_dw_whcn_f16_f32_len, conv2d_dw_whcn_f16_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f16_f32, "conv2d_dw_whcn_f16_f32", conv2d_dw_whcn_f16_f32_len, conv2d_dw_whcn_f16_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_cwhn_f16_f32, "conv2d_dw_cwhn_f16_f32", conv2d_dw_cwhn_f16_f32_len, conv2d_dw_cwhn_f16_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_cwhn_f16_f32, "conv2d_dw_cwhn_f16_f32", conv2d_dw_cwhn_f16_f32_len, conv2d_dw_cwhn_f16_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
|
||||||
|
|
||||||
|
for (uint32_t use_push = 0; use_push < 2; ++use_push) {
|
||||||
for (uint32_t i = 0; i < num_topk_moe_pipelines; ++i) {
|
for (uint32_t i = 0; i < num_topk_moe_pipelines; ++i) {
|
||||||
ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX], "topk_moe_f32_early_softmax_"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 0}, 1, true, true, device->subgroup_size);
|
ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX][use_push], "topk_moe_f32_early_softmax_"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 0, use_push}, 1, true, true, device->subgroup_size);
|
||||||
ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX_NORM], "topk_moe_f32_early_softmax_norm"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 1, 0}, 1, true, true, device->subgroup_size);
|
ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX_NORM][use_push], "topk_moe_f32_early_softmax_norm"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 1, 0, use_push}, 1, true, true, device->subgroup_size);
|
||||||
ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_LATE_SOFTMAX], "topk_moe_f32_late_softmax"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 1}, 1, true, true, device->subgroup_size);
|
ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_LATE_SOFTMAX][use_push], "topk_moe_f32_late_softmax"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 1, use_push}, 1, true, true, device->subgroup_size);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (auto &c : compiles) {
|
for (auto &c : compiles) {
|
||||||
|
|
@ -8274,6 +8295,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
||||||
switch (op) {
|
switch (op) {
|
||||||
case GGML_OP_GET_ROWS:
|
case GGML_OP_GET_ROWS:
|
||||||
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
||||||
|
if (src0->type == GGML_TYPE_I32) {
|
||||||
|
// i32 src only supports i32 result
|
||||||
|
GGML_ASSERT(dst->type == GGML_TYPE_I32);
|
||||||
|
return ctx->device->pipeline_get_rows[src0->type];
|
||||||
|
}
|
||||||
if (dst->type == GGML_TYPE_F16) {
|
if (dst->type == GGML_TYPE_F16) {
|
||||||
return ctx->device->pipeline_get_rows[src0->type];
|
return ctx->device->pipeline_get_rows[src0->type];
|
||||||
}
|
}
|
||||||
|
|
@ -8400,6 +8426,12 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
||||||
return ctx->device->pipeline_tri[dst->type == GGML_TYPE_F16];
|
return ctx->device->pipeline_tri[dst->type == GGML_TYPE_F16];
|
||||||
}
|
}
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
case GGML_OP_DIAG:
|
||||||
|
if (src0->type == dst->type &&
|
||||||
|
(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)) {
|
||||||
|
return ctx->device->pipeline_diag[dst->type == GGML_TYPE_F16];
|
||||||
|
}
|
||||||
|
return nullptr;
|
||||||
case GGML_OP_CLAMP:
|
case GGML_OP_CLAMP:
|
||||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||||
return ctx->device->pipeline_clamp_f32;
|
return ctx->device->pipeline_clamp_f32;
|
||||||
|
|
@ -8554,7 +8586,9 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
||||||
uint32_t idx = (uint32_t)ceilf(log2f(float(dst->ne[0])));
|
uint32_t idx = (uint32_t)ceilf(log2f(float(dst->ne[0])));
|
||||||
GGML_ASSERT(idx < num_topk_moe_pipelines);
|
GGML_ASSERT(idx < num_topk_moe_pipelines);
|
||||||
topk_moe_mode mode = ggml_vk_num_additional_ops_to_topk_moe_mode(ctx->num_additional_fused_ops);
|
topk_moe_mode mode = ggml_vk_num_additional_ops_to_topk_moe_mode(ctx->num_additional_fused_ops);
|
||||||
return ctx->device->pipeline_topk_moe[idx][mode];
|
// use n_experts from push constant if it's not equal to the power of two spec constant
|
||||||
|
bool use_push = dst->ne[0] != (1u << idx);
|
||||||
|
return ctx->device->pipeline_topk_moe[idx][mode][use_push];
|
||||||
}
|
}
|
||||||
|
|
||||||
if (src0->type == GGML_TYPE_F32 && (src1 == nullptr || src1->type == GGML_TYPE_F32) && dst->type == GGML_TYPE_F32) {
|
if (src0->type == GGML_TYPE_F32 && (src1 == nullptr || src1->type == GGML_TYPE_F32) && dst->type == GGML_TYPE_F32) {
|
||||||
|
|
@ -9091,6 +9125,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
||||||
case GGML_OP_COS:
|
case GGML_OP_COS:
|
||||||
case GGML_OP_LOG:
|
case GGML_OP_LOG:
|
||||||
case GGML_OP_TRI:
|
case GGML_OP_TRI:
|
||||||
|
case GGML_OP_DIAG:
|
||||||
case GGML_OP_CLAMP:
|
case GGML_OP_CLAMP:
|
||||||
case GGML_OP_PAD:
|
case GGML_OP_PAD:
|
||||||
case GGML_OP_ROLL:
|
case GGML_OP_ROLL:
|
||||||
|
|
@ -9778,6 +9813,12 @@ static void ggml_vk_tri(ggml_backend_vk_context * ctx, vk_context& subctx, const
|
||||||
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_TRI, std::move(p));
|
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_TRI, std::move(p));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void ggml_vk_diag(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||||
|
vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst));
|
||||||
|
|
||||||
|
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_DIAG, std::move(p));
|
||||||
|
}
|
||||||
|
|
||||||
static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||||
vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst);
|
vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst);
|
||||||
p.param1 = ggml_get_op_params_f32(dst, 0);
|
p.param1 = ggml_get_op_params_f32(dst, 0);
|
||||||
|
|
@ -10111,7 +10152,7 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx,
|
||||||
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||||
|
|
||||||
ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, src2, nullptr, dst, GGML_OP_SOFT_MAX, {
|
vk_op_soft_max_push_constants pc {
|
||||||
ncols,
|
ncols,
|
||||||
src1 != nullptr ? nrows_y : (uint32_t)0,
|
src1 != nullptr ? nrows_y : (uint32_t)0,
|
||||||
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],
|
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],
|
||||||
|
|
@ -10122,7 +10163,55 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx,
|
||||||
n_head_log2,
|
n_head_log2,
|
||||||
nrows_x,
|
nrows_x,
|
||||||
src2 != nullptr
|
src2 != nullptr
|
||||||
});
|
};
|
||||||
|
|
||||||
|
if (ncols <= 16384) {
|
||||||
|
ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, src2, nullptr, dst, GGML_OP_SOFT_MAX, std::move(pc));
|
||||||
|
} else {
|
||||||
|
|
||||||
|
vk_subbuffer buf_a = ggml_vk_tensor_subbuffer(ctx, src0);
|
||||||
|
vk_subbuffer buf_b = src1 ? ggml_vk_tensor_subbuffer(ctx, src1) : buf_a;
|
||||||
|
vk_subbuffer buf_c = src2 ? ggml_vk_tensor_subbuffer(ctx, src2) : buf_a;
|
||||||
|
vk_subbuffer buf_d = ggml_vk_tensor_subbuffer(ctx, dst);
|
||||||
|
|
||||||
|
uint32_t elems_per_wg = 128 * 4;
|
||||||
|
uint32_t num_wgs = CEIL_DIV(ncols, elems_per_wg);
|
||||||
|
size_t tmp_size = num_wgs * nrows_x * sizeof(float);
|
||||||
|
|
||||||
|
if (ctx->prealloc_size_x < tmp_size) {
|
||||||
|
ctx->prealloc_size_x = tmp_size;
|
||||||
|
ggml_vk_preallocate_buffers(ctx, subctx);
|
||||||
|
}
|
||||||
|
if (ctx->prealloc_size_y < tmp_size) {
|
||||||
|
ctx->prealloc_size_y = tmp_size;
|
||||||
|
ggml_vk_preallocate_buffers(ctx, subctx);
|
||||||
|
}
|
||||||
|
if (ctx->prealloc_x_need_sync || ctx->prealloc_y_need_sync) {
|
||||||
|
ggml_vk_sync_buffers(ctx, subctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
vk_subbuffer buf_x = { ctx->prealloc_x, 0, tmp_size };
|
||||||
|
vk_subbuffer buf_y = { ctx->prealloc_y, 0, tmp_size };
|
||||||
|
|
||||||
|
std::array<uint32_t, 3> elements = { num_wgs, nrows_x, 1 };
|
||||||
|
|
||||||
|
vk_pipeline pipeline1 = src1 && src1->type == GGML_TYPE_F16 ? ctx->device->pipeline_soft_max_large1_f32_f16 : ctx->device->pipeline_soft_max_large1_f32;
|
||||||
|
vk_pipeline pipeline2 = src1 && src1->type == GGML_TYPE_F16 ? ctx->device->pipeline_soft_max_large2_f32_f16 : ctx->device->pipeline_soft_max_large2_f32;
|
||||||
|
vk_pipeline pipeline3 = src1 && src1->type == GGML_TYPE_F16 ? ctx->device->pipeline_soft_max_large3_f32_f16 : ctx->device->pipeline_soft_max_large3_f32;
|
||||||
|
|
||||||
|
ggml_pipeline_request_descriptor_sets(ctx, pipeline1, 1);
|
||||||
|
ggml_pipeline_request_descriptor_sets(ctx, pipeline2, 1);
|
||||||
|
ggml_pipeline_request_descriptor_sets(ctx, pipeline3, 1);
|
||||||
|
|
||||||
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline1, { buf_a, buf_b, buf_c, buf_d, buf_x, buf_y }, pc, elements);
|
||||||
|
ggml_vk_sync_buffers(ctx, subctx);
|
||||||
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline2, { buf_a, buf_b, buf_c, buf_d, buf_x, buf_y }, pc, elements);
|
||||||
|
ggml_vk_sync_buffers(ctx, subctx);
|
||||||
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline3, { buf_a, buf_b, buf_c, buf_d, buf_x, buf_y }, pc, elements);
|
||||||
|
|
||||||
|
ctx->prealloc_x_need_sync = true;
|
||||||
|
ctx->prealloc_y_need_sync = true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_soft_max_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_vk_soft_max_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
|
|
@ -10158,6 +10247,7 @@ static void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx,
|
||||||
|
|
||||||
vk_op_topk_moe_push_constants pc {};
|
vk_op_topk_moe_push_constants pc {};
|
||||||
pc.n_rows = n_rows;
|
pc.n_rows = n_rows;
|
||||||
|
pc.n_experts_push = n_experts;
|
||||||
pc.n_expert_used = n_expert_used;
|
pc.n_expert_used = n_expert_used;
|
||||||
if (mode == TOPK_MOE_EARLY_SOFTMAX_NORM) {
|
if (mode == TOPK_MOE_EARLY_SOFTMAX_NORM) {
|
||||||
ggml_tensor * clamp = cgraph->nodes[node_idx + 7];
|
ggml_tensor * clamp = cgraph->nodes[node_idx + 7];
|
||||||
|
|
@ -11857,6 +11947,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
||||||
case GGML_OP_TRI:
|
case GGML_OP_TRI:
|
||||||
ggml_vk_tri(ctx, compute_ctx, src0, node);
|
ggml_vk_tri(ctx, compute_ctx, src0, node);
|
||||||
|
|
||||||
|
break;
|
||||||
|
case GGML_OP_DIAG:
|
||||||
|
ggml_vk_diag(ctx, compute_ctx, src0, node);
|
||||||
|
|
||||||
break;
|
break;
|
||||||
case GGML_OP_CLAMP:
|
case GGML_OP_CLAMP:
|
||||||
ggml_vk_clamp(ctx, compute_ctx, src0, node);
|
ggml_vk_clamp(ctx, compute_ctx, src0, node);
|
||||||
|
|
@ -12832,8 +12926,7 @@ static bool ggml_vk_can_fuse_topk_moe(ggml_backend_vk_context * ctx, const struc
|
||||||
}
|
}
|
||||||
|
|
||||||
const int n_expert = softmax->ne[0];
|
const int n_expert = softmax->ne[0];
|
||||||
// n_expert must be a power of 2
|
if (n_expert > (1 << (num_topk_moe_pipelines-1))) {
|
||||||
if (!is_pow2(n_expert) || n_expert > (1 << (num_topk_moe_pipelines-1))) {
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -13877,6 +13970,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||||
case GGML_TYPE_IQ4_XS:
|
case GGML_TYPE_IQ4_XS:
|
||||||
case GGML_TYPE_IQ4_NL:
|
case GGML_TYPE_IQ4_NL:
|
||||||
case GGML_TYPE_MXFP4:
|
case GGML_TYPE_MXFP4:
|
||||||
|
case GGML_TYPE_I32:
|
||||||
return true;
|
return true;
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
|
|
@ -14001,6 +14095,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||||
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
||||||
case GGML_OP_LOG:
|
case GGML_OP_LOG:
|
||||||
case GGML_OP_TRI:
|
case GGML_OP_TRI:
|
||||||
|
case GGML_OP_DIAG:
|
||||||
return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
|
return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
|
||||||
op->type == op->src[0]->type;
|
op->type == op->src[0]->type;
|
||||||
case GGML_OP_ARGSORT:
|
case GGML_OP_ARGSORT:
|
||||||
|
|
@ -14591,6 +14686,8 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
|
||||||
tensor_clone = ggml_log(ggml_ctx, src_clone[0]);
|
tensor_clone = ggml_log(ggml_ctx, src_clone[0]);
|
||||||
} else if (tensor->op == GGML_OP_TRI) {
|
} else if (tensor->op == GGML_OP_TRI) {
|
||||||
tensor_clone = ggml_tri(ggml_ctx, src_clone[0], ggml_get_op_params_i32(tensor, 0));
|
tensor_clone = ggml_tri(ggml_ctx, src_clone[0], ggml_get_op_params_i32(tensor, 0));
|
||||||
|
} else if (tensor->op == GGML_OP_DIAG) {
|
||||||
|
tensor_clone = ggml_diag(ggml_ctx, src_clone[0]);
|
||||||
} else if (tensor->op == GGML_OP_CLAMP) {
|
} else if (tensor->op == GGML_OP_CLAMP) {
|
||||||
const float * params = (const float *)tensor->op_params;
|
const float * params = (const float *)tensor->op_params;
|
||||||
tensor_clone = ggml_clamp(ggml_ctx, src_clone[0], params[0], params[1]);
|
tensor_clone = ggml_clamp(ggml_ctx, src_clone[0], params[0], params[1]);
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,29 @@
|
||||||
|
#version 450
|
||||||
|
|
||||||
|
#include "rte.glsl"
|
||||||
|
#include "types.glsl"
|
||||||
|
#include "generic_unary_head.glsl"
|
||||||
|
|
||||||
|
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
|
||||||
|
void main() {
|
||||||
|
const uint idx = get_idx();
|
||||||
|
|
||||||
|
if (idx >= p.ne) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const uint i13 = fastdiv(idx, p.ne1_012mp, p.ne1_012L);
|
||||||
|
const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10;
|
||||||
|
const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, p.ne1_01L);
|
||||||
|
const uint i12_offset = i12*p.ne11*p.ne10;
|
||||||
|
const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, p.ne1_0L);
|
||||||
|
const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10;
|
||||||
|
|
||||||
|
if (i10 == i11) {
|
||||||
|
const float val = float(data_a[get_aoffset() + i13*p.nb03 + i12*p.nb02 + 0*p.nb01 + i10*p.nb00]);
|
||||||
|
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(val);
|
||||||
|
} else {
|
||||||
|
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -256,6 +256,9 @@ void main() {
|
||||||
barrier();
|
barrier();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// prevent race on tmpsh
|
||||||
|
barrier();
|
||||||
|
|
||||||
// reduce across threads
|
// reduce across threads
|
||||||
|
|
||||||
[[unroll]] for (uint32_t r = 0; r < Br; ++r) {
|
[[unroll]] for (uint32_t r = 0; r < Br; ++r) {
|
||||||
|
|
|
||||||
|
|
@ -302,6 +302,9 @@ void main() {
|
||||||
barrier();
|
barrier();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// prevent race on tmpsh
|
||||||
|
barrier();
|
||||||
|
|
||||||
// reduce across threads
|
// reduce across threads
|
||||||
|
|
||||||
float rowmaxf[rows_per_thread], eMf[rows_per_thread], Moldf[rows_per_thread];
|
float rowmaxf[rows_per_thread], eMf[rows_per_thread], Moldf[rows_per_thread];
|
||||||
|
|
|
||||||
|
|
@ -26,9 +26,9 @@ void main() {
|
||||||
const uint d_offset = get_doffset() + i10*p.nb21 + i11*p.nb22 + i12*p.nb23;
|
const uint d_offset = get_doffset() + i10*p.nb21 + i11*p.nb22 + i12*p.nb23;
|
||||||
|
|
||||||
#if defined(DATA_A_BF16)
|
#if defined(DATA_A_BF16)
|
||||||
FLOAT_TYPE v = FLOAT_TYPE(bf16_to_fp32(data_a[a_offset + i00]));
|
TEMP_TYPE v = TEMP_TYPE(bf16_to_fp32(data_a[a_offset + i00]));
|
||||||
#else
|
#else
|
||||||
FLOAT_TYPE v = FLOAT_TYPE(data_a[a_offset + i00]);
|
TEMP_TYPE v = TEMP_TYPE(data_a[a_offset + i00]);
|
||||||
#endif
|
#endif
|
||||||
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
||||||
data_d[d_offset + i00] = D_TYPE(v);
|
data_d[d_offset + i00] = D_TYPE(v);
|
||||||
|
|
|
||||||
|
|
@ -7,36 +7,52 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
|
||||||
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
|
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
|
||||||
|
|
||||||
void calc_superblock(const uint a_offset, const uint b_offset, const uint ib32, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows) {
|
void calc_superblock(const uint a_offset, const uint b_offset, const uint ib32, const uint i,
|
||||||
const uint y_idx = i * QUANT_K + 32 * ib32;
|
const uint num_blocks_per_row, const uint first_row, const uint num_rows) {
|
||||||
|
const uint y_idx_base = i * QUANT_K + 32 * ib32;
|
||||||
|
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||||
|
const uint base_b_idx = (j * p.batch_stride_b + b_offset + y_idx_base) / 4;
|
||||||
|
[[unroll]] for (uint l = 0; l < 4; ++l) {
|
||||||
|
const vec4 b_val_0 = vec4(data_b_v4[base_b_idx + 2 * l]);
|
||||||
|
const vec4 b_val_1 = vec4(data_b_v4[base_b_idx + 2 * l + 1]);
|
||||||
|
|
||||||
|
// index for data_a
|
||||||
uint ibi = a_offset / QUANT_K + first_row * num_blocks_per_row + i;
|
uint ibi = a_offset / QUANT_K + first_row * num_blocks_per_row + i;
|
||||||
|
|
||||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||||
const float d = float(data_a[ibi].d);
|
const float d = float(data_a[ibi].d);
|
||||||
const uint qh = data_a[ibi].qh[ib32];
|
const uint qh = data_a[ibi].qh[ib32];
|
||||||
const float dl = d * float(2 * bitfieldExtract(qh, 12, 3) + 1);
|
|
||||||
const float delta = ((qh & 0x8000) != 0) ? -IQ1S_DELTA : IQ1S_DELTA;
|
|
||||||
|
|
||||||
[[unroll]] for (uint l = 0; l < 4; ++l) {
|
const float dl = d * float(2 * bitfieldExtract(qh, 12, 3) + 1);
|
||||||
const uint qs = data_a[ibi].qs[4 * ib32 + l];
|
const uint qs = data_a[ibi].qs[4 * ib32 + l];
|
||||||
const uint idxhi = bitfieldExtract(qh, 3 * int(l), 3);
|
const uint idxhi = bitfieldExtract(qh, 3 * int(l), 3);
|
||||||
const int16_t grid = int16_t(iq1s_grid[qs | (idxhi << 8)]);
|
const uint16_t grid = uint16_t(iq1s_grid[qs | (idxhi << 8)]);
|
||||||
|
|
||||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
const float delta_val = ((qh & 0x8000) != 0) ? -IQ1S_DELTA : IQ1S_DELTA;
|
||||||
vec4 b0 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 0]);
|
const vec4 delta_v = vec4(delta_val);
|
||||||
vec4 b4 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 1]);
|
const vec4 fbits0 = vec4(
|
||||||
|
float(bitfieldExtract(grid, 0, 2)),
|
||||||
|
float(bitfieldExtract(grid, 2, 2)),
|
||||||
|
float(bitfieldExtract(grid, 4, 2)),
|
||||||
|
float(bitfieldExtract(grid, 6, 2))
|
||||||
|
);
|
||||||
|
const vec4 fbits1 = vec4(
|
||||||
|
float(bitfieldExtract(grid, 8, 2)),
|
||||||
|
float(bitfieldExtract(grid, 10, 2)),
|
||||||
|
float(bitfieldExtract(grid, 12, 2)),
|
||||||
|
float(bitfieldExtract(grid, 14, 2))
|
||||||
|
);
|
||||||
|
|
||||||
|
vec4 sum_v = fma(b_val_0, fbits0 + delta_v, vec4(0.0));
|
||||||
|
sum_v = fma(b_val_1, fbits1 + delta_v, sum_v);
|
||||||
|
FLOAT_TYPE sum = dot(sum_v, vec4(1.0));
|
||||||
|
|
||||||
FLOAT_TYPE sum = FLOAT_TYPE(0.0);
|
|
||||||
[[unroll]] for (int k = 0; k < 4; ++k) {
|
|
||||||
sum = fma(FLOAT_TYPE(b0[k]), bitfieldExtract(grid, 2 * k, 2) + delta,
|
|
||||||
fma(FLOAT_TYPE(b4[k]), bitfieldExtract(grid, 8 + 2 * k, 2) + delta, sum));
|
|
||||||
}
|
|
||||||
temp[j][n] = fma(dl, sum, temp[j][n]);
|
temp[j][n] = fma(dl, sum, temp[j][n]);
|
||||||
}
|
|
||||||
}
|
|
||||||
ibi += num_blocks_per_row;
|
ibi += num_blocks_per_row;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
||||||
uint a_offset, b_offset, d_offset;
|
uint a_offset, b_offset, d_offset;
|
||||||
|
|
|
||||||
|
|
@ -244,17 +244,20 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
|
||||||
const uint iqs = idx % 128; // 0..127
|
const uint iqs = idx % 128; // 0..127
|
||||||
|
|
||||||
const uint n = iqs / 64; // 0,1
|
const uint n = iqs / 64; // 0,1
|
||||||
const uint b = (iqs % 64) / 32; // 0,1
|
const uint b = ((iqs % 64) / 32) * 4; // 0,4
|
||||||
const uint is_b = (iqs % 16) / 8; // 0,1
|
const uint is_b = (iqs % 16) / 8; // 0,1
|
||||||
const uint qhshift = ((iqs % 64) / 16) * 2; // 0,2,4,6
|
const uint qhshift = ((iqs % 64) / 16) * 2; // 0,2,4,6
|
||||||
const uint is = 8 * n + qhshift + is_b; // 0..15
|
const uint is = 8 * n + qhshift + is_b; // 0..15
|
||||||
const uint qsi = n * 64 + (iqs % 32) * 2; // 0,2,4..126
|
const uint qsi = n * 32 + (iqs % 32); // 0..63
|
||||||
const uint qhi = n * 32 + (iqs % 16) * 2; // 0,2,4..62
|
const uint qhi = n * 16 + (iqs % 16); // 0..31
|
||||||
|
|
||||||
const float dscale = float(data_a[ib].d) * float(data_a[ib].scales[is]);
|
const float dscale = float(data_a[ib].d) * float(data_a[ib].scales[is]);
|
||||||
|
|
||||||
buf_a[buf_idx] = FLOAT_TYPE_VEC2(dscale * float(int8_t(((data_a[ib].ql[qsi ] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi ] >> qhshift) & 3) << 4)) - 32),
|
const uint ql = (uint(data_a_packed16[ib].ql[qsi]) >> b) & 0x0F0F;
|
||||||
dscale * float(int8_t(((data_a[ib].ql[qsi + 1] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi + 1] >> qhshift) & 3) << 4)) - 32));
|
const uint qh = (uint(data_a_packed16[ib].qh[qhi]) >> qhshift) & 0x0303;
|
||||||
|
const vec2 q = (vec2(unpack8(ql | (qh << 4)).xy) - 32) * dscale;
|
||||||
|
|
||||||
|
buf_a[buf_idx] = FLOAT_TYPE_VEC2(q.x, q.y);
|
||||||
#elif defined(DATA_A_IQ1_S)
|
#elif defined(DATA_A_IQ1_S)
|
||||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
|
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,62 @@
|
||||||
|
#version 450
|
||||||
|
|
||||||
|
#include "soft_max_large_common.glsl"
|
||||||
|
|
||||||
|
void main() {
|
||||||
|
const uint tid = gl_LocalInvocationID.x;
|
||||||
|
const uint rowx = gl_WorkGroupID.y;
|
||||||
|
const uint wg_start = gl_WorkGroupID.x * BLOCK_SIZE * num_iters;
|
||||||
|
|
||||||
|
const uint32_t i03 = rowx / (p.ne01 * p.ne02);
|
||||||
|
const uint32_t i02 = (rowx - i03 * p.ne01 * p.ne02) / p.ne01;
|
||||||
|
const uint32_t i01 = rowx % p.ne01;
|
||||||
|
|
||||||
|
uint rowy_start = 0;
|
||||||
|
if (p.KY > 0) {
|
||||||
|
rowy_start = i01 * p.nb11 + (i02 % p.ne12) * p.nb12 + (i03 % p.ne13) * p.nb13;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (rowx >= p.nrows_x) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
float slope = get_slope(rowx);
|
||||||
|
|
||||||
|
// Find max
|
||||||
|
FLOAT_TYPE max_val = p.has_sinks == 0 ? uintBitsToFloat(0xFF800000) : data_c[i02];
|
||||||
|
|
||||||
|
[[unroll]] for (uint col0 = wg_start, idx = 0; idx < num_iters; col0 += BLOCK_SIZE, ++idx) {
|
||||||
|
const uint col = col0 + tid;
|
||||||
|
|
||||||
|
FLOAT_TYPE a = FLOAT_TYPE(0);
|
||||||
|
if (col < p.KX) {
|
||||||
|
a = data_a[rowx * p.KX + col];
|
||||||
|
}
|
||||||
|
|
||||||
|
FLOAT_TYPE b = FLOAT_TYPE(0);
|
||||||
|
if (p.KY > 0 && col < p.KX) {
|
||||||
|
b = data_b[rowy_start + col];
|
||||||
|
}
|
||||||
|
|
||||||
|
FLOAT_TYPE v = a * p.scale + slope * b;
|
||||||
|
|
||||||
|
if (col < p.KX) {
|
||||||
|
max_val = max(max_val, v);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// reduce across the workgroup
|
||||||
|
vals[tid] = max_val;
|
||||||
|
barrier();
|
||||||
|
[[unroll]] for (uint s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
|
||||||
|
if (tid < s) {
|
||||||
|
vals[tid] = max(vals[tid], vals[tid + s]);
|
||||||
|
}
|
||||||
|
barrier();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (tid == 0) {
|
||||||
|
max_val = vals[0];
|
||||||
|
data_m[rowx * gl_NumWorkGroups.x + gl_WorkGroupID.x] = max_val;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,79 @@
|
||||||
|
#version 450
|
||||||
|
|
||||||
|
#include "soft_max_large_common.glsl"
|
||||||
|
|
||||||
|
void main() {
|
||||||
|
const uint tid = gl_LocalInvocationID.x;
|
||||||
|
const uint rowx = gl_WorkGroupID.y;
|
||||||
|
const uint wg_start = gl_WorkGroupID.x * BLOCK_SIZE * num_iters;
|
||||||
|
|
||||||
|
const uint32_t i03 = rowx / (p.ne01 * p.ne02);
|
||||||
|
const uint32_t i02 = (rowx - i03 * p.ne01 * p.ne02) / p.ne01;
|
||||||
|
const uint32_t i01 = rowx % p.ne01;
|
||||||
|
|
||||||
|
uint rowy_start = 0;
|
||||||
|
if (p.KY > 0) {
|
||||||
|
rowy_start = i01 * p.nb11 + (i02 % p.ne12) * p.nb12 + (i03 % p.ne13) * p.nb13;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (rowx >= p.nrows_x) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
float slope = get_slope(rowx);
|
||||||
|
|
||||||
|
// Find max
|
||||||
|
FLOAT_TYPE max_val = p.has_sinks == 0 ? uintBitsToFloat(0xFF800000) : data_c[i02];
|
||||||
|
|
||||||
|
[[unroll]] for (uint i = 0; i < gl_NumWorkGroups.x; i += BLOCK_SIZE) {
|
||||||
|
if (i + tid < gl_NumWorkGroups.x) {
|
||||||
|
max_val = max(max_val, data_m[rowx * gl_NumWorkGroups.x + i + tid]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// reduce across the workgroup
|
||||||
|
vals[tid] = max_val;
|
||||||
|
barrier();
|
||||||
|
[[unroll]] for (uint s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
|
||||||
|
if (tid < s) {
|
||||||
|
vals[tid] = max(max_val, vals[tid + s]);
|
||||||
|
}
|
||||||
|
barrier();
|
||||||
|
}
|
||||||
|
|
||||||
|
max_val = vals[0];
|
||||||
|
barrier();
|
||||||
|
|
||||||
|
FLOAT_TYPE sum = FLOAT_TYPE(0.0f);
|
||||||
|
|
||||||
|
// Compute sum{exp(x - max)}
|
||||||
|
[[unroll]] for (uint col0 = wg_start, idx = 0; idx < num_iters; col0 += BLOCK_SIZE, ++idx) {
|
||||||
|
const uint col = col0 + tid;
|
||||||
|
|
||||||
|
if (col >= p.KX) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// compute exp(a*scale+b*slope), add it to sum
|
||||||
|
const uint i = rowx * p.KX + col;
|
||||||
|
FLOAT_TYPE val;
|
||||||
|
val = exp(FLOAT_TYPE(data_a[i]) * p.scale + (p.KY > 0 ? slope * FLOAT_TYPE(data_b[rowy_start + col]) : FLOAT_TYPE(0.0f)) - max_val);
|
||||||
|
sum += val;
|
||||||
|
data_d[i] = D_TYPE(val);
|
||||||
|
}
|
||||||
|
|
||||||
|
// reduce across the workgroup
|
||||||
|
vals[tid] = sum;
|
||||||
|
barrier();
|
||||||
|
[[unroll]] for (uint s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
|
||||||
|
if (tid < s) {
|
||||||
|
vals[tid] += vals[tid + s];
|
||||||
|
}
|
||||||
|
barrier();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (tid == 0) {
|
||||||
|
sum = vals[0];
|
||||||
|
data_s[rowx * gl_NumWorkGroups.x + gl_WorkGroupID.x] = sum;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,65 @@
|
||||||
|
#version 450
|
||||||
|
|
||||||
|
#include "soft_max_large_common.glsl"
|
||||||
|
|
||||||
|
shared FLOAT_TYPE sumsh[BLOCK_SIZE];
|
||||||
|
|
||||||
|
void main() {
|
||||||
|
const uint tid = gl_LocalInvocationID.x;
|
||||||
|
const uint rowx = gl_WorkGroupID.y;
|
||||||
|
const uint wg_start = gl_WorkGroupID.x * BLOCK_SIZE * num_iters;
|
||||||
|
|
||||||
|
const uint32_t i03 = rowx / (p.ne01 * p.ne02);
|
||||||
|
const uint32_t i02 = (rowx - i03 * p.ne01 * p.ne02) / p.ne01;
|
||||||
|
const uint32_t i01 = rowx % p.ne01;
|
||||||
|
|
||||||
|
uint rowy_start = 0;
|
||||||
|
if (p.KY > 0) {
|
||||||
|
rowy_start = i01 * p.nb11 + (i02 % p.ne12) * p.nb12 + (i03 % p.ne13) * p.nb13;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (rowx >= p.nrows_x) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
FLOAT_TYPE max_val = p.has_sinks == 0 ? uintBitsToFloat(0xFF800000) : data_c[i02];
|
||||||
|
FLOAT_TYPE sum = FLOAT_TYPE(0.0f);
|
||||||
|
|
||||||
|
[[unroll]] for (uint i = 0; i < gl_NumWorkGroups.x; i += BLOCK_SIZE) {
|
||||||
|
if (i + tid < gl_NumWorkGroups.x) {
|
||||||
|
max_val = max(max_val, data_m[rowx * gl_NumWorkGroups.x + i + tid]);
|
||||||
|
sum += data_s[rowx * gl_NumWorkGroups.x + i + tid];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// reduce across the workgroup
|
||||||
|
vals[tid] = max_val;
|
||||||
|
sumsh[tid] = sum;
|
||||||
|
barrier();
|
||||||
|
[[unroll]] for (uint s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
|
||||||
|
if (tid < s) {
|
||||||
|
vals[tid] = max(max_val, vals[tid + s]);
|
||||||
|
sumsh[tid] += sumsh[tid + s];
|
||||||
|
}
|
||||||
|
barrier();
|
||||||
|
}
|
||||||
|
|
||||||
|
max_val = vals[0];
|
||||||
|
sum = sumsh[0];
|
||||||
|
|
||||||
|
if (p.has_sinks != 0) {
|
||||||
|
sum += FLOAT_TYPE(exp(FLOAT_TYPE(data_c[i02]) - max_val));
|
||||||
|
}
|
||||||
|
|
||||||
|
FLOAT_TYPE rcpdivisor = 1.0/sum;
|
||||||
|
|
||||||
|
[[unroll]] for (uint col0 = wg_start, idx = 0; idx < num_iters; col0 += BLOCK_SIZE, ++idx) {
|
||||||
|
const uint col = col0 + tid;
|
||||||
|
|
||||||
|
if (col >= p.KX) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
data_d[rowx*p.KX + col] *= D_TYPE(rcpdivisor);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,53 @@
|
||||||
|
#extension GL_EXT_control_flow_attributes : enable
|
||||||
|
|
||||||
|
layout (push_constant) uniform parameter
|
||||||
|
{
|
||||||
|
uint KX;
|
||||||
|
uint KY;
|
||||||
|
uint ne00;
|
||||||
|
uint ne01;
|
||||||
|
uint ne02;
|
||||||
|
uint ne12;
|
||||||
|
uint ne13;
|
||||||
|
uint nb11;
|
||||||
|
uint nb12;
|
||||||
|
uint nb13;
|
||||||
|
float scale;
|
||||||
|
float max_bias;
|
||||||
|
float m0;
|
||||||
|
float m1;
|
||||||
|
uint n_head_log2;
|
||||||
|
uint nrows_x;
|
||||||
|
uint has_sinks;
|
||||||
|
} p;
|
||||||
|
|
||||||
|
#include "types.glsl"
|
||||||
|
|
||||||
|
layout(constant_id = 0) const uint BLOCK_SIZE = 128;
|
||||||
|
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
layout(constant_id = 1) const uint num_iters = 4;
|
||||||
|
|
||||||
|
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||||
|
layout (binding = 1) readonly buffer Y {B_TYPE data_b[];};
|
||||||
|
layout (binding = 2) readonly buffer Z {float data_c[];};
|
||||||
|
layout (binding = 3) buffer D {D_TYPE data_d[];};
|
||||||
|
layout (binding = 4) buffer M {float data_m[];};
|
||||||
|
layout (binding = 5) buffer S {float data_s[];};
|
||||||
|
|
||||||
|
shared FLOAT_TYPE vals[BLOCK_SIZE];
|
||||||
|
|
||||||
|
float get_slope(uint rowx) {
|
||||||
|
float slope = 1.0f;
|
||||||
|
|
||||||
|
// ALiBi
|
||||||
|
if (p.max_bias > 0.0f) {
|
||||||
|
const uint h = (rowx / p.ne01) % p.ne02; // head index
|
||||||
|
|
||||||
|
const float base = h < p.n_head_log2 ? p.m0 : p.m1;
|
||||||
|
const uint exp = h < p.n_head_log2 ? h + 1 : 2*(h - p.n_head_log2) + 1;
|
||||||
|
|
||||||
|
slope = pow(base, exp);
|
||||||
|
}
|
||||||
|
|
||||||
|
return slope;
|
||||||
|
}
|
||||||
|
|
@ -10,6 +10,7 @@
|
||||||
layout (push_constant) uniform parameter
|
layout (push_constant) uniform parameter
|
||||||
{
|
{
|
||||||
uint n_rows;
|
uint n_rows;
|
||||||
|
uint n_experts_push;
|
||||||
uint n_expert_used;
|
uint n_expert_used;
|
||||||
float clamp_min;
|
float clamp_min;
|
||||||
float clamp_max;
|
float clamp_max;
|
||||||
|
|
@ -18,11 +19,16 @@ layout (push_constant) uniform parameter
|
||||||
layout(local_size_x_id = 0, local_size_y = 4, local_size_z = 1) in;
|
layout(local_size_x_id = 0, local_size_y = 4, local_size_z = 1) in;
|
||||||
|
|
||||||
layout(constant_id = 0) const uint WARP_SIZE = 32;
|
layout(constant_id = 0) const uint WARP_SIZE = 32;
|
||||||
layout(constant_id = 1) const uint n_experts = 512;
|
layout(constant_id = 1) const uint n_experts_spec = 512;
|
||||||
layout(constant_id = 2) const bool with_norm = true;
|
layout(constant_id = 2) const bool with_norm = true;
|
||||||
layout(constant_id = 3) const bool late_softmax = false;
|
layout(constant_id = 3) const bool late_softmax = false;
|
||||||
|
layout(constant_id = 4) const bool nexperts_use_push = false;
|
||||||
|
|
||||||
const uint experts_per_thread = (n_experts > WARP_SIZE) ? n_experts / WARP_SIZE : 1;
|
uint n_experts = nexperts_use_push ? n_experts_push : n_experts_spec;
|
||||||
|
|
||||||
|
#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
|
||||||
|
|
||||||
|
const uint experts_per_thread = CEIL_DIV(n_experts_spec, WARP_SIZE);
|
||||||
|
|
||||||
layout (binding = 0, std430) readonly buffer Logits {float logits[];};
|
layout (binding = 0, std430) readonly buffer Logits {float logits[];};
|
||||||
layout (binding = 1, std430) writeonly buffer Weights {float weights[];};
|
layout (binding = 1, std430) writeonly buffer Weights {float weights[];};
|
||||||
|
|
@ -94,7 +100,7 @@ void main() {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!late_softmax) {
|
if (!late_softmax) {
|
||||||
softmax_warp_inplace(wt, n_experts, lane, false);
|
softmax_warp_inplace(wt, n_experts, lane, nexperts_use_push);
|
||||||
}
|
}
|
||||||
|
|
||||||
// at this point, each thread holds a portion of softmax,
|
// at this point, each thread holds a portion of softmax,
|
||||||
|
|
|
||||||
|
|
@ -704,13 +704,15 @@ void process_shaders() {
|
||||||
shader = (tname == "f32" || tname == "f16" || tname == "bf16") ? "get_rows.comp" : "get_rows_quant.comp";
|
shader = (tname == "f32" || tname == "f16" || tname == "bf16") ? "get_rows.comp" : "get_rows_quant.comp";
|
||||||
|
|
||||||
if (tname == "f16") {
|
if (tname == "f16") {
|
||||||
string_to_spv("get_rows_" + tname, shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}}));
|
string_to_spv("get_rows_" + tname, shader, merge_maps(base_dict, {{"TEMP_TYPE", "FLOAT_TYPE"}, {data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}}));
|
||||||
} else {
|
} else {
|
||||||
string_to_spv("get_rows_" + tname, shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}}));
|
string_to_spv("get_rows_" + tname, shader, merge_maps(base_dict, {{"TEMP_TYPE", "FLOAT_TYPE"}, {data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}}));
|
||||||
}
|
}
|
||||||
string_to_spv("get_rows_" + tname + "_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float"}}));
|
string_to_spv("get_rows_" + tname + "_f32", shader, merge_maps(base_dict, {{"TEMP_TYPE", "FLOAT_TYPE"}, {data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float"}}));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
string_to_spv("get_rows_i32", "get_rows.comp", {{"TEMP_TYPE", "uint"}, {"A_TYPE", "uint"}, {"B_TYPE", "int"}, {"D_TYPE", "uint"}});
|
||||||
|
|
||||||
string_to_spv("mul_mat_vec_p021_f16_f32_subgroup_add", "mul_mat_vec_p021.comp", {{"A_TYPE", "float16_t"}, {"A_TYPE_VEC4", "f16vec4"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}, {"USE_SUBGROUP_ADD", "1"}});
|
string_to_spv("mul_mat_vec_p021_f16_f32_subgroup_add", "mul_mat_vec_p021.comp", {{"A_TYPE", "float16_t"}, {"A_TYPE_VEC4", "f16vec4"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}, {"USE_SUBGROUP_ADD", "1"}});
|
||||||
string_to_spv("mul_mat_vec_p021_f16_f32", "mul_mat_vec_p021.comp", {{"A_TYPE", "float16_t"}, {"A_TYPE_VEC4", "f16vec4"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}});
|
string_to_spv("mul_mat_vec_p021_f16_f32", "mul_mat_vec_p021.comp", {{"A_TYPE", "float16_t"}, {"A_TYPE_VEC4", "f16vec4"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}});
|
||||||
string_to_spv("mul_mat_vec_nc_f16_f32", "mul_mat_vec_nc.comp", {{"A_TYPE", "float16_t"}, {"A_TYPE_VEC4", "f16vec4"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}});
|
string_to_spv("mul_mat_vec_nc_f16_f32", "mul_mat_vec_nc.comp", {{"A_TYPE", "float16_t"}, {"A_TYPE_VEC4", "f16vec4"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}});
|
||||||
|
|
@ -854,6 +856,8 @@ void process_shaders() {
|
||||||
|
|
||||||
string_to_spv("tri_f16", "tri.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
string_to_spv("tri_f16", "tri.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||||
string_to_spv("tri_f32", "tri.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
string_to_spv("tri_f32", "tri.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||||
|
string_to_spv("diag_f16", "diag.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||||
|
string_to_spv("diag_f32", "diag.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||||
|
|
||||||
string_to_spv("softplus_f16", "softplus.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
string_to_spv("softplus_f16", "softplus.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||||
string_to_spv("softplus_f32", "softplus.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
string_to_spv("softplus_f32", "softplus.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||||
|
|
@ -899,6 +903,13 @@ void process_shaders() {
|
||||||
string_to_spv("soft_max_f32_f16", "soft_max.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
|
string_to_spv("soft_max_f32_f16", "soft_max.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
|
||||||
string_to_spv("soft_max_back_f32", "soft_max_back.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
|
string_to_spv("soft_max_back_f32", "soft_max_back.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||||
|
|
||||||
|
string_to_spv("soft_max_large1_f32", "soft_max_large1.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||||
|
string_to_spv("soft_max_large2_f32", "soft_max_large2.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||||
|
string_to_spv("soft_max_large3_f32", "soft_max_large3.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||||
|
string_to_spv("soft_max_large1_f32_f16", "soft_max_large1.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
|
||||||
|
string_to_spv("soft_max_large2_f32_f16", "soft_max_large2.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
|
||||||
|
string_to_spv("soft_max_large3_f32_f16", "soft_max_large3.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
|
||||||
|
|
||||||
string_to_spv("rope_norm_f32", "rope_norm.comp", {{"A_TYPE", "float"}, {"ROPE_D_TYPE", "float"}});
|
string_to_spv("rope_norm_f32", "rope_norm.comp", {{"A_TYPE", "float"}, {"ROPE_D_TYPE", "float"}});
|
||||||
string_to_spv("rope_norm_f16", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"ROPE_D_TYPE", "float16_t"}});
|
string_to_spv("rope_norm_f16", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"ROPE_D_TYPE", "float16_t"}});
|
||||||
string_to_spv("rope_norm_f16_rte", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"ROPE_D_TYPE", "float16_t"}, {"RTE16", "1"}});
|
string_to_spv("rope_norm_f16_rte", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"ROPE_D_TYPE", "float16_t"}, {"RTE16", "1"}});
|
||||||
|
|
|
||||||
|
|
@ -7566,6 +7566,11 @@ size_t ggml_quantize_chunk(
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
void ggml_log_get(ggml_log_callback * log_callback, void ** user_data) {
|
||||||
|
*log_callback = g_logger_state.log_callback;
|
||||||
|
*user_data = g_logger_state.log_callback_user_data;
|
||||||
|
}
|
||||||
|
|
||||||
void ggml_log_set(ggml_log_callback log_callback, void * user_data) {
|
void ggml_log_set(ggml_log_callback log_callback, void * user_data) {
|
||||||
g_logger_state.log_callback = log_callback ? log_callback : ggml_log_callback_default;
|
g_logger_state.log_callback = log_callback ? log_callback : ggml_log_callback_default;
|
||||||
g_logger_state.log_callback_user_data = user_data;
|
g_logger_state.log_callback_user_data = user_data;
|
||||||
|
|
|
||||||
|
|
@ -3320,6 +3320,7 @@ class VisionProjectorType:
|
||||||
ULTRAVOX = "ultravox"
|
ULTRAVOX = "ultravox"
|
||||||
INTERNVL = "internvl"
|
INTERNVL = "internvl"
|
||||||
QWEN2A = "qwen2a" # audio
|
QWEN2A = "qwen2a" # audio
|
||||||
|
GLMA = "glma" # audio
|
||||||
QWEN25O = "qwen2.5o" # omni
|
QWEN25O = "qwen2.5o" # omni
|
||||||
VOXTRAL = "voxtral"
|
VOXTRAL = "voxtral"
|
||||||
LFM2 = "lfm2"
|
LFM2 = "lfm2"
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
# GBNF Guide
|
# GBNF Guide
|
||||||
|
|
||||||
GBNF (GGML BNF) is a format for defining [formal grammars](https://en.wikipedia.org/wiki/Formal_grammar) to constrain model outputs in `llama.cpp`. For example, you can use it to force the model to generate valid JSON, or speak only in emojis. GBNF grammars are supported in various ways in `tools/main` and `tools/server`.
|
GBNF (GGML BNF) is a format for defining [formal grammars](https://en.wikipedia.org/wiki/Formal_grammar) to constrain model outputs in `llama.cpp`. For example, you can use it to force the model to generate valid JSON, or speak only in emojis. GBNF grammars are supported in various ways in `tools/cli`, `tools/completion` and `tools/server`.
|
||||||
|
|
||||||
## Background
|
## Background
|
||||||
|
|
||||||
|
|
@ -135,7 +135,7 @@ While semantically correct, the syntax `x? x? x?.... x?` (with N repetitions) ma
|
||||||
You can use GBNF grammars:
|
You can use GBNF grammars:
|
||||||
|
|
||||||
- In [llama-server](../tools/server)'s completion endpoints, passed as the `grammar` body field
|
- In [llama-server](../tools/server)'s completion endpoints, passed as the `grammar` body field
|
||||||
- In [llama-cli](../tools/main), passed as the `--grammar` & `--grammar-file` flags
|
- In [llama-cli](../tools/cli) and [llama-completion](../tools/completion), passed as the `--grammar` & `--grammar-file` flags
|
||||||
- With [test-gbnf-validator](../tests/test-gbnf-validator.cpp), to test them against strings.
|
- With [test-gbnf-validator](../tests/test-gbnf-validator.cpp), to test them against strings.
|
||||||
|
|
||||||
## JSON Schemas → GBNF
|
## JSON Schemas → GBNF
|
||||||
|
|
@ -145,7 +145,7 @@ You can use GBNF grammars:
|
||||||
- In [llama-server](../tools/server):
|
- In [llama-server](../tools/server):
|
||||||
- For any completion endpoints, passed as the `json_schema` body field
|
- For any completion endpoints, passed as the `json_schema` body field
|
||||||
- For the `/chat/completions` endpoint, passed inside the `response_format` body field (e.g. `{"type", "json_object", "schema": {"items": {}}}` or `{ type: "json_schema", json_schema: {"schema": ...} }`)
|
- For the `/chat/completions` endpoint, passed inside the `response_format` body field (e.g. `{"type", "json_object", "schema": {"items": {}}}` or `{ type: "json_schema", json_schema: {"schema": ...} }`)
|
||||||
- In [llama-cli](../tools/main), passed as the `--json` / `-j` flag
|
- In [llama-cli](../tools/cli) and [llama-completion](../tools/completion), passed as the `--json` / `-j` flag
|
||||||
- To convert to a grammar ahead of time:
|
- To convert to a grammar ahead of time:
|
||||||
- in CLI, with [examples/json_schema_to_grammar.py](../examples/json_schema_to_grammar.py)
|
- in CLI, with [examples/json_schema_to_grammar.py](../examples/json_schema_to_grammar.py)
|
||||||
- in JavaScript with [json-schema-to-grammar.mjs](../tools/server/public_legacy/json-schema-to-grammar.mjs) (this is used by the [server](../tools/server)'s Web UI)
|
- in JavaScript with [json-schema-to-grammar.mjs](../tools/server/public_legacy/json-schema-to-grammar.mjs) (this is used by the [server](../tools/server)'s Web UI)
|
||||||
|
|
|
||||||
|
|
@ -313,6 +313,7 @@ extern "C" {
|
||||||
bool check_tensors; // validate model tensor data
|
bool check_tensors; // validate model tensor data
|
||||||
bool use_extra_bufts; // use extra buffer types (used for weight repacking)
|
bool use_extra_bufts; // use extra buffer types (used for weight repacking)
|
||||||
bool no_host; // bypass host buffer allowing extra buffers to be used
|
bool no_host; // bypass host buffer allowing extra buffers to be used
|
||||||
|
bool no_alloc; // only load metadata and simulate memory allocations
|
||||||
};
|
};
|
||||||
|
|
||||||
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
|
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
|
||||||
|
|
@ -466,10 +467,24 @@ extern "C" {
|
||||||
// Frees all allocated memory
|
// Frees all allocated memory
|
||||||
LLAMA_API void llama_free(struct llama_context * ctx);
|
LLAMA_API void llama_free(struct llama_context * ctx);
|
||||||
|
|
||||||
|
// fits mparams and cparams to free device memory (assumes system memory is unlimited)
|
||||||
|
// returns true if the parameters could be successfully modified to fit device memory
|
||||||
|
// this function is NOT thread safe because it modifies the global llama logger state
|
||||||
|
LLAMA_API bool llama_params_fit(
|
||||||
|
const char * path_model,
|
||||||
|
struct llama_model_params * mparams,
|
||||||
|
struct llama_context_params * cparams,
|
||||||
|
float * tensor_split, // writable buffer for tensor split, needs at least llama_max_devices elements
|
||||||
|
struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
|
||||||
|
size_t margin, // margin of memory to leave per device in bytes
|
||||||
|
uint32_t n_ctx_min, // minimum context size to set when trying to reduce memory use
|
||||||
|
enum ggml_log_level log_level); // minimum log level to print during fitting, lower levels go to debug log
|
||||||
|
|
||||||
LLAMA_API int64_t llama_time_us(void);
|
LLAMA_API int64_t llama_time_us(void);
|
||||||
|
|
||||||
LLAMA_API size_t llama_max_devices(void);
|
LLAMA_API size_t llama_max_devices(void);
|
||||||
LLAMA_API size_t llama_max_parallel_sequences(void);
|
LLAMA_API size_t llama_max_parallel_sequences(void);
|
||||||
|
LLAMA_API size_t llama_max_tensor_buft_overrides(void);
|
||||||
|
|
||||||
LLAMA_API bool llama_supports_mmap (void);
|
LLAMA_API bool llama_supports_mmap (void);
|
||||||
LLAMA_API bool llama_supports_mlock (void);
|
LLAMA_API bool llama_supports_mlock (void);
|
||||||
|
|
@ -1354,6 +1369,8 @@ extern "C" {
|
||||||
|
|
||||||
// Set callback for all future logging events.
|
// Set callback for all future logging events.
|
||||||
// If this is not called, or NULL is supplied, everything is output on stderr.
|
// If this is not called, or NULL is supplied, everything is output on stderr.
|
||||||
|
// The logger state is global so these functions are NOT thread safe.
|
||||||
|
LLAMA_API void llama_log_get(ggml_log_callback * log_callback, void ** user_data);
|
||||||
LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
|
LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
|
||||||
|
|
||||||
//
|
//
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
{
|
{
|
||||||
"extraPaths": ["gguf-py"],
|
"extraPaths": ["gguf-py", "examples/model-conversion/scripts"],
|
||||||
"pythonVersion": "3.9",
|
"pythonVersion": "3.9",
|
||||||
"pythonPlatform": "All",
|
"pythonPlatform": "All",
|
||||||
"reportUnusedImport": "warning",
|
"reportUnusedImport": "warning",
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,281 @@
|
||||||
|
import argparse
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logger = logging.getLogger("compare-logprobs")
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|
||||||
|
|
||||||
|
DESCRIPTION = """
|
||||||
|
Compare logits between llama.cpp and another inference engine using OpenAI-compatible server endpoints.
|
||||||
|
|
||||||
|
Unlike compare-logits.py, it allows dumping logits from a hosted API endpoint. Useful when it's not possible to run both models locally.
|
||||||
|
|
||||||
|
Example usage:
|
||||||
|
Step 1: Dump logits from two different servers
|
||||||
|
python scripts/compare-logprobs.py dump logits_llama.log http://localhost:8080/v1/completions
|
||||||
|
python scripts/compare-logprobs.py dump logits_other.log http://other-engine:8000/v1/completions
|
||||||
|
|
||||||
|
(optionally, you can add --api-key <key> if the endpoint requires authentication)
|
||||||
|
|
||||||
|
Step 2: Compare the dumped logits
|
||||||
|
python scripts/compare-logprobs.py compare logits_llama.log logits_other.log report.md
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def generate_input_prompt(length: int) -> list[str]:
|
||||||
|
CORPUS = """
|
||||||
|
You are an advanced AI assistant capable of using tools to gather information, perform calculations, or execute tasks. Always think step by step before responding. If a user's query requires external data, computation, or actions beyond your internal knowledge, use the appropriate tools via function calls.
|
||||||
|
|
||||||
|
### Tool Call Format:
|
||||||
|
When you need to use a tool, output the call in this exact XML format. Include the opening and closing tags. Do not escape arguments; they will be parsed as plain text.
|
||||||
|
|
||||||
|
You can make multiple calls in one go by placing them one after another.
|
||||||
|
"""
|
||||||
|
words = [w.strip() for w in CORPUS.strip().split(" ")]
|
||||||
|
words = [w for w in words if len(w) > 0] # filter out empty strings
|
||||||
|
while len(words) < length:
|
||||||
|
words += words
|
||||||
|
return words[:length]
|
||||||
|
|
||||||
|
|
||||||
|
def dump_logits(
|
||||||
|
endpoint: str,
|
||||||
|
output_path: Path,
|
||||||
|
input_words: list[str],
|
||||||
|
pattern: list[tuple[bool, int]],
|
||||||
|
api_key=None,
|
||||||
|
):
|
||||||
|
logger.info(f"Dumping logits to {output_path} from endpoint {endpoint}...")
|
||||||
|
words = input_words
|
||||||
|
curr_text = ""
|
||||||
|
n_total = sum(n for get, n in pattern if get)
|
||||||
|
n_done = 0
|
||||||
|
i_cur = 0
|
||||||
|
i_total = len(words)
|
||||||
|
with output_path.open("w") as f:
|
||||||
|
for get, n in pattern:
|
||||||
|
if not get:
|
||||||
|
# skip n words
|
||||||
|
for i in range(n):
|
||||||
|
curr_text += words.pop(0) + " "
|
||||||
|
i_cur += 1
|
||||||
|
continue
|
||||||
|
# get n words
|
||||||
|
for i in range(n):
|
||||||
|
curr_text += words.pop(0) + " "
|
||||||
|
payload = {
|
||||||
|
"prompt": curr_text.strip(),
|
||||||
|
"temperature": 0.0,
|
||||||
|
"top_k": 1,
|
||||||
|
"max_tokens": 1,
|
||||||
|
"logprobs": 1,
|
||||||
|
"stream": False,
|
||||||
|
}
|
||||||
|
response = requests.post(
|
||||||
|
endpoint,
|
||||||
|
json=payload,
|
||||||
|
headers={"Authorization": f"Bearer {api_key}"} if api_key else {},
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
data = response.json()
|
||||||
|
data["__index"] = i_cur # add index for easier debugging later
|
||||||
|
data = json.dumps(data)
|
||||||
|
f.write(f"{data}\n")
|
||||||
|
n_done += 1
|
||||||
|
i_cur += 1
|
||||||
|
logger.info(
|
||||||
|
f"\n\n{data}\n\n[Step: {n_done}/{n_total} | Word: {i_cur}/{i_total}]"
|
||||||
|
)
|
||||||
|
logger.info(f"Logits dumped to {output_path}")
|
||||||
|
|
||||||
|
|
||||||
|
def get_token_logprobs(data: dict):
|
||||||
|
logprobs = data["choices"][0]["logprobs"]
|
||||||
|
if "content" in logprobs:
|
||||||
|
# llama.cpp case
|
||||||
|
top = logprobs["content"][0]["top_logprobs"][0]
|
||||||
|
return top["token"], top["logprob"]
|
||||||
|
else:
|
||||||
|
# vllm case
|
||||||
|
tokens = logprobs["tokens"]
|
||||||
|
token_logprobs = logprobs["token_logprobs"]
|
||||||
|
return tokens[0], token_logprobs[0]
|
||||||
|
|
||||||
|
|
||||||
|
def clean_text(text: str) -> str:
|
||||||
|
return (
|
||||||
|
"'"
|
||||||
|
+ text.replace("\n", "\\n")
|
||||||
|
.replace("\t", "\\t")
|
||||||
|
.replace("\r", "\\r")
|
||||||
|
.replace("|", "\\|")
|
||||||
|
+ "'"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def compare_logits(input1: Path, input2: Path, output_path: Path):
|
||||||
|
with input1.open("r") as f1, input2.open("r") as f2, output_path.open("w") as fout:
|
||||||
|
lines1 = f1.readlines()
|
||||||
|
lines2 = f2.readlines()
|
||||||
|
|
||||||
|
tab_header = [
|
||||||
|
"idx",
|
||||||
|
input1.name,
|
||||||
|
"logprob_1",
|
||||||
|
input2.name,
|
||||||
|
"logprob_2",
|
||||||
|
"diff (abs)",
|
||||||
|
]
|
||||||
|
tab_entries = []
|
||||||
|
tab_max_widths = [len(h) for h in tab_header]
|
||||||
|
|
||||||
|
assert len(lines1) == len(
|
||||||
|
lines2
|
||||||
|
), "Input files must have the same number of lines."
|
||||||
|
|
||||||
|
fout.write("# Logits Comparison Report\n\n")
|
||||||
|
for i, (line1, line2) in enumerate(zip(lines1, lines2)):
|
||||||
|
if not line1.strip() or not line2.strip():
|
||||||
|
continue # skip empty lines
|
||||||
|
|
||||||
|
data1 = json.loads(line1)
|
||||||
|
data2 = json.loads(line2)
|
||||||
|
|
||||||
|
idx1 = data1.get("__index", -1)
|
||||||
|
idx2 = data2.get("__index", -1)
|
||||||
|
if idx1 != idx2:
|
||||||
|
logger.warning(
|
||||||
|
f"Warning: Mismatched indices at line {i}: {idx1} vs {idx2}"
|
||||||
|
)
|
||||||
|
|
||||||
|
token1, logprob1 = get_token_logprobs(data1)
|
||||||
|
token2, logprob2 = get_token_logprobs(data2)
|
||||||
|
|
||||||
|
token1 = clean_text(token1)
|
||||||
|
token2 = clean_text(token2)
|
||||||
|
abs_diff = abs(logprob1 - logprob2)
|
||||||
|
|
||||||
|
tab_entries.append(
|
||||||
|
(
|
||||||
|
str(idx1 + 1),
|
||||||
|
token1,
|
||||||
|
f"{logprob1:.4f}",
|
||||||
|
token2,
|
||||||
|
f"{logprob2:.4f}",
|
||||||
|
f"{(abs_diff):.4f}",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
for i in range(len(tab_entries)):
|
||||||
|
for j in range(len(tab_header)):
|
||||||
|
tab_max_widths[j] = max(tab_max_widths[j], len(tab_entries[i][j]))
|
||||||
|
|
||||||
|
output = ""
|
||||||
|
for j in range(len(tab_header)):
|
||||||
|
output += f"| {tab_header[j]:<{tab_max_widths[j]}} "
|
||||||
|
output += "|\n"
|
||||||
|
for j in range(len(tab_header)):
|
||||||
|
output += f"|{'-' * (tab_max_widths[j] + 2)}"
|
||||||
|
output += "|\n"
|
||||||
|
for entry in tab_entries:
|
||||||
|
for j in range(len(tab_header)):
|
||||||
|
output += f"| {entry[j]:<{tab_max_widths[j]}} "
|
||||||
|
output += "|\n"
|
||||||
|
|
||||||
|
logger.info("\n" + output)
|
||||||
|
fout.write(output)
|
||||||
|
logger.info(f"Report written to {output_path}")
|
||||||
|
|
||||||
|
|
||||||
|
def parse_pattern(pattern: str) -> list[tuple[bool, int]]:
|
||||||
|
parts = pattern.split(",")
|
||||||
|
result = []
|
||||||
|
for i, part in enumerate(parts):
|
||||||
|
n = int(part)
|
||||||
|
if i % 2 == 0:
|
||||||
|
result.append((True, n)) # get n words
|
||||||
|
else:
|
||||||
|
result.append((False, n)) # skip n words
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args() -> argparse.Namespace:
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description=DESCRIPTION, formatter_class=argparse.RawTextHelpFormatter
|
||||||
|
)
|
||||||
|
subparsers = parser.add_subparsers(
|
||||||
|
dest="verb", required=True, help="action to perform"
|
||||||
|
)
|
||||||
|
|
||||||
|
# dump subcommand
|
||||||
|
parser_dump = subparsers.add_parser("dump", help="dump logits from an endpoint")
|
||||||
|
parser_dump.add_argument(
|
||||||
|
"output", type=Path, help="output path for dumped logits (.log)"
|
||||||
|
)
|
||||||
|
parser_dump.add_argument(
|
||||||
|
"endpoint", type=str, help="OAI-compat /completions endpoint"
|
||||||
|
)
|
||||||
|
parser_dump.add_argument(
|
||||||
|
"--api-key",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="API key for authentication (if required)",
|
||||||
|
)
|
||||||
|
parser_dump.add_argument(
|
||||||
|
"--file",
|
||||||
|
type=Path,
|
||||||
|
default=None,
|
||||||
|
help="File containing prompt to use instead of the default",
|
||||||
|
)
|
||||||
|
parser_dump.add_argument(
|
||||||
|
"--pattern",
|
||||||
|
type=str,
|
||||||
|
default="10,1000,10,4000,10",
|
||||||
|
help="Pattern n_get,n_skip,... where n_get is number of words to get and n_skip is number of words to skip (num of words, NOT num of tokens)",
|
||||||
|
)
|
||||||
|
|
||||||
|
# compare subcommand
|
||||||
|
parser_compare = subparsers.add_parser(
|
||||||
|
"compare", help="compare two dumped logits files"
|
||||||
|
)
|
||||||
|
parser_compare.add_argument("input1", type=Path, help="first input file (.log)")
|
||||||
|
parser_compare.add_argument("input2", type=Path, help="second input file (.log)")
|
||||||
|
parser_compare.add_argument(
|
||||||
|
"output", type=Path, help="output path for comparison report (.md)"
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
return parser.parse_args()
|
||||||
|
except Exception as e:
|
||||||
|
parser.print_help()
|
||||||
|
raise e
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
args = parse_args()
|
||||||
|
|
||||||
|
if args.verb == "dump":
|
||||||
|
pattern = parse_pattern(args.pattern)
|
||||||
|
input_length = sum(n for _, n in pattern)
|
||||||
|
input_words = generate_input_prompt(input_length)
|
||||||
|
if args.file is not None:
|
||||||
|
with args.file.open("r") as f:
|
||||||
|
input_words = f.read().strip().split(" ")
|
||||||
|
if input_length < sum(n for _, n in pattern):
|
||||||
|
raise ValueError(
|
||||||
|
f"Input file has only {input_length} words, but pattern requires at least {input_length} words."
|
||||||
|
)
|
||||||
|
input_length = len(input_words)
|
||||||
|
logger.info(f"Using {input_length} words")
|
||||||
|
dump_logits(args.endpoint, args.output, input_words, pattern, args.api_key)
|
||||||
|
elif args.verb == "compare":
|
||||||
|
compare_logits(args.input1, args.input2, args.output)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown verb: {args.verb}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
@ -1 +1 @@
|
||||||
55bc9320a4aae82af18e23eefd5de319a755d7b9
|
130bc125a88bb57664b88932c48c38a1cb316fac
|
||||||
|
|
|
||||||
|
|
@ -9,6 +9,7 @@
|
||||||
#include "llama-model.h"
|
#include "llama-model.h"
|
||||||
|
|
||||||
#include <cinttypes>
|
#include <cinttypes>
|
||||||
|
#include <cmath>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <limits>
|
#include <limits>
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
|
|
@ -72,6 +73,43 @@ llama_context::llama_context(
|
||||||
cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
|
cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (cparams.yarn_ext_factor != 0) {
|
||||||
|
static auto get_mscale = [](float scale, float mscale) {
|
||||||
|
return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f);
|
||||||
|
};
|
||||||
|
|
||||||
|
const float factor = 1.0f / cparams.rope_freq_scale;
|
||||||
|
|
||||||
|
// ref: https://github.com/huggingface/transformers/blob/6d00f6b0a5679c36510f203e4226e36f517c3032/src/transformers/modeling_rope_utils.py#L336-L348
|
||||||
|
if (hparams.rope_yarn_log_mul != 0.0f) {
|
||||||
|
// note: here we assume `mscale == 1.0f`
|
||||||
|
// TODO: start reading the actual value of mscale and handle the case where it is not 1.0f
|
||||||
|
float mscale = 1.0f;
|
||||||
|
const float mscale_all_dims = hparams.rope_yarn_log_mul;
|
||||||
|
|
||||||
|
// [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
|
||||||
|
// special-case DEEPSEEK v2:
|
||||||
|
// https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite-Chat/blob/main/config.json#L42-L43
|
||||||
|
if (model.arch == LLM_ARCH_DEEPSEEK2 && mscale_all_dims != 1.0f) {
|
||||||
|
mscale = mscale_all_dims;
|
||||||
|
}
|
||||||
|
|
||||||
|
cparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims);
|
||||||
|
|
||||||
|
LLAMA_LOG_WARN("%s: setting new yarn_attn_factor = %.4f (mscale == %.1f, mscale_all_dim = %.1f)\n",
|
||||||
|
__func__, cparams.yarn_attn_factor, mscale, mscale_all_dims);
|
||||||
|
} else {
|
||||||
|
cparams.yarn_attn_factor = get_mscale(factor, 1.0f);
|
||||||
|
}
|
||||||
|
|
||||||
|
// when YARN is applied with yarn_ext_factor != 0.0f, we need to cancel this factor:
|
||||||
|
// https://github.com/ggml-org/llama.cpp/blob/a81a569577cc38b32558958b048228150be63eae/ggml/src/ggml-cpu/ops.cpp#L5541-L5544
|
||||||
|
//
|
||||||
|
// ref: https://github.com/ggml-org/llama.cpp/discussions/7416
|
||||||
|
// https://github.com/ggml-org/llama.cpp/pull/17945
|
||||||
|
cparams.yarn_attn_factor *= 1.0f / (1.0f + 0.1f * logf(factor));
|
||||||
|
}
|
||||||
|
|
||||||
cparams.yarn_attn_factor *= hparams.rope_attn_factor;
|
cparams.yarn_attn_factor *= hparams.rope_attn_factor;
|
||||||
|
|
||||||
if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
|
if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
|
||||||
|
|
@ -220,6 +258,7 @@ llama_context::llama_context(
|
||||||
|
|
||||||
backend_buft.clear();
|
backend_buft.clear();
|
||||||
backend_ptrs.clear();
|
backend_ptrs.clear();
|
||||||
|
backend_buf_exp_size.clear();
|
||||||
|
|
||||||
for (auto & backend : backends) {
|
for (auto & backend : backends) {
|
||||||
auto * buft = ggml_backend_get_default_buffer_type(backend.get());
|
auto * buft = ggml_backend_get_default_buffer_type(backend.get());
|
||||||
|
|
@ -236,6 +275,7 @@ llama_context::llama_context(
|
||||||
|
|
||||||
backend_buft.push_back(buft);
|
backend_buft.push_back(buft);
|
||||||
backend_ptrs.push_back(backend.get());
|
backend_ptrs.push_back(backend.get());
|
||||||
|
backend_buf_exp_size.push_back(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
LLAMA_LOG_DEBUG("%s: backend_ptrs.size() = %zu\n", __func__, backend_ptrs.size());
|
LLAMA_LOG_DEBUG("%s: backend_ptrs.size() = %zu\n", __func__, backend_ptrs.size());
|
||||||
|
|
@ -351,7 +391,8 @@ llama_context::llama_context(
|
||||||
|
|
||||||
// reserve pp (prompt processing) graph first so that buffers are only allocated once
|
// reserve pp (prompt processing) graph first so that buffers are only allocated once
|
||||||
{
|
{
|
||||||
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
|
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(),
|
||||||
|
model.hparams.no_alloc, model.hparams.no_alloc ? backend_buf_exp_size.data() : nullptr);
|
||||||
if (!gf) {
|
if (!gf) {
|
||||||
if (pipeline_parallel) {
|
if (pipeline_parallel) {
|
||||||
LLAMA_LOG_WARN("%s: compute buffer allocation failed, retrying without pipeline parallelism\n", __func__);
|
LLAMA_LOG_WARN("%s: compute buffer allocation failed, retrying without pipeline parallelism\n", __func__);
|
||||||
|
|
@ -369,7 +410,7 @@ llama_context::llama_context(
|
||||||
|
|
||||||
// reserve with tg (token generation) graph to get the number of splits and nodes
|
// reserve with tg (token generation) graph to get the number of splits and nodes
|
||||||
{
|
{
|
||||||
auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get());
|
auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get(), model.hparams.no_alloc);
|
||||||
if (!gf) {
|
if (!gf) {
|
||||||
throw std::runtime_error("failed to allocate compute tg buffers");
|
throw std::runtime_error("failed to allocate compute tg buffers");
|
||||||
}
|
}
|
||||||
|
|
@ -384,7 +425,7 @@ llama_context::llama_context(
|
||||||
//
|
//
|
||||||
// auto * gf = graph_reserve(n_tokens, 1, n_tokens, mctx.get());
|
// auto * gf = graph_reserve(n_tokens, 1, n_tokens, mctx.get());
|
||||||
//
|
//
|
||||||
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
|
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(), model.hparams.no_alloc);
|
||||||
if (!gf) {
|
if (!gf) {
|
||||||
throw std::runtime_error("failed to allocate compute pp buffers");
|
throw std::runtime_error("failed to allocate compute pp buffers");
|
||||||
}
|
}
|
||||||
|
|
@ -393,11 +434,13 @@ llama_context::llama_context(
|
||||||
for (size_t i = 0; i < backend_ptrs.size(); ++i) {
|
for (size_t i = 0; i < backend_ptrs.size(); ++i) {
|
||||||
ggml_backend_t backend = backend_ptrs[i];
|
ggml_backend_t backend = backend_ptrs[i];
|
||||||
ggml_backend_buffer_type_t buft = backend_buft[i];
|
ggml_backend_buffer_type_t buft = backend_buft[i];
|
||||||
size_t size = ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
if (!model.hparams.no_alloc) {
|
||||||
if (size > 1) {
|
backend_buf_exp_size[i] = ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
||||||
|
}
|
||||||
|
if (backend_buf_exp_size[i] > 1) {
|
||||||
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
|
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
|
||||||
ggml_backend_buft_name(buft),
|
ggml_backend_buft_name(buft),
|
||||||
size / 1024.0 / 1024.0);
|
backend_buf_exp_size[i] / 1024.0 / 1024.0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -416,6 +459,23 @@ llama_context::llama_context(
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_context::~llama_context() {
|
llama_context::~llama_context() {
|
||||||
|
// FIXME this currently results in a use-after-free bug if the model is freed before the context
|
||||||
|
// if (!model.hparams.no_alloc) {
|
||||||
|
// for (size_t i = 0; i < backend_ptrs.size(); ++i) {
|
||||||
|
// ggml_backend_t backend = backend_ptrs[i];
|
||||||
|
// ggml_backend_buffer_type_t buft = backend_buft[i];
|
||||||
|
|
||||||
|
// const size_t size_exp = backend_buf_exp_size[i];
|
||||||
|
// const size_t size_act = ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
||||||
|
// if (size_exp == size_act) {
|
||||||
|
// LLAMA_LOG_DEBUG("%s: %10s compute buffer size is %8.4f MiB, matches expectation of %8.4f MiB\n",
|
||||||
|
// __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
|
||||||
|
// } else {
|
||||||
|
// LLAMA_LOG_WARN("%s: %10s compute buffer size of %8.4f MiB, does not match expectation of %8.4f MiB\n",
|
||||||
|
// __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// }
|
||||||
ggml_opt_free(opt_ctx);
|
ggml_opt_free(opt_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1318,6 +1378,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||||
// This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
|
// This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
|
||||||
LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
||||||
#endif
|
#endif
|
||||||
|
synchronize();
|
||||||
buf_output = nullptr;
|
buf_output = nullptr;
|
||||||
logits = nullptr;
|
logits = nullptr;
|
||||||
embd = nullptr;
|
embd = nullptr;
|
||||||
|
|
@ -1389,7 +1450,8 @@ llm_graph_result * llama_context::get_gf_res_reserve() const {
|
||||||
return static_cast<llm_graph_result *>(gf_res_reserve.get());
|
return static_cast<llm_graph_result *>(gf_res_reserve.get());
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only) {
|
ggml_cgraph * llama_context::graph_reserve(
|
||||||
|
uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only, size_t * sizes) {
|
||||||
LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs);
|
LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs);
|
||||||
GGML_ASSERT(n_outputs >= 1);
|
GGML_ASSERT(n_outputs >= 1);
|
||||||
|
|
||||||
|
|
@ -1426,8 +1488,13 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
|
||||||
|
|
||||||
// initialize scheduler with the specified graph
|
// initialize scheduler with the specified graph
|
||||||
if (split_only) {
|
if (split_only) {
|
||||||
|
if (sizes) {
|
||||||
|
ggml_backend_sched_reserve_size(sched.get(), gf, sizes);
|
||||||
|
} else {
|
||||||
ggml_backend_sched_split_graph(sched.get(), gf);
|
ggml_backend_sched_split_graph(sched.get(), gf);
|
||||||
|
}
|
||||||
} else if (!ggml_backend_sched_reserve(sched.get(), gf)) {
|
} else if (!ggml_backend_sched_reserve(sched.get(), gf)) {
|
||||||
|
GGML_ASSERT(!sizes);
|
||||||
LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
|
LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
@ -2049,15 +2116,26 @@ void llama_context::perf_reset() {
|
||||||
|
|
||||||
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> llama_context::memory_breakdown() const {
|
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> llama_context::memory_breakdown() const {
|
||||||
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> ret;
|
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> ret;
|
||||||
for (const auto & buft_size : model.memory_breakdown()) {
|
for (const auto & [buft, size] : model.memory_breakdown()) {
|
||||||
ret[buft_size.first].model += buft_size.second;
|
ret[buft].model += size;
|
||||||
}
|
}
|
||||||
for (const auto & buft_size : memory->memory_breakdown()) {
|
if (memory) {
|
||||||
ret[buft_size.first].context += buft_size.second;
|
for (const auto & [buft, size] : memory->memory_breakdown()) {
|
||||||
|
ret[buft].context += size;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
if (model.hparams.no_alloc) {
|
||||||
|
for (size_t i = 0; i < backends.size(); ++i) {
|
||||||
|
ggml_backend_t backend = backends[i].get();
|
||||||
|
ggml_backend_buffer_type_t buft = ggml_backend_sched_get_buffer_type(sched.get(), backend);
|
||||||
|
ret[buft].compute += backend_buf_exp_size[i];
|
||||||
|
}
|
||||||
|
} else {
|
||||||
for (const auto & backend_ptr : backends) {
|
for (const auto & backend_ptr : backends) {
|
||||||
ggml_backend_t backend = backend_ptr.get();
|
ggml_backend_t backend = backend_ptr.get();
|
||||||
ret[ggml_backend_sched_get_buffer_type(sched.get(), backend)].compute += ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
ggml_backend_buffer_type_t buft = ggml_backend_sched_get_buffer_type(sched.get(), backend);
|
||||||
|
ret[buft].compute += ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -26,6 +26,10 @@ struct llama_memory_breakdown_data {
|
||||||
size_t model = 0; // memory allocated for the model
|
size_t model = 0; // memory allocated for the model
|
||||||
size_t context = 0; // memory allocated for the context
|
size_t context = 0; // memory allocated for the context
|
||||||
size_t compute = 0; // memory allocated for temporary compute buffers
|
size_t compute = 0; // memory allocated for temporary compute buffers
|
||||||
|
|
||||||
|
size_t total() const {
|
||||||
|
return model + context + compute;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_context {
|
struct llama_context {
|
||||||
|
|
@ -206,7 +210,8 @@ public:
|
||||||
ggml_status graph_compute(ggml_cgraph * gf, bool batched);
|
ggml_status graph_compute(ggml_cgraph * gf, bool batched);
|
||||||
|
|
||||||
// reserve a graph with a dummy ubatch of the specified size
|
// reserve a graph with a dummy ubatch of the specified size
|
||||||
ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false);
|
ggml_cgraph * graph_reserve(
|
||||||
|
uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false, size_t * sizes = nullptr);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
llm_graph_params graph_params(
|
llm_graph_params graph_params(
|
||||||
|
|
@ -281,9 +286,10 @@ private:
|
||||||
|
|
||||||
std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
|
std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
|
||||||
|
|
||||||
// buffer types used for the compute buffer of each backend
|
// pointers and buffer types used for the compute buffer of each backend
|
||||||
std::vector<ggml_backend_t> backend_ptrs;
|
std::vector<ggml_backend_t> backend_ptrs;
|
||||||
std::vector<ggml_backend_buffer_type_t> backend_buft;
|
std::vector<ggml_backend_buffer_type_t> backend_buft;
|
||||||
|
std::vector<size_t> backend_buf_exp_size; // expected buffer sizes
|
||||||
|
|
||||||
llm_graph_result_ptr gf_res_prev;
|
llm_graph_result_ptr gf_res_prev;
|
||||||
llm_graph_result_ptr gf_res_reserve;
|
llm_graph_result_ptr gf_res_reserve;
|
||||||
|
|
|
||||||
|
|
@ -78,7 +78,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
|
||||||
for (int i = 0; i < n_tokens; ++i) {
|
for (int i = 0; i < n_tokens; ++i) {
|
||||||
const float pos = ubatch->pos[i];
|
const float pos = ubatch->pos[i];
|
||||||
attn_scale_data[i] = std::log(
|
attn_scale_data[i] = std::log(
|
||||||
std::floor((pos + 1.0f) / n_attn_temp_floor_scale) + 1.0
|
std::floor((pos + f_attn_temp_offset) / n_attn_temp_floor_scale) + 1.0
|
||||||
) * f_attn_temp_scale + 1.0;
|
) * f_attn_temp_scale + 1.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -574,7 +574,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
|
||||||
freq_base (cparams.rope_freq_base),
|
freq_base (cparams.rope_freq_base),
|
||||||
freq_scale (cparams.rope_freq_scale),
|
freq_scale (cparams.rope_freq_scale),
|
||||||
ext_factor (cparams.yarn_ext_factor),
|
ext_factor (cparams.yarn_ext_factor),
|
||||||
attn_factor (llama_hparams::yarn_attn_factor_adjust(cparams.yarn_attn_factor, cparams.rope_freq_scale, cparams.yarn_ext_factor)),
|
attn_factor (cparams.yarn_attn_factor),
|
||||||
beta_fast (cparams.yarn_beta_fast),
|
beta_fast (cparams.yarn_beta_fast),
|
||||||
beta_slow (cparams.yarn_beta_slow),
|
beta_slow (cparams.yarn_beta_slow),
|
||||||
norm_eps (hparams.f_norm_eps),
|
norm_eps (hparams.f_norm_eps),
|
||||||
|
|
@ -1203,7 +1203,7 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
|
ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
|
||||||
auto inp = std::make_unique<llm_graph_input_attn_temp>(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);
|
auto inp = std::make_unique<llm_graph_input_attn_temp>(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale, hparams.f_attn_temp_offset);
|
||||||
|
|
||||||
auto & cur = inp->attn_scale;
|
auto & cur = inp->attn_scale;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -132,8 +132,8 @@ public:
|
||||||
// temperature tuning, used by llama4
|
// temperature tuning, used by llama4
|
||||||
class llm_graph_input_attn_temp : public llm_graph_input_i {
|
class llm_graph_input_attn_temp : public llm_graph_input_i {
|
||||||
public:
|
public:
|
||||||
llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
|
llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale, float f_attn_temp_offset)
|
||||||
: n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
|
: n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale), f_attn_temp_offset(f_attn_temp_offset) {}
|
||||||
virtual ~llm_graph_input_attn_temp() = default;
|
virtual ~llm_graph_input_attn_temp() = default;
|
||||||
|
|
||||||
void set_input(const llama_ubatch * ubatch) override;
|
void set_input(const llama_ubatch * ubatch) override;
|
||||||
|
|
@ -142,6 +142,7 @@ public:
|
||||||
|
|
||||||
const uint32_t n_attn_temp_floor_scale;
|
const uint32_t n_attn_temp_floor_scale;
|
||||||
const float f_attn_temp_scale;
|
const float f_attn_temp_scale;
|
||||||
|
const float f_attn_temp_offset;
|
||||||
};
|
};
|
||||||
|
|
||||||
class llm_graph_input_pos_bucket : public llm_graph_input_i {
|
class llm_graph_input_pos_bucket : public llm_graph_input_i {
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,6 @@
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <cmath>
|
|
||||||
|
|
||||||
void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
|
void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
|
||||||
if (dense_first) {
|
if (dense_first) {
|
||||||
|
|
@ -231,13 +230,3 @@ bool llama_hparams::is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
float llama_hparams::yarn_attn_factor_adjust(float attn_factor, float freq_scale, float ext_factor) {
|
|
||||||
GGML_ASSERT(ext_factor >= 0.0f);
|
|
||||||
|
|
||||||
if (ext_factor != 0.0f) {
|
|
||||||
attn_factor *= 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
|
|
||||||
}
|
|
||||||
|
|
||||||
return attn_factor;
|
|
||||||
}
|
|
||||||
|
|
|
||||||
|
|
@ -34,6 +34,7 @@ struct llama_hparams_convnext {
|
||||||
|
|
||||||
struct llama_hparams {
|
struct llama_hparams {
|
||||||
bool vocab_only;
|
bool vocab_only;
|
||||||
|
bool no_alloc;
|
||||||
bool rope_finetuned;
|
bool rope_finetuned;
|
||||||
bool use_par_res;
|
bool use_par_res;
|
||||||
bool swin_norm;
|
bool swin_norm;
|
||||||
|
|
@ -165,6 +166,7 @@ struct llama_hparams {
|
||||||
uint32_t n_no_rope_layer_step = 4;
|
uint32_t n_no_rope_layer_step = 4;
|
||||||
uint32_t n_attn_temp_floor_scale = 0;
|
uint32_t n_attn_temp_floor_scale = 0;
|
||||||
float f_attn_temp_scale = 0.0f;
|
float f_attn_temp_scale = 0.0f;
|
||||||
|
float f_attn_temp_offset = 0.0f; // offset position index
|
||||||
|
|
||||||
// gemma3n altup
|
// gemma3n altup
|
||||||
uint32_t n_altup = 4; // altup_num_inputs
|
uint32_t n_altup = 4; // altup_num_inputs
|
||||||
|
|
@ -268,13 +270,6 @@ struct llama_hparams {
|
||||||
// TODO: think of a better place for this function
|
// TODO: think of a better place for this function
|
||||||
// TODO: pack the SWA params in a struct?
|
// TODO: pack the SWA params in a struct?
|
||||||
static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1);
|
static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1);
|
||||||
|
|
||||||
// when YARN is applied with yarn_ext_factor != 0.0f, we need to cancel this factor:
|
|
||||||
// https://github.com/ggml-org/llama.cpp/blob/a81a569577cc38b32558958b048228150be63eae/ggml/src/ggml-cpu/ops.cpp#L5541-L5544
|
|
||||||
//
|
|
||||||
// ref: https://github.com/ggml-org/llama.cpp/discussions/7416
|
|
||||||
// https://github.com/ggml-org/llama.cpp/pull/17945
|
|
||||||
static float yarn_attn_factor_adjust(float attn_factor, float freq_scale, float ext_factor);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
|
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
|
||||||
|
|
|
||||||
|
|
@ -25,6 +25,10 @@ time_meas::~time_meas() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void llama_log_get(ggml_log_callback * log_callback, void ** user_data) {
|
||||||
|
ggml_log_get(log_callback, user_data);
|
||||||
|
}
|
||||||
|
|
||||||
void llama_log_set(ggml_log_callback log_callback, void * user_data) {
|
void llama_log_set(ggml_log_callback log_callback, void * user_data) {
|
||||||
ggml_log_set(log_callback, user_data);
|
ggml_log_set(log_callback, user_data);
|
||||||
g_logger_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
|
g_logger_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
|
||||||
|
|
|
||||||
|
|
@ -175,7 +175,15 @@ llama_kv_cache::llama_kv_cache(
|
||||||
|
|
||||||
// allocate tensors and initialize the buffers to avoid NaNs in the padding
|
// allocate tensors and initialize the buffers to avoid NaNs in the padding
|
||||||
for (auto & [buft, ctx] : ctx_map) {
|
for (auto & [buft, ctx] : ctx_map) {
|
||||||
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft);
|
ggml_backend_buffer_t buf;
|
||||||
|
if (model.hparams.no_alloc) {
|
||||||
|
buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); // dummy buffer
|
||||||
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != nullptr; t = ggml_get_next_tensor(ctx.get(), t)) {
|
||||||
|
t->buffer = buf; // set dummy buffer for KV cache so that the backend scheduler won't try to allocate it
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft); // real buffer
|
||||||
|
}
|
||||||
if (!buf) {
|
if (!buf) {
|
||||||
throw std::runtime_error("failed to allocate buffer for kv cache");
|
throw std::runtime_error("failed to allocate buffer for kv cache");
|
||||||
}
|
}
|
||||||
|
|
@ -482,9 +490,18 @@ llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) const {
|
||||||
|
|
||||||
std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache::memory_breakdown() const {
|
std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache::memory_breakdown() const {
|
||||||
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
||||||
for (const auto & [_, buf] : ctxs_bufs) {
|
for (const auto & [ctx, buf] : ctxs_bufs) {
|
||||||
ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
|
ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(buf.get());
|
||||||
|
|
||||||
|
if (hparams.no_alloc) {
|
||||||
|
GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) == nullptr);
|
||||||
|
ret[buft] += ggml_backend_alloc_ctx_tensors_from_buft_size(ctx.get(), buft);
|
||||||
|
} else {
|
||||||
|
// GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) != nullptr); // multi_buffer does not have a defined base
|
||||||
|
ret[buft] += ggml_backend_buffer_get_size(buf.get());
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1372,7 +1389,7 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
|
||||||
const auto & yarn_ext_factor = cparams.yarn_ext_factor;
|
const auto & yarn_ext_factor = cparams.yarn_ext_factor;
|
||||||
const auto & yarn_beta_fast = cparams.yarn_beta_fast;
|
const auto & yarn_beta_fast = cparams.yarn_beta_fast;
|
||||||
const auto & yarn_beta_slow = cparams.yarn_beta_slow;
|
const auto & yarn_beta_slow = cparams.yarn_beta_slow;
|
||||||
const auto & yarn_attn_factor = llama_hparams::yarn_attn_factor_adjust(cparams.yarn_attn_factor, cparams.rope_freq_scale, cparams.yarn_ext_factor);
|
const auto & yarn_attn_factor = cparams.yarn_attn_factor;
|
||||||
|
|
||||||
const auto & n_rot = hparams.n_rot;
|
const auto & n_rot = hparams.n_rot;
|
||||||
const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE || hparams.rope_type == LLAMA_ROPE_TYPE_IMROPE
|
const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE || hparams.rope_type == LLAMA_ROPE_TYPE_IMROPE
|
||||||
|
|
|
||||||
|
|
@ -473,6 +473,7 @@ llama_model_loader::llama_model_loader(
|
||||||
std::vector<std::string> & splits,
|
std::vector<std::string> & splits,
|
||||||
bool use_mmap,
|
bool use_mmap,
|
||||||
bool check_tensors,
|
bool check_tensors,
|
||||||
|
bool no_alloc,
|
||||||
const llama_model_kv_override * param_overrides_p,
|
const llama_model_kv_override * param_overrides_p,
|
||||||
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) {
|
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) {
|
||||||
int trace = 0;
|
int trace = 0;
|
||||||
|
|
@ -716,6 +717,7 @@ llama_model_loader::llama_model_loader(
|
||||||
|
|
||||||
this->use_mmap = use_mmap;
|
this->use_mmap = use_mmap;
|
||||||
this->check_tensors = check_tensors;
|
this->check_tensors = check_tensors;
|
||||||
|
this->no_alloc = no_alloc;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string llama_model_loader::get_arch_name() const {
|
std::string llama_model_loader::get_arch_name() const {
|
||||||
|
|
|
||||||
|
|
@ -71,6 +71,7 @@ struct llama_model_loader {
|
||||||
|
|
||||||
bool use_mmap = false;
|
bool use_mmap = false;
|
||||||
bool check_tensors;
|
bool check_tensors;
|
||||||
|
bool no_alloc;
|
||||||
|
|
||||||
llama_files files;
|
llama_files files;
|
||||||
llama_ftype ftype;
|
llama_ftype ftype;
|
||||||
|
|
@ -97,6 +98,7 @@ struct llama_model_loader {
|
||||||
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
|
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
|
||||||
bool use_mmap,
|
bool use_mmap,
|
||||||
bool check_tensors,
|
bool check_tensors,
|
||||||
|
bool no_alloc,
|
||||||
const llama_model_kv_override * param_overrides_p,
|
const llama_model_kv_override * param_overrides_p,
|
||||||
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p);
|
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p);
|
||||||
|
|
||||||
|
|
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue