Merge tag 'b7438' into dev-hexagon-refactoring-part2

This commit is contained in:
chraac 2025-12-17 10:37:09 +08:00
commit 0656a5ece4
226 changed files with 12583 additions and 7544 deletions

View File

@ -4,7 +4,7 @@
# Define the CANN base image for easier version updates later # Define the CANN base image for easier version updates later
ARG CHIP_TYPE=910b ARG CHIP_TYPE=910b
ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.3.rc1.alpha001-${CHIP_TYPE}-openeuler22.03-py3.11 ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.3.rc2-${CHIP_TYPE}-openeuler24.03-py3.11
# ============================================================================== # ==============================================================================
# BUILD STAGE # BUILD STAGE
@ -107,11 +107,11 @@ ENTRYPOINT ["/app/tools.sh"]
# ENTRYPOINT ["/app/llama-server"] # ENTRYPOINT ["/app/llama-server"]
### Target: light ### Target: light
# Lightweight image containing only llama-cli # Lightweight image containing only llama-cli and llama-completion
# ============================================================================== # ==============================================================================
FROM base AS light FROM base AS light
COPY --from=build /app/full/llama-cli /app COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
ENTRYPOINT [ "/app/llama-cli" ] ENTRYPOINT [ "/app/llama-cli" ]

View File

@ -68,7 +68,7 @@ ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only ### Light, CLI only
FROM base AS light FROM base AS light
COPY --from=build /app/full/llama-cli /app COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
WORKDIR /app WORKDIR /app

View File

@ -74,7 +74,7 @@ ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only ### Light, CLI only
FROM base AS light FROM base AS light
COPY --from=build /app/full/llama-cli /app COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
WORKDIR /app WORKDIR /app

View File

@ -73,7 +73,7 @@ ENTRYPOINT ["/app/tools.sh"]
FROM base AS light FROM base AS light
COPY --from=build /app/lib/ /app COPY --from=build /app/lib/ /app
COPY --from=build /app/full/llama-cli /app COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
WORKDIR /app WORKDIR /app

View File

@ -23,11 +23,12 @@ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
RUN echo "Building with static libs" && \ RUN echo "Building with static libs" && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \ source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_TESTS=OFF && \ cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_TESTS=OFF && \
cmake --build build --config Release --target llama-cli cmake --build build --config Release --target llama-cli && \
cmake --build build --config Release --target llama-completion
# TODO: use image with NNRT # TODO: use image with NNRT
FROM ascendai/cann:$ASCEND_VERSION AS runtime FROM ascendai/cann:$ASCEND_VERSION AS runtime
COPY --from=build /app/build/bin/llama-cli /llama-cli COPY --from=build /app/build/bin/llama-cli /app/build/bin/llama-completion /
ENV LC_ALL=C.utf8 ENV LC_ALL=C.utf8

View File

@ -37,6 +37,7 @@ make -j GGML_CUDA=1
%install %install
mkdir -p %{buildroot}%{_bindir}/ mkdir -p %{buildroot}%{_bindir}/
cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli
cp -p llama-completion %{buildroot}%{_bindir}/llama-cuda-completion
cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server
cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple
@ -68,6 +69,7 @@ rm -rf %{_builddir}/*
%files %files
%{_bindir}/llama-cuda-cli %{_bindir}/llama-cuda-cli
%{_bindir}/llama-cuda-completion
%{_bindir}/llama-cuda-server %{_bindir}/llama-cuda-server
%{_bindir}/llama-cuda-simple %{_bindir}/llama-cuda-simple
/usr/lib/systemd/system/llamacuda.service /usr/lib/systemd/system/llamacuda.service

View File

@ -39,6 +39,7 @@ make -j
%install %install
mkdir -p %{buildroot}%{_bindir}/ mkdir -p %{buildroot}%{_bindir}/
cp -p llama-cli %{buildroot}%{_bindir}/llama-cli cp -p llama-cli %{buildroot}%{_bindir}/llama-cli
cp -p llama-completion %{buildroot}%{_bindir}/llama-completion
cp -p llama-server %{buildroot}%{_bindir}/llama-server cp -p llama-server %{buildroot}%{_bindir}/llama-server
cp -p llama-simple %{buildroot}%{_bindir}/llama-simple cp -p llama-simple %{buildroot}%{_bindir}/llama-simple
@ -70,6 +71,7 @@ rm -rf %{_builddir}/*
%files %files
%{_bindir}/llama-cli %{_bindir}/llama-cli
%{_bindir}/llama-completion
%{_bindir}/llama-server %{_bindir}/llama-server
%{_bindir}/llama-simple %{_bindir}/llama-simple
/usr/lib/systemd/system/llama.service /usr/lib/systemd/system/llama.service

View File

@ -81,7 +81,7 @@ ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only ### Light, CLI only
FROM base AS light FROM base AS light
COPY --from=build /app/full/llama-cli /app COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
WORKDIR /app WORKDIR /app

View File

@ -94,7 +94,7 @@ ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only ### Light, CLI only
FROM base AS light FROM base AS light
COPY --from=build /app/full/llama-cli /app COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
WORKDIR /app WORKDIR /app

View File

@ -105,7 +105,7 @@ WORKDIR /llama.cpp/bin
# Copy llama.cpp binaries and libraries # Copy llama.cpp binaries and libraries
COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin/llama-completion /llama.cpp/bin
ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ] ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ]

View File

@ -13,6 +13,8 @@ elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
exec ./llama-quantize "$@" exec ./llama-quantize "$@"
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
exec ./llama-cli "$@" exec ./llama-cli "$@"
elif [[ "$arg1" == '--run-legacy' || "$arg1" == '-l' ]]; then
exec ./llama-completion "$@"
elif [[ "$arg1" == '--bench' || "$arg1" == '-b' ]]; then elif [[ "$arg1" == '--bench' || "$arg1" == '-b' ]]; then
exec ./llama-bench "$@" exec ./llama-bench "$@"
elif [[ "$arg1" == '--perplexity' || "$arg1" == '-p' ]]; then elif [[ "$arg1" == '--perplexity' || "$arg1" == '-p' ]]; then
@ -32,8 +34,10 @@ elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
else else
echo "Unknown command: $arg1" echo "Unknown command: $arg1"
echo "Available commands: " echo "Available commands: "
echo " --run (-r): Run a model previously converted into ggml" echo " --run (-r): Run a model (chat) previously converted into ggml"
echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512" echo " ex: -m /models/7B/ggml-model-q4_0.bin"
echo " --run-legacy (-l): Run a model (legacy completion) previously converted into ggml"
echo " ex: -m /models/7B/ggml-model-q4_0.bin -no-cnv -p \"Building a website can be done in 10 simple steps:\" -n 512"
echo " --bench (-b): Benchmark the performance of the inference for various parameters." echo " --bench (-b): Benchmark the performance of the inference for various parameters."
echo " ex: -m model.gguf" echo " ex: -m model.gguf"
echo " --perplexity (-p): Measure the perplexity of a model over a given text." echo " --perplexity (-p): Measure the perplexity of a model over a given text."

View File

@ -68,7 +68,7 @@ ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only ### Light, CLI only
FROM base AS light FROM base AS light
COPY --from=build /app/full/llama-cli /app COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
WORKDIR /app WORKDIR /app

View File

@ -11,7 +11,7 @@ body:
(i.e. the generated text) are incorrect or llama.cpp crashes during model evaluation. (i.e. the generated text) are incorrect or llama.cpp crashes during model evaluation.
If you encountered the issue while using an external UI (e.g. ollama), If you encountered the issue while using an external UI (e.g. ollama),
please reproduce your issue using one of the examples/binaries in this repository. please reproduce your issue using one of the examples/binaries in this repository.
The `llama-cli` binary can be used for simple and reproducible model inference. The `llama-completion` binary can be used for simple and reproducible model inference.
- type: textarea - type: textarea
id: version id: version
attributes: attributes:
@ -74,9 +74,12 @@ body:
Please give us a summary of the problem and tell us how to reproduce it. Please give us a summary of the problem and tell us how to reproduce it.
If you can narrow down the bug to specific hardware, compile flags, or command line arguments, If you can narrow down the bug to specific hardware, compile flags, or command line arguments,
that information would be very much appreciated by us. that information would be very much appreciated by us.
If possible, please try to reproduce the issue using `llama-completion` with `-fit off`.
If you can only reproduce the issue with `-fit on`, please provide logs both with and without `--verbose`.
placeholder: > placeholder: >
e.g. when I run llama-cli with -ngl 99 I get garbled outputs. e.g. when I run llama-completion with `-fa on` I get garbled outputs for very long prompts.
When I use -ngl 0 it works correctly. With short prompts or `-fa off` it works correctly.
Here are the exact commands that I used: ... Here are the exact commands that I used: ...
validations: validations:
required: true required: true

View File

@ -20,7 +20,8 @@ on:
'**/*.swift', '**/*.swift',
'**/*.m', '**/*.m',
'**/*.metal', '**/*.metal',
'**/*.comp' '**/*.comp',
'**/*.glsl'
] ]
pull_request: pull_request:
@ -40,7 +41,8 @@ on:
'**/*.swift', '**/*.swift',
'**/*.m', '**/*.m',
'**/*.metal', '**/*.metal',
'**/*.comp' '**/*.comp',
'**/*.glsl'
] ]
concurrency: concurrency:
@ -1400,26 +1402,55 @@ jobs:
chip_type: ['910b', '310p'] chip_type: ['910b', '310p']
build: ['Release'] build: ['Release']
runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }} runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
container: ascendai/cann:${{ matrix.chip_type == '910b' && '8.3.rc1.alpha001-910b-openeuler22.03-py3.11' || '8.2.rc1-310p-openeuler22.03-py3.11' }}
steps: steps:
- name: Checkout - name: Checkout
uses: actions/checkout@v4 uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Dependencies - name: Free up disk space
uses: ggml-org/free-disk-space@v1.3.1
with:
tool-cache: true
- name: Set container image
id: cann-image
run: | run: |
yum update -y image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.3.rc2-910b-openeuler24.03-py3.11' || '8.3.rc2-310p-openeuler24.03-py3.11' }}"
yum install -y git gcc gcc-c++ make cmake libcurl-devel echo "image=${image}" >> "${GITHUB_OUTPUT}"
- name: Pull container image
run: docker pull "${{ steps.cann-image.outputs.image }}"
- name: Build - name: Build
env:
BUILD_TYPE: ${{ matrix.build }}
SOC_TYPE: ascend${{ matrix.chip_type }}
run: | run: |
export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH} HOST_UID=$(id -u)
HOST_GID=$(id -g)
docker run --rm \
-v "${PWD}:/workspace" \
-w /workspace \
-e SOC_TYPE=${SOC_TYPE} \
-e BUILD_TYPE=${BUILD_TYPE} \
"${{ steps.cann-image.outputs.image }}" \
bash -lc '
set -e
yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake libcurl-devel
yum clean all && rm -rf /var/cache/yum
git config --global --add safe.directory "/workspace"
export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
cmake -S . -B build \ cmake -S . -B build \
-DCMAKE_BUILD_TYPE=${{ matrix.build }} \ -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
-DGGML_CANN=on \ -DGGML_CANN=on \
-DSOC_TYPE=ascend${{ matrix.chip_type }} -DSOC_TYPE=${SOC_TYPE}
cmake --build build -j $(nproc) cmake --build build -j $(nproc)
chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
'
# TODO: simplify the following workflows using a matrix # TODO: simplify the following workflows using a matrix
# TODO: run lighter CI on PRs and the full CI only on master (if needed) # TODO: run lighter CI on PRs and the full CI only on master (if needed)
ggml-ci-x64-cpu-low-perf: ggml-ci-x64-cpu-low-perf:

View File

@ -731,6 +731,78 @@ jobs:
path: llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz path: llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz
name: llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz name: llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz
openEuler-cann:
strategy:
matrix:
arch: [x86, aarch64]
chip_type: ['910b', '310p']
build: ['Release']
runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Free up disk space
uses: ggml-org/free-disk-space@v1.3.1
with:
tool-cache: true
- name: Set container image
id: cann-image
run: |
image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.3.rc2-910b-openeuler24.03-py3.11' || '8.3.rc2-310p-openeuler24.03-py3.11' }}"
echo "image=${image}" >> "${GITHUB_OUTPUT}"
- name: Pull container image
run: docker pull "${{ steps.cann-image.outputs.image }}"
- name: Build
env:
BUILD_TYPE: ${{ matrix.build }}
SOC_TYPE: ascend${{ matrix.chip_type }}
run: |
HOST_UID=$(id -u)
HOST_GID=$(id -g)
docker run --rm \
-v "${PWD}:/workspace" \
-w /workspace \
-e SOC_TYPE=${SOC_TYPE} \
-e BUILD_TYPE=${BUILD_TYPE} \
"${{ steps.cann-image.outputs.image }}" \
bash -lc '
set -e
yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake libcurl-devel
yum clean all && rm -rf /var/cache/yum
git config --global --add safe.directory "/workspace"
export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
cmake -S . -B build \
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
-DGGML_CANN=on \
-DSOC_TYPE=${SOC_TYPE}
cmake --build build -j $(nproc)
chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
'
- name: Determine tag name
id: tag
uses: ./.github/actions/get-tag-name
- name: Pack artifacts
run: |
cp LICENSE ./build/bin/
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
- name: Upload artifacts (tar)
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.tar.gz
name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.tar.gz
release: release:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@ -752,6 +824,7 @@ jobs:
- macOS-arm64 - macOS-arm64
- macOS-x64 - macOS-x64
- ios-xcode-build - ios-xcode-build
- openEuler-cann
steps: steps:
- name: Clone - name: Clone
@ -844,6 +917,12 @@ jobs:
- [Windows x64 (SYCL)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip) - [Windows x64 (SYCL)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip)
- [Windows x64 (HIP)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-hip-radeon-x64.zip) - [Windows x64 (HIP)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-hip-radeon-x64.zip)
**openEuler:**
- [openEuler x86 (310p)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-310p-openEuler-x86.tar.gz)
- [openEuler x86 (910b)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-x86.tar.gz)
- [openEuler aarch64 (310p)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-310p-openEuler-aarch64.tar.gz)
- [openEuler aarch64 (910b)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-aarch64.tar.gz)
- name: Upload release - name: Upload release
id: upload_release id: upload_release
uses: actions/github-script@v3 uses: actions/github-script@v3

295
.github/workflows/server-webui.yml vendored Normal file
View File

@ -0,0 +1,295 @@
# Server WebUI build and tests
name: Server WebUI
on:
workflow_dispatch: # allows manual triggering
inputs:
sha:
description: 'Commit SHA1 to build'
required: false
type: string
slow_tests:
description: 'Run slow tests'
required: true
type: boolean
push:
branches:
- master
paths: ['.github/workflows/server-webui.yml', 'tools/server/webui/**.*', 'tools/server/tests/**.*', 'tools/server/public/**']
pull_request:
types: [opened, synchronize, reopened]
paths: ['.github/workflows/server-webui.yml', 'tools/server/webui/**.*', 'tools/server/tests/**.*', 'tools/server/public/**']
env:
LLAMA_LOG_COLORS: 1
LLAMA_LOG_PREFIX: 1
LLAMA_LOG_TIMESTAMPS: 1
LLAMA_LOG_VERBOSITY: 10
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
jobs:
webui-setup:
name: WebUI Setup
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: "22"
cache: "npm"
cache-dependency-path: "tools/server/webui/package-lock.json"
- name: Cache node_modules
uses: actions/cache@v4
id: cache-node-modules
with:
path: tools/server/webui/node_modules
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
restore-keys: |
${{ runner.os }}-node-modules-
- name: Install dependencies
if: steps.cache-node-modules.outputs.cache-hit != 'true'
run: npm ci
working-directory: tools/server/webui
webui-check:
needs: webui-setup
name: WebUI Check
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: "22"
- name: Restore node_modules cache
uses: actions/cache@v4
with:
path: tools/server/webui/node_modules
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
restore-keys: |
${{ runner.os }}-node-modules-
- name: Run type checking
run: npm run check
working-directory: tools/server/webui
- name: Run linting
run: npm run lint
working-directory: tools/server/webui
webui-build:
needs: webui-check
name: WebUI Build
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: "22"
- name: Restore node_modules cache
uses: actions/cache@v4
with:
path: tools/server/webui/node_modules
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
restore-keys: |
${{ runner.os }}-node-modules-
- name: Build application
run: npm run build
working-directory: tools/server/webui
webui-tests:
needs: webui-build
name: Run WebUI tests
permissions:
contents: read
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: "22"
- name: Restore node_modules cache
uses: actions/cache@v4
with:
path: tools/server/webui/node_modules
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
restore-keys: |
${{ runner.os }}-node-modules-
- name: Install Playwright browsers
run: npx playwright install --with-deps
working-directory: tools/server/webui
- name: Build Storybook
run: npm run build-storybook
working-directory: tools/server/webui
- name: Run Client tests
run: npm run test:client
working-directory: tools/server/webui
- name: Run Server tests
run: npm run test:server
working-directory: tools/server/webui
- name: Run UI tests
run: npm run test:ui -- --testTimeout=60000
working-directory: tools/server/webui
- name: Run E2E tests
run: npm run test:e2e
working-directory: tools/server/webui
server-build:
needs: [webui-tests]
runs-on: ubuntu-latest
strategy:
matrix:
sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
build_type: [RelWithDebInfo]
include:
- build_type: Release
sanitizer: ""
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
steps:
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get -y install \
build-essential \
xxd \
git \
cmake \
curl \
wget \
language-pack-en \
libssl-dev
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: Python setup
id: setup_python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Tests dependencies
id: test_dependencies
run: |
pip install -r tools/server/tests/requirements.txt
- name: Setup Node.js for WebUI
uses: actions/setup-node@v4
with:
node-version: "22"
cache: "npm"
cache-dependency-path: "tools/server/webui/package-lock.json"
- name: Install WebUI dependencies
run: npm ci
working-directory: tools/server/webui
- name: Build WebUI
run: npm run build
working-directory: tools/server/webui
- name: Build (no OpenMP)
id: cmake_build_no_openmp
if: ${{ matrix.sanitizer == 'THREAD' }}
run: |
cmake -B build \
-DGGML_NATIVE=OFF \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DLLAMA_BUILD_SERVER=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-DGGML_OPENMP=OFF ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
- name: Build (sanitizers)
id: cmake_build_sanitizers
if: ${{ matrix.sanitizer != '' && matrix.sanitizer != 'THREAD' }}
run: |
cmake -B build \
-DGGML_NATIVE=OFF \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DLLAMA_BUILD_SERVER=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
- name: Build (sanitizers)
id: cmake_build
if: ${{ matrix.sanitizer == '' }}
run: |
cmake -B build \
-DGGML_NATIVE=OFF \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DLLAMA_BUILD_SERVER=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
- name: Tests
id: server_integration_tests
if: ${{ matrix.sanitizer == '' }}
env:
GITHUB_ACTIONS: "true"
run: |
cd tools/server/tests
./tests.sh
- name: Tests (sanitizers)
id: server_integration_tests_sanitizers
if: ${{ matrix.sanitizer != '' }}
run: |
cd tools/server/tests
LLAMA_SANITIZE=1 ./tests.sh
- name: Slow tests
id: server_integration_tests_slow
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
run: |
cd tools/server/tests
SLOW_TESTS=1 ./tests.sh

View File

@ -76,270 +76,6 @@ jobs:
run: | run: |
pip install -r tools/server/tests/requirements.txt pip install -r tools/server/tests/requirements.txt
webui-setup:
name: WebUI Setup
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: "22"
cache: "npm"
cache-dependency-path: "tools/server/webui/package-lock.json"
- name: Cache node_modules
uses: actions/cache@v4
id: cache-node-modules
with:
path: tools/server/webui/node_modules
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
restore-keys: |
${{ runner.os }}-node-modules-
- name: Install dependencies
if: steps.cache-node-modules.outputs.cache-hit != 'true'
run: npm ci
working-directory: tools/server/webui
webui-check:
needs: webui-setup
name: WebUI Check
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: "22"
- name: Restore node_modules cache
uses: actions/cache@v4
with:
path: tools/server/webui/node_modules
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
restore-keys: |
${{ runner.os }}-node-modules-
- name: Run type checking
run: npm run check
working-directory: tools/server/webui
- name: Run linting
run: npm run lint
working-directory: tools/server/webui
webui-build:
needs: webui-check
name: WebUI Build
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: "22"
- name: Restore node_modules cache
uses: actions/cache@v4
with:
path: tools/server/webui/node_modules
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
restore-keys: |
${{ runner.os }}-node-modules-
- name: Build application
run: npm run build
working-directory: tools/server/webui
webui-tests:
needs: webui-build
name: Run WebUI tests
permissions:
contents: read
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: "22"
- name: Restore node_modules cache
uses: actions/cache@v4
with:
path: tools/server/webui/node_modules
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
restore-keys: |
${{ runner.os }}-node-modules-
- name: Install Playwright browsers
run: npx playwright install --with-deps
working-directory: tools/server/webui
- name: Build Storybook
run: npm run build-storybook
working-directory: tools/server/webui
- name: Run Client tests
run: npm run test:client
working-directory: tools/server/webui
- name: Run Server tests
run: npm run test:server
working-directory: tools/server/webui
- name: Run UI tests
run: npm run test:ui -- --testTimeout=60000
working-directory: tools/server/webui
- name: Run E2E tests
run: npm run test:e2e
working-directory: tools/server/webui
server-build:
needs: [webui-tests]
runs-on: ubuntu-latest
strategy:
matrix:
sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
build_type: [RelWithDebInfo]
include:
- build_type: Release
sanitizer: ""
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
steps:
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get -y install \
build-essential \
xxd \
git \
cmake \
curl \
wget \
language-pack-en \
libssl-dev
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: Python setup
id: setup_python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Tests dependencies
id: test_dependencies
run: |
pip install -r tools/server/tests/requirements.txt
- name: Setup Node.js for WebUI
uses: actions/setup-node@v4
with:
node-version: "22"
cache: "npm"
cache-dependency-path: "tools/server/webui/package-lock.json"
- name: Install WebUI dependencies
run: npm ci
working-directory: tools/server/webui
- name: Build WebUI
run: npm run build
working-directory: tools/server/webui
- name: Build (no OpenMP)
id: cmake_build_no_openmp
if: ${{ matrix.sanitizer == 'THREAD' }}
run: |
cmake -B build \
-DGGML_NATIVE=OFF \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DLLAMA_BUILD_SERVER=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-DGGML_OPENMP=OFF ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
- name: Build (sanitizers)
id: cmake_build_sanitizers
if: ${{ matrix.sanitizer != '' && matrix.sanitizer != 'THREAD' }}
run: |
cmake -B build \
-DGGML_NATIVE=OFF \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DLLAMA_BUILD_SERVER=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
- name: Build (sanitizers)
id: cmake_build
if: ${{ matrix.sanitizer == '' }}
run: |
cmake -B build \
-DGGML_NATIVE=OFF \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DLLAMA_BUILD_SERVER=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
- name: Tests
id: server_integration_tests
if: ${{ matrix.sanitizer == '' }}
env:
GITHUB_ACTIONS: "true"
run: |
cd tools/server/tests
./tests.sh
- name: Tests (sanitizers)
id: server_integration_tests_sanitizers
if: ${{ matrix.sanitizer != '' }}
run: |
cd tools/server/tests
LLAMA_SANITIZE=1 ./tests.sh
- name: Slow tests
id: server_integration_tests_slow
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
run: |
cd tools/server/tests
SLOW_TESTS=1 ./tests.sh
server-windows: server-windows:
runs-on: windows-2022 runs-on: windows-2022

1
.gitignore vendored
View File

@ -54,6 +54,7 @@
/out/ /out/
/tmp/ /tmp/
/autogen-*.md /autogen-*.md
/common/build-info.cpp
# Deprecated # Deprecated

View File

@ -87,7 +87,8 @@
/tests/ @ggerganov /tests/ @ggerganov
/tests/test-chat-.* @pwilkin /tests/test-chat-.* @pwilkin
/tools/batched-bench/ @ggerganov /tools/batched-bench/ @ggerganov
/tools/main/ @ggerganov /tools/cli/ @ngxson
/tools/completion/ @ggerganov
/tools/mtmd/ @ngxson /tools/mtmd/ @ngxson
/tools/perplexity/ @ggerganov /tools/perplexity/ @ggerganov
/tools/quantize/ @ggerganov /tools/quantize/ @ggerganov

View File

@ -313,7 +313,7 @@ The Hugging Face platform provides a variety of online tools for converting, qua
To learn more about model quantization, [read this documentation](tools/quantize/README.md) To learn more about model quantization, [read this documentation](tools/quantize/README.md)
## [`llama-cli`](tools/main) ## [`llama-cli`](tools/cli)
#### A CLI tool for accessing and experimenting with most of `llama.cpp`'s functionality. #### A CLI tool for accessing and experimenting with most of `llama.cpp`'s functionality.
@ -525,7 +525,8 @@ To learn more about model quantization, [read this documentation](tools/quantize
## Other documentation ## Other documentation
- [main (cli)](tools/main/README.md) - [cli](tools/cli/README.md)
- [completion](tools/completion/README.md)
- [server](tools/server/README.md) - [server](tools/server/README.md)
- [GBNF grammars](grammars/README.md) - [GBNF grammars](grammars/README.md)

View File

@ -68,3 +68,6 @@ Please disclose it as a private [security advisory](https://github.com/ggml-org/
Please note that using AI to identify vulnerabilities and generate reports is permitted. However, you must (1) explicitly disclose how AI was used and (2) conduct a thorough manual review before submitting the report. Please note that using AI to identify vulnerabilities and generate reports is permitted. However, you must (1) explicitly disclose how AI was used and (2) conduct a thorough manual review before submitting the report.
A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure. A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.
> [!IMPORTANT]
> For collaborators: if you are interested in helping out with reviewing privting security disclosures, please see: https://github.com/ggml-org/llama.cpp/discussions/18080

View File

@ -398,6 +398,8 @@ function gg_run_qwen3_0_6b {
./bin/llama-quantize ${model_bf16} ${model_q5_k} q5_k $(nproc) ./bin/llama-quantize ${model_bf16} ${model_q5_k} q5_k $(nproc)
./bin/llama-quantize ${model_bf16} ${model_q6_k} q6_k $(nproc) ./bin/llama-quantize ${model_bf16} ${model_q6_k} q6_k $(nproc)
(time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
(time ./bin/llama-completion -no-cnv --model ${model_f16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/llama-completion -no-cnv --model ${model_f16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-completion -no-cnv --model ${model_bf16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log (time ./bin/llama-completion -no-cnv --model ${model_bf16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
(time ./bin/llama-completion -no-cnv --model ${model_q8_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log (time ./bin/llama-completion -no-cnv --model ${model_q8_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
@ -523,6 +525,8 @@ function gg_run_embd_bge_small {
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0 ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
(time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
(time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log (time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
@ -563,6 +567,8 @@ function gg_run_rerank_tiny {
model_f16="${path_models}/ggml-model-f16.gguf" model_f16="${path_models}/ggml-model-f16.gguf"
(time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
# for this model, the SEP token is "</s>" # for this model, the SEP token is "</s>"
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --no-op-offload --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log (time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --no-op-offload --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log

View File

@ -20,6 +20,7 @@
#include <nlohmann/json.hpp> #include <nlohmann/json.hpp>
#include <algorithm> #include <algorithm>
#include <cinttypes>
#include <climits> #include <climits>
#include <cstdarg> #include <cstdarg>
#include <fstream> #include <fstream>
@ -105,6 +106,16 @@ bool common_arg::is_exclude(enum llama_example ex) {
bool common_arg::get_value_from_env(std::string & output) const { bool common_arg::get_value_from_env(std::string & output) const {
if (env == nullptr) return false; if (env == nullptr) return false;
if (!args_neg.empty()) {
// for compatibility, we need to check LLAMA_ARG_NO_ env as well
std::string neg_env = env;
string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
char * neg_value = std::getenv(neg_env.c_str());
if (neg_value) {
output = "0"; // falsey
return true;
}
}
char * value = std::getenv(env); char * value = std::getenv(env);
if (value) { if (value) {
output = value; output = value;
@ -114,6 +125,14 @@ bool common_arg::get_value_from_env(std::string & output) const {
} }
bool common_arg::has_value_from_env() const { bool common_arg::has_value_from_env() const {
if (env != nullptr && !args_neg.empty()) {
// for compatibility, we need to check LLAMA_ARG_NO_ env as well
std::string neg_env = env;
string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
if (std::getenv(neg_env.c_str())) {
return true;
}
}
return env != nullptr && std::getenv(env); return env != nullptr && std::getenv(env);
} }
@ -151,9 +170,10 @@ std::string common_arg::to_string() const {
std::string leading_spaces(n_leading_spaces, ' '); std::string leading_spaces(n_leading_spaces, ' ');
std::ostringstream ss; std::ostringstream ss;
for (const auto arg : args) { auto all_args = get_args(); // also contains args_neg
if (arg == args.front()) { for (const auto & arg : all_args) {
if (args.size() == 1) { if (arg == all_args.front()) {
if (all_args.size() == 1) {
ss << arg; ss << arg;
} else { } else {
// first arg is usually abbreviation, we need padding to make it more beautiful // first arg is usually abbreviation, we need padding to make it more beautiful
@ -162,7 +182,7 @@ std::string common_arg::to_string() const {
ss << tmp << spaces; ss << tmp << spaces;
} }
} else { } else {
ss << arg << (arg != args.back() ? ", " : ""); ss << arg << (arg != all_args.back() ? ", " : "");
} }
} }
if (value_hint) ss << " " << value_hint; if (value_hint) ss << " " << value_hint;
@ -181,6 +201,31 @@ std::string common_arg::to_string() const {
return ss.str(); return ss.str();
} }
std::vector<std::string> common_arg::get_args() const {
std::vector<std::string> result;
for (const auto & arg : args) {
result.push_back(std::string(arg));
}
for (const auto & arg : args_neg) {
result.push_back(std::string(arg));
}
return result;
}
std::vector<std::string> common_arg::get_env() const {
std::vector<std::string> result;
if (env) {
result.push_back(std::string(env));
}
if (!args_neg.empty() && env) {
// for compatibility, we need to add LLAMA_ARG_NO_ variant
std::string neg_env = env;
string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
result.push_back(neg_env);
}
return result;
}
// //
// utils // utils
// //
@ -316,6 +361,16 @@ static std::string get_all_kv_cache_types() {
return msg.str(); return msg.str();
} }
static bool parse_bool_value(const std::string & value) {
if (is_truthy(value)) {
return true;
} else if (is_falsey(value)) {
return false;
} else {
throw std::invalid_argument("invalid boolean value");
}
}
// //
// CLI argument parsing functions // CLI argument parsing functions
// //
@ -323,10 +378,13 @@ static std::string get_all_kv_cache_types() {
static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) { static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
common_params & params = ctx_arg.params; common_params & params = ctx_arg.params;
std::unordered_map<std::string, common_arg *> arg_to_options; std::unordered_map<std::string, std::pair<common_arg *, bool>> arg_to_options;
for (auto & opt : ctx_arg.options) { for (auto & opt : ctx_arg.options) {
for (const auto & arg : opt.args) { for (const auto & arg : opt.args) {
arg_to_options[arg] = &opt; arg_to_options[arg] = {&opt, /* is_positive */ true};
}
for (const auto & arg : opt.args_neg) {
arg_to_options[arg] = {&opt, /* is_positive */ false};
} }
} }
@ -335,12 +393,15 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
std::string value; std::string value;
if (opt.get_value_from_env(value)) { if (opt.get_value_from_env(value)) {
try { try {
if (opt.handler_void && (value == "1" || value == "true")) { if (opt.handler_void && is_truthy(value)) {
opt.handler_void(params); opt.handler_void(params);
} }
if (opt.handler_int) { if (opt.handler_int) {
opt.handler_int(params, std::stoi(value)); opt.handler_int(params, std::stoi(value));
} }
if (opt.handler_bool) {
opt.handler_bool(params, parse_bool_value(value));
}
if (opt.handler_string) { if (opt.handler_string) {
opt.handler_string(params, value); opt.handler_string(params, value);
continue; continue;
@ -369,7 +430,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
if (arg_to_options.find(arg) == arg_to_options.end()) { if (arg_to_options.find(arg) == arg_to_options.end()) {
throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str())); throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
} }
auto opt = *arg_to_options[arg]; auto & tmp = arg_to_options[arg];
auto opt = *tmp.first;
bool is_positive = tmp.second;
if (opt.has_value_from_env()) { if (opt.has_value_from_env()) {
fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str()); fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
} }
@ -378,6 +441,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
opt.handler_void(params); opt.handler_void(params);
continue; continue;
} }
if (opt.handler_bool) {
opt.handler_bool(params, is_positive);
continue;
}
// arg with single value // arg with single value
check_arg(i); check_arg(i);
@ -402,7 +469,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
throw std::invalid_argument(string_format( throw std::invalid_argument(string_format(
"error while handling argument \"%s\": %s\n\n" "error while handling argument \"%s\": %s\n\n"
"usage:\n%s\n\nto show complete usage, run with -h", "usage:\n%s\n\nto show complete usage, run with -h",
arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str())); arg.c_str(), e.what(), opt.to_string().c_str()));
} }
} }
@ -438,7 +505,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
// model is required (except for server) // model is required (except for server)
// TODO @ngxson : maybe show a list of available models in CLI in this case // TODO @ngxson : maybe show a list of available models in CLI in this case
if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !params.usage) { if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !params.usage && !params.completion) {
throw std::invalid_argument("error: --model is required\n"); throw std::invalid_argument("error: --model is required\n");
} }
@ -463,7 +530,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
params.kv_overrides.back().key[0] = 0; params.kv_overrides.back().key[0] = 0;
} }
if (!params.tensor_buft_overrides.empty()) { // pad tensor_buft_overrides for llama_params_fit:
const size_t ntbo = llama_max_tensor_buft_overrides();
while (params.tensor_buft_overrides.size() < ntbo) {
params.tensor_buft_overrides.push_back({nullptr, nullptr}); params.tensor_buft_overrides.push_back({nullptr, nullptr});
} }
@ -573,6 +642,7 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
"llama-batched-bench", "llama-batched-bench",
"llama-bench", "llama-bench",
"llama-cli", "llama-cli",
"llama-completion",
"llama-convert-llama2c-to-ggml", "llama-convert-llama2c-to-ggml",
"llama-cvector-generator", "llama-cvector-generator",
"llama-embedding", "llama-embedding",
@ -657,7 +727,7 @@ static void add_rpc_devices(const std::string & servers) {
} }
} }
bool common_params_parse(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map) { bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map) {
common_params dummy_params; common_params dummy_params;
common_params_context ctx_arg = common_params_parser_init(dummy_params, ex, nullptr); common_params_context ctx_arg = common_params_parser_init(dummy_params, ex, nullptr);
@ -666,6 +736,9 @@ bool common_params_parse(int argc, char ** argv, llama_example ex, std::map<comm
for (const auto & arg : opt.args) { for (const auto & arg : opt.args) {
arg_to_options[arg] = &opt; arg_to_options[arg] = &opt;
} }
for (const auto & arg : opt.args_neg) {
arg_to_options[arg] = &opt;
}
} }
// TODO @ngxson : find a way to deduplicate this code // TODO @ngxson : find a way to deduplicate this code
@ -750,11 +823,11 @@ static std::string list_builtin_chat_templates() {
} }
bool common_arg_utils::is_truthy(const std::string & value) { bool common_arg_utils::is_truthy(const std::string & value) {
return value == "on" || value == "enabled" || value == "1"; return value == "on" || value == "enabled" || value == "true" || value == "1";
} }
bool common_arg_utils::is_falsey(const std::string & value) { bool common_arg_utils::is_falsey(const std::string & value) {
return value == "off" || value == "disabled" || value == "0"; return value == "off" || value == "disabled" || value == "false" || value == "0";
} }
bool common_arg_utils::is_autoy(const std::string & value) { bool common_arg_utils::is_autoy(const std::string & value) {
@ -762,6 +835,19 @@ bool common_arg_utils::is_autoy(const std::string & value) {
} }
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) { common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
// per-example default params
// we define here to make sure it's included in llama-gen-docs
if (ex == LLAMA_EXAMPLE_COMPLETION) {
params.use_jinja = false; // disable jinja by default
} else if (ex == LLAMA_EXAMPLE_MTMD) {
params.use_jinja = false; // disable jinja by default
params.sampling.temp = 0.2; // lower temp by default for better quality
} else if (ex == LLAMA_EXAMPLE_SERVER) {
params.n_parallel = -1; // auto by default
}
params.use_color = tty_can_use_colors(); params.use_color = tty_can_use_colors();
// load dynamic backends // load dynamic backends
@ -839,10 +925,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
} }
)); ));
add_opt(common_arg( add_opt(common_arg(
{"--display-prompt"},
{"--no-display-prompt"}, {"--no-display-prompt"},
string_format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"), string_format("whether to print prompt at generation (default: %s)", params.display_prompt ? "true" : "false"),
[](common_params & params) { [](common_params & params, bool value) {
params.display_prompt = false; params.display_prompt = value;
} }
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI})); ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg( add_opt(common_arg(
@ -1033,7 +1120,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_env("LLAMA_ARG_SWA_FULL")); ).set_env("LLAMA_ARG_SWA_FULL"));
add_opt(common_arg( add_opt(common_arg(
{"--ctx-checkpoints", "--swa-checkpoints"}, "N", {"--ctx-checkpoints", "--swa-checkpoints"}, "N",
string_format("max number of context checkpoints to create per slot (default: %d)\n" string_format("max number of context checkpoints to create per slot (default: %d)"
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_ctx_checkpoints), "[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_ctx_checkpoints),
[](common_params & params, int value) { [](common_params & params, int value) {
params.n_ctx_checkpoints = value; params.n_ctx_checkpoints = value;
@ -1041,7 +1128,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); ).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg( add_opt(common_arg(
{"--cache-ram", "-cram"}, "N", {"--cache-ram", "-cram"}, "N",
string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)\n" string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)"
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)", params.cache_ram_mib), "[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)", params.cache_ram_mib),
[](common_params & params, int value) { [](common_params & params, int value) {
params.cache_ram_mib = value; params.cache_ram_mib = value;
@ -1049,24 +1136,17 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); ).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg( add_opt(common_arg(
{"--kv-unified", "-kvu"}, {"--kv-unified", "-kvu"},
string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n" "use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)",
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)", params.kv_unified ? "true" : "false"),
[](common_params & params) { [](common_params & params) {
params.kv_unified = true; params.kv_unified = true;
} }
).set_env("LLAMA_ARG_KV_UNIFIED")); ).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--no-context-shift"},
string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
[](common_params & params) {
params.ctx_shift = false;
}
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
add_opt(common_arg( add_opt(common_arg(
{"--context-shift"}, {"--context-shift"},
string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"), {"--no-context-shift"},
[](common_params & params) { string_format("whether to use context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
params.ctx_shift = true; [](common_params & params, bool value) {
params.ctx_shift = value;
} }
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT")); ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT"));
add_opt(common_arg( add_opt(common_arg(
@ -1106,20 +1186,22 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
} }
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION})); ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
add_opt(common_arg( add_opt(common_arg(
{"--perf"},
{"--no-perf"}, {"--no-perf"},
string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"), string_format("whether to enable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
[](common_params & params) { [](common_params & params, bool value) {
params.no_perf = true; params.no_perf = !value;
params.sampling.no_perf = true; params.sampling.no_perf = !value;
} }
).set_env("LLAMA_ARG_NO_PERF")); ).set_env("LLAMA_ARG_PERF"));
add_opt(common_arg( add_opt(common_arg(
{"--show-timings"},
{"--no-show-timings"}, {"--no-show-timings"},
string_format("disable timing information after each response (default: %s)", params.show_timings ? "true" : "false"), string_format("whether to show timing information after each response (default: %s)", params.show_timings ? "true" : "false"),
[](common_params & params) { [](common_params & params, bool value) {
params.show_timings = false; params.show_timings = value;
} }
).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_NO_SHOW_TIMINGS")); ).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SHOW_TIMINGS"));
add_opt(common_arg( add_opt(common_arg(
{"-f", "--file"}, "FNAME", {"-f", "--file"}, "FNAME",
"a file containing the prompt (default: none)", "a file containing the prompt (default: none)",
@ -1171,16 +1253,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_excludes({LLAMA_EXAMPLE_SERVER})); ).set_excludes({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg( add_opt(common_arg(
{"-e", "--escape"}, {"-e", "--escape"},
string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
[](common_params & params) {
params.escape = true;
}
));
add_opt(common_arg(
{"--no-escape"}, {"--no-escape"},
"do not process escape sequences", string_format("whether to process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
[](common_params & params) { [](common_params & params, bool value) {
params.escape = false; params.escape = value;
} }
)); ));
add_opt(common_arg( add_opt(common_arg(
@ -1227,19 +1303,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER})); ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg( add_opt(common_arg(
{"-cnv", "--conversation"}, {"-cnv", "--conversation"},
"run in conversation mode:\n" {"-no-cnv", "--no-conversation"},
"whether to run in conversation mode:\n"
"- does not print special tokens and suffix/prefix\n" "- does not print special tokens and suffix/prefix\n"
"- interactive mode is also enabled\n" "- interactive mode is also enabled\n"
"(default: auto enabled if chat template is available)", "(default: auto enabled if chat template is available)",
[](common_params & params) { [](common_params & params, bool value) {
params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED; params.conversation_mode = value ? COMMON_CONVERSATION_MODE_ENABLED : COMMON_CONVERSATION_MODE_DISABLED;
}
).set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
{"-no-cnv", "--no-conversation"},
"force disable conversation mode (default: false)",
[](common_params & params) {
params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
} }
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI})); ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg( add_opt(common_arg(
@ -1297,10 +1367,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
} }
).set_examples({LLAMA_EXAMPLE_COMPLETION})); ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg( add_opt(common_arg(
{"--warmup"},
{"--no-warmup"}, {"--no-warmup"},
"skip warming up the model with an empty run", string_format("whether to perform warmup with an empty run (default: %s)", params.warmup ? "enabled" : "disabled"),
[](common_params & params) { [](common_params & params, bool value) {
params.warmup = false; params.warmup = value;
} }
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY})); ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
add_opt(common_arg( add_opt(common_arg(
@ -1359,7 +1430,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.sampling.top_k = value; params.sampling.top_k = value;
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K; params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K;
} }
).set_sparam()); ).set_sparam().set_env("LLAMA_ARG_TOP_K"));
add_opt(common_arg( add_opt(common_arg(
{"--top-p"}, "N", {"--top-p"}, "N",
string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sampling.top_p), string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sampling.top_p),
@ -1702,19 +1773,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
} }
).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_COMPLETION})); ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg( add_opt(common_arg(
{"-kvo", "--kv-offload"},
{"-nkvo", "--no-kv-offload"}, {"-nkvo", "--no-kv-offload"},
"disable KV offload", string_format("whether to enable KV cache offloading (default: %s)", params.no_kv_offload ? "disabled" : "enabled"),
[](common_params & params) { [](common_params & params, bool value) {
params.no_kv_offload = true; params.no_kv_offload = !value;
} }
).set_env("LLAMA_ARG_NO_KV_OFFLOAD")); ).set_env("LLAMA_ARG_KV_OFFLOAD"));
add_opt(common_arg( add_opt(common_arg(
{"--repack"},
{"-nr", "--no-repack"}, {"-nr", "--no-repack"},
"disable weight repacking", string_format("whether to enable weight repacking (default: %s)", params.no_extra_bufts ? "disabled" : "enabled"),
[](common_params & params) { [](common_params & params, bool value) {
params.no_extra_bufts = true; params.no_extra_bufts = !value;
} }
).set_env("LLAMA_ARG_NO_REPACK")); ).set_env("LLAMA_ARG_REPACK"));
add_opt(common_arg( add_opt(common_arg(
{"--no-host"}, {"--no-host"},
"bypass host buffer allowing extra buffers to be used", "bypass host buffer allowing extra buffers to be used",
@ -1827,6 +1900,19 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
LOG_WRN("DEPRECATED: --defrag-thold is deprecated and no longer necessary to specify\n"); LOG_WRN("DEPRECATED: --defrag-thold is deprecated and no longer necessary to specify\n");
} }
).set_env("LLAMA_ARG_DEFRAG_THOLD")); ).set_env("LLAMA_ARG_DEFRAG_THOLD"));
if (ex == LLAMA_EXAMPLE_SERVER) {
// this is to make sure this option appears in the server-specific section of the help message
add_opt(common_arg(
{"-np", "--parallel"}, "N",
string_format("number of server slots (default: %d, -1 = auto)", params.n_parallel),
[](common_params & params, int value) {
if (value == 0) {
throw std::invalid_argument("error: invalid value for n_parallel\n");
}
params.n_parallel = value;
}
).set_env("LLAMA_ARG_N_PARALLEL").set_examples({LLAMA_EXAMPLE_SERVER}));
} else {
add_opt(common_arg( add_opt(common_arg(
{"-np", "--parallel"}, "N", {"-np", "--parallel"}, "N",
string_format("number of parallel sequences to decode (default: %d)", params.n_parallel), string_format("number of parallel sequences to decode (default: %d)", params.n_parallel),
@ -1834,6 +1920,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.n_parallel = value; params.n_parallel = value;
} }
).set_env("LLAMA_ARG_N_PARALLEL")); ).set_env("LLAMA_ARG_N_PARALLEL"));
}
add_opt(common_arg( add_opt(common_arg(
{"-ns", "--sequences"}, "N", {"-ns", "--sequences"}, "N",
string_format("number of sequences to decode (default: %d)", params.n_sequences), string_format("number of sequences to decode (default: %d)", params.n_sequences),
@ -1843,20 +1930,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_PARALLEL})); ).set_examples({LLAMA_EXAMPLE_PARALLEL}));
add_opt(common_arg( add_opt(common_arg(
{"-cb", "--cont-batching"}, {"-cb", "--cont-batching"},
string_format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"), {"-nocb", "--no-cont-batching"},
[](common_params & params) { string_format("whether to enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
params.cont_batching = true; [](common_params & params, bool value) {
params.cont_batching = value;
} }
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING")); ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING"));
add_opt(common_arg( add_opt(common_arg(
{"-nocb", "--no-cont-batching"}, {"-mm", "--mmproj"}, "FILE",
"disable continuous batching",
[](common_params & params) {
params.cont_batching = false;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
add_opt(common_arg(
{"--mmproj"}, "FILE",
"path to a multimodal projector file. see tools/mtmd/README.md\n" "path to a multimodal projector file. see tools/mtmd/README.md\n"
"note: if -hf is used, this argument can be omitted", "note: if -hf is used, this argument can be omitted",
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
@ -1864,26 +1945,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
} }
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ")); ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ"));
add_opt(common_arg( add_opt(common_arg(
{"--mmproj-url"}, "URL", {"-mmu", "--mmproj-url"}, "URL",
"URL to a multimodal projector file. see tools/mtmd/README.md", "URL to a multimodal projector file. see tools/mtmd/README.md",
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.mmproj.url = value; params.mmproj.url = value;
} }
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_URL")); ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_URL"));
add_opt(common_arg( add_opt(common_arg(
{"--no-mmproj"}, {"--mmproj-auto"},
"explicitly disable multimodal projector, useful when using -hf", {"--no-mmproj", "--no-mmproj-auto"},
[](common_params & params) { string_format("whether to use multimodal projector file (if available), useful when using -hf (default: %s)", params.no_mmproj ? "disabled" : "enabled"),
params.no_mmproj = true; [](common_params & params, bool value) {
params.no_mmproj = !value;
} }
).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ")); ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_AUTO"));
add_opt(common_arg( add_opt(common_arg(
{"--mmproj-offload"},
{"--no-mmproj-offload"}, {"--no-mmproj-offload"},
"do not offload multimodal projector to GPU", string_format("whether to enable GPU offloading for multimodal projector (default: %s)", params.mmproj_use_gpu ? "enabled" : "disabled"),
[](common_params & params) { [](common_params & params, bool value) {
params.mmproj_use_gpu = false; params.mmproj_use_gpu = value;
} }
).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD")); ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_OFFLOAD"));
add_opt(common_arg( add_opt(common_arg(
{"--image", "--audio"}, "FILE", {"--image", "--audio"}, "FILE",
"path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n", "path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n",
@ -1923,12 +2006,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
} }
).set_env("LLAMA_ARG_MLOCK")); ).set_env("LLAMA_ARG_MLOCK"));
add_opt(common_arg( add_opt(common_arg(
{"--mmap"},
{"--no-mmap"}, {"--no-mmap"},
"do not memory-map model (slower load but may reduce pageouts if not using mlock)", string_format("whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
[](common_params & params) { [](common_params & params, bool value) {
params.use_mmap = false; params.use_mmap = value;
} }
).set_env("LLAMA_ARG_NO_MMAP")); ).set_env("LLAMA_ARG_MMAP"));
add_opt(common_arg( add_opt(common_arg(
{"--numa"}, "TYPE", {"--numa"}, "TYPE",
"attempt optimizations that help on some NUMA systems\n" "attempt optimizations that help on some NUMA systems\n"
@ -2098,6 +2182,34 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
} }
} }
).set_env("LLAMA_ARG_MAIN_GPU")); ).set_env("LLAMA_ARG_MAIN_GPU"));
add_opt(common_arg(
{ "-fit", "--fit" }, "[on|off]",
string_format("whether to adjust unset arguments to fit in device memory ('on' or 'off', default: '%s')", params.fit_params ? "on" : "off"),
[](common_params & params, const std::string & value) {
if (is_truthy(value)) {
params.fit_params = true;
} else if (is_falsey(value)) {
params.fit_params = false;
} else {
throw std::runtime_error(
string_format("error: unkown value for --fit: '%s'\n", value.c_str()));
}
}
).set_env("LLAMA_ARG_FIT"));
add_opt(common_arg(
{ "-fitt", "--fit-target" }, "MiB",
string_format("target margin per device for --fit option, default: %zu", params.fit_params_target/(1024*1024)),
[](common_params & params, int value) {
params.fit_params_target = value * size_t(1024*1024);
}
).set_env("LLAMA_ARG_FIT_TARGET"));
add_opt(common_arg(
{ "-fitc", "--fit-ctx" }, "N",
string_format("minimum ctx size that can be set by --fit option, default: %" PRIu32, params.fit_params_min_ctx),
[](common_params & params, int value) {
params.fit_params_min_ctx = value;
}
).set_env("LLAMA_ARG_FIT_CTX"));
add_opt(common_arg( add_opt(common_arg(
{"--check-tensors"}, {"--check-tensors"},
string_format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"), string_format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"),
@ -2116,10 +2228,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
} }
)); ));
add_opt(common_arg( add_opt(common_arg(
{"--op-offload"},
{"--no-op-offload"}, {"--no-op-offload"},
string_format("disable offloading host tensor operations to device (default: %s)", params.no_op_offload ? "true" : "false"), string_format("whether to offload host tensor operations to device (default: %s)", params.no_op_offload ? "false" : "true"),
[](common_params & params) { [](common_params & params, bool value) {
params.no_op_offload = true; params.no_op_offload = !value;
} }
)); ));
add_opt(common_arg( add_opt(common_arg(
@ -2315,10 +2428,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
} }
).set_examples({LLAMA_EXAMPLE_IMATRIX})); ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
add_opt(common_arg( add_opt(common_arg(
{"--ppl"},
{"--no-ppl"}, {"--no-ppl"},
string_format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"), string_format("whether to compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
[](common_params & params) { [](common_params & params, bool value) {
params.compute_ppl = false; params.compute_ppl = value;
} }
).set_examples({LLAMA_EXAMPLE_IMATRIX})); ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
add_opt(common_arg( add_opt(common_arg(
@ -2437,12 +2551,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
} }
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX")); ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
add_opt(common_arg( add_opt(common_arg(
{"--webui"},
{"--no-webui"}, {"--no-webui"},
string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"), string_format("whether to enable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
[](common_params & params) { [](common_params & params, bool value) {
params.webui = false; params.webui = value;
} }
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_WEBUI")); ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI"));
add_opt(common_arg( add_opt(common_arg(
{"--embedding", "--embeddings"}, {"--embedding", "--embeddings"},
string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"), string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
@ -2547,18 +2662,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS")); ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS"));
add_opt(common_arg( add_opt(common_arg(
{"--slots"}, {"--slots"},
string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"), {"--no-slots"},
[](common_params & params) { string_format("expose slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
params.endpoint_slots = true; [](common_params & params, bool value) {
params.endpoint_slots = value;
} }
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS")); ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
add_opt(common_arg(
{"--no-slots"},
"disables slots monitoring endpoint",
[](common_params & params) {
params.endpoint_slots = false;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS"));
add_opt(common_arg( add_opt(common_arg(
{"--slot-save-path"}, "PATH", {"--slot-save-path"}, "PATH",
"path to save slot kv cache (default: disabled)", "path to save slot kv cache (default: disabled)",
@ -2609,26 +2718,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
} }
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX")); ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX"));
add_opt(common_arg( add_opt(common_arg(
{"--models-autoload"},
{"--no-models-autoload"}, {"--no-models-autoload"},
"disables automatic loading of models (default: enabled)", string_format("for router server, whether to automatically load models (default: %s)", params.models_autoload ? "enabled" : "disabled"),
[](common_params & params) { [](common_params & params, bool value) {
params.models_autoload = false; params.models_autoload = value;
} }
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_MODELS_AUTOLOAD")); ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_AUTOLOAD"));
add_opt(common_arg( add_opt(common_arg(
{"--jinja"}, {"--jinja"},
string_format("use jinja template for chat (default: %s)", params.use_jinja ? "enabled" : "disabled"), {"--no-jinja"},
[](common_params & params) { string_format("whether to use jinja template engine for chat (default: %s)", params.use_jinja ? "enabled" : "disabled"),
params.use_jinja = true; [](common_params & params, bool value) {
params.use_jinja = value;
} }
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA")); ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
add_opt(common_arg(
{"--no-jinja"},
string_format("disable jinja template for chat (default: %s)", params.use_jinja ? "disabled" : "enabled"),
[](common_params & params) {
params.use_jinja = false;
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_NO_JINJA"));
add_opt(common_arg( add_opt(common_arg(
{"--reasoning-format"}, "FORMAT", {"--reasoning-format"}, "FORMAT",
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n" "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
@ -2673,15 +2777,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
} }
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE")); ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
add_opt(common_arg( add_opt(common_arg(
{"--prefill-assistant"},
{"--no-prefill-assistant"}, {"--no-prefill-assistant"},
string_format( string_format(
"whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n" "whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n"
"when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n" "when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n"
), ),
[](common_params & params) { [](common_params & params, bool value) {
params.prefill_assistant = false; params.prefill_assistant = value;
} }
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_PREFILL_ASSISTANT")); ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PREFILL_ASSISTANT"));
add_opt(common_arg( add_opt(common_arg(
{"-sps", "--slot-prompt-similarity"}, "SIMILARITY", {"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity), string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),

View File

@ -16,6 +16,7 @@ struct common_arg {
std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON}; std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
std::set<enum llama_example> excludes = {}; std::set<enum llama_example> excludes = {};
std::vector<const char *> args; std::vector<const char *> args;
std::vector<const char *> args_neg; // for negated args like --no-xxx
const char * value_hint = nullptr; // help text or example for arg value const char * value_hint = nullptr; // help text or example for arg value
const char * value_hint_2 = nullptr; // for second arg value const char * value_hint_2 = nullptr; // for second arg value
const char * env = nullptr; const char * env = nullptr;
@ -25,6 +26,7 @@ struct common_arg {
void (*handler_string) (common_params & params, const std::string &) = nullptr; void (*handler_string) (common_params & params, const std::string &) = nullptr;
void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr; void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr;
void (*handler_int) (common_params & params, int) = nullptr; void (*handler_int) (common_params & params, int) = nullptr;
void (*handler_bool) (common_params & params, bool) = nullptr;
common_arg() = default; common_arg() = default;
@ -48,6 +50,13 @@ struct common_arg {
void (*handler)(common_params & params) void (*handler)(common_params & params)
) : args(args), help(help), handler_void(handler) {} ) : args(args), help(help), handler_void(handler) {}
common_arg(
const std::initializer_list<const char *> & args,
const std::initializer_list<const char *> & args_neg,
const std::string & help,
void (*handler)(common_params & params, bool)
) : args(args), args_neg(args_neg), help(help), handler_bool(handler) {}
// support 2 values for arg // support 2 values for arg
common_arg( common_arg(
const std::initializer_list<const char *> & args, const std::initializer_list<const char *> & args,
@ -80,6 +89,10 @@ struct common_arg {
} }
return strcmp(args[0], other.args[0]) == 0; return strcmp(args[0], other.args[0]) == 0;
} }
// get all args and env vars (including negated args/env)
std::vector<std::string> get_args() const;
std::vector<std::string> get_env() const;
}; };
namespace common_arg_utils { namespace common_arg_utils {
@ -102,7 +115,7 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
// parse input arguments from CLI into a map // parse input arguments from CLI into a map
// TODO: support repeated args in the future // TODO: support repeated args in the future
bool common_params_parse(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map); bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map);
// initialize argument parser context - used by test-arg-parser and preset // initialize argument parser context - used by test-arg-parser and preset
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr); common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);

View File

@ -4,9 +4,14 @@
using json = nlohmann::json; using json = nlohmann::json;
static std::string_view trim_trailing_space(std::string_view sv) { static std::string_view trim_trailing_space(std::string_view sv, int max = -1) {
int count = 0;
while (!sv.empty() && std::isspace(static_cast<unsigned char>(sv.back()))) { while (!sv.empty() && std::isspace(static_cast<unsigned char>(sv.back()))) {
if (max != -1 && count <= max) {
break;
}
sv.remove_suffix(1); sv.remove_suffix(1);
count++;
} }
return sv; return sv;
} }
@ -93,7 +98,7 @@ void common_chat_peg_constructed_mapper::map(const common_peg_ast_node & node) {
if (is_arg_string && current_tool) { if (is_arg_string && current_tool) {
// Serialize to JSON, but exclude the end quote // Serialize to JSON, but exclude the end quote
std::string dumped = json(node.text).dump(); std::string dumped = json(trim_trailing_space(node.text)).dump();
current_tool->arguments += dumped.substr(0, dumped.size() - 1); current_tool->arguments += dumped.substr(0, dumped.size() - 1);
needs_closing_quote = true; needs_closing_quote = true;
} }
@ -101,6 +106,7 @@ void common_chat_peg_constructed_mapper::map(const common_peg_ast_node & node) {
if (is_arg_close && current_tool) { if (is_arg_close && current_tool) {
if (needs_closing_quote) { if (needs_closing_quote) {
current_tool->arguments += "\""; current_tool->arguments += "\"";
needs_closing_quote = false;
} }
} }
@ -109,6 +115,10 @@ void common_chat_peg_constructed_mapper::map(const common_peg_ast_node & node) {
} }
if (is_tool_close && current_tool) { if (is_tool_close && current_tool) {
if (needs_closing_quote) {
current_tool->arguments += "\"";
needs_closing_quote = false;
}
current_tool->arguments += "}"; current_tool->arguments += "}";
} }
} }

View File

@ -711,6 +711,25 @@ static void foreach_function(const json & tools, const std::function<void(const
} }
} }
static void foreach_parameter(const json & function, const std::function<void(const std::string &, const json &, bool)> & fn) {
if (!function.contains("parameters") || !function.at("parameters").is_object()) {
return;
}
const auto & params = function.at("parameters");
if (!params.contains("properties") || !params.at("properties").is_object()) {
return;
}
const auto & props = params.at("properties");
std::set<std::string> required;
if (params.contains("required") && params.at("required").is_array()) {
params.at("required").get_to(required);
}
for (const auto & [name, prop] : props.items()) {
bool is_required = (required.find(name) != required.end());
fn(name, prop, is_required);
}
}
static std::string apply( static std::string apply(
const common_chat_template & tmpl, const common_chat_template & tmpl,
const struct templates_params & inputs, const struct templates_params & inputs,
@ -1409,6 +1428,123 @@ static common_chat_params common_chat_params_init_nemotron_v2(const common_chat_
return data; return data;
} }
static common_chat_params common_chat_params_init_nemotron_v3(const common_chat_template & tmpl, const struct templates_params & inputs) {
common_chat_params data;
data.prompt = apply(tmpl, inputs);
data.format = COMMON_CHAT_FORMAT_PEG_CONSTRUCTED;
// Handle thinking tags appropriately based on inputs.enable_thinking
if (string_ends_with(data.prompt, "<think>\n")) {
if (!inputs.enable_thinking) {
data.prompt += "</think>";
} else {
data.thinking_forced_open = true;
}
}
data.preserved_tokens = {
"<think>",
"</think>",
"<tool_call>",
"</tool_call>",
};
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
auto include_grammar = true;
auto parser = build_chat_peg_constructed_parser([&](auto & p) {
auto reasoning = p.eps();
if (inputs.enable_thinking && extract_reasoning) {
auto reasoning_content = p.reasoning(p.until("</think>")) + ("</think>" | p.end());
if (data.thinking_forced_open) {
reasoning = reasoning_content;
}
}
// Response format parser
if (inputs.json_schema.is_object() && !inputs.json_schema.empty()) {
return reasoning << p.content(p.schema(p.json(), "response-format", inputs.json_schema));
}
// Tool call parser
if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
auto tool_choice = p.choice();
foreach_function(inputs.tools, [&](const json & tool) {
const auto & function = tool.at("function");
std::string name = function.at("name");
auto parameters = function.at("parameters");
auto schema_info = common_schema_info();
schema_info.resolve_refs(parameters);
auto tool_open = "<function=" + p.tool_name(p.literal(name)) + ">\n";
auto tool_close = p.literal("</function>\n");
auto args = p.sequence();
auto arg_string = p.rule("xml-arg-string", p.until_one_of({
"\n</parameter>",
"\n<parameter=",
"\n</function>"
}));
foreach_parameter(function, [&](const auto & param_name, const json & param_schema, bool is_required) {
auto rule_name = "tool-" + name + "-arg-" + param_name;
auto arg_open = "<parameter=" + p.tool_arg_name(p.literal(param_name)) + ">\n";
auto arg_close = p.literal("</parameter>\n");
auto arg_value = p.eps();
if (schema_info.resolves_to_string(param_schema)) {
arg_value = p.tool_arg_string_value(arg_string) + "\n";
} else {
arg_value = p.tool_arg_json_value(p.schema(p.json(), rule_name + "-schema", param_schema));
}
// Model may or my not close with </parameter>
auto arg_rule = p.rule(rule_name, p.tool_arg_open(arg_open) + arg_value + p.optional(p.tool_arg_close(arg_close)));
args += p.repeat(arg_rule, /* min = */ is_required ? 1 : 0, /* max = */ 1);
});
tool_choice |= p.rule("tool-" + name, p.tool_open(tool_open) + args + p.tool_close(tool_close));
});
auto min_calls = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED ? 1 : 0;
auto max_calls = inputs.parallel_tool_calls ? -1 : 1;
auto tool_call = p.rule("tool-call", "<tool_call>\n" + tool_choice + "</tool_call>" + p.space());
auto tool_calls = p.trigger_rule("tool-call-root", p.repeat(tool_call, /* min = */ min_calls, /* max = */ max_calls));
return reasoning << p.content(p.until("<tool_call>")) << tool_calls;
}
// Content only parser
include_grammar = false;
return reasoning << p.content(p.rest());
});
data.parser = parser.save();
if (include_grammar) {
data.grammar_lazy = has_tools && inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
foreach_function(inputs.tools, [&](const json & tool) {
const auto & function = tool.at("function");
auto schema = function.at("parameters");
builder.resolve_refs(schema);
});
parser.build_grammar(builder, data.grammar_lazy);
});
data.grammar_triggers = {
{COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<tool_call>"}
};
}
return data;
}
static common_chat_params common_chat_params_init_apertus(const common_chat_template & tmpl, const struct templates_params & inputs) { static common_chat_params common_chat_params_init_apertus(const common_chat_template & tmpl, const struct templates_params & inputs) {
common_chat_params data; common_chat_params data;
@ -2534,6 +2670,10 @@ static common_chat_params common_chat_templates_apply_jinja(
src.find("<function=") != std::string::npos && src.find("<function=") != std::string::npos &&
src.find("<parameters>") != std::string::npos && src.find("<parameters>") != std::string::npos &&
src.find("<parameter=") != std::string::npos) { src.find("<parameter=") != std::string::npos) {
// Nemotron 3 Nano 30B A3B
if (src.find("<think>") != std::string::npos) {
return common_chat_params_init_nemotron_v3(tmpl, params);
}
return common_chat_params_init_qwen3_coder_xml(tmpl, params); return common_chat_params_init_qwen3_coder_xml(tmpl, params);
} }

View File

@ -1013,31 +1013,40 @@ bool tty_can_use_colors() {
// Model utils // Model utils
// //
static inline void common_init_sampler_from_model( // TODO: move to common/sampling
static void common_init_sampler_from_model(
const llama_model * model, const llama_model * model,
common_params_sampling & sparams) { common_params_sampling & sparams) {
const uint64_t config = sparams.user_sampling_config; const uint64_t config = sparams.user_sampling_config;
auto get_int32 = [&](const char * key, int32_t & dst, uint64_t user_config) { auto get_int32 = [&](const char * key, int32_t & dst, uint64_t user_config) {
if (config & user_config) return; if (config & user_config) {
return;
}
char buf[64] = {0}; char buf[64] = {0};
if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) { if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
char * end = nullptr; char * end = nullptr;
int32_t v = strtol(buf, &end, 10); int32_t v = strtol(buf, &end, 10);
if (end && end != buf) dst = v; if (end && end != buf) {
dst = v;
}
} }
}; };
auto get_float = [&](const char * key, float & dst, uint64_t user_config) { auto get_float = [&](const char * key, float & dst, uint64_t user_config) {
if (config & user_config) return; if (config & user_config) {
return;
}
char buf[128] = {0}; char buf[128] = {0};
if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) { if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
char * end = nullptr; char * end = nullptr;
float v = strtof(buf, &end); float v = strtof(buf, &end);
if (end && end != buf) dst = v; if (end && end != buf) {
dst = v;
}
} }
}; };
@ -1065,31 +1074,125 @@ static inline void common_init_sampler_from_model(
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA), sparams.mirostat_eta, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA); get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA), sparams.mirostat_eta, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA);
} }
struct common_init_result common_init_from_params(common_params & params) { struct common_init_result::impl {
common_init_result iparams; impl() = default;
~impl() = default;
llama_model_ptr model;
llama_context_ptr context;
std::vector<llama_adapter_lora_ptr> lora;
std::vector<common_sampler_ptr> samplers;
};
common_init_result::common_init_result(common_params & params) :
pimpl(new impl{}) {
auto mparams = common_model_params_to_llama(params); auto mparams = common_model_params_to_llama(params);
auto cparams = common_context_params_to_llama(params);
if (params.fit_params) {
LOG_INF("%s: fitting params to device memory, to report bugs during this step use -fit off (or --verbose if you can't)\n", __func__);
llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
}
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams); llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
if (model == NULL) { if (model == NULL) {
LOG_ERR("%s: failed to load model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n", return;
__func__, params.model.path.c_str());
return iparams;
} }
common_init_sampler_from_model(model, params.sampling); pimpl->model.reset(model);
const llama_vocab * vocab = llama_model_get_vocab(model); const llama_vocab * vocab = llama_model_get_vocab(model);
auto cparams = common_context_params_to_llama(params); // updates params.sampling
// TODO: fix naming
common_init_sampler_from_model(model, params.sampling);
if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
params.sampling.ignore_eos = false;
}
// initialize once
for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
if (llama_vocab_is_eog(vocab, i)) {
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(vocab, i).c_str(), -INFINITY);
params.sampling.logit_bias_eog.push_back({i, -INFINITY});
}
}
if (params.sampling.ignore_eos) {
// add EOG biases to the active set of logit biases
params.sampling.logit_bias.insert(
params.sampling.logit_bias.end(),
params.sampling.logit_bias_eog.begin(), params.sampling.logit_bias_eog.end());
}
//if (params.sampling.penalty_last_n == -1) {
// LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
// params.sampling.penalty_last_n = llama_n_ctx(lctx);
//}
//if (params.sampling.dry_penalty_last_n == -1) {
// LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
// params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
//}
pimpl->samplers.resize(cparams.n_seq_max);
for (int i = 0; i < (int) cparams.n_seq_max; ++i) {
pimpl->samplers[i].reset(common_sampler_init(model, params.sampling));
}
llama_context * lctx = llama_init_from_model(model, cparams); llama_context * lctx = llama_init_from_model(model, cparams);
if (lctx == NULL) { if (lctx == NULL) {
LOG_ERR("%s: failed to create context with model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n", LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
__func__, params.model.path.c_str()); return;
llama_model_free(model);
return iparams;
} }
pimpl->context.reset(lctx);
}
llama_model * common_init_result::model() {
return pimpl->model.get();
}
llama_context * common_init_result::context() {
return pimpl->context.get();
}
common_sampler * common_init_result::sampler(llama_seq_id seq_id) {
return pimpl->samplers[seq_id].get();
}
std::vector<llama_adapter_lora_ptr> & common_init_result::lora() {
return pimpl->lora;
}
void common_init_result::free_context() {
pimpl->context.reset();
}
common_init_result_ptr common_init_from_params(common_params & params) {
common_init_result_ptr res(new common_init_result(params));
llama_model * model = res->model();
if (model == NULL) {
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
return res;
}
llama_context * lctx = res->context();
if (lctx == NULL) {
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
return res;
}
const llama_vocab * vocab = llama_model_get_vocab(model);
if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) { if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) {
LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__); LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
params.ctx_shift = false; params.ctx_shift = false;
@ -1101,10 +1204,7 @@ struct common_init_result common_init_from_params(common_params & params) {
const auto cvec = common_control_vector_load(params.control_vectors); const auto cvec = common_control_vector_load(params.control_vectors);
if (cvec.n_embd == -1) { if (cvec.n_embd == -1) {
llama_free(lctx); return res;
llama_model_free(model);
return iparams;
} }
int err = llama_apply_adapter_cvec( int err = llama_apply_adapter_cvec(
@ -1115,10 +1215,7 @@ struct common_init_result common_init_from_params(common_params & params) {
params.control_vector_layer_start, params.control_vector_layer_start,
params.control_vector_layer_end); params.control_vector_layer_end);
if (err) { if (err) {
llama_free(lctx); return res;
llama_model_free(model);
return iparams;
} }
} }
@ -1142,10 +1239,7 @@ struct common_init_result common_init_from_params(common_params & params) {
} }
if (!ok) { if (!ok) {
llama_free(lctx); return res;
llama_model_free(model);
return iparams;
} }
} }
@ -1155,9 +1249,7 @@ struct common_init_result common_init_from_params(common_params & params) {
lora.reset(llama_adapter_lora_init(model, la.path.c_str())); lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
if (lora == nullptr) { if (lora == nullptr) {
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str()); LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
llama_free(lctx); return res;
llama_model_free(model);
return iparams;
} }
char buf[1024]; char buf[1024];
@ -1166,43 +1258,13 @@ struct common_init_result common_init_from_params(common_params & params) {
la.task_name = buf; la.task_name = buf;
llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf)); llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
la.prompt_prefix = buf; la.prompt_prefix = buf;
iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters res->lora().emplace_back(std::move(lora)); // copy to list of loaded adapters
} }
if (!params.lora_init_without_apply) { if (!params.lora_init_without_apply) {
common_set_adapter_lora(lctx, params.lora_adapters); common_set_adapter_lora(lctx, params.lora_adapters);
} }
if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
params.sampling.ignore_eos = false;
}
// initialize once
for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
if (llama_vocab_is_eog(vocab, i)) {
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
params.sampling.logit_bias_eog.push_back({i, -INFINITY});
}
}
if (params.sampling.ignore_eos) {
// add EOG biases to the active set of logit biases
params.sampling.logit_bias.insert(
params.sampling.logit_bias.end(),
params.sampling.logit_bias_eog.begin(), params.sampling.logit_bias_eog.end());
}
if (params.sampling.penalty_last_n == -1) {
LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
params.sampling.penalty_last_n = llama_n_ctx(lctx);
}
if (params.sampling.dry_penalty_last_n == -1) {
LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
}
if (params.warmup) { if (params.warmup) {
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__); LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
@ -1241,12 +1303,11 @@ struct common_init_result common_init_from_params(common_params & params) {
llama_set_warmup(lctx, false); llama_set_warmup(lctx, false);
} }
iparams.model.reset(model); return res;
iparams.context.reset(lctx);
return iparams;
} }
common_init_result::~common_init_result() = default;
std::string get_model_endpoint() { std::string get_model_endpoint() {
const char * model_endpoint_env = getenv("MODEL_ENDPOINT"); const char * model_endpoint_env = getenv("MODEL_ENDPOINT");
// We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility. // We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility.
@ -1255,7 +1316,9 @@ std::string get_model_endpoint() {
std::string model_endpoint = "https://huggingface.co/"; std::string model_endpoint = "https://huggingface.co/";
if (endpoint_env) { if (endpoint_env) {
model_endpoint = endpoint_env; model_endpoint = endpoint_env;
if (model_endpoint.back() != '/') model_endpoint += '/'; if (model_endpoint.back() != '/') {
model_endpoint += '/';
}
} }
return model_endpoint; return model_endpoint;
} }

View File

@ -99,6 +99,7 @@ enum llama_example {
LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_TTS,
LLAMA_EXAMPLE_DIFFUSION, LLAMA_EXAMPLE_DIFFUSION,
LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_FINETUNE,
LLAMA_EXAMPLE_FIT_PARAMS,
LLAMA_EXAMPLE_COUNT, LLAMA_EXAMPLE_COUNT,
}; };
@ -195,7 +196,6 @@ struct common_params_sampling {
std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
std::vector<enum common_sampler_type> samplers = { std::vector<enum common_sampler_type> samplers = {
COMMON_SAMPLER_TYPE_PENALTIES, COMMON_SAMPLER_TYPE_PENALTIES,
COMMON_SAMPLER_TYPE_DRY, COMMON_SAMPLER_TYPE_DRY,
@ -216,6 +216,10 @@ struct common_params_sampling {
std::vector<llama_logit_bias> logit_bias; // logit biases to apply std::vector<llama_logit_bias> logit_bias; // logit biases to apply
std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
bool has_logit_bias() const {
return !logit_bias.empty();
}
// print the parameters into a string // print the parameters into a string
std::string print() const; std::string print() const;
}; };
@ -303,8 +307,8 @@ struct lr_opt {
struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata); struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
struct common_params { struct common_params {
int32_t n_predict = -1; // new tokens to predict int32_t n_predict = -1; // max. number of new tokens to predict, -1 == no limit
int32_t n_ctx = 4096; // context size int32_t n_ctx = 0; // context size, 0 == context the model was trained with
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS) int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS) int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt int32_t n_keep = 0; // number of tokens to keep from initial prompt
@ -328,6 +332,9 @@ struct common_params {
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
bool fit_params = true; // whether to fit unset model/context parameters to free device memory
size_t fit_params_target = 1024 * 1024*1024; // margin per device in bytes for fitting parameters to free memory
int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
@ -669,15 +676,29 @@ bool tty_can_use_colors();
// Model utils // Model utils
// //
// note: defines object's lifetime struct common_sampler;
struct common_init_result {
llama_model_ptr model;
llama_context_ptr context;
std::vector<llama_adapter_lora_ptr> lora; // note: defines the model, context, samplers, ets. lifetimes
struct common_init_result {
common_init_result(common_params & params);
~common_init_result();
llama_model * model();
llama_context * context();
common_sampler * sampler(llama_seq_id seq_id);
std::vector<llama_adapter_lora_ptr> & lora();
void free_context();
private:
struct impl;
std::unique_ptr<impl> pimpl;
}; };
struct common_init_result common_init_from_params(common_params & params); using common_init_result_ptr = std::unique_ptr<common_init_result>;
common_init_result_ptr common_init_from_params(common_params & params);
struct llama_model_params common_model_params_to_llama ( common_params & params); struct llama_model_params common_model_params_to_llama ( common_params & params);
struct llama_context_params common_context_params_to_llama(const common_params & params); struct llama_context_params common_context_params_to_llama(const common_params & params);

View File

@ -12,6 +12,8 @@
#include <filesystem> #include <filesystem>
#include <fstream> #include <fstream>
#include <future> #include <future>
#include <map>
#include <mutex>
#include <regex> #include <regex>
#include <string> #include <string>
#include <thread> #include <thread>
@ -472,6 +474,18 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
#elif defined(LLAMA_USE_HTTPLIB) #elif defined(LLAMA_USE_HTTPLIB)
class ProgressBar {
static inline std::mutex mutex;
static inline std::map<const ProgressBar *, int> lines;
static inline int max_line = 0;
static void cleanup(const ProgressBar * line) {
lines.erase(line);
if (lines.empty()) {
max_line = 0;
}
}
static bool is_output_a_tty() { static bool is_output_a_tty() {
#if defined(_WIN32) #if defined(_WIN32)
return _isatty(_fileno(stdout)); return _isatty(_fileno(stdout));
@ -480,7 +494,15 @@ static bool is_output_a_tty() {
#endif #endif
} }
static void print_progress(size_t current, size_t total) { public:
ProgressBar() = default;
~ProgressBar() {
std::lock_guard<std::mutex> lock(mutex);
cleanup(this);
}
void update(size_t current, size_t total) {
if (!is_output_a_tty()) { if (!is_output_a_tty()) {
return; return;
} }
@ -489,19 +511,42 @@ static void print_progress(size_t current, size_t total) {
return; return;
} }
std::lock_guard<std::mutex> lock(mutex);
if (lines.find(this) == lines.end()) {
lines[this] = max_line++;
std::cout << "\n";
}
int lines_up = max_line - lines[this];
size_t width = 50; size_t width = 50;
size_t pct = (100 * current) / total; size_t pct = (100 * current) / total;
size_t pos = (width * current) / total; size_t pos = (width * current) / total;
std::cout << "[" std::cout << "\033[s";
if (lines_up > 0) {
std::cout << "\033[" << lines_up << "A";
}
std::cout << "\033[2K\r["
<< std::string(pos, '=') << std::string(pos, '=')
<< (pos < width ? ">" : "") << (pos < width ? ">" : "")
<< std::string(width - pos, ' ') << std::string(width - pos, ' ')
<< "] " << std::setw(3) << pct << "% (" << "] " << std::setw(3) << pct << "% ("
<< current / (1024 * 1024) << " MB / " << current / (1024 * 1024) << " MB / "
<< total / (1024 * 1024) << " MB)\r"; << total / (1024 * 1024) << " MB) "
<< "\033[u";
std::cout.flush(); std::cout.flush();
if (current == total) {
cleanup(this);
} }
}
ProgressBar(const ProgressBar &) = delete;
ProgressBar & operator=(const ProgressBar &) = delete;
};
static bool common_pull_file(httplib::Client & cli, static bool common_pull_file(httplib::Client & cli,
const std::string & resolve_path, const std::string & resolve_path,
@ -523,6 +568,7 @@ static bool common_pull_file(httplib::Client & cli,
const char * func = __func__; // avoid __func__ inside a lambda const char * func = __func__; // avoid __func__ inside a lambda
size_t downloaded = existing_size; size_t downloaded = existing_size;
size_t progress_step = 0; size_t progress_step = 0;
ProgressBar bar;
auto res = cli.Get(resolve_path, headers, auto res = cli.Get(resolve_path, headers,
[&](const httplib::Response &response) { [&](const httplib::Response &response) {
@ -554,7 +600,7 @@ static bool common_pull_file(httplib::Client & cli,
progress_step += len; progress_step += len;
if (progress_step >= total_size / 1000 || downloaded == total_size) { if (progress_step >= total_size / 1000 || downloaded == total_size) {
print_progress(downloaded, total_size); bar.update(downloaded, total_size);
progress_step = 0; progress_step = 0;
} }
return true; return true;
@ -562,8 +608,6 @@ static bool common_pull_file(httplib::Client & cli,
nullptr nullptr
); );
std::cout << "\n";
if (!res) { if (!res) {
LOG_ERR("%s: error during download. Status: %d\n", __func__, res ? res->status : -1); LOG_ERR("%s: error during download. Status: %d\n", __func__, res ? res->status : -1);
return false; return false;

View File

@ -305,8 +305,9 @@ static std::string format_literal(const std::string & literal) {
std::string gbnf_format_literal(const std::string & literal) { return format_literal(literal); } std::string gbnf_format_literal(const std::string & literal) { return format_literal(literal); }
class SchemaConverter { class common_schema_converter {
private: private:
friend class common_schema_info;
friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options); friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
std::function<json(const std::string &)> _fetch_json; std::function<json(const std::string &)> _fetch_json;
bool _dotall; bool _dotall;
@ -729,7 +730,7 @@ private:
} }
public: public:
SchemaConverter( common_schema_converter(
const std::function<json(const std::string &)> & fetch_json, const std::function<json(const std::string &)> & fetch_json,
bool dotall) bool dotall)
: _fetch_json(fetch_json), _dotall(dotall) : _fetch_json(fetch_json), _dotall(dotall)
@ -990,6 +991,134 @@ public:
} }
}; };
// common_schema_info implementation (pimpl)
common_schema_info::common_schema_info()
: impl_(std::make_unique<common_schema_converter>(
[](const std::string &) { return json(); },
false)) {}
common_schema_info::~common_schema_info() = default;
common_schema_info::common_schema_info(common_schema_info &&) noexcept = default;
common_schema_info & common_schema_info::operator=(common_schema_info &&) noexcept = default;
void common_schema_info::resolve_refs(nlohmann::ordered_json & schema) {
impl_->resolve_refs(schema, "");
}
// Determines if a JSON schema can resolve to a string type through any path.
// Some models emit raw string values rather than JSON-encoded strings for string parameters.
// If any branch of the schema (via oneOf, anyOf, $ref, etc.) permits a string, this returns
// true, allowing callers to handle the value as a raw string for simplicity.
bool common_schema_info::resolves_to_string(const nlohmann::ordered_json & schema) {
std::unordered_set<std::string> visited_refs;
std::function<bool(const json &)> check = [&](const json & s) -> bool {
if (!s.is_object()) {
return false;
}
// Handle $ref
if (s.contains("$ref")) {
const std::string & ref = s["$ref"];
if (visited_refs.find(ref) != visited_refs.end()) {
// Circular reference, assume not a string to be safe
return false;
}
visited_refs.insert(ref);
auto it = impl_->_refs.find(ref);
if (it != impl_->_refs.end()) {
return check(it->second);
}
return false;
}
// Check type field
if (s.contains("type")) {
const json & schema_type = s["type"];
if (schema_type.is_string()) {
if (schema_type == "string") {
return true;
}
} else if (schema_type.is_array()) {
// Type can be an array like ["string", "null"]
for (const auto & t : schema_type) {
if (t == "string") {
return true;
}
}
}
}
// Check oneOf/anyOf - if any alternative can be a string
if (s.contains("oneOf")) {
for (const auto & alt : s["oneOf"]) {
if (check(alt)) {
return true;
}
}
}
if (s.contains("anyOf")) {
for (const auto & alt : s["anyOf"]) {
if (check(alt)) {
return true;
}
}
}
// Check allOf - all components must be compatible with string type
if (s.contains("allOf")) {
bool all_string = true;
for (const auto & component : s["allOf"]) {
if (!check(component)) {
all_string = false;
break;
}
}
if (all_string) {
return true;
}
}
// Check const - if the constant value is a string
if (s.contains("const")) {
if (s["const"].is_string()) {
return true;
}
}
// Check enum - if any enum value is a string
if (s.contains("enum")) {
for (const auto & val : s["enum"]) {
if (val.is_string()) {
return true;
}
}
}
// String-specific keywords imply string type
if (s.contains("pattern") || s.contains("minLength") || s.contains("maxLength")) {
return true;
}
// Check format - many formats imply string
if (s.contains("format")) {
const std::string & fmt = s["format"];
if (fmt == "date" || fmt == "time" || fmt == "date-time" ||
fmt == "uri" || fmt == "email" || fmt == "hostname" ||
fmt == "ipv4" || fmt == "ipv6" || fmt == "uuid" ||
fmt.find("uuid") == 0) {
return true;
}
}
return false;
};
return check(schema);
}
std::string json_schema_to_grammar(const json & schema, bool force_gbnf) { std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
#ifdef LLAMA_USE_LLGUIDANCE #ifdef LLAMA_USE_LLGUIDANCE
if (!force_gbnf) { if (!force_gbnf) {
@ -1006,7 +1135,7 @@ std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
} }
std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options) { std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options) {
SchemaConverter converter([&](const std::string &) { return json(); }, options.dotall); common_schema_converter converter([&](const std::string &) { return json(); }, options.dotall);
common_grammar_builder builder { common_grammar_builder builder {
/* .add_rule = */ [&](const std::string & name, const std::string & rule) { /* .add_rule = */ [&](const std::string & name, const std::string & rule) {
return converter._add_rule(name, rule); return converter._add_rule(name, rule);

View File

@ -3,11 +3,31 @@
#include <nlohmann/json_fwd.hpp> #include <nlohmann/json_fwd.hpp>
#include <functional> #include <functional>
#include <memory>
#include <string> #include <string>
std::string json_schema_to_grammar(const nlohmann::ordered_json & schema, std::string json_schema_to_grammar(const nlohmann::ordered_json & schema,
bool force_gbnf = false); bool force_gbnf = false);
class common_schema_converter;
// Probes a JSON schema to extract information about its structure and type constraints.
class common_schema_info {
std::unique_ptr<common_schema_converter> impl_;
public:
common_schema_info();
~common_schema_info();
common_schema_info(const common_schema_info &) = delete;
common_schema_info & operator=(const common_schema_info &) = delete;
common_schema_info(common_schema_info &&) noexcept;
common_schema_info & operator=(common_schema_info &&) noexcept;
void resolve_refs(nlohmann::ordered_json & schema);
bool resolves_to_string(const nlohmann::ordered_json & schema);
};
struct common_grammar_builder { struct common_grammar_builder {
std::function<std::string(const std::string &, const std::string &)> add_rule; std::function<std::string(const std::string &, const std::string &)> add_rule;
std::function<std::string(const std::string &, const nlohmann::ordered_json &)> add_schema; std::function<std::string(const std::string &, const nlohmann::ordered_json &)> add_schema;

View File

@ -425,7 +425,7 @@ struct parser_executor {
if (result.need_more_input()) { if (result.need_more_input()) {
// Propagate - need to know what child would match before negating // Propagate - need to know what child would match before negating
return result; return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos);
} }
// Child failed, so negation succeeds // Child failed, so negation succeeds

View File

@ -23,10 +23,16 @@ std::vector<std::string> common_preset::to_args() const {
if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) { if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) {
// flag option, no value // flag option, no value
if (common_arg_utils::is_falsey(value)) { if (common_arg_utils::is_falsey(value)) {
// skip the flag // use negative arg if available
if (!opt.args_neg.empty()) {
args.back() = opt.args_neg.back();
} else {
// otherwise, skip the flag
// TODO: maybe throw an error instead?
args.pop_back(); args.pop_back();
} }
} }
}
if (opt.value_hint != nullptr) { if (opt.value_hint != nullptr) {
// single value // single value
args.push_back(value); args.push_back(value);
@ -141,16 +147,31 @@ static std::map<std::string, std::map<std::string, std::string>> parse_ini_from_
static std::map<std::string, common_arg> get_map_key_opt(common_params_context & ctx_params) { static std::map<std::string, common_arg> get_map_key_opt(common_params_context & ctx_params) {
std::map<std::string, common_arg> mapping; std::map<std::string, common_arg> mapping;
for (const auto & opt : ctx_params.options) { for (const auto & opt : ctx_params.options) {
if (opt.env != nullptr) { for (const auto & env : opt.get_env()) {
mapping[opt.env] = opt; mapping[env] = opt;
} }
for (const auto & arg : opt.args) { for (const auto & arg : opt.get_args()) {
mapping[rm_leading_dashes(arg)] = opt; mapping[rm_leading_dashes(arg)] = opt;
} }
} }
return mapping; return mapping;
} }
static bool is_bool_arg(const common_arg & arg) {
return !arg.args_neg.empty();
}
static std::string parse_bool_arg(const common_arg & arg, const std::string & key, const std::string & value) {
// if this is a negated arg, we need to reverse the value
for (const auto & neg_arg : arg.args_neg) {
if (rm_leading_dashes(neg_arg) == key) {
return common_arg_utils::is_truthy(value) ? "false" : "true";
}
}
// otherwise, not negated
return value;
}
common_presets common_presets_load(const std::string & path, common_params_context & ctx_params) { common_presets common_presets_load(const std::string & path, common_params_context & ctx_params) {
common_presets out; common_presets out;
auto key_to_opt = get_map_key_opt(ctx_params); auto key_to_opt = get_map_key_opt(ctx_params);
@ -167,8 +188,13 @@ common_presets common_presets_load(const std::string & path, common_params_conte
for (const auto & [key, value] : section.second) { for (const auto & [key, value] : section.second) {
LOG_DBG("option: %s = %s\n", key.c_str(), value.c_str()); LOG_DBG("option: %s = %s\n", key.c_str(), value.c_str());
if (key_to_opt.find(key) != key_to_opt.end()) { if (key_to_opt.find(key) != key_to_opt.end()) {
preset.options[key_to_opt[key]] = value; auto & opt = key_to_opt[key];
LOG_DBG("accepted option: %s = %s\n", key.c_str(), value.c_str()); if (is_bool_arg(opt)) {
preset.options[opt] = parse_bool_arg(opt, key, value);
} else {
preset.options[opt] = value;
}
LOG_DBG("accepted option: %s = %s\n", key.c_str(), preset.options[opt].c_str());
} else { } else {
// TODO: maybe warn about unknown key? // TODO: maybe warn about unknown key?
} }

View File

@ -104,9 +104,10 @@ struct ring_buffer {
struct common_sampler { struct common_sampler {
common_params_sampling params; common_params_sampling params;
struct llama_sampler * grmr;
struct llama_sampler * chain; struct llama_sampler * chain;
bool grammar;
ring_buffer<llama_token> prev; ring_buffer<llama_token> prev;
std::vector<llama_token_data> cur; std::vector<llama_token_data> cur;
@ -116,7 +117,6 @@ struct common_sampler {
void reset() { void reset() {
prev.clear(); prev.clear();
llama_sampler_reset(grmr);
llama_sampler_reset(chain); llama_sampler_reset(chain);
} }
@ -167,10 +167,15 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
lparams.no_perf = params.no_perf; lparams.no_perf = params.no_perf;
struct llama_sampler * grmr; llama_sampler * chain = llama_sampler_chain_init(lparams);
bool grammar = false;
std::vector<llama_sampler *> samplers;
if (params.grammar.compare(0, 11, "%llguidance") == 0) { if (params.grammar.compare(0, 11, "%llguidance") == 0) {
#ifdef LLAMA_USE_LLGUIDANCE #ifdef LLAMA_USE_LLGUIDANCE
grmr = llama_sampler_init_llg(vocab, "lark", params.grammar.c_str()); samplers.push_back(llama_sampler_init_llg(vocab, "lark", params.grammar.c_str()));
grammar = true;
#else #else
GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled"); GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
#endif // LLAMA_USE_LLGUIDANCE #endif // LLAMA_USE_LLGUIDANCE
@ -217,30 +222,23 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
trigger_patterns_c.push_back(regex.c_str()); trigger_patterns_c.push_back(regex.c_str());
} }
grmr = params.grammar_lazy if (!params.grammar.empty()) {
? llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root", if (params.grammar_lazy) {
samplers.push_back(
llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
trigger_patterns_c.data(), trigger_patterns_c.size(), trigger_patterns_c.data(), trigger_patterns_c.size(),
trigger_tokens.data(), trigger_tokens.size()) trigger_tokens.data(), trigger_tokens.size()));
: llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root"); } else {
if (!grmr) { samplers.push_back(llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root"));
return nullptr; }
grammar = true;
} }
} }
auto * result = new common_sampler { if (params.has_logit_bias()) {
/* .params = */ params, samplers.push_back(llama_sampler_init_logit_bias(llama_vocab_n_tokens(vocab), params.logit_bias.size(), params.logit_bias.data()));
/* .grmr = */ grmr, }
/* .chain = */ llama_sampler_chain_init(lparams),
/* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
/* .cur = */ {},
/* .cur_p = */ {},
};
llama_sampler_chain_add(result->chain,
llama_sampler_init_logit_bias(
llama_vocab_n_tokens(vocab),
params.logit_bias.size(),
params.logit_bias.data()));
if (params.mirostat == 0) { if (params.mirostat == 0) {
for (const auto & cnstr : params.samplers) { for (const auto & cnstr : params.samplers) {
@ -253,58 +251,70 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
c_breakers.push_back(str.c_str()); c_breakers.push_back(str.c_str());
} }
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size())); samplers.push_back(llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
} }
break; break;
case COMMON_SAMPLER_TYPE_TOP_K: case COMMON_SAMPLER_TYPE_TOP_K:
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k)); samplers.push_back(llama_sampler_init_top_k (params.top_k));
break; break;
case COMMON_SAMPLER_TYPE_TOP_P: case COMMON_SAMPLER_TYPE_TOP_P:
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep)); samplers.push_back(llama_sampler_init_top_p (params.top_p, params.min_keep));
break; break;
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: case COMMON_SAMPLER_TYPE_TOP_N_SIGMA:
llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma)); samplers.push_back(llama_sampler_init_top_n_sigma(params.top_n_sigma));
break; break;
case COMMON_SAMPLER_TYPE_MIN_P: case COMMON_SAMPLER_TYPE_MIN_P:
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep)); samplers.push_back(llama_sampler_init_min_p (params.min_p, params.min_keep));
break; break;
case COMMON_SAMPLER_TYPE_XTC: case COMMON_SAMPLER_TYPE_XTC:
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed)); samplers.push_back(llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
break; break;
case COMMON_SAMPLER_TYPE_TYPICAL_P: case COMMON_SAMPLER_TYPE_TYPICAL_P:
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep)); samplers.push_back(llama_sampler_init_typical (params.typ_p, params.min_keep));
break; break;
case COMMON_SAMPLER_TYPE_TEMPERATURE: case COMMON_SAMPLER_TYPE_TEMPERATURE:
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent)); samplers.push_back(llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
break; break;
case COMMON_SAMPLER_TYPE_INFILL: case COMMON_SAMPLER_TYPE_INFILL:
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab)); samplers.push_back(llama_sampler_init_infill (vocab));
break; break;
case COMMON_SAMPLER_TYPE_PENALTIES: case COMMON_SAMPLER_TYPE_PENALTIES:
llama_sampler_chain_add(result->chain, llama_sampler_init_penalties (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present)); samplers.push_back(llama_sampler_init_penalties (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
break; break;
default: default:
GGML_ASSERT(false && "unknown sampler type"); GGML_ASSERT(false && "unknown sampler type");
} }
} }
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
samplers.push_back(llama_sampler_init_dist(params.seed));
} else if (params.mirostat == 1) { } else if (params.mirostat == 1) {
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp)); samplers.push_back(llama_sampler_init_temp(params.temp));
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100)); samplers.push_back(llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
} else if (params.mirostat == 2) { } else if (params.mirostat == 2) {
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp)); samplers.push_back(llama_sampler_init_temp(params.temp));
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta)); samplers.push_back(llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
} else { } else {
GGML_ASSERT(false && "unknown mirostat version"); GGML_ASSERT(false && "unknown mirostat version");
} }
for (auto * smpl : samplers) {
llama_sampler_chain_add(chain, smpl);
}
auto * result = new common_sampler {
/* .params = */ params,
/* .chain = */ chain,
/* .grammar = */ grammar,
/* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
/* .cur = */ {},
/* .cur_p = */ {},
};
return result; return result;
} }
void common_sampler_free(struct common_sampler * gsmpl) { void common_sampler_free(struct common_sampler * gsmpl) {
if (gsmpl) { if (gsmpl) {
llama_sampler_free(gsmpl->grmr);
llama_sampler_free(gsmpl->chain); llama_sampler_free(gsmpl->chain);
delete gsmpl; delete gsmpl;
@ -314,11 +324,24 @@ void common_sampler_free(struct common_sampler * gsmpl) {
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) { void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
const auto tm = gsmpl->tm(); const auto tm = gsmpl->tm();
if (accept_grammar) { if (gsmpl->grammar) {
llama_sampler_accept(gsmpl->grmr, token); const int n_smpl = llama_sampler_chain_n(gsmpl->chain);
}
for (int i = 0; i < n_smpl; i++) {
auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
// the grammar sampler is always the first one
if (i == 0) {
if (accept_grammar) {
llama_sampler_accept(smpl, token);
}
} else {
llama_sampler_accept(smpl, token);
}
}
} else {
llama_sampler_accept(gsmpl->chain, token); llama_sampler_accept(gsmpl->chain, token);
}
gsmpl->prev.push_back(token); gsmpl->prev.push_back(token);
} }
@ -330,8 +353,8 @@ void common_sampler_reset(struct common_sampler * gsmpl) {
struct common_sampler * common_sampler_clone(common_sampler * gsmpl) { struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
return new common_sampler { return new common_sampler {
/* .params = */ gsmpl->params, /* .params = */ gsmpl->params,
/* .grmr = */ llama_sampler_clone(gsmpl->grmr),
/* .chain = */ llama_sampler_clone(gsmpl->chain), /* .chain = */ llama_sampler_clone(gsmpl->chain),
/* .grammar = */ gsmpl->grammar,
/* .prev = */ gsmpl->prev, /* .prev = */ gsmpl->prev,
/* .cur = */ gsmpl->cur, /* .cur = */ gsmpl->cur,
/* .cur_p = */ gsmpl->cur_p, /* .cur_p = */ gsmpl->cur_p,
@ -383,58 +406,33 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
} }
} }
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) { struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl) {
return gsmpl->chain;
}
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx) {
llama_synchronize(ctx); llama_synchronize(ctx);
// start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations // start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations
const auto tm = gsmpl->tm(); const auto tm = gsmpl->tm();
gsmpl->set_logits(ctx, idx); llama_token id = LLAMA_TOKEN_NULL;
auto & grmr = gsmpl->grmr;
auto & chain = gsmpl->chain; auto & chain = gsmpl->chain;
auto & cur_p = gsmpl->cur_p; // initialized by set_logits auto & cur_p = gsmpl->cur_p; // initialized by set_logits
if (grammar_first) { gsmpl->set_logits(ctx, idx);
llama_sampler_apply(grmr, &cur_p);
}
llama_sampler_apply(chain, &cur_p); llama_sampler_apply(chain, &cur_p);
GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration"); GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
const llama_token id = cur_p.data[cur_p.selected].id; id = cur_p.data[cur_p.selected].id;
if (grammar_first) {
return id; return id;
} }
// check if it the sampled token fits the grammar std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft) {
{
llama_token_data single_token_data = { id, 1.0f, 0.0f };
llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
llama_sampler_apply(grmr, &single_token_data_array);
const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
if (is_valid) {
return id;
}
}
// resampling:
// if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
gsmpl->set_logits(ctx, idx);
llama_sampler_apply(grmr, &cur_p);
llama_sampler_apply(chain, &cur_p);
GGML_ASSERT(cur_p.selected != -1 && "no selected token during re-sampling - check your sampling configuration");
return cur_p.data[cur_p.selected].id;
}
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first) {
GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1"); GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1");
std::vector<llama_token> result; std::vector<llama_token> result;
@ -442,7 +440,7 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
size_t i = 0; size_t i = 0;
for (; i < draft.size(); i++) { for (; i < draft.size(); i++) {
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first); const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);
common_sampler_accept(gsmpl, id, true); common_sampler_accept(gsmpl, id, true);
@ -454,7 +452,7 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
} }
if (i == draft.size()) { if (i == draft.size()) {
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first); const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);
common_sampler_accept(gsmpl, id, true); common_sampler_accept(gsmpl, id, true);
@ -464,13 +462,13 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
return result; return result;
} }
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) { std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft) {
std::vector<int> idxs(draft.size() + 1); std::vector<int> idxs(draft.size() + 1);
for (size_t i = 0; i < idxs.size(); ++i) { for (size_t i = 0; i < idxs.size(); ++i) {
idxs[i] = i; idxs[i] = i;
} }
return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first); return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft);
} }
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) { uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
@ -515,7 +513,8 @@ std::string common_sampler_print(const struct common_sampler * gsmpl) {
for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) { for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i); const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
result += std::string("-> ") + llama_sampler_name(smpl) + " "; result += std::string("-> ");
result += std::string(llama_sampler_name(smpl)) + " ";
} }
return result; return result;

View File

@ -48,6 +48,8 @@ struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
// arguments can be nullptr to skip printing // arguments can be nullptr to skip printing
void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl); void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);
struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl);
// extended sampling implementation: // extended sampling implementation:
// //
// - set logits // - set logits
@ -55,10 +57,7 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
// - check if the token fits the grammar (if any) // - check if the token fits the grammar (if any)
// - if not: resample by first applying the grammar constraints and then sampling again (slower path) // - if not: resample by first applying the grammar constraints and then sampling again (slower path)
// //
// if grammar_first is true, the grammar is applied before the samplers (slower) llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx);
// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
//
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
// generalized version of common_sampler_sample // generalized version of common_sampler_sample
// //
@ -76,10 +75,10 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
// //
// returns at least 1 token, up to idxs.size() // returns at least 1 token, up to idxs.size()
// //
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first = false); std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft);
// assume idxs == [ 0, 1, 2, ..., draft.size() ] // assume idxs == [ 0, 1, 2, ..., draft.size() ]
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false); std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft);
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl); uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
@ -107,3 +106,9 @@ std::vector<enum common_sampler_type> common_sampler_types_from_chars(const std:
llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab, llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab,
const char * grammar_kind, const char * grammar_data); const char * grammar_kind, const char * grammar_data);
struct common_sampler_deleter {
void operator()(common_sampler * s) { common_sampler_free(s); }
};
typedef std::unique_ptr<common_sampler, common_sampler_deleter> common_sampler_ptr;

View File

@ -315,7 +315,7 @@ llama_tokens common_speculative_gen_draft(
for (int i = 0; i < params.n_draft; ++i) { for (int i = 0; i < params.n_draft; ++i) {
common_batch_clear(batch); common_batch_clear(batch);
common_sampler_sample(smpl, ctx_dft, 0, true); common_sampler_sample(smpl, ctx_dft, 0);
const auto * cur_p = common_sampler_get_candidates(smpl, true); const auto * cur_p = common_sampler_get_candidates(smpl, true);

File diff suppressed because it is too large Load Diff

View File

@ -143,6 +143,7 @@ models = [
{"name": "bailingmoe2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-mini-base-2.0", }, {"name": "bailingmoe2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-mini-base-2.0", },
{"name": "granite-docling", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-docling-258M", }, {"name": "granite-docling", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-docling-258M", },
{"name": "minimax-m2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/MiniMaxAI/MiniMax-M2", }, {"name": "minimax-m2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/MiniMaxAI/MiniMax-M2", },
{"name": "kormo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/KORMo-Team/KORMo-tokenizer", },
] ]
# some models are known to be broken upstream, so we will skip them as exceptions # some models are known to be broken upstream, so we will skip them as exceptions

View File

@ -103,6 +103,8 @@ SYCL backend supports Intel GPU Family:
- Intel Built-in Arc GPU - Intel Built-in Arc GPU
- Intel iGPU in Core CPU (11th Generation Core CPU and newer, refer to [oneAPI supported GPU](https://www.intel.com/content/www/us/en/developer/articles/system-requirements/intel-oneapi-base-toolkit-system-requirements.html#inpage-nav-1-1)). - Intel iGPU in Core CPU (11th Generation Core CPU and newer, refer to [oneAPI supported GPU](https://www.intel.com/content/www/us/en/developer/articles/system-requirements/intel-oneapi-base-toolkit-system-requirements.html#inpage-nav-1-1)).
On older Intel GPUs, you may try [OpenCL](/docs/backend/OPENCL.md) although the performance is not optimal, and some GPUs may not support OpenCL nor have any GPGPU capabilities.
#### Verified devices #### Verified devices
| Intel GPU | Status | Verified Model | | Intel GPU | Status | Verified Model |

View File

@ -9,7 +9,8 @@ Adding a model requires few steps:
After following these steps, you can open PR. After following these steps, you can open PR.
Also, it is important to check that the examples and main ggml backends (CUDA, METAL, CPU) are working with the new architecture, especially: Also, it is important to check that the examples and main ggml backends (CUDA, METAL, CPU) are working with the new architecture, especially:
- [main](/tools/main/) - [cli](/tools/cli/)
- [completion](/tools/completion/)
- [imatrix](/tools/imatrix/) - [imatrix](/tools/imatrix/)
- [quantize](/tools/quantize/) - [quantize](/tools/quantize/)
- [server](/tools/server/) - [server](/tools/server/)
@ -96,7 +97,7 @@ The model params and tensors layout must be defined in `llama.cpp` source files:
1. Define a new `llm_arch` enum value in `src/llama-arch.h`. 1. Define a new `llm_arch` enum value in `src/llama-arch.h`.
2. In `src/llama-arch.cpp`: 2. In `src/llama-arch.cpp`:
- Add the architecture name to the `LLM_ARCH_NAMES` map. - Add the architecture name to the `LLM_ARCH_NAMES` map.
- Add the tensor mappings to the `LLM_TENSOR_NAMES` map. - Add the list of model tensors to `llm_get_tensor_names` (you may also need to update `LLM_TENSOR_NAMES`)
3. Add any non-standard metadata loading in the `llama_model_loader` constructor in `src/llama-model-loader.cpp`. 3. Add any non-standard metadata loading in the `llama_model_loader` constructor in `src/llama-model-loader.cpp`.
4. If the model has a RoPE operation, add a case for the architecture in `llama_model_rope_type` function in `src/llama-model.cpp`. 4. If the model has a RoPE operation, add a case for the architecture in `llama_model_rope_type` function in `src/llama-model.cpp`.

View File

@ -7,9 +7,9 @@
## Images ## Images
We have three Docker images available for this project: We have three Docker images available for this project:
1. `ghcr.io/ggml-org/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`) 1. `ghcr.io/ggml-org/llama.cpp:full`: This image includes both the `llama-cli` and `llama-completion` executables and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
2. `ghcr.io/ggml-org/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`) 2. `ghcr.io/ggml-org/llama.cpp:light`: This image only includes the `llama-cli` and `llama-completion` executables. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
3. `ghcr.io/ggml-org/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`) 3. `ghcr.io/ggml-org/llama.cpp:server`: This image only includes the `llama-server` executable. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
Additionally, there the following images, similar to the above: Additionally, there the following images, similar to the above:
@ -44,13 +44,15 @@ docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --all-in-o
On completion, you are ready to play! On completion, you are ready to play!
```bash ```bash
docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf
docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --run-legacy -m /models/32B/ggml-model-q8_0.gguf -no-cnv -p "Building a mobile app can be done in 15 steps:" -n 512
``` ```
or with a light image: or with a light image:
```bash ```bash
docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 docker run -v /path/to/models:/models --entrypoint /app/llama-cli ghcr.io/ggml-org/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf
docker run -v /path/to/models:/models --entrypoint /app/llama-completion ghcr.io/ggml-org/llama.cpp:light -m /models/32B/ggml-model-q8_0.gguf -no-cnv -p "Building a mobile app can be done in 15 steps:" -n 512
``` ```
or with a server image: or with a server image:
@ -59,6 +61,8 @@ or with a server image:
docker run -v /path/to/models:/models -p 8080:8080 ghcr.io/ggml-org/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512 docker run -v /path/to/models:/models -p 8080:8080 ghcr.io/ggml-org/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512
``` ```
In the above examples, `--entrypoint /app/llama-cli` is specified for clarity, but you can safely omit it since it's the default entrypoint in the container.
## Docker With CUDA ## Docker With CUDA
Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) properly installed on Linux, or is using a GPU enabled cloud, `cuBLAS` should be accessible inside the container. Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) properly installed on Linux, or is using a GPU enabled cloud, `cuBLAS` should be accessible inside the container.
@ -80,9 +84,9 @@ The defaults are:
The resulting images, are essentially the same as the non-CUDA images: The resulting images, are essentially the same as the non-CUDA images:
1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. 1. `local/llama.cpp:full-cuda`: This image includes both the `llama-cli` and `llama-completion` executables and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
2. `local/llama.cpp:light-cuda`: This image only includes the main executable file. 2. `local/llama.cpp:light-cuda`: This image only includes the `llama-cli` and `llama-completion` executables.
3. `local/llama.cpp:server-cuda`: This image only includes the server executable file. 3. `local/llama.cpp:server-cuda`: This image only includes the `llama-server` executable.
## Usage ## Usage
@ -114,9 +118,9 @@ The defaults are:
The resulting images, are essentially the same as the non-MUSA images: The resulting images, are essentially the same as the non-MUSA images:
1. `local/llama.cpp:full-musa`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. 1. `local/llama.cpp:full-musa`: This image includes both the `llama-cli` and `llama-completion` executables and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
2. `local/llama.cpp:light-musa`: This image only includes the main executable file. 2. `local/llama.cpp:light-musa`: This image only includes the `llama-cli` and `llama-completion` executables.
3. `local/llama.cpp:server-musa`: This image only includes the server executable file. 3. `local/llama.cpp:server-musa`: This image only includes the `llama-server` executable.
## Usage ## Usage

View File

@ -18,12 +18,12 @@ Legend:
| ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | | ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
| ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | | ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
| ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | | ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
| ADD_ID | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | ✅ | ❌ | ❌ | ❌ | | ADD_ID | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | ✅ | ❌ | ❌ | ❌ |
| ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | | ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
| ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | | ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
| ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | | ✅ | ❌ | ❌ | ❌ | | ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
| CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | | CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
| CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ | ❌ | | CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | | 🟡 | ❌ | ❌ | ❌ |
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ | | CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ | | CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
| CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | | CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ |
@ -31,7 +31,7 @@ Legend:
| CONV_3D | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | | CONV_3D | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | | CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
| CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | | CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
| COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | | COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | | 🟡 | ❌ | ❌ | ❌ |
| COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | | COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
| CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ | | CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
| CROSS_ENTROPY_LOSS | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | | CROSS_ENTROPY_LOSS | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
@ -64,7 +64,7 @@ Legend:
| IM2COL_3D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | | IM2COL_3D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
| L2_NORM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | | L2_NORM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
| LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | | LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
| LOG | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ | | LOG | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | | ✅ | ❌ | ❌ | ❌ |
| MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | | MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
| MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | | MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
| MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | | MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
@ -98,14 +98,14 @@ Legend:
| SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ | | SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
| SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ | | SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
| SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | | SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
| SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | | SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | | 🟡 | ❌ | ❌ | ❌ |
| SOFTCAP | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | | SOFTCAP | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ | | SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
| SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | | SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
| SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ | | SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
| SOLVE_TRI | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ | | SOLVE_TRI | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
| SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ | ❌ | | SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | | 🟡 | ❌ | ❌ | ❌ |
| SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ | ❌ | | SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | | 🟡 | ❌ | ❌ | ❌ |
| SSM_CONV | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | | SSM_CONV | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
| SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ | | SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
| STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ | | STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
@ -113,7 +113,7 @@ Legend:
| SUM | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | | SUM | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
| SUM_ROWS | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ | | SUM_ROWS | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
| SWIGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ | | SWIGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
| SWIGLU_OAI | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | 🟡 | ✅ | ❌ | ❌ | | SWIGLU_OAI | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | 🟡 | ✅ | ❌ | ❌ |
| TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ | | TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
| TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | | TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
| TOP_K | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ | | TOP_K | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |

File diff suppressed because it is too large Load Diff

View File

@ -2,6 +2,7 @@
#include "common.h" #include "common.h"
#include "log.h" #include "log.h"
#include "llama.h" #include "llama.h"
#include "sampling.h"
#include <algorithm> #include <algorithm>
#include <cstdio> #include <cstdio>
@ -64,11 +65,12 @@ int main(int argc, char ** argv) {
ctx_params.n_ctx = n_kv_req; ctx_params.n_ctx = n_kv_req;
ctx_params.n_batch = std::max(n_predict, n_parallel); ctx_params.n_batch = std::max(n_predict, n_parallel);
llama_context * ctx = llama_init_from_model(model, ctx_params);
auto sparams = llama_sampler_chain_default_params(); auto sparams = llama_sampler_chain_default_params();
sparams.no_perf = false; sparams.no_perf = false;
std::vector<llama_sampler *> samplers;
for (int32_t i = 0; i < n_parallel; ++i) {
llama_sampler * smpl = llama_sampler_chain_init(sparams); llama_sampler * smpl = llama_sampler_chain_init(sparams);
llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sampling.top_k)); llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sampling.top_k));
@ -76,6 +78,11 @@ int main(int argc, char ** argv) {
llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sampling.temp)); llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sampling.temp));
llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sampling.seed)); llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sampling.seed));
samplers.push_back(smpl);
}
llama_context * ctx = llama_init_from_model(model, ctx_params);
if (ctx == NULL) { if (ctx == NULL) {
LOG_ERR("%s: error: failed to create the llama_context\n" , __func__); LOG_ERR("%s: error: failed to create the llama_context\n" , __func__);
return 1; return 1;
@ -173,7 +180,7 @@ int main(int argc, char ** argv) {
continue; continue;
} }
const llama_token new_token_id = llama_sampler_sample(smpl, ctx, i_batch[i]); const llama_token new_token_id = llama_sampler_sample(samplers[i], ctx, i_batch[i]);
// is it an end of generation? -> mark the stream as finished // is it an end of generation? -> mark the stream as finished
if (llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_predict) { if (llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_predict) {
@ -229,14 +236,17 @@ int main(int argc, char ** argv) {
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
LOG("\n"); LOG("\n");
llama_perf_sampler_print(smpl); llama_perf_sampler_print(samplers[0]);
llama_perf_context_print(ctx); llama_perf_context_print(ctx);
fprintf(stderr, "\n"); fprintf(stderr, "\n");
llama_batch_free(batch); llama_batch_free(batch);
llama_sampler_free(smpl); for (auto & sampler_config : samplers) {
llama_sampler_free(sampler_config);
}
llama_free(ctx); llama_free(ctx);
llama_model_free(model); llama_model_free(model);

View File

@ -131,10 +131,10 @@ int main(int argc, char ** argv) {
llama_numa_init(params.numa); llama_numa_init(params.numa);
// load the model // load the model
common_init_result llama_init = common_init_from_params(params); auto llama_init = common_init_from_params(params);
llama_model * model = llama_init.model.get(); auto * model = llama_init->model();
llama_context * ctx = llama_init.context.get(); auto * ctx = llama_init->context();
if (model == NULL) { if (model == NULL) {
LOG_ERR("%s: unable to load model\n", __func__); LOG_ERR("%s: unable to load model\n", __func__);

View File

@ -202,10 +202,10 @@ int main(int argc, char ** argv) {
params.warmup = false; params.warmup = false;
// init // init
common_init_result llama_init = common_init_from_params(params); auto llama_init = common_init_from_params(params);
llama_model * model = llama_init.model.get(); auto * model = llama_init->model();
llama_context * ctx = llama_init.context.get(); auto * ctx = llama_init->context();
if (model == nullptr || ctx == nullptr) { if (model == nullptr || ctx == nullptr) {
LOG_ERR("%s : failed to init\n", __func__); LOG_ERR("%s : failed to init\n", __func__);

View File

@ -14,12 +14,13 @@ static void write_table_header(std::ofstream & file) {
static void write_table_entry(std::ofstream & file, const common_arg & opt) { static void write_table_entry(std::ofstream & file, const common_arg & opt) {
file << "| `"; file << "| `";
// args // args
for (const auto & arg : opt.args) { auto all_args = opt.get_args();
if (arg == opt.args.front()) { for (const auto & arg : all_args) {
if (arg == all_args.front()) {
file << arg; file << arg;
if (opt.args.size() > 1) file << ", "; if (all_args.size() > 1) file << ", ";
} else { } else {
file << arg << (arg != opt.args.back() ? ", " : ""); file << arg << (arg != all_args.back() ? ", " : "");
} }
} }
// value hint // value hint
@ -47,7 +48,7 @@ static void write_table(std::ofstream & file, std::vector<common_arg *> & opts)
} }
} }
static void export_md(std::string fname, llama_example ex) { static void export_md(std::string fname, llama_example ex, std::string name) {
std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc); std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc);
common_params params; common_params params;
@ -71,13 +72,14 @@ static void export_md(std::string fname, llama_example ex) {
write_table(file, common_options); write_table(file, common_options);
file << "\n\n**Sampling params**\n\n"; file << "\n\n**Sampling params**\n\n";
write_table(file, sparam_options); write_table(file, sparam_options);
file << "\n\n**Example-specific params**\n\n"; file << "\n\n**" << name << "-specific params**\n\n";
write_table(file, specific_options); write_table(file, specific_options);
} }
int main(int, char **) { int main(int, char **) {
export_md("autogen-main.md", LLAMA_EXAMPLE_COMPLETION); // TODO: add CLI
export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER); export_md("autogen-completion.md", LLAMA_EXAMPLE_COMPLETION, "Tool");
export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER, "Server");
return 0; return 0;
} }

View File

@ -55,10 +55,10 @@ int main(int argc, char ** argv) {
llama_numa_init(params.numa); llama_numa_init(params.numa);
// load the target model // load the target model
common_init_result llama_init = common_init_from_params(params); auto llama_init = common_init_from_params(params);
llama_model * model = llama_init.model.get(); auto * model = llama_init->model();
llama_context * ctx = llama_init.context.get(); auto * ctx = llama_init->context();
auto * mem = llama_get_memory(ctx); auto * mem = llama_get_memory(ctx);

View File

@ -18,16 +18,16 @@ int main(int argc, char ** argv){
llama_numa_init(params.numa); llama_numa_init(params.numa);
// load the model // load the model
common_init_result llama_init = common_init_from_params(params); auto llama_init = common_init_from_params(params);
llama_model_ptr & model = llama_init.model; auto * model = llama_init->model();
llama_context_ptr & ctx = llama_init.context; auto * ctx = llama_init->context();
GGML_ASSERT(model != nullptr); GGML_ASSERT(model != nullptr);
// tokenize the prompt // tokenize the prompt
std::vector<llama_token> inp; std::vector<llama_token> inp;
inp = common_tokenize(ctx.get(), params.prompt, true, true); inp = common_tokenize(ctx, params.prompt, true, true);
fprintf(stderr, "%s: tokenization done\n", __func__); fprintf(stderr, "%s: tokenization done\n", __func__);
common_ngram_cache ngram_cache; common_ngram_cache ngram_cache;

View File

@ -28,13 +28,13 @@ int main(int argc, char ** argv){
llama_numa_init(params.numa); llama_numa_init(params.numa);
// load the model // load the model
common_init_result llama_init = common_init_from_params(params); auto llama_init = common_init_from_params(params);
llama_context_ptr & ctx = llama_init.context; llama_context * ctx = llama_init->context();
// tokenize the prompt // tokenize the prompt
std::vector<llama_token> inp; std::vector<llama_token> inp;
inp = common_tokenize(ctx.get(), params.prompt, true, true); inp = common_tokenize(ctx, params.prompt, true, true);
common_ngram_cache ngram_cache_context; common_ngram_cache ngram_cache_context;
common_ngram_cache ngram_cache_dynamic; common_ngram_cache ngram_cache_dynamic;
@ -65,7 +65,7 @@ int main(int argc, char ** argv){
} }
const int n_input = inp.size(); const int n_input = inp.size();
const int n_ctx = llama_n_ctx(ctx.get()); const int n_ctx = llama_n_ctx(ctx);
int n_drafted = 0; int n_drafted = 0;
int n_accept = 0; int n_accept = 0;

View File

@ -29,10 +29,10 @@ int main(int argc, char ** argv){
llama_numa_init(params.numa); llama_numa_init(params.numa);
// load the model // load the model
common_init_result llama_init = common_init_from_params(params); auto llama_init = common_init_from_params(params);
llama_model * model = llama_init.model.get(); auto * model = llama_init->model();
llama_context * ctx = llama_init.context.get(); auto * ctx = llama_init->context();
const llama_vocab * vocab = llama_model_get_vocab(model); const llama_vocab * vocab = llama_model_get_vocab(model);

View File

@ -10,6 +10,13 @@ and in some cases perplexity checked of the quantized model. And finally the
model/models need to the ggml-org on Hugging Face. This tool/example tries to model/models need to the ggml-org on Hugging Face. This tool/example tries to
help with this process. help with this process.
> 📝 **Note:** When adding a new model from an existing family, verify the
> previous version passes logits verification first. Existing models can have
> subtle numerical differences that don't affect generation quality but cause
> logits mismatches. Identifying these upfront whether they exist in llama.cpp,
> the conversion script, or in an upstream implementation, can save significant
> debugging time.
### Overview ### Overview
The idea is that the makefile targets and scripts here can be used in the The idea is that the makefile targets and scripts here can be used in the
development/conversion process assisting with things like: development/conversion process assisting with things like:

View File

@ -1,10 +1,13 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import numpy as np
import sys import sys
import os import numpy as np
from pathlib import Path from pathlib import Path
# Add utils directory to path for direct script execution
sys.path.insert(0, str(Path(__file__).parent.parent / "utils"))
from common import get_model_name_from_env_path # type: ignore[import-not-found]
def quick_logits_check(pytorch_file, llamacpp_file): def quick_logits_check(pytorch_file, llamacpp_file):
"""Lightweight sanity check before NMSE""" """Lightweight sanity check before NMSE"""
@ -32,27 +35,16 @@ def quick_logits_check(pytorch_file, llamacpp_file):
print(f"Top 10 llama.cpp logits: {llamacpp_logits[llamacpp_top10]}") print(f"Top 10 llama.cpp logits: {llamacpp_logits[llamacpp_top10]}")
print(f"Max absolute difference: {max_diff:.4f}") print(f"Max absolute difference: {max_diff:.4f}")
if max_diff > 1.0:
print(f"❌ NOK: Large differences detected - max diff: {max_diff:.4f}")
return False
return True return True
def main(): def main():
model_path = os.getenv('MODEL_PATH') model_name = get_model_name_from_env_path('MODEL_PATH')
if not model_path:
print("Error: MODEL_PATH environment variable not set")
sys.exit(1)
if not os.path.exists(model_path):
print(f"Error: Model file not found: {model_path}")
sys.exit(1)
model_name = os.path.basename(model_path)
data_dir = Path("data") data_dir = Path("data")
pytorch_file = data_dir / f"pytorch-{model_name}.bin" pytorch_file = data_dir / f"pytorch-{model_name}.bin"
llamacpp_file = data_dir / f"llamacpp-{model_name}.bin"
llamacpp_model_name = get_model_name_from_env_path('CONVERTED_MODEL')
print(f"Using converted model: {llamacpp_model_name}")
llamacpp_file = data_dir / f"llamacpp-{llamacpp_model_name}.bin"
if not pytorch_file.exists(): if not pytorch_file.exists():
print(f"Error: PyTorch logits file not found: {pytorch_file}") print(f"Error: PyTorch logits file not found: {pytorch_file}")

View File

@ -7,7 +7,7 @@ base_model:
Recommended way to run this model: Recommended way to run this model:
```sh ```sh
llama-server -hf {namespace}/{model_name}-GGUF -c 0 -fa llama-server -hf {namespace}/{model_name}-GGUF -c 0
``` ```
Then, access http://localhost:8080 Then, access http://localhost:8080

View File

@ -200,7 +200,7 @@ with torch.no_grad():
logits = outputs.logits logits = outputs.logits
# Extract logits for the last token (next token prediction) # Extract logits for the last token (next token prediction)
last_logits = logits[0, -1, :].cpu().numpy() last_logits = logits[0, -1, :].float().cpu().numpy()
print(f"Logits shape: {logits.shape}") print(f"Logits shape: {logits.shape}")
print(f"Last token logits shape: {last_logits.shape}") print(f"Last token logits shape: {last_logits.shape}")

View File

@ -34,8 +34,11 @@ done
MODEL_PATH="${MODEL_PATH:-"$EMBEDDING_MODEL_PATH"}" MODEL_PATH="${MODEL_PATH:-"$EMBEDDING_MODEL_PATH"}"
MODEL_NAME="${MODEL_NAME:-$(basename "$MODEL_PATH")}" MODEL_NAME="${MODEL_NAME:-$(basename "$MODEL_PATH")}"
CONVERTED_MODEL_PATH="${CONVERTED_EMBEDDING_PATH:-"$CONVERTED_EMBEDDING_MODEL"}"
CONVERTED_MODEL_NAME="${CONVERTED_MODEL_NAME:-$(basename "$CONVERTED_MODEL_PATH" .gguf)}"
if [ -t 0 ]; then if [ -t 0 ]; then
CPP_EMBEDDINGS="data/llamacpp-${MODEL_NAME}-embeddings.bin" CPP_EMBEDDINGS="data/llamacpp-${CONVERTED_MODEL_NAME}-embeddings.bin"
else else
# Process piped JSON data and convert to binary (matching logits.cpp format) # Process piped JSON data and convert to binary (matching logits.cpp format)
TEMP_FILE=$(mktemp /tmp/tmp.XXXXXX.binn) TEMP_FILE=$(mktemp /tmp/tmp.XXXXXX.binn)

View File

@ -5,6 +5,7 @@ import sys
import os import os
import argparse import argparse
from pathlib import Path from pathlib import Path
from common import get_model_name_from_env_path # type: ignore[import-not-found]
def calculate_nmse(reference, test): def calculate_nmse(reference, test):
mse = np.mean((test - reference) ** 2) mse = np.mean((test - reference) ** 2)
@ -67,11 +68,13 @@ def main():
parser.add_argument('-m', '--model-path', required=True, help='Path to the model directory') parser.add_argument('-m', '--model-path', required=True, help='Path to the model directory')
args = parser.parse_args() args = parser.parse_args()
model_name = os.path.basename(args.model_path) model_name = get_model_name_from_env_path('MODEL_PATH')
data_dir = Path("data") data_dir = Path("data")
pytorch_file = data_dir / f"pytorch-{model_name}.bin" pytorch_file = data_dir / f"pytorch-{model_name}.bin"
llamacpp_file = data_dir / f"llamacpp-{model_name}.bin"
llamacpp_model_name = get_model_name_from_env_path('CONVERTED_MODEL')
llamacpp_file = data_dir / f"llamacpp-{llamacpp_model_name}.bin"
print(f"Model name: {model_name}") print(f"Model name: {model_name}")
print(f"PyTorch logits file: {pytorch_file}") print(f"PyTorch logits file: {pytorch_file}")

View File

@ -0,0 +1,20 @@
#!/usr/bin/env python3
import os
import sys
def get_model_name_from_env_path(env_path_name):
model_path = os.getenv(env_path_name)
if not model_path:
print(f"Error: {env_path_name} environment variable not set")
sys.exit(1)
if not os.path.exists(model_path):
print(f"Error: Model file not found: {model_path}")
sys.exit(1)
name = os.path.basename(os.path.normpath(model_path))
if name.endswith(".gguf"):
name = name[:-5]
return name

View File

@ -192,10 +192,10 @@ int main(int argc, char ** argv) {
llama_numa_init(params.numa); llama_numa_init(params.numa);
// load the target model // load the target model
common_init_result llama_init = common_init_from_params(params); auto llama_init = common_init_from_params(params);
llama_model * model = llama_init.model.get(); auto * model = llama_init->model();
llama_context * ctx = llama_init.context.get(); auto * ctx = llama_init->context();
auto * mem = llama_get_memory(ctx); auto * mem = llama_get_memory(ctx);

View File

@ -149,10 +149,10 @@ int main(int argc, char ** argv) {
llama_numa_init(params.numa); llama_numa_init(params.numa);
// load the model // load the model
common_init_result llama_init = common_init_from_params(params); auto llama_init = common_init_from_params(params);
llama_model * model = llama_init.model.get(); auto * model = llama_init->model();
llama_context * ctx = llama_init.context.get(); auto * ctx = llama_init->context();
if (model == NULL) { if (model == NULL) {
LOG_ERR("%s: unable to load model\n", __func__); LOG_ERR("%s: unable to load model\n", __func__);

View File

@ -34,10 +34,10 @@ int main(int argc, char ** argv) {
std::string result2; std::string result2;
// init // init
common_init_result llama_init = common_init_from_params(params); auto llama_init = common_init_from_params(params);
llama_model * model = llama_init.model.get(); auto * model = llama_init->model();
llama_context * ctx = llama_init.context.get(); auto * ctx = llama_init->context();
if (model == nullptr || ctx == nullptr) { if (model == nullptr || ctx == nullptr) {
fprintf(stderr, "%s : failed to init\n", __func__); fprintf(stderr, "%s : failed to init\n", __func__);

View File

@ -40,10 +40,10 @@ int main(int argc, char ** argv) {
llama_context * ctx_dft = NULL; llama_context * ctx_dft = NULL;
// load the target model // load the target model
common_init_result llama_init_tgt = common_init_from_params(params); auto llama_init_tgt = common_init_from_params(params);
model_tgt = llama_init_tgt.model.get(); model_tgt = llama_init_tgt->model();
ctx_tgt = llama_init_tgt.context.get(); ctx_tgt = llama_init_tgt->context();
const llama_vocab * vocab = llama_model_get_vocab(model_tgt); const llama_vocab * vocab = llama_model_get_vocab(model_tgt);
@ -61,10 +61,10 @@ int main(int argc, char ** argv) {
params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads; params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
params.tensor_buft_overrides = params.speculative.tensor_buft_overrides; params.tensor_buft_overrides = params.speculative.tensor_buft_overrides;
common_init_result llama_init_dft = common_init_from_params(params); auto llama_init_dft = common_init_from_params(params);
//model_dft = llama_init_dft.model.get(); //model_dft = llama_init_dft->model();
ctx_dft = llama_init_dft.context.get(); ctx_dft = llama_init_dft->context();
if (!common_speculative_are_compatible(ctx_tgt, ctx_dft)) { if (!common_speculative_are_compatible(ctx_tgt, ctx_dft)) {
LOG_INF("the draft model '%s' is not compatible with the target model '%s'. tokens will be translated between the draft and target models.\n", params.speculative.model.path.c_str(), params.model.path.c_str()); LOG_INF("the draft model '%s' is not compatible with the target model '%s'. tokens will be translated between the draft and target models.\n", params.speculative.model.path.c_str(), params.model.path.c_str());
@ -255,6 +255,8 @@ int main(int argc, char ** argv) {
LOG_INF("target:\n\n"); LOG_INF("target:\n\n");
common_perf_print(ctx_tgt, smpl); common_perf_print(ctx_tgt, smpl);
llama_batch_free(batch_tgt);
common_sampler_free(smpl); common_sampler_free(smpl);
common_speculative_free(spec); common_speculative_free(spec);

View File

@ -71,10 +71,10 @@ int main(int argc, char ** argv) {
llama_context * ctx_dft = NULL; llama_context * ctx_dft = NULL;
// load the target model // load the target model
common_init_result llama_init_tgt = common_init_from_params(params); auto llama_init_tgt = common_init_from_params(params);
model_tgt = llama_init_tgt.model.get(); model_tgt = llama_init_tgt->model();
ctx_tgt = llama_init_tgt.context.get(); ctx_tgt = llama_init_tgt->context();
// load the draft model // load the draft model
params.devices = params.speculative.devices; params.devices = params.speculative.devices;
@ -87,10 +87,10 @@ int main(int argc, char ** argv) {
params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads; params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
params.tensor_buft_overrides = params.speculative.tensor_buft_overrides; params.tensor_buft_overrides = params.speculative.tensor_buft_overrides;
common_init_result llama_init_dft = common_init_from_params(params); auto llama_init_dft = common_init_from_params(params);
model_dft = llama_init_dft.model.get(); model_dft = llama_init_dft->model();
ctx_dft = llama_init_dft.context.get(); ctx_dft = llama_init_dft->context();
const llama_vocab * vocab_tgt = llama_model_get_vocab(model_tgt); const llama_vocab * vocab_tgt = llama_model_get_vocab(model_tgt);
const llama_vocab * vocab_dft = llama_model_get_vocab(model_dft); const llama_vocab * vocab_dft = llama_model_get_vocab(model_dft);
@ -242,7 +242,7 @@ int main(int argc, char ** argv) {
bool accept = false; bool accept = false;
if (params.sampling.temp > 0) { if (params.sampling.temp > 0) {
// stochastic verification // stochastic verification
common_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft], true); common_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]);
auto & dist_tgt = *common_sampler_get_candidates(smpl, true); auto & dist_tgt = *common_sampler_get_candidates(smpl, true);
@ -491,7 +491,7 @@ int main(int argc, char ** argv) {
continue; continue;
} }
common_sampler_sample(drafts[s].smpl, ctx_dft, drafts[s].i_batch_dft, true); common_sampler_sample(drafts[s].smpl, ctx_dft, drafts[s].i_batch_dft);
const auto * cur_p = common_sampler_get_candidates(drafts[s].smpl, true); const auto * cur_p = common_sampler_get_candidates(drafts[s].smpl, true);

View File

@ -39,9 +39,10 @@ int main(int argc, char ** argv) {
llama_backend_init(); llama_backend_init();
llama_numa_init(params.numa); llama_numa_init(params.numa);
// load the model and apply lora adapter, if any // load the model and apply lora adapter, if any
common_init_result llama_init = common_init_from_params(params); auto llama_init = common_init_from_params(params);
llama_model_ptr & model = llama_init.model;
llama_context_ptr & ctx = llama_init.context; auto * model = llama_init->model();
auto * ctx = llama_init->context();
if (model == NULL) { if (model == NULL) {
LOG_ERR("%s: unable to load model\n", __func__); LOG_ERR("%s: unable to load model\n", __func__);
@ -54,8 +55,8 @@ int main(int argc, char ** argv) {
LOG_INF("%s\n", common_params_get_system_info(params).c_str()); LOG_INF("%s\n", common_params_get_system_info(params).c_str());
} }
std::vector<llama_token> tokens = common_tokenize(ctx.get(), params.prompt, true); std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, true);
ggml_opt_dataset_t dataset = common_opt_dataset_init(ctx.get(), tokens, llama_n_ctx(ctx.get()) / 2); ggml_opt_dataset_t dataset = common_opt_dataset_init(ctx, tokens, llama_n_ctx(ctx) / 2);
struct lr_opt & lr = params.lr; struct lr_opt & lr = params.lr;
LOG_INF("-optimizer %s -lr0 %.2g -wd %.2g -lr-min %.2g -min-epochs %.2g -epochs %d -period %.2g -val %.2g\n", LOG_INF("-optimizer %s -lr0 %.2g -wd %.2g -lr-min %.2g -min-epochs %.2g -epochs %d -period %.2g -val %.2g\n",
@ -70,7 +71,7 @@ int main(int argc, char ** argv) {
/*get_opt_pars_ud =*/&params.lr, /*get_opt_pars_ud =*/&params.lr,
/*optimizer_type =*/params.optimizer, /*optimizer_type =*/params.optimizer,
}; };
llama_opt_init(ctx.get(), model.get(), lopt_params); llama_opt_init(ctx, model, lopt_params);
const int64_t idata_split = ggml_opt_dataset_ndata(dataset) * (1.0f - params.val_split); const int64_t idata_split = ggml_opt_dataset_ndata(dataset) * (1.0f - params.val_split);
@ -78,7 +79,7 @@ int main(int argc, char ** argv) {
ggml_opt_result_t result_eval = ggml_opt_result_init(); ggml_opt_result_t result_eval = ggml_opt_result_init();
for (lr.epoch = 0; lr.epoch < lr.epochs; ++lr.epoch) { for (lr.epoch = 0; lr.epoch < lr.epochs; ++lr.epoch) {
llama_opt_epoch(ctx.get(), dataset, result_train, result_eval, idata_split, llama_opt_epoch(ctx, dataset, result_train, result_eval, idata_split,
ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar); ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar);
fprintf(stderr, "\n"); fprintf(stderr, "\n");
@ -88,7 +89,7 @@ int main(int argc, char ** argv) {
ggml_opt_result_free(result_train); ggml_opt_result_free(result_train);
ggml_opt_result_free(result_eval); ggml_opt_result_free(result_eval);
llama_model_save_to_file(model.get(), params.out_file.c_str()); llama_model_save_to_file(model, params.out_file.c_str());
llama_backend_free(); llama_backend_free();

View File

@ -54,6 +54,10 @@ if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
# TODO # TODO
else() else()
set(GGML_STANDALONE OFF) set(GGML_STANDALONE OFF)
if (NOT CMAKE_RUNTIME_OUTPUT_DIRECTORY)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
endif()
endif() endif()
if (EMSCRIPTEN) if (EMSCRIPTEN)

View File

@ -53,7 +53,14 @@ GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc);
// call with a worst-case graph to avoid buffer reallocations // call with a worst-case graph to avoid buffer reallocations
// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed // not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
// returns false if the buffer allocation failed // returns false if the buffer allocation failed
// ggml_gallocr_resrve_n_size writes the buffer sizes per galloc buffer that would be allocated by ggml_gallocr_reserve_n to sizes
GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph); GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
GGML_API void ggml_gallocr_reserve_n_size(
ggml_gallocr_t galloc,
struct ggml_cgraph * graph,
const int * node_buffer_ids,
const int * leaf_buffer_ids,
size_t * sizes);
GGML_API bool ggml_gallocr_reserve_n( GGML_API bool ggml_gallocr_reserve_n(
ggml_gallocr_t galloc, ggml_gallocr_t galloc,
struct ggml_cgraph * graph, struct ggml_cgraph * graph,
@ -68,6 +75,8 @@ GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_i
// Utils // Utils
// Create a buffer and allocate all the tensors in a ggml_context // Create a buffer and allocate all the tensors in a ggml_context
// ggml_backend_alloc_ctx_tensors_from_buft_size returns the size of the buffer that would be allocated by ggml_backend_alloc_ctx_tensors_from_buft
GGML_API size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft); GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend); GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);

View File

@ -307,6 +307,7 @@ extern "C" {
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched); GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
// Initialize backend buffers from a measure graph // Initialize backend buffers from a measure graph
GGML_API void ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes);
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched); GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);

View File

@ -99,6 +99,7 @@ extern "C" {
GGML_BACKEND_API int ggml_cpu_has_sme (void); GGML_BACKEND_API int ggml_cpu_has_sme (void);
// other // other
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void); GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
GGML_BACKEND_API int ggml_cpu_get_rvv_vlen (void); // risc-v vector length in bytes
GGML_BACKEND_API int ggml_cpu_has_vsx (void); GGML_BACKEND_API int ggml_cpu_has_vsx (void);
GGML_BACKEND_API int ggml_cpu_has_vxe (void); GGML_BACKEND_API int ggml_cpu_has_vxe (void);
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void); GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);

View File

@ -2615,6 +2615,7 @@ extern "C" {
// Set callback for all future logging events. // Set callback for all future logging events.
// If this is not called, or NULL is supplied, everything is output on stderr. // If this is not called, or NULL is supplied, everything is output on stderr.
GGML_API void ggml_log_get(ggml_log_callback * log_callback, void ** user_data);
GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data); GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor); GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);

View File

@ -594,7 +594,9 @@ static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {
} }
static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) { static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated; return t->data != NULL // tensor data already set externally
|| t->buffer // tensor on external buffer (but not yet allocated)
|| ggml_gallocr_is_own(galloc, t); // tensor will be allocated by galloc
} }
// free the extra space at the end if the new tensor is smaller // free the extra space at the end if the new tensor is smaller
@ -823,7 +825,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
} }
} }
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) { static bool ggml_gallocr_reserve_n_impl(
ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids, bool no_alloc) {
size_t min_hash_size = graph->n_nodes + graph->n_leafs; size_t min_hash_size = graph->n_nodes + graph->n_leafs;
// add 25% margin to avoid hash collisions // add 25% margin to avoid hash collisions
min_hash_size += min_hash_size / 4; min_hash_size += min_hash_size / 4;
@ -928,12 +931,14 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0; size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0;
if (cur_size > 0) { if (cur_size > 0) {
GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n",
__func__, ggml_backend_buft_name(galloc->bufts[i]), __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
} }
} }
#endif #endif
ggml_vbuffer_free(galloc->buffers[i]); ggml_vbuffer_free(galloc->buffers[i]);
if (no_alloc) {
galloc->buffers[i] = NULL;
} else {
galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE); galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
if (galloc->buffers[i] == NULL) { if (galloc->buffers[i] == NULL) {
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size); GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
@ -941,10 +946,26 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
} }
} }
} }
}
return true; return true;
} }
void ggml_gallocr_reserve_n_size(
ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids, size_t * sizes) {
GGML_ASSERT(ggml_gallocr_reserve_n_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids, /*no_alloc =*/ true));
for (int i = 0; i < galloc->n_buffers; i++) {
sizes[i] = 0;
for (int c = 0; c < galloc->buf_tallocs[i]->n_chunks; c++) {
sizes[i] += galloc->buf_tallocs[i]->chunks[c]->max_size;
}
}
}
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
return ggml_gallocr_reserve_n_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids, /*no_alloc =*/ false);
}
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) { bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL); return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
} }
@ -1147,7 +1168,8 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
return true; return true;
} }
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) { static ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft_impl(
struct ggml_context * ctx, ggml_backend_buffer_type_t buft, size_t * nbytes_total, bool no_alloc) {
GGML_ASSERT(ggml_get_no_alloc(ctx) == true); GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
size_t alignment = ggml_backend_buft_get_alignment(buft); size_t alignment = ggml_backend_buft_get_alignment(buft);
@ -1155,6 +1177,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
ggml_backend_buffer_t * buffers = NULL; ggml_backend_buffer_t * buffers = NULL;
size_t n_buffers = 0; size_t n_buffers = 0;
*nbytes_total = 0;
size_t cur_buf_size = 0; size_t cur_buf_size = 0;
struct ggml_tensor * first = ggml_get_first_tensor(ctx); struct ggml_tensor * first = ggml_get_first_tensor(ctx);
@ -1166,10 +1189,11 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
if (cur_buf_size > 0 && (cur_buf_size + this_size) > max_size) { if (cur_buf_size > 0 && (cur_buf_size + this_size) > max_size) {
// allocate tensors in the current buffer // allocate tensors in the current buffer
if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) { if (!no_alloc && !alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
return NULL; return NULL;
} }
first = t; first = t;
*nbytes_total += cur_buf_size;
cur_buf_size = this_size; cur_buf_size = this_size;
} else { } else {
cur_buf_size += this_size; cur_buf_size += this_size;
@ -1178,15 +1202,21 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
// allocate remaining tensors // allocate remaining tensors
if (cur_buf_size > 0) { if (cur_buf_size > 0) {
if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) { *nbytes_total += cur_buf_size;
if (!no_alloc && !alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
return NULL; return NULL;
} }
} }
if (no_alloc) {
return NULL;
}
if (n_buffers == 0) { if (n_buffers == 0) {
#ifndef NDEBUG #ifndef NDEBUG
GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__); GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
#endif #endif
GGML_ASSERT(!buffers);
return NULL; return NULL;
} }
@ -1196,10 +1226,24 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
} else { } else {
buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers); buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
} }
free(buffers); if (buffers) {
free(buffers); // can be NULL if context is empty or no_alloc
}
return buffer; return buffer;
} }
size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
size_t nbytes_total = 0;
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*no_alloc=*/ true);
GGML_ASSERT(!buf);
return nbytes_total;
}
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
size_t nbytes_total = 0;
return ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*no_alloc =*/ false);
}
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) { ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) {
return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend)); return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend));
} }

View File

@ -36,12 +36,11 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
} }
ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
GGML_ASSERT(buft);
if (size == 0) { if (size == 0) {
// return a dummy buffer for zero-sized allocations // return a dummy buffer for zero-sized allocations
return ggml_backend_buffer_init(buft, {}, NULL, 0); return ggml_backend_buffer_init(buft, {}, NULL, 0);
} }
GGML_ASSERT(buft);
return buft->iface.alloc_buffer(buft, size); return buft->iface.alloc_buffer(buft, size);
} }
@ -128,6 +127,12 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
return NULL; return NULL;
} }
// FIXME JG: a multi_buffer has a non-zero size, according to the above comment get_base is not optional,
// I don't know whether the above comment is correct
if (!buffer->iface.get_base) {
return NULL;
}
void * base = buffer->iface.get_base(buffer); void * base = buffer->iface.get_base(buffer);
GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL"); GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
@ -1727,6 +1732,20 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
sched->is_alloc = false; sched->is_alloc = false;
} }
void ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes) {
GGML_ASSERT(sched);
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
GGML_ASSERT(sizes);
ggml_backend_sched_reset(sched);
ggml_backend_sched_synchronize(sched);
ggml_backend_sched_split_graph(sched, measure_graph);
ggml_gallocr_reserve_n_size(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids, sizes);
}
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) { bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
GGML_ASSERT(sched); GGML_ASSERT(sched);
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs); GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);

View File

@ -2548,6 +2548,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
case GGML_OP_ARGSORT: case GGML_OP_ARGSORT:
case GGML_OP_ACC: case GGML_OP_ACC:
case GGML_OP_GROUP_NORM: case GGML_OP_GROUP_NORM:
return true;
case GGML_OP_PAD: case GGML_OP_PAD:
// TODO: add circular padding support for cann, see https://github.com/ggml-org/llama.cpp/pull/16985 // TODO: add circular padding support for cann, see https://github.com/ggml-org/llama.cpp/pull/16985
return ggml_get_op_params_i32(op, 8) == 0; return ggml_get_op_params_i32(op, 8) == 0;

View File

@ -24,6 +24,7 @@
#define UNUSED GGML_UNUSED #define UNUSED GGML_UNUSED
#if defined(__aarch64__) && defined(__ARM_NEON) && (defined(__ARM_FEATURE_MATMUL_INT8) || defined(__ARM_FEATURE_DOTPROD))
static inline void decode_q4_Kx8_scales_mins(const uint8_t * scales_in, static inline void decode_q4_Kx8_scales_mins(const uint8_t * scales_in,
int16x8_t * out_mins, int16x8_t * out_mins,
int8_t * out_scales) { int8_t * out_scales) {
@ -46,6 +47,7 @@ static inline void decode_q4_Kx8_scales_mins(const uint8_t * scales_in,
scales_u32[1] = (sm[2] & kmask2) | (((sm[0] >> 6) & kmask3) << 4); scales_u32[1] = (sm[2] & kmask2) | (((sm[0] >> 6) & kmask3) << 4);
memcpy(out_scales, scales_u32, 8); memcpy(out_scales, scales_u32, 8);
} }
#endif
void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
assert(QK8_0 == 32); assert(QK8_0 == 32);

View File

@ -81,6 +81,11 @@ struct ggml_arm_arch_features_type {
} ggml_arm_arch_features = { 0 }; } ggml_arm_arch_features = { 0 };
#endif #endif
#if defined(__riscv)
struct ggml_riscv_arch_features_type {
int rvv_vlen;
} ggml_riscv_arch_features = { 0 };
#endif
#if defined(_WIN32) #if defined(_WIN32)
@ -703,6 +708,15 @@ static void ggml_init_arm_arch_features(void) {}
#endif #endif
#endif // __ARM_ARCH #endif // __ARM_ARCH
#if defined(__riscv) && defined(__riscv_v_intrinsic)
#include <riscv_vector.h>
static void ggml_init_riscv_arch_features(void) {
ggml_riscv_arch_features.rvv_vlen = __riscv_vlenb();
}
#else
static void ggml_init_riscv_arch_features(void) {}
#endif
struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) { struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
GGML_ASSERT(!ggml_get_no_alloc(ctx)); GGML_ASSERT(!ggml_get_no_alloc(ctx));
@ -3459,6 +3473,14 @@ int ggml_cpu_has_riscv_v(void) {
#endif #endif
} }
int ggml_cpu_get_rvv_vlen(void) {
#if defined(__riscv) && defined(__riscv_v_intrinsic)
return ggml_riscv_arch_features.rvv_vlen;
#else
return 0;
#endif
}
int ggml_cpu_has_f16c(void) { int ggml_cpu_has_f16c(void) {
#if defined(__F16C__) #if defined(__F16C__)
return 1; return 1;
@ -3625,6 +3647,10 @@ void ggml_cpu_init(void) {
ggml_init_arm_arch_features(); ggml_init_arm_arch_features();
#endif #endif
#if defined(__riscv)
ggml_init_riscv_arch_features();
#endif
is_first_call = false; is_first_call = false;
} }

View File

@ -583,6 +583,10 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
if (ggml_cpu_has_riscv_v()) { if (ggml_cpu_has_riscv_v()) {
features.push_back({ "RISCV_V", "1" }); features.push_back({ "RISCV_V", "1" });
} }
if (ggml_cpu_get_rvv_vlen() > 0) {
static std::string rvv_vlen = std::to_string(ggml_cpu_get_rvv_vlen());
features.push_back({ "RVV_VLEN", rvv_vlen.c_str() });
}
if (ggml_cpu_has_vsx()) { if (ggml_cpu_has_vsx()) {
features.push_back({ "VSX", "1" }); features.push_back({ "VSX", "1" });
} }

View File

@ -2169,7 +2169,8 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0; static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0;
if (cur->type == GGML_TYPE_Q4_0) { if (cur->type == GGML_TYPE_Q4_0) {
if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) { if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)
|| (ggml_cpu_has_riscv_v() && (ggml_cpu_get_rvv_vlen() >= QK4_0))) {
if (cur->ne[1] % 8 == 0) { if (cur->ne[1] % 8 == 0) {
return &q4_0_8x8_q8_0; return &q4_0_8x8_q8_0;
} }

View File

@ -67,13 +67,16 @@
#define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000 #define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000
#define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a #define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a
#define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA #define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA
#define GGML_CUDA_CC_RDNA3_5 (GGML_CUDA_CC_OFFSET_AMD + 0x1150) // AI 370, AI Max 395 laptops.
#define GGML_CUDA_CC_RDNA4 (GGML_CUDA_CC_OFFSET_AMD + 0x1200) // RX 9000 #define GGML_CUDA_CC_RDNA4 (GGML_CUDA_CC_OFFSET_AMD + 0x1200) // RX 9000
#define GGML_CUDA_CC_IS_AMD(cc) (cc >= GGML_CUDA_CC_OFFSET_AMD) #define GGML_CUDA_CC_IS_AMD(cc) (cc >= GGML_CUDA_CC_OFFSET_AMD)
#define GGML_CUDA_CC_IS_RDNA(cc) (cc >= GGML_CUDA_CC_RDNA1) #define GGML_CUDA_CC_IS_RDNA(cc) (cc >= GGML_CUDA_CC_RDNA1)
#define GGML_CUDA_CC_IS_RDNA1(cc) (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2) #define GGML_CUDA_CC_IS_RDNA1(cc) (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2)
#define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3) #define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA4) #define GGML_CUDA_CC_IS_RDNA3_0(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA3_5)
#define GGML_CUDA_CC_IS_RDNA3_5(cc) (cc >= GGML_CUDA_CC_RDNA3_5 && cc < GGML_CUDA_CC_RDNA4)
#define GGML_CUDA_CC_IS_RDNA3(cc) (GGML_CUDA_CC_IS_RDNA3_0(cc) || GGML_CUDA_CC_IS_RDNA3_5(cc))
#define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4) #define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4)
#define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA1) #define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA1)
#define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_RDNA1) #define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_RDNA1)

View File

@ -642,8 +642,8 @@ static __global__ void flash_attn_stream_k_fixup(
const int iter_k = (ne11 + (nbatch_fa - 1)) / nbatch_fa; const int iter_k = (ne11 + (nbatch_fa - 1)) / nbatch_fa;
const int iter_j = (ne01 + (ncols1 - 1)) / ncols1; const int iter_j = (ne01 + (ncols1 - 1)) / ncols1;
const int kbc0 = (bidx0 + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x; const int kbc0 = int64_t(bidx0 + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
const int kbc0_stop = (bidx0 + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x; const int kbc0_stop = int64_t(bidx0 + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
const bool did_not_have_any_data = kbc0 == kbc0_stop; const bool did_not_have_any_data = kbc0 == kbc0_stop;
const bool wrote_beginning_of_tile = kbc0 % iter_k == 0; const bool wrote_beginning_of_tile = kbc0 % iter_k == 0;
@ -679,7 +679,7 @@ static __global__ void flash_attn_stream_k_fixup(
int bidx = bidx0 - 1; int bidx = bidx0 - 1;
int kbc_stop = kbc0; int kbc_stop = kbc0;
while(true) { while(true) {
const int kbc = bidx*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x; const int kbc = int64_t(bidx)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
if (kbc == kbc_stop) { // Did not have any data. if (kbc == kbc_stop) { // Did not have any data.
bidx--; bidx--;
kbc_stop = kbc; kbc_stop = kbc;

View File

@ -1380,8 +1380,8 @@ static __global__ void flash_attn_ext_f16(
const int iter_j = (ne01.z + (ncols1 - 1)) / ncols1; const int iter_j = (ne01.z + (ncols1 - 1)) / ncols1;
// kbc == k block continuous, current index in continuous ijk space. // kbc == k block continuous, current index in continuous ijk space.
int kbc = (blockIdx.x + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x; int kbc = int64_t(blockIdx.x + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
const int kbc_stop = (blockIdx.x + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x; const int kbc_stop = int64_t(blockIdx.x + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
// If the seams of 2 CUDA blocks fall within an output tile their results need to be combined. // If the seams of 2 CUDA blocks fall within an output tile their results need to be combined.
// For this we need to track both the block that starts the tile (needs_fixup) and the block that finishes the tile (is_fixup). // For this we need to track both the block that starts the tile (needs_fixup) and the block that finishes the tile (is_fixup).

View File

@ -189,6 +189,9 @@ namespace ggml_cuda_mma {
return 8 * (threadIdx.x / 16) + l; return 8 * (threadIdx.x / 16) + l;
#elif defined(RDNA3) #elif defined(RDNA3)
return 2 * l + (threadIdx.x / 16); return 2 * l + (threadIdx.x / 16);
#else
NO_DEVICE_CODE;
return -1;
#endif // defined(RDNA4) #endif // defined(RDNA4)
} else { } else {
NO_DEVICE_CODE; NO_DEVICE_CODE;
@ -290,8 +293,12 @@ namespace ggml_cuda_mma {
} }
} }
#elif defined(AMD_WMMA_AVAILABLE) #elif defined(AMD_WMMA_AVAILABLE)
#if defined(RDNA3)
// RDNA3 has duplicated data as input.
static constexpr int ne = I * J / 32 * 2;
#else
static constexpr int ne = I * J / 32; static constexpr int ne = I * J / 32;
#endif // defined(RDNA3)
half2 x[ne] = {{0.0f, 0.0f}}; half2 x[ne] = {{0.0f, 0.0f}};
static constexpr __device__ bool supported() { static constexpr __device__ bool supported() {
@ -310,7 +317,14 @@ namespace ggml_cuda_mma {
static __device__ __forceinline__ int get_j(const int l) { static __device__ __forceinline__ int get_j(const int l) {
if constexpr (I == 16 && J == 8) { if constexpr (I == 16 && J == 8) {
#if defined(RDNA4)
return 4 * (threadIdx.x / 16) + l; return 4 * (threadIdx.x / 16) + l;
#elif defined(RDNA3)
return l;
#else
NO_DEVICE_CODE;
return -1;
#endif // defined(RDNA4)
} else { } else {
NO_DEVICE_CODE; NO_DEVICE_CODE;
return -1; return -1;
@ -366,11 +380,16 @@ namespace ggml_cuda_mma {
static constexpr int I = I_; static constexpr int I = I_;
static constexpr int J = J_; static constexpr int J = J_;
static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR; static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR;
static constexpr int ne = I * J / WARP_SIZE;
nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
#if defined(AMD_WMMA_AVAILABLE) #if defined(AMD_WMMA_AVAILABLE)
#if defined(RDNA3)
// RDNA3 has duplicated data as input.
static constexpr int ne = I * J / 32 * 2;
#else
static constexpr int ne = I * J / 32;
#endif // defined(RDNA3)
nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
static constexpr __device__ bool supported() { static constexpr __device__ bool supported() {
if (I == 16 && J == 8) return true; if (I == 16 && J == 8) return true;
return false; return false;
@ -387,13 +406,23 @@ namespace ggml_cuda_mma {
static __device__ __forceinline__ int get_j(const int l) { static __device__ __forceinline__ int get_j(const int l) {
if constexpr (I == 16 && J == 8) { if constexpr (I == 16 && J == 8) {
#if defined(RDNA4)
return 4 * (threadIdx.x / 16) + l; return 4 * (threadIdx.x / 16) + l;
#elif defined(RDNA3)
return l;
#else
NO_DEVICE_CODE;
return -1;
#endif // defined(RDNA4)
} else { } else {
NO_DEVICE_CODE; NO_DEVICE_CODE;
return -1; return -1;
} }
} }
#else #else
static constexpr int ne = I * J / WARP_SIZE;
nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
static constexpr __device__ bool supported() { static constexpr __device__ bool supported() {
if (I == 8 && J == 8) return true; if (I == 8 && J == 8) return true;
if (I == 16 && J == 4) return true; if (I == 16 && J == 4) return true;
@ -546,8 +575,14 @@ namespace ggml_cuda_mma {
} }
#elif defined(AMD_WMMA_AVAILABLE) #elif defined(AMD_WMMA_AVAILABLE)
if constexpr (std::is_same_v<T, half2> || std::is_same_v<T, nv_bfloat162>) { if constexpr (std::is_same_v<T, half2> || std::is_same_v<T, nv_bfloat162>) {
#if defined(RDNA4)
ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0)); ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
#elif defined(RDNA3)
ggml_cuda_memcpy_1<sizeof(t.x)/2>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
ggml_cuda_memcpy_1<sizeof(t.x)/2>(t.x + t.ne/2, xs0 + t.get_i(0) * stride + t.get_j(t.ne/2));
#else
NO_DEVICE_CODE;
#endif // defined(RDNA4)
} else if constexpr (std::is_same_v<T, int>) { } else if constexpr (std::is_same_v<T, int>) {
if constexpr (I == 16 && J == 4) { if constexpr (I == 16 && J == 4) {
int64_t * xi = (int64_t *) t.x; int64_t * xi = (int64_t *) t.x;
@ -888,6 +923,16 @@ namespace ggml_cuda_mma {
const halfx8_t& a_frag = reinterpret_cast<const halfx8_t&>(A.x[0]); const halfx8_t& a_frag = reinterpret_cast<const halfx8_t&>(A.x[0]);
const halfx8_t& b_frag = reinterpret_cast<const halfx8_t&>(B.x[0]); const halfx8_t& b_frag = reinterpret_cast<const halfx8_t&>(B.x[0]);
acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(a_frag, b_frag, acc_frag); acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(a_frag, b_frag, acc_frag);
#elif defined(RDNA3)
using halfx16_t = __attribute__((ext_vector_type(16))) _Float16;
using floatx8_t = __attribute__((ext_vector_type(8))) float;
floatx8_t& acc_frag = reinterpret_cast<floatx8_t&>(D.x[0]);
const halfx16_t& a_frag = reinterpret_cast<const halfx16_t&>(A.x[0]);
const halfx16_t& b_frag = reinterpret_cast<const halfx16_t&>(B.x[0]);
acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32(a_frag, b_frag, acc_frag);
#else
GGML_UNUSED_VARS(D, A, B);
NO_DEVICE_CODE;
#endif // RDNA4 #endif // RDNA4
#else #else
GGML_UNUSED_VARS(D, A, B); GGML_UNUSED_VARS(D, A, B);
@ -905,6 +950,16 @@ namespace ggml_cuda_mma {
const bf16x8_t& a_frag = reinterpret_cast<const bf16x8_t&>(A.x[0]); const bf16x8_t& a_frag = reinterpret_cast<const bf16x8_t&>(A.x[0]);
const bf16x8_t& b_frag = reinterpret_cast<const bf16x8_t&>(B.x[0]); const bf16x8_t& b_frag = reinterpret_cast<const bf16x8_t&>(B.x[0]);
acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12(a_frag, b_frag, acc_frag); acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12(a_frag, b_frag, acc_frag);
#elif defined(RDNA3)
using bf16x16_t = __attribute__((ext_vector_type(16))) __bf16;
using floatx8_t = __attribute__((ext_vector_type(8))) float;
floatx8_t& acc_frag = reinterpret_cast<floatx8_t&>(D.x[0]);
const bf16x16_t& a_frag = reinterpret_cast<const bf16x16_t&>(A.x[0]);
const bf16x16_t& b_frag = reinterpret_cast<const bf16x16_t&>(B.x[0]);
acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32(a_frag, b_frag, acc_frag);
#else
GGML_UNUSED_VARS(D, A, B);
NO_DEVICE_CODE;
#endif // RDNA4 #endif // RDNA4
#else #else
GGML_UNUSED_VARS(D, A, B); GGML_UNUSED_VARS(D, A, B);

View File

@ -151,7 +151,9 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const
return false; return false;
} }
} else { } else {
if (src1_ncols > 16) { if (GGML_CUDA_CC_IS_RDNA3_0(cc) && src1_ncols > 8) {
return false;
} else if (src1_ncols > 16) {
return false; return false;
} }
} }
@ -160,9 +162,9 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const
case GGML_TYPE_F32: case GGML_TYPE_F32:
return ampere_mma_available(cc); return ampere_mma_available(cc);
case GGML_TYPE_F16: case GGML_TYPE_F16:
return volta_mma_available(cc) || turing_mma_available(cc) || (amd_wmma_available(cc) && GGML_CUDA_CC_IS_RDNA4(cc)); return volta_mma_available(cc) || turing_mma_available(cc) || amd_wmma_available(cc);
case GGML_TYPE_BF16: case GGML_TYPE_BF16:
return ampere_mma_available(cc) || (amd_wmma_available(cc) && GGML_CUDA_CC_IS_RDNA4(cc)); return ampere_mma_available(cc) || amd_wmma_available(cc);
default: default:
return false; return false;
} }

View File

@ -765,7 +765,10 @@ bool ggml_cuda_should_use_mmvf(enum ggml_type type, int cc, const int64_t * src0
return ne11 <= 8; return ne11 <= 8;
} else if (GGML_CUDA_CC_IS_AMD(cc)) { } else if (GGML_CUDA_CC_IS_AMD(cc)) {
if (fp16_mma_hardware_available(cc)) { if (fp16_mma_hardware_available(cc)) {
if (GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) { if (GGML_CUDA_CC_IS_RDNA3(cc)) {
return ne11 <= 3;
}
if (GGML_CUDA_CC_IS_RDNA4(cc)) {
return ne11 <= 5; return ne11 <= 5;
} }
return ne11 <= 2; return ne11 <= 2;

View File

@ -1974,9 +1974,6 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s
break; break;
case GGML_TYPE_F16: case GGML_TYPE_F16:
if (!opt_experimental) {
return false;
}
break; break;
default: default:

View File

@ -903,7 +903,7 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri
const float * restrict vy = (const float * restrict) y; const float * restrict vy = (const float * restrict) y;
for (uint32_t i = 0; i < n; i++) { for (uint32_t i = 0; i < n; i++) {
rsum += vx[i] * (__fp16) vy[i]; rsum += (float)vx[i] * vy[i];
} }
*s = rsum; *s = rsum;
return; return;
@ -917,7 +917,7 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri
// for some reason we need volatile here so that the compiler doesn't try anything funky // for some reason we need volatile here so that the compiler doesn't try anything funky
volatile HVX_Vector rsum = Q6_V_vsplat_R(0); volatile HVX_Vector rsum = Q6_V_vsplat_R(0);
float r_sum_scalar = 0.0f;
uint32_t i = 0; uint32_t i = 0;
for (i = 0; i < nv0; i++) { for (i = 0; i < nv0; i++) {
@ -926,31 +926,42 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri
HVX_Vector x = vx[i]; HVX_Vector x = vx[i];
HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), Q6_Vh_vsplat_R(0x3C00)); // mul by 1.0 HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), Q6_Vh_vsplat_R(0x3C00)); // mul by 1.0
HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_hi_W(xp)), Q6_V_hi_W(yp)); //NOTE: need volatile here to prevent compiler optimization
HVX_Vector lo = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp)); // Seem compiler cannot guarantee read-after-write??
volatile HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_hi_W(xp)), Q6_V_hi_W(yp));
volatile HVX_Vector lo = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp));
HVX_Vector sum = Q6_Vqf32_vadd_Vqf32Vqf32(hi, lo); HVX_Vector sum = Q6_Vqf32_vadd_Vqf32Vqf32(hi, lo);
rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, sum); rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, sum);
} }
if (nv1) { if (nv1) {
HVX_VectorPair yp = vy[i]; // HVX_VectorPair yp = vy[i];
HVX_Vector x = vx[i]; // HVX_Vector x = vx[i];
HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), Q6_Vh_vsplat_R(0x3C00)); // mul by 1.0 // HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), Q6_Vh_vsplat_R(0x3C00)); // mul by 1.0
if (nv1 >= 32) { // if (nv1 >= 32) {
HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_hi_W(xp)), Q6_V_hi_W(yp)); // volatile HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_hi_W(xp)), Q6_V_hi_W(yp));
rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, hi); // rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, hi);
nv1 -= 32; // nv1 -= 32;
} // }
// rsum = hvx_vec_qf32_reduce_sum(rsum);
// if (nv1) {
// volatile HVX_Vector lo = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp));
// HVX_Vector sum = hvx_vec_qf32_reduce_sum_n(lo, nv1);
// rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, sum);
// }
//process the remainder using scalar loop
rsum = hvx_vec_qf32_reduce_sum(rsum); rsum = hvx_vec_qf32_reduce_sum(rsum);
const __fp16 * restrict sx = (const __fp16 * restrict) x;
const float * restrict sy = (const float * restrict) y;
if (nv1) { for (uint32_t i = nv0 * 64; i < n; i++) {
HVX_Vector lo = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp)); r_sum_scalar += (float) sx[i] * sy[i];
HVX_Vector sum = hvx_vec_qf32_reduce_sum_n(lo, nv1);
rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, sum);
} }
// hvx_vec_dump_fp16("X", x); // hvx_vec_dump_fp16("X", x);
@ -961,7 +972,7 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri
rsum = hvx_vec_qf32_reduce_sum(rsum); rsum = hvx_vec_qf32_reduce_sum(rsum);
} }
*s = hvx_vec_get_fp32(Q6_Vsf_equals_Vqf32(rsum)); *s = hvx_vec_get_fp32(Q6_Vsf_equals_Vqf32(rsum)) + r_sum_scalar;
# ifdef HTP_DEBUG # ifdef HTP_DEBUG
{ {
@ -1498,9 +1509,6 @@ static void matmul_f16_f32(struct htp_tensor * restrict src0,
uint64_t t1, t2; uint64_t t1, t2;
t1 = HAP_perf_get_qtimer_count(); t1 = HAP_perf_get_qtimer_count();
const size_t src0_row_size = sizeof(__fp16) * ne00;
const size_t src1_row_size = sizeof(float) * ne10;
assert(ne12 % ne02 == 0); assert(ne12 % ne02 == 0);
assert(ne13 % ne03 == 0); assert(ne13 % ne03 == 0);
@ -1510,8 +1518,6 @@ static void matmul_f16_f32(struct htp_tensor * restrict src0,
// This is the size of the rest of the dimensions of the result // This is the size of the rest of the dimensions of the result
const uint32_t nr1 = ne1 * ne2 * ne3; const uint32_t nr1 = ne1 * ne2 * ne3;
uint32_t chunk_size = 64;
// distribute the thread work across the inner or outer loop based on which one is larger // distribute the thread work across the inner or outer loop based on which one is larger
uint32_t nchunk0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows uint32_t nchunk0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
uint32_t nchunk1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows uint32_t nchunk1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
@ -1544,11 +1550,11 @@ static void matmul_f16_f32(struct htp_tensor * restrict src0,
const uint32_t blck_0 = 64; const uint32_t blck_0 = 64;
const uint32_t blck_1 = 64; const uint32_t blck_1 = 64;
float tmp[32]; __attribute__((aligned(128))) float tmp[64];
for (uint32_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) { for (uint32_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) {
for (uint32_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) { for (uint32_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) {
for (uint32_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir1_end; ir1++) { for (uint32_t ir1 = iir1; ir1 < MIN(iir1 + blck_1, ir1_end); ir1++) {
const uint32_t i13 = (ir1 / (ne12 * ne1)); const uint32_t i13 = (ir1 / (ne12 * ne1));
const uint32_t i12 = (ir1 - i13 * ne12 * ne1) / ne1; const uint32_t i12 = (ir1 - i13 * ne12 * ne1) / ne1;
const uint32_t i11 = (ir1 - i13 * ne12 * ne1 - i12 * ne1); const uint32_t i11 = (ir1 - i13 * ne12 * ne1 - i12 * ne1);
@ -1561,13 +1567,16 @@ static void matmul_f16_f32(struct htp_tensor * restrict src0,
const uint32_t i2 = i12; const uint32_t i2 = i12;
const uint32_t i3 = i13; const uint32_t i3 = i13;
const uint8_t * restrict src0_row = (const uint8_t *) src0->data + (0 + i02 * nb02 + i03 * nb03); const uint8_t * restrict src0_base = (const uint8_t *) src0->data + (0 + i02 * nb02 + i03 * nb03);
const uint8_t * restrict src1_col = const uint8_t * restrict src1_col =
(const uint8_t *) src1->data + (i11 + i12 * ne11 + i13 * ne12 * ne11) * src1_row_size; (const uint8_t *) src1->data + (i11 * nb11 + i12 * nb12 + i13 * nb13);
float * dst_col = (float *) ((uint8_t * restrict) dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3)); float * dst_col = (float *) ((uint8_t * restrict) dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
for (uint32_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0++) { const uint32_t ir0_block_end = MIN(iir0 + blck_0, ir0_end);
vec_dot_f16_f32(ne00, &tmp[ir0 - iir0], src0_row + ir0 * src0_row_size, src1_col); for (uint32_t ir0 = iir0; ir0 < ir0_block_end; ir0++) {
// Use nb01 stride for non-contiguous src0 support
const uint8_t * restrict src0_row = src0_base + ir0 * nb01;
vec_dot_f16_f32(ne00, &tmp[ir0 - iir0], src0_row, src1_col);
} }
hvx_copy_fp32_ua((uint8_t *) &dst_col[iir0], (uint8_t *) tmp, MIN(iir0 + blck_0, ir0_end) - iir0); hvx_copy_fp32_ua((uint8_t *) &dst_col[iir0], (uint8_t *) tmp, MIN(iir0 + blck_0, ir0_end) - iir0);

View File

@ -769,9 +769,16 @@ ggml_metal_device_t ggml_metal_device_init(void) {
#endif #endif
dev->props.use_shared_buffers = dev->props.has_unified_memory; dev->props.use_shared_buffers = dev->props.has_unified_memory;
#if TARGET_OS_OSX
// In case of eGPU, shared memory may be preferable.
dev->props.use_shared_buffers |= [dev->mtl_device location] == MTLDeviceLocationExternal;
#endif
if (getenv("GGML_METAL_SHARED_BUFFERS_DISABLE") != NULL) { if (getenv("GGML_METAL_SHARED_BUFFERS_DISABLE") != NULL) {
dev->props.use_shared_buffers = false; dev->props.use_shared_buffers = false;
} }
if (getenv("GGML_METAL_SHARED_BUFFERS_ENABLE") != NULL) {
dev->props.use_shared_buffers = true;
}
dev->props.supports_gpu_family_apple7 = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7]; dev->props.supports_gpu_family_apple7 = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7];

View File

@ -0,0 +1,77 @@
#include <sycl/sycl.hpp>
#include "common.hpp"
#include "add-id.hpp"
static void add_id_kernel(
const float* src0,
const float* src1,
const int32_t* src2,
float* dst,
int64_t ne0,
int64_t ne1,
size_t nb01,
size_t nb02,
size_t nb11,
size_t nb21,
sycl::nd_item<3> item_ct1) {
const int64_t i1 = item_ct1.get_group(2);
const int64_t i2 = item_ct1.get_group(1);
const int i11 =
*(const int32_t*)((const char*)src2 + i1 * sizeof(int32_t) + i2 * nb21);
const size_t nb1 = ne0 * sizeof(float);
const size_t nb2 = ne1 * nb1;
float* dst_row = (float*)((char*)dst + i1 * nb1 + i2 * nb2);
const float* src0_row =
(const float*)((const char*)src0 + i1 * nb01 + i2 * nb02);
const float* src1_row = (const float*)((const char*)src1 + i11 * nb11);
for (int64_t i0 = item_ct1.get_local_id(2); i0 < ne0;
i0 += item_ct1.get_local_range(2)) {
dst_row[i0] = src0_row[i0] + src1_row[i0];
}
}
void ggml_sycl_add_id(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
const ggml_tensor* src0 = dst->src[0];
const ggml_tensor* src1 = dst->src[1];
const ggml_tensor* src2 = dst->src[2];
GGML_TENSOR_TERNARY_OP_LOCALS
GGML_ASSERT(dst->type == GGML_TYPE_F32);
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT(src2->type == GGML_TYPE_I32);
GGML_ASSERT(nb00 == sizeof(float));
GGML_ASSERT(nb10 == sizeof(float));
GGML_ASSERT(nb20 == sizeof(int32_t));
const float* src0_d = (const float*)src0->data;
const float* src1_d = (const float*)src1->data;
const int32_t* src2_d = (const int32_t*)src2->data;
float* dst_d = (float*)dst->data;
int threads = std::min((int)ne00, 768); // cols
ctx.stream()->parallel_for(
sycl::nd_range<3>(
sycl::range<3>(1, ne02, ne01) * sycl::range<3>(1, 1, threads),
sycl::range<3>(1, 1, threads)),
[=](sycl::nd_item<3> item_ct1) {
add_id_kernel(
src0_d,
src1_d,
src2_d,
dst_d,
ne0,
ne1,
nb01,
nb02,
nb11,
nb21,
item_ct1);
});
}

View File

@ -0,0 +1,8 @@
#ifndef GGML_SYCL_ADD_ID_HPP
#define GGML_SYCL_ADD_ID_HPP
#include "common.hpp"
void ggml_sycl_add_id(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
#endif // GGML_SYCL_ADD_ID_HPP

View File

@ -642,5 +642,22 @@ static __dpct_inline__ sycl::uint2 fast_div_modulo(uint32_t n, const sycl::uint3
return sycl::uint2(div_val, mod_val); return sycl::uint2(div_val, mod_val);
} }
static __dpct_inline__ int ggml_sycl_dp4a(const int a, const int b, int c) {
return dpct::dp4a(a, b, c);
}
static __dpct_inline__ float ggml_sycl_e8m0_to_fp32(uint8_t x) {
uint32_t bits;
if (x == 0) {
bits = 0x00400000;
} else {
bits = (uint32_t) x << 23;
}
float result;
memcpy(&result, &bits, sizeof(float));
return result;
}
#endif // GGML_SYCL_COMMON_HPP #endif // GGML_SYCL_COMMON_HPP

View File

@ -472,6 +472,16 @@ static void dequantize_row_iq4_nl_sycl(const void *vx, dst_t *y, const int64_t k
} }
} }
template <typename dst_t>
static void dequantize_row_mxfp4_sycl(const void * vx, dst_t * y, const int64_t k, dpct::queue_ptr stream) {
const int nb = (k + QK_K - 1) / QK_K;
stream->parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
[=](sycl::nd_item<3> item_ct1) {
dequantize_block_mxfp4(vx, y, item_ct1);
});
}
template <typename src_t, typename dst_t> template <typename src_t, typename dst_t>
static void convert_unary_nc(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t ne00, const int64_t ne01, static void convert_unary_nc(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t ne00, const int64_t ne01,
const int64_t ne02, const int64_t s01, const int64_t s02, const int64_t s03, const int64_t ne02, const int64_t s01, const int64_t s02, const int64_t s03,
@ -518,6 +528,7 @@ static void convert_unary_sycl(const void * vx, dst_t * y, const int64_t k, dpct
convert_unary_nc_sycl<src_t>(vx, y, k, 1, 1, 1, k, k, k, queue); convert_unary_nc_sycl<src_t>(vx, y, k, 1, 1, 1, k, k, k, queue);
} }
to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) { to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
switch (type) { switch (type) {
case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_0:
@ -571,6 +582,8 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
return dequantize_row_iq4_xs_sycl; return dequantize_row_iq4_xs_sycl;
case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_NL:
return dequantize_row_iq4_nl_sycl; return dequantize_row_iq4_nl_sycl;
case GGML_TYPE_MXFP4:
return dequantize_row_mxfp4_sycl;
case GGML_TYPE_F32: case GGML_TYPE_F32:
return convert_unary_sycl<float>; return convert_unary_sycl<float>;
#ifdef GGML_SYCL_HAS_BF16 #ifdef GGML_SYCL_HAS_BF16
@ -636,6 +649,8 @@ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
return dequantize_row_iq4_xs_sycl; return dequantize_row_iq4_xs_sycl;
case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_NL:
return dequantize_row_iq4_nl_sycl; return dequantize_row_iq4_nl_sycl;
case GGML_TYPE_MXFP4:
return dequantize_row_mxfp4_sycl;
case GGML_TYPE_F16: case GGML_TYPE_F16:
return convert_unary_sycl<sycl::half>; return convert_unary_sycl<sycl::half>;
#ifdef GGML_SYCL_HAS_BF16 #ifdef GGML_SYCL_HAS_BF16

View File

@ -819,5 +819,23 @@ dequantize_block_iq4_xs(const void *__restrict__ vx, dst_t *__restrict__ yy,
} }
} }
template<typename dst_t>
static void dequantize_block_mxfp4(const void * __restrict__ vx, dst_t * __restrict__ yy,
const sycl::nd_item<3> &item_ct1) {
// auto item_ct1 = sycl::ext::oneapi::this_work_item::get_nd_item<3>();
const int64_t i = item_ct1.get_group(2);
const block_mxfp4 * x = (const block_mxfp4 *) vx + i*(QK_K/QK_MXFP4);
const int64_t tid = item_ct1.get_local_id(2);
const int64_t il = tid/8; // 0...3
const int64_t ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 4*il;
const uint8_t * q4 = x[ib].qs + 4*il;
const float d = ggml_sycl_e8m0_to_fp32(x[ib].e);
for (int j = 0; j < 4; ++j) {
y[j+ 0] = d * kvalues_mxfp4[q4[j] & 0xf]*0.5f;
y[j+16] = d * kvalues_mxfp4[q4[j] >> 4]*0.5f;
}
}
#endif // GGML_SYCL_DEQUANTIZE_HPP #endif // GGML_SYCL_DEQUANTIZE_HPP

View File

@ -1860,10 +1860,31 @@ namespace dpct
: id); : id);
} }
template <typename T1, typename T2>
using dot_product_acc_t = std::conditional_t<
std::is_unsigned_v<T1> && std::is_unsigned_v<T2>,
uint32_t,
int32_t>;
template <typename T>
sycl::vec<T, 4> extract_and_sign_or_zero_extend4(T val) {
return sycl::vec<T, 1>(val)
.template as<sycl::vec<
std::conditional_t<std::is_signed_v<T>, int8_t, uint8_t>,
4>>()
.template convert<T>();
}
template <typename T1, typename T2, typename T3> template <typename T1, typename T2, typename T3>
inline auto dp4a(T1 a, T2 b, T3 c) inline auto dp4a(T1 a, T2 b, T3 c) {
{ dot_product_acc_t<T1, T2> res = c;
return syclcompat::dp4a(a, b, c); auto va = extract_and_sign_or_zero_extend4(a);
auto vb = extract_and_sign_or_zero_extend4(b);
res += va[0] * vb[0];
res += va[1] * vb[1];
res += va[2] * vb[2];
res += va[3] * vb[3];
return res;
} }
struct sub_sat struct sub_sat
@ -2972,6 +2993,38 @@ namespace dpct
atomic_fetch_add<T1, addressSpace>(addr, operand, memoryOrder); atomic_fetch_add<T1, addressSpace>(addr, operand, memoryOrder);
} }
inline unsigned int byte_level_permute(
unsigned int a, unsigned int b, unsigned int s) {
unsigned int ret;
ret = ((((std::uint64_t)b << 32 | a) >> (s & 0x7) * 8) & 0xff) |
(((((std::uint64_t)b << 32 | a) >> ((s >> 4) & 0x7) * 8) & 0xff)
<< 8) |
(((((std::uint64_t)b << 32 | a) >> ((s >> 8) & 0x7) * 8) & 0xff)
<< 16) |
(((((std::uint64_t)b << 32 | a) >> ((s >> 12) & 0x7) * 8) & 0xff)
<< 24);
return ret;
}
inline uint32_t byte_level_permute_custom(
uint32_t low32, uint32_t high32, uint32_t sel, int mode = 0) {
constexpr uint16_t lookup[6][4] = {
{0x3210, 0x4321, 0x5432, 0x6543}, // Forward 4-byte extract
{0x5670, 0x6701, 0x7012, 0x0123}, // Backward 4-byte extract
{0x0000, 0x1111, 0x2222, 0x3333}, // Replicate 8-bit values
{0x3210, 0x3211, 0x3222, 0x3333}, // Edge clamp left
{0x0000, 0x1110, 0x2210, 0x3210}, // Edge clamp right
{0x1010, 0x3232, 0x1010, 0x3232} // Replicate 16-bit values
};
if (mode >= 1 && mode <= 6) {
return byte_level_permute(low32, high32, lookup[mode - 1][sel & 0x3]);
} else if (!mode) {
return byte_level_permute(low32, high32, sel);
}
return 0;
}
} // COPY from DPCT head files } // COPY from DPCT head files
#endif // GGML_SYCL_DPCT_HELPER_HPP #endif // GGML_SYCL_DPCT_HELPER_HPP

View File

@ -911,6 +911,98 @@ static inline void ggml_sycl_op_swiglu(ggml_backend_sycl_context & ctx, ggml_ten
}); });
} }
__dpct_inline__ float ggml_sycl_op_swiglu_oai_single(float x, float g, float alpha = 1.702f, float limit = 7.0f) {
x = sycl::fmin(x, limit);
g = sycl::fmax(sycl::fmin(g, limit), -limit);
float out_glu = x / (1.0f + sycl::native::exp(-x * alpha));
out_glu = out_glu * (1.0f + g);
return out_glu;
}
template <typename T>
static void swiglu_oai_kernel(const T * x, const T * g, T * dst, const int64_t k,
const int64_t n, const int64_t o0, const int64_t o1,
float alpha, float limit, sycl::nd_item<3> item_ct1) {
const int64_t i = int64_t(item_ct1.get_local_range(2)) * item_ct1.get_group(2) + item_ct1.get_local_id(2);
if (i >= k) {
return;
}
const int64_t j0 = (i / n) * o0 + (i % n);
const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n);
float xi = x[j0];
float gi = g[j1];
dst[i] = ggml_sycl_op_swiglu_oai_single(xi, gi, alpha, limit);
}
template <typename T>
static void swiglu_oai_sycl(const T * x,
const T * g,
T * dst,
const int64_t k,
const int64_t n,
const int64_t o0,
const int64_t o1,
const float alpha,
const float limit,
dpct::queue_ptr stream) {
const int64_t num_blocks = (k + SYCL_GLU_BLOCK_SIZE - 1) / SYCL_GLU_BLOCK_SIZE;
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_GLU_BLOCK_SIZE),
sycl::range<3>(1, 1, SYCL_GLU_BLOCK_SIZE)),
[=](sycl::nd_item<3> item_ct1) {
swiglu_oai_kernel(x, g, dst, k, n, o0, o1, alpha, limit, item_ct1);
});
}
void ggml_sycl_op_swiglu_oai(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1];
void * src0_d = src0->data;
void * src1_d = src1 ? src1->data : src0->data;
const int64_t src0_o = src0->nb[1];
const int64_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
void * dst_d = dst->data;
const int64_t nc = src1 ? src0->ne[0] : src0->ne[0] / 2;
dpct::queue_ptr stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous_1(src0));
GGML_ASSERT(src0->nb[0] == ggml_element_size(src0));
GGML_ASSERT(ggml_is_contiguous(dst));
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
GGML_ASSERT(src0->type == dst->type);
GGML_ASSERT(dst->ne[0] == nc);
GGML_ASSERT(ggml_nrows(dst) == ggml_nrows(src0));
if (src1) {
GGML_ASSERT(ggml_is_contiguous_1(src1));
GGML_ASSERT(src1->nb[0] == ggml_element_size(src1));
GGML_ASSERT(src1->ne[0] == nc);
GGML_ASSERT(src0->type == src1->type);
}
//const int32_t swapped = ((const int32_t *) dst->op_params)[1];
const int32_t swapped = ggml_get_op_params_i32(dst, 1);
const float alpha = ggml_get_op_params_f32(dst, 2);
const float limit = ggml_get_op_params_f32(dst, 3);
float * src0_p = (float *) src0_d;
float * src1_p = (float *) src1_d;
if (!src1) {
src0_p += swapped ? nc : 0;
src1_p += swapped ? 0 : nc;
}
swiglu_oai_sycl(src0_p, src1_p, (float *)dst_d, ggml_nelements(dst), nc, src0_o / sizeof(float), src1_o / sizeof(float), alpha, limit, stream);
}
static inline void ggml_sycl_op_geglu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { static inline void ggml_sycl_op_geglu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst, ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst,
[](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) { [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
@ -1070,6 +1162,11 @@ void ggml_sycl_swiglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_op_swiglu(ctx, dst); ggml_sycl_op_swiglu(ctx, dst);
} }
void ggml_sycl_swiglu_oai(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
ggml_sycl_op_swiglu_oai(ctx, dst);
}
void ggml_sycl_geglu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { void ggml_sycl_geglu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
ggml_sycl_op_geglu_erf(ctx, dst); ggml_sycl_op_geglu_erf(ctx, dst);

View File

@ -5,6 +5,8 @@
#include "ggml.h" #include "ggml.h"
#include <limits> // For std::numeric_limits #include <limits> // For std::numeric_limits
#define SYCL_GLU_BLOCK_SIZE 256
template <typename T> template <typename T>
T neg_infinity() { T neg_infinity() {
return -std::numeric_limits<T>::infinity(); return -std::numeric_limits<T>::infinity();
@ -41,6 +43,8 @@ void ggml_sycl_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst); void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_swiglu_oai(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst); void ggml_sycl_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst); void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst);

View File

@ -39,6 +39,7 @@
#include "ggml-impl.h" #include "ggml-impl.h"
#include "ggml-backend-impl.h" #include "ggml-backend-impl.h"
#include "ggml-sycl/add-id.hpp"
#include "ggml-sycl/backend.hpp" #include "ggml-sycl/backend.hpp"
#include "ggml-sycl/common.hpp" #include "ggml-sycl/common.hpp"
#include "ggml-sycl/element_wise.hpp" #include "ggml-sycl/element_wise.hpp"
@ -3313,6 +3314,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
bool use_mul_mat_q = ggml_sycl_supports_mmq(src0->type) bool use_mul_mat_q = ggml_sycl_supports_mmq(src0->type)
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32; && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
// mmvq and mmq need the __dp4a instruction which is available for gen12+ // mmvq and mmq need the __dp4a instruction which is available for gen12+
// Workaround in https://github.com/ggerganov/llama.cpp/commit/95f84d5ce8b449a9b16009434aca800df504a02e // Workaround in https://github.com/ggerganov/llama.cpp/commit/95f84d5ce8b449a9b16009434aca800df504a02e
use_mul_mat_q = use_mul_mat_q && (src0->type != GGML_TYPE_IQ2_XXS); use_mul_mat_q = use_mul_mat_q && (src0->type != GGML_TYPE_IQ2_XXS);
@ -3320,7 +3322,6 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
use_mul_mat_q = use_mul_mat_q && (src1->ne[1] <= MMQ_MAX_BATCH_SIZE); use_mul_mat_q = use_mul_mat_q && (src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
#endif // SYCL_USE_XMX #endif // SYCL_USE_XMX
// mmvq path is faster in the CUDA backend. // mmvq path is faster in the CUDA backend.
if (!g_ggml_sycl_prioritize_dmmv && (ctx.stream()->get_backend() == sycl::backend::ext_oneapi_cuda if (!g_ggml_sycl_prioritize_dmmv && (ctx.stream()->get_backend() == sycl::backend::ext_oneapi_cuda
// Dispatch becomes obscure with the reorder, MMVQ when the reorder optimization // Dispatch becomes obscure with the reorder, MMVQ when the reorder optimization
@ -3711,6 +3712,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
case GGML_OP_ADD1: // TODO: more efficient implementation case GGML_OP_ADD1: // TODO: more efficient implementation
ggml_sycl_add(ctx, dst); ggml_sycl_add(ctx, dst);
break; break;
case GGML_OP_ADD_ID:
ggml_sycl_add_id(ctx, dst);
break;
case GGML_OP_SUB: case GGML_OP_SUB:
ggml_sycl_sub(ctx, dst); ggml_sycl_sub(ctx, dst);
break; break;
@ -3803,6 +3807,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
case GGML_GLU_OP_SWIGLU: case GGML_GLU_OP_SWIGLU:
ggml_sycl_swiglu(ctx, dst); ggml_sycl_swiglu(ctx, dst);
break; break;
case GGML_GLU_OP_SWIGLU_OAI:
ggml_sycl_swiglu_oai(ctx, dst);
break;
case GGML_GLU_OP_GEGLU_ERF: case GGML_GLU_OP_GEGLU_ERF:
ggml_sycl_geglu_erf(ctx, dst); ggml_sycl_geglu_erf(ctx, dst);
break; break;
@ -4397,6 +4404,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_GLU_OP_REGLU: case GGML_GLU_OP_REGLU:
case GGML_GLU_OP_GEGLU: case GGML_GLU_OP_GEGLU:
case GGML_GLU_OP_SWIGLU: case GGML_GLU_OP_SWIGLU:
case GGML_GLU_OP_SWIGLU_OAI:
case GGML_GLU_OP_GEGLU_ERF: case GGML_GLU_OP_GEGLU_ERF:
case GGML_GLU_OP_GEGLU_QUICK: case GGML_GLU_OP_GEGLU_QUICK:
return ggml_is_contiguous_1(op->src[0]); return ggml_is_contiguous_1(op->src[0]);
@ -4424,15 +4432,18 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
} }
} }
ggml_type src0_type = op->src[0]->type; ggml_type src0_type = op->src[0]->type;
if (src0_type == GGML_TYPE_BF16 || src0_type == GGML_TYPE_MXFP4) { if (src0_type == GGML_TYPE_BF16 ) {
// TODO: support MXFP4 // TODO: support GGML_TYPE_BF16
// FIXME: keep a list of supported types to avoid breaking the backend when a new type is added // FIXME: keep a list of supported types to avoid breaking the backend when a new type is added
return false; return false;
} }
// TODO: The configuration below needs more work to be supported with oneDNN // TODO: The configuration below needs more work to be supported with oneDNN
if (ggml_is_permuted(a) && !ggml_is_contiguous(a) && a->ne[2] > 1 && a->ne[3] > 1) { if (ggml_is_permuted(a) && !ggml_is_contiguous(a) &&
a->ne[2] > 1 && a->ne[3] > 1 && src0_type == GGML_TYPE_F16) {
return false; return false;
} }
// TODO: This specific configuration can fail with oneDNN and needs more debugging // TODO: This specific configuration can fail with oneDNN and needs more debugging
if (!ggml_is_permuted(a) && ggml_is_permuted(b) && b->ne[2] > 1 && b->ne[3] > 1 && if (!ggml_is_permuted(a) && ggml_is_permuted(b) && b->ne[2] > 1 && b->ne[3] > 1 &&
a->ne[0] > 128 && a->ne[2] == 1 && src0_type == GGML_TYPE_F16) { a->ne[0] > 128 && a->ne[2] == 1 && src0_type == GGML_TYPE_F16) {
@ -4553,9 +4564,9 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_VIEW: case GGML_OP_VIEW:
case GGML_OP_PERMUTE: case GGML_OP_PERMUTE:
case GGML_OP_TRANSPOSE: case GGML_OP_TRANSPOSE:
return true;
case GGML_OP_ADD: case GGML_OP_ADD:
case GGML_OP_ADD1: case GGML_OP_ADD1:
case GGML_OP_ADD_ID:
case GGML_OP_SUB: case GGML_OP_SUB:
case GGML_OP_COUNT_EQUAL: case GGML_OP_COUNT_EQUAL:
case GGML_OP_MUL: case GGML_OP_MUL:

View File

@ -595,6 +595,25 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
} }
} }
static void mul_mat_vec_mxfp4_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols, const int nrows,
dpct::queue_ptr stream) {
GGML_ASSERT(ncols % QK_MXFP4 == 0);
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
{
stream->submit([&](sycl::handler & cgh) {
cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
mul_mat_vec_q<QK_MXFP4, QI_MXFP4, block_mxfp4, VDR_MXFP4_Q8_1_MMVQ, vec_dot_mxfp4_q8_1>(
vx, vy, dst, ncols, nrows, item_ct1);
});
});
}
}
static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy, static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
float *dst, const int ncols, float *dst, const int ncols,
const int nrows, const int nrows,
@ -1123,6 +1142,9 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ4_XS:
mul_mat_vec_iq4_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); mul_mat_vec_iq4_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
break; break;
case GGML_TYPE_MXFP4:
mul_mat_vec_mxfp4_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
break;
default: default:
GGML_ABORT("fatal error"); GGML_ABORT("fatal error");
} }

View File

@ -16,8 +16,8 @@
static void pad_f32(const float * src, float * dst, static void pad_f32(const float * src, float * dst,
const int lp0, const int rp0, const int lp1, const int rp1, const int lp0, const int rp0, const int lp1, const int rp1,
const int lp2, const int rp2, const int lp3, const int rp3, const int lp2, const int rp2, const int lp3, const int rp3,
const int ne0, const int ne1, const int ne2, const int ne3) { const int ne0, const int ne1, const int ne2, const int ne3,
auto item_ct1 = sycl::ext::oneapi::this_work_item::get_nd_item<3>(); sycl::nd_item<3> item_ct1) {
int i0 = item_ct1.get_local_id(2) + int i0 = item_ct1.get_local_id(2) +
item_ct1.get_group(2) * item_ct1.get_local_range(2); item_ct1.get_group(2) * item_ct1.get_local_range(2);
int i1 = item_ct1.get_group(1); int i1 = item_ct1.get_group(1);
@ -63,7 +63,7 @@ static void pad_f32_sycl(const float *src, float *dst, const int lp0,
sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE)), sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE)),
[=](sycl::nd_item<3> item_ct1) { [=](sycl::nd_item<3> item_ct1) {
pad_f32(src, dst, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3, ne0, ne1, pad_f32(src, dst, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3, ne0, ne1,
ne2, ne3); ne2, ne3, item_ct1);
}); });
} }

View File

@ -88,7 +88,7 @@ void ggml_sycl_ssm_conv(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_ASSERT(src0->nb[0] == sizeof(float)); GGML_ASSERT(src0->nb[0] == sizeof(float));
GGML_ASSERT(src1->nb[0] == sizeof(float)); GGML_ASSERT(src1->nb[0] == sizeof(float));
GGML_ASSERT(src0->nb[1] == src0->ne[0] * static_cast<int>(sizeof(float))); GGML_ASSERT(src0->nb[1] == src0->ne[0] * sizeof(float));
const int src_stride_inner = ncs; const int src_stride_inner = ncs;
const int src_stride_seq = ncs * d_inner; const int src_stride_seq = ncs * d_inner;

View File

@ -20,6 +20,18 @@
typedef float (*vec_dot_q_sycl_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, typedef float (*vec_dot_q_sycl_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1,
const int & iqs); const int & iqs);
static __dpct_inline__ int get_int_b1(const void * x, const int & i32) {
const uint8_t * x8 = (const uint8_t *) x;
int x32 = x8[4*i32 + 0] << 0;
x32 |= x8[4*i32 + 1] << 8;
x32 |= x8[4*i32 + 2] << 16;
x32 |= x8[4*i32 + 3] << 24;
return x32;
}
static __dpct_inline__ int get_int_from_int8(const int8_t* x8, const int& i32) { static __dpct_inline__ int get_int_from_int8(const int8_t* x8, const int& i32) {
const uint16_t* x16 = const uint16_t* x16 =
(const uint16_t*)(x8 + sizeof(int) * i32); // assume at least 2 byte (const uint16_t*)(x8 + sizeof(int) * i32); // assume at least 2 byte
@ -75,6 +87,28 @@ static __dpct_inline__ void get_int_from_table_16(const uint32_t &q4,
val2 = v1 | (v2 << 16); val2 = v1 | (v2 << 16);
} }
static __dpct_inline__ sycl::int2 get_int_from_table_16(
const int& q4, const int8_t* table) {
const uint32_t* table32 = (const uint32_t*)table;
uint32_t tmp[2];
const uint32_t low_high_selection_indices =
(0x32103210 | ((q4 & 0x88888888) >> 1));
#pragma unroll
for (uint32_t i = 0; i < 2; ++i) {
const uint32_t shift = 16 * i;
const uint32_t low =
dpct::byte_level_permute(table32[0], table32[1], q4 >> shift);
const uint32_t high =
dpct::byte_level_permute(table32[2], table32[3], q4 >> shift);
tmp[i] = dpct::byte_level_permute(
low, high, low_high_selection_indices >> shift);
}
return sycl::int2(
dpct::byte_level_permute(tmp[0], tmp[1], 0x6420),
dpct::byte_level_permute(tmp[0], tmp[1], 0x7531));
}
#define VDR_Q2_K_Q8_1_MMVQ 1 #define VDR_Q2_K_Q8_1_MMVQ 1
// contiguous v/x values // contiguous v/x values
@ -685,6 +719,30 @@ vec_dot_q4_1_q8_1(const void *__restrict__ vbq,
return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds); return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
} }
#define VDR_MXFP4_Q8_1_MMVQ 2
#define VDR_MXFP4_Q8_1_MMQ 4
static __dpct_inline__ float vec_dot_mxfp4_q8_1(const void * __restrict__ vbq,
const block_q8_1 * __restrict__ bq8_1,
const int & iqs) {
const block_mxfp4 * bq4 = (const block_mxfp4 *) vbq;
const int * q8 = (const int *) bq8_1->qs + iqs;
int sumi = 0;
#pragma unroll
for (int l = 0; l < VDR_MXFP4_Q8_1_MMVQ; ++l) {
const int aux_q4 = get_int_b1(bq4->qs, iqs + l);
const sycl::int2 v = get_int_from_table_16(aux_q4, kvalues_mxfp4);
sumi = ggml_sycl_dp4a(v.x(), q8[l + 0], sumi);
sumi = ggml_sycl_dp4a(v.y(), q8[l + 4], sumi);
}
const float d = ggml_sycl_e8m0_to_fp32(bq4->e) * 0.5f * (bq8_1->ds)[0];
return d * sumi;
}
static __dpct_inline__ float static __dpct_inline__ float
vec_dot_q5_0_q8_1(const void *__restrict__ vbq, vec_dot_q5_0_q8_1(const void *__restrict__ vbq,
const block_q8_1 *__restrict__ bq8_1, const int &iqs) { const block_q8_1 *__restrict__ bq8_1, const int &iqs) {

View File

@ -659,6 +659,7 @@ struct vk_device_struct {
vk_pipeline pipeline_cos_f32; vk_pipeline pipeline_cos_f32;
vk_pipeline pipeline_log[2]; vk_pipeline pipeline_log[2];
vk_pipeline pipeline_tri[2]; vk_pipeline pipeline_tri[2];
vk_pipeline pipeline_diag[2];
vk_pipeline pipeline_clamp_f32; vk_pipeline pipeline_clamp_f32;
vk_pipeline pipeline_pad_f32; vk_pipeline pipeline_pad_f32;
vk_pipeline pipeline_roll_f32; vk_pipeline pipeline_roll_f32;
@ -722,6 +723,11 @@ struct vk_device_struct {
vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16; vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16;
vk_pipeline pipeline_soft_max_f32_wg512, pipeline_soft_max_f32_f16_wg512; vk_pipeline pipeline_soft_max_f32_wg512, pipeline_soft_max_f32_f16_wg512;
vk_pipeline pipeline_soft_max_back_f32; vk_pipeline pipeline_soft_max_back_f32;
vk_pipeline pipeline_soft_max_large1_f32, pipeline_soft_max_large1_f32_f16;
vk_pipeline pipeline_soft_max_large2_f32, pipeline_soft_max_large2_f32_f16;
vk_pipeline pipeline_soft_max_large3_f32, pipeline_soft_max_large3_f32_f16;
vk_pipeline pipeline_rope_norm_f32, pipeline_rope_norm_f16, pipeline_rope_norm_f32_f16; vk_pipeline pipeline_rope_norm_f32, pipeline_rope_norm_f16, pipeline_rope_norm_f32_f16;
vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16, pipeline_rope_neox_f32_f16; vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16, pipeline_rope_neox_f32_f16;
vk_pipeline pipeline_rope_multi_f32, pipeline_rope_multi_f16; vk_pipeline pipeline_rope_multi_f32, pipeline_rope_multi_f16;
@ -757,7 +763,8 @@ struct vk_device_struct {
vk_pipeline pipeline_flash_attn_split_k_reduce; vk_pipeline pipeline_flash_attn_split_k_reduce;
vk_pipeline pipeline_topk_moe[num_topk_moe_pipelines][TOPK_MOE_COUNT]; // [2] is for whether to take n_experts from spec constant (0) or push constant (1)
vk_pipeline pipeline_topk_moe[num_topk_moe_pipelines][TOPK_MOE_COUNT][2];
std::vector<vk_pipeline_ref> all_pipelines; std::vector<vk_pipeline_ref> all_pipelines;
@ -1149,6 +1156,7 @@ static_assert(sizeof(vk_op_multi_add_push_constants) <= 256);
struct vk_op_topk_moe_push_constants { struct vk_op_topk_moe_push_constants {
uint32_t n_rows; uint32_t n_rows;
uint32_t n_experts_push;
uint32_t n_expert_used; uint32_t n_expert_used;
float clamp_min; float clamp_min;
float clamp_max; float clamp_max;
@ -3730,6 +3738,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ4_XS], "get_rows_iq4_xs", get_rows_iq4_xs_len, get_rows_iq4_xs_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ4_XS], "get_rows_iq4_xs", get_rows_iq4_xs_len, get_rows_iq4_xs_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl", get_rows_iq4_nl_len, get_rows_iq4_nl_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl", get_rows_iq4_nl_len, get_rows_iq4_nl_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_MXFP4], "get_rows_mxfp4", get_rows_mxfp4_len, get_rows_mxfp4_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_MXFP4], "get_rows_mxfp4", get_rows_mxfp4_len, get_rows_mxfp4_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_I32], "get_rows_i32", get_rows_i32_len, get_rows_i32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f32_f32", get_rows_f32_f32_len, get_rows_f32_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f32_f32", get_rows_f32_f32_len, get_rows_f32_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F16 ], "get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F16 ], "get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
@ -3917,6 +3926,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_tri[0], "tri_f32", tri_f32_len, tri_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_tri[0], "tri_f32", tri_f32_len, tri_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_tri[1], "tri_f16", tri_f16_len, tri_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_tri[1], "tri_f16", tri_f16_len, tri_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_diag[0], "diag_f32", diag_f32_len, diag_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_diag[1], "diag_f16", diag_f16_len, diag_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_clamp_f32, "clamp_f32", clamp_f32_len, clamp_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_clamp_f32, "clamp_f32", clamp_f32_len, clamp_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_pad_f32, "pad_f32", pad_f32_len, pad_f32_data, "main", 2, sizeof(vk_op_pad_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_pad_f32, "pad_f32", pad_f32_len, pad_f32_data, "main", 2, sizeof(vk_op_pad_push_constants), {512, 1, 1}, {}, 1);
@ -3996,6 +4008,13 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_f16_wg512, "soft_max_f32_f16_wg512", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 512 }, 1); ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_f16_wg512, "soft_max_f32_f16_wg512", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 512 }, 1);
ggml_vk_create_pipeline(device, device->pipeline_soft_max_back_f32, "soft_max_back_f32", soft_max_back_f32_len, soft_max_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1, true); ggml_vk_create_pipeline(device, device->pipeline_soft_max_back_f32, "soft_max_back_f32", soft_max_back_f32_len, soft_max_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_soft_max_large1_f32, "soft_max_large1_f32", soft_max_large1_f32_len, soft_max_large1_f32_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_soft_max_large2_f32, "soft_max_large2_f32", soft_max_large2_f32_len, soft_max_large2_f32_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_soft_max_large3_f32, "soft_max_large3_f32", soft_max_large3_f32_len, soft_max_large3_f32_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_soft_max_large1_f32_f16, "soft_max_large1_f32_f16", soft_max_large1_f32_f16_len, soft_max_large1_f32_f16_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_soft_max_large2_f32_f16, "soft_max_large2_f32_f16", soft_max_large2_f32_f16_len, soft_max_large2_f32_f16_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_soft_max_large3_f32_f16, "soft_max_large3_f32_f16", soft_max_large3_f32_f16_len, soft_max_large3_f32_f16_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f32, "rope_norm_f32", rope_norm_f32_len, rope_norm_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f32, "rope_norm_f32", rope_norm_f32_len, rope_norm_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_rope_multi_f32, "rope_multi_f32", rope_multi_f32_len, rope_multi_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_rope_multi_f32, "rope_multi_f32", rope_multi_f32_len, rope_multi_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
@ -4204,10 +4223,12 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f16_f32, "conv2d_dw_whcn_f16_f32", conv2d_dw_whcn_f16_f32_len, conv2d_dw_whcn_f16_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f16_f32, "conv2d_dw_whcn_f16_f32", conv2d_dw_whcn_f16_f32_len, conv2d_dw_whcn_f16_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_cwhn_f16_f32, "conv2d_dw_cwhn_f16_f32", conv2d_dw_cwhn_f16_f32_len, conv2d_dw_cwhn_f16_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_cwhn_f16_f32, "conv2d_dw_cwhn_f16_f32", conv2d_dw_cwhn_f16_f32_len, conv2d_dw_cwhn_f16_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
for (uint32_t use_push = 0; use_push < 2; ++use_push) {
for (uint32_t i = 0; i < num_topk_moe_pipelines; ++i) { for (uint32_t i = 0; i < num_topk_moe_pipelines; ++i) {
ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX], "topk_moe_f32_early_softmax_"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 0}, 1, true, true, device->subgroup_size); ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX][use_push], "topk_moe_f32_early_softmax_"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 0, use_push}, 1, true, true, device->subgroup_size);
ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX_NORM], "topk_moe_f32_early_softmax_norm"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 1, 0}, 1, true, true, device->subgroup_size); ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX_NORM][use_push], "topk_moe_f32_early_softmax_norm"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 1, 0, use_push}, 1, true, true, device->subgroup_size);
ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_LATE_SOFTMAX], "topk_moe_f32_late_softmax"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 1}, 1, true, true, device->subgroup_size); ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_LATE_SOFTMAX][use_push], "topk_moe_f32_late_softmax"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 1, use_push}, 1, true, true, device->subgroup_size);
}
} }
for (auto &c : compiles) { for (auto &c : compiles) {
@ -8274,6 +8295,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
switch (op) { switch (op) {
case GGML_OP_GET_ROWS: case GGML_OP_GET_ROWS:
GGML_ASSERT(src1->type == GGML_TYPE_I32); GGML_ASSERT(src1->type == GGML_TYPE_I32);
if (src0->type == GGML_TYPE_I32) {
// i32 src only supports i32 result
GGML_ASSERT(dst->type == GGML_TYPE_I32);
return ctx->device->pipeline_get_rows[src0->type];
}
if (dst->type == GGML_TYPE_F16) { if (dst->type == GGML_TYPE_F16) {
return ctx->device->pipeline_get_rows[src0->type]; return ctx->device->pipeline_get_rows[src0->type];
} }
@ -8400,6 +8426,12 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
return ctx->device->pipeline_tri[dst->type == GGML_TYPE_F16]; return ctx->device->pipeline_tri[dst->type == GGML_TYPE_F16];
} }
return nullptr; return nullptr;
case GGML_OP_DIAG:
if (src0->type == dst->type &&
(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)) {
return ctx->device->pipeline_diag[dst->type == GGML_TYPE_F16];
}
return nullptr;
case GGML_OP_CLAMP: case GGML_OP_CLAMP:
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
return ctx->device->pipeline_clamp_f32; return ctx->device->pipeline_clamp_f32;
@ -8554,7 +8586,9 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
uint32_t idx = (uint32_t)ceilf(log2f(float(dst->ne[0]))); uint32_t idx = (uint32_t)ceilf(log2f(float(dst->ne[0])));
GGML_ASSERT(idx < num_topk_moe_pipelines); GGML_ASSERT(idx < num_topk_moe_pipelines);
topk_moe_mode mode = ggml_vk_num_additional_ops_to_topk_moe_mode(ctx->num_additional_fused_ops); topk_moe_mode mode = ggml_vk_num_additional_ops_to_topk_moe_mode(ctx->num_additional_fused_ops);
return ctx->device->pipeline_topk_moe[idx][mode]; // use n_experts from push constant if it's not equal to the power of two spec constant
bool use_push = dst->ne[0] != (1u << idx);
return ctx->device->pipeline_topk_moe[idx][mode][use_push];
} }
if (src0->type == GGML_TYPE_F32 && (src1 == nullptr || src1->type == GGML_TYPE_F32) && dst->type == GGML_TYPE_F32) { if (src0->type == GGML_TYPE_F32 && (src1 == nullptr || src1->type == GGML_TYPE_F32) && dst->type == GGML_TYPE_F32) {
@ -9091,6 +9125,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
case GGML_OP_COS: case GGML_OP_COS:
case GGML_OP_LOG: case GGML_OP_LOG:
case GGML_OP_TRI: case GGML_OP_TRI:
case GGML_OP_DIAG:
case GGML_OP_CLAMP: case GGML_OP_CLAMP:
case GGML_OP_PAD: case GGML_OP_PAD:
case GGML_OP_ROLL: case GGML_OP_ROLL:
@ -9778,6 +9813,12 @@ static void ggml_vk_tri(ggml_backend_vk_context * ctx, vk_context& subctx, const
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_TRI, std::move(p)); ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_TRI, std::move(p));
} }
static void ggml_vk_diag(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst));
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_DIAG, std::move(p));
}
static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst); vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst);
p.param1 = ggml_get_op_params_f32(dst, 0); p.param1 = ggml_get_op_params_f32(dst, 0);
@ -10111,7 +10152,7 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx,
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, src2, nullptr, dst, GGML_OP_SOFT_MAX, { vk_op_soft_max_push_constants pc {
ncols, ncols,
src1 != nullptr ? nrows_y : (uint32_t)0, src1 != nullptr ? nrows_y : (uint32_t)0,
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],
@ -10122,7 +10163,55 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx,
n_head_log2, n_head_log2,
nrows_x, nrows_x,
src2 != nullptr src2 != nullptr
}); };
if (ncols <= 16384) {
ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, src2, nullptr, dst, GGML_OP_SOFT_MAX, std::move(pc));
} else {
vk_subbuffer buf_a = ggml_vk_tensor_subbuffer(ctx, src0);
vk_subbuffer buf_b = src1 ? ggml_vk_tensor_subbuffer(ctx, src1) : buf_a;
vk_subbuffer buf_c = src2 ? ggml_vk_tensor_subbuffer(ctx, src2) : buf_a;
vk_subbuffer buf_d = ggml_vk_tensor_subbuffer(ctx, dst);
uint32_t elems_per_wg = 128 * 4;
uint32_t num_wgs = CEIL_DIV(ncols, elems_per_wg);
size_t tmp_size = num_wgs * nrows_x * sizeof(float);
if (ctx->prealloc_size_x < tmp_size) {
ctx->prealloc_size_x = tmp_size;
ggml_vk_preallocate_buffers(ctx, subctx);
}
if (ctx->prealloc_size_y < tmp_size) {
ctx->prealloc_size_y = tmp_size;
ggml_vk_preallocate_buffers(ctx, subctx);
}
if (ctx->prealloc_x_need_sync || ctx->prealloc_y_need_sync) {
ggml_vk_sync_buffers(ctx, subctx);
}
vk_subbuffer buf_x = { ctx->prealloc_x, 0, tmp_size };
vk_subbuffer buf_y = { ctx->prealloc_y, 0, tmp_size };
std::array<uint32_t, 3> elements = { num_wgs, nrows_x, 1 };
vk_pipeline pipeline1 = src1 && src1->type == GGML_TYPE_F16 ? ctx->device->pipeline_soft_max_large1_f32_f16 : ctx->device->pipeline_soft_max_large1_f32;
vk_pipeline pipeline2 = src1 && src1->type == GGML_TYPE_F16 ? ctx->device->pipeline_soft_max_large2_f32_f16 : ctx->device->pipeline_soft_max_large2_f32;
vk_pipeline pipeline3 = src1 && src1->type == GGML_TYPE_F16 ? ctx->device->pipeline_soft_max_large3_f32_f16 : ctx->device->pipeline_soft_max_large3_f32;
ggml_pipeline_request_descriptor_sets(ctx, pipeline1, 1);
ggml_pipeline_request_descriptor_sets(ctx, pipeline2, 1);
ggml_pipeline_request_descriptor_sets(ctx, pipeline3, 1);
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline1, { buf_a, buf_b, buf_c, buf_d, buf_x, buf_y }, pc, elements);
ggml_vk_sync_buffers(ctx, subctx);
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline2, { buf_a, buf_b, buf_c, buf_d, buf_x, buf_y }, pc, elements);
ggml_vk_sync_buffers(ctx, subctx);
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline3, { buf_a, buf_b, buf_c, buf_d, buf_x, buf_y }, pc, elements);
ctx->prealloc_x_need_sync = true;
ctx->prealloc_y_need_sync = true;
}
} }
static void ggml_vk_soft_max_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_vk_soft_max_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@ -10158,6 +10247,7 @@ static void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx,
vk_op_topk_moe_push_constants pc {}; vk_op_topk_moe_push_constants pc {};
pc.n_rows = n_rows; pc.n_rows = n_rows;
pc.n_experts_push = n_experts;
pc.n_expert_used = n_expert_used; pc.n_expert_used = n_expert_used;
if (mode == TOPK_MOE_EARLY_SOFTMAX_NORM) { if (mode == TOPK_MOE_EARLY_SOFTMAX_NORM) {
ggml_tensor * clamp = cgraph->nodes[node_idx + 7]; ggml_tensor * clamp = cgraph->nodes[node_idx + 7];
@ -11857,6 +11947,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
case GGML_OP_TRI: case GGML_OP_TRI:
ggml_vk_tri(ctx, compute_ctx, src0, node); ggml_vk_tri(ctx, compute_ctx, src0, node);
break;
case GGML_OP_DIAG:
ggml_vk_diag(ctx, compute_ctx, src0, node);
break; break;
case GGML_OP_CLAMP: case GGML_OP_CLAMP:
ggml_vk_clamp(ctx, compute_ctx, src0, node); ggml_vk_clamp(ctx, compute_ctx, src0, node);
@ -12832,8 +12926,7 @@ static bool ggml_vk_can_fuse_topk_moe(ggml_backend_vk_context * ctx, const struc
} }
const int n_expert = softmax->ne[0]; const int n_expert = softmax->ne[0];
// n_expert must be a power of 2 if (n_expert > (1 << (num_topk_moe_pipelines-1))) {
if (!is_pow2(n_expert) || n_expert > (1 << (num_topk_moe_pipelines-1))) {
return false; return false;
} }
@ -13877,6 +13970,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_NL:
case GGML_TYPE_MXFP4: case GGML_TYPE_MXFP4:
case GGML_TYPE_I32:
return true; return true;
default: default:
return false; return false;
@ -14001,6 +14095,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32; return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
case GGML_OP_LOG: case GGML_OP_LOG:
case GGML_OP_TRI: case GGML_OP_TRI:
case GGML_OP_DIAG:
return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) && return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
op->type == op->src[0]->type; op->type == op->src[0]->type;
case GGML_OP_ARGSORT: case GGML_OP_ARGSORT:
@ -14591,6 +14686,8 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
tensor_clone = ggml_log(ggml_ctx, src_clone[0]); tensor_clone = ggml_log(ggml_ctx, src_clone[0]);
} else if (tensor->op == GGML_OP_TRI) { } else if (tensor->op == GGML_OP_TRI) {
tensor_clone = ggml_tri(ggml_ctx, src_clone[0], ggml_get_op_params_i32(tensor, 0)); tensor_clone = ggml_tri(ggml_ctx, src_clone[0], ggml_get_op_params_i32(tensor, 0));
} else if (tensor->op == GGML_OP_DIAG) {
tensor_clone = ggml_diag(ggml_ctx, src_clone[0]);
} else if (tensor->op == GGML_OP_CLAMP) { } else if (tensor->op == GGML_OP_CLAMP) {
const float * params = (const float *)tensor->op_params; const float * params = (const float *)tensor->op_params;
tensor_clone = ggml_clamp(ggml_ctx, src_clone[0], params[0], params[1]); tensor_clone = ggml_clamp(ggml_ctx, src_clone[0], params[0], params[1]);

Some files were not shown because too many files have changed in this diff Show More