diff --git a/.devops/cuda.Dockerfile b/.devops/cuda.Dockerfile index f9dc7f3163..4b708ae278 100644 --- a/.devops/cuda.Dockerfile +++ b/.devops/cuda.Dockerfile @@ -60,6 +60,7 @@ RUN apt-get update \ git \ python3 \ python3-pip \ + && pip install --upgrade pip setuptools wheel \ && pip install --break-system-packages -r requirements.txt \ && apt autoremove -y \ && apt clean -y \ diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md new file mode 100644 index 0000000000..3250e3279e --- /dev/null +++ b/.github/copilot-instructions.md @@ -0,0 +1,262 @@ +# Copilot Instructions for llama.cpp + +## Repository Overview + +llama.cpp is a large-scale C/C++ project for efficient LLM (Large Language Model) inference with minimal setup and dependencies. The project enables running language models on diverse hardware with state-of-the-art performance. + +**Key Facts:** +- **Primary language**: C/C++ with Python utility scripts +- **Size**: ~200k+ lines of code across 1000+ files +- **Architecture**: Modular design with main library (`libllama`) and 40+ executable tools/examples +- **Core dependency**: ggml tensor library (vendored in `ggml/` directory) +- **Backends supported**: CPU (AVX/NEON optimized), CUDA, Metal, Vulkan, SYCL, ROCm, MUSA +- **License**: MIT + +## Build Instructions + +### Prerequisites +- CMake 3.14+ (primary build system) +- C++17 compatible compiler (GCC 13.3+, Clang, MSVC) +- Optional: ccache for faster compilation + +### Basic Build (CPU-only) +**ALWAYS run these commands in sequence:** +```bash +cmake -B build +cmake --build build --config Release -j $(nproc) +``` + +**Build time**: ~10 minutes on 4-core system with ccache enabled, ~25 minutes without ccache. + +**Important Notes:** +- The Makefile is deprecated - always use CMake +- ccache is automatically detected and used if available +- Built binaries are placed in `build/bin/` +- Parallel builds (`-j`) significantly reduce build time + +### Backend-Specific Builds +For CUDA support: +```bash +cmake -B build -DGGML_CUDA=ON +cmake --build build --config Release -j $(nproc) +``` + +For Metal (macOS): +```bash +cmake -B build -DGGML_METAL=ON +cmake --build build --config Release -j $(nproc) +``` + +**Important Note**: While all backends can be built as long as the correct requirements for that backend are installed, you will not be able to run them without the correct hardware. The only backend that can be run for testing and validation is the CPU backend. + +### Debug Builds +Single-config generators: +```bash +cmake -B build -DCMAKE_BUILD_TYPE=Debug +cmake --build build +``` + +Multi-config generators: +```bash +cmake -B build -G "Xcode" +cmake --build build --config Debug +``` + +### Common Build Issues +- **Issue**: Network tests fail in isolated environments + **Solution**: Expected behavior - core functionality tests will still pass + +## Testing + +### Running Tests +```bash +ctest --test-dir build --output-on-failure -j $(nproc) +``` + +**Test suite**: 38 tests covering tokenizers, grammar parsing, sampling, backends, and integration +**Expected failures**: 2-3 tests may fail if network access is unavailable (they download models) +**Test time**: ~30 seconds for passing tests + +### Server Unit Tests +Run server-specific unit tests after building the server: +```bash +# Build the server first +cmake --build build --target llama-server + +# Navigate to server tests and run +cd tools/server/tests +source ../../../.venv/bin/activate +./tests.sh +``` +**Server test dependencies**: The `.venv` environment includes the required dependencies for server unit tests (pytest, aiohttp, etc.). Tests can be run individually or with various options as documented in `tools/server/tests/README.md`. + +### Test Categories +- Tokenizer tests: Various model tokenizers (BERT, GPT-2, LLaMA, etc.) +- Grammar tests: GBNF parsing and validation +- Backend tests: Core ggml operations across different backends +- Integration tests: End-to-end workflows + +### Manual Testing Commands +```bash +# Test basic inference +./build/bin/llama-cli --version + +# Test model loading (requires model file) +./build/bin/llama-cli -m path/to/model.gguf -p "Hello" -n 10 +``` + +## Code Quality and Linting + +### C++ Code Formatting +**ALWAYS format C++ code before committing:** +```bash +git clang-format +``` + +Configuration is in `.clang-format` with these key rules: +- 4-space indentation +- 120 column limit +- Braces on same line for functions +- Pointer alignment: `void * ptr` (middle) +- Reference alignment: `int & ref` (middle) + +### Python Code +**ALWAYS activate the Python environment in `.venv` and use tools from that environment:** +```bash +# Activate virtual environment +source .venv/bin/activate +``` + +Configuration files: +- `.flake8`: flake8 settings (max-line-length=125, excludes examples/tools) +- `pyrightconfig.json`: pyright type checking configuration + +### Pre-commit Hooks +Run before committing: +```bash +pre-commit run --all-files +``` + +## Continuous Integration + +### GitHub Actions Workflows +Key workflows that run on every PR: +- `.github/workflows/build.yml`: Multi-platform builds +- `.github/workflows/server.yml`: Server functionality tests +- `.github/workflows/python-lint.yml`: Python code quality +- `.github/workflows/python-type-check.yml`: Python type checking + +### Local CI Validation +**Run full CI locally before submitting PRs:** +```bash +mkdir tmp + +# CPU-only build +bash ./ci/run.sh ./tmp/results ./tmp/mnt +``` + +**CI Runtime**: 30-60 minutes depending on backend configuration + +### Triggering CI +Add `ggml-ci` to commit message to trigger heavy CI workloads on the custom CI infrastructure. + +## Project Layout and Architecture + +### Core Directories +- **`src/`**: Main llama library implementation (`llama.cpp`, `llama-*.cpp`) +- **`include/`**: Public API headers, primarily `include/llama.h` +- **`ggml/`**: Core tensor library (submodule with custom GGML framework) +- **`examples/`**: 30+ example applications and tools +- **`tools/`**: Additional development and utility tools (server benchmarks, tests) +- **`tests/`**: Comprehensive test suite with CTest integration +- **`docs/`**: Detailed documentation (build guides, API docs, etc.) +- **`scripts/`**: Utility scripts for CI, data processing, and automation +- **`common/`**: Shared utility code used across examples + +### Key Files +- **`CMakeLists.txt`**: Primary build configuration +- **`include/llama.h`**: Main C API header (~2000 lines) +- **`src/llama.cpp`**: Core library implementation (~8000 lines) +- **`CONTRIBUTING.md`**: Coding guidelines and PR requirements +- **`.clang-format`**: C++ formatting rules +- **`.pre-commit-config.yaml`**: Git hook configuration + +### Built Executables (in `build/bin/`) +Primary tools: +- **`llama-cli`**: Main inference tool +- **`llama-server`**: OpenAI-compatible HTTP server +- **`llama-quantize`**: Model quantization utility +- **`llama-perplexity`**: Model evaluation tool +- **`llama-bench`**: Performance benchmarking +- **`llama-convert-llama2c-to-ggml`**: Model conversion utilities + +### Configuration Files +- **CMake**: `CMakeLists.txt`, `cmake/` directory +- **Linting**: `.clang-format`, `.clang-tidy`, `.flake8` +- **CI**: `.github/workflows/`, `ci/run.sh` +- **Git**: `.gitignore` (includes build artifacts, models, cache) + +### Dependencies +- **System**: OpenMP, libcurl (for model downloading) +- **Optional**: CUDA SDK, Metal framework, Vulkan SDK, Intel oneAPI +- **Bundled**: httplib, json (header-only libraries in vendored form) + +## Common Validation Steps + +### After Making Changes +1. **Format code**: `git clang-format` +2. **Build**: `cmake --build build --config Release` +3. **Test**: `ctest --test-dir build --output-on-failure` +4. **Server tests** (if modifying server): `cd tools/server/tests && source ../../../.venv/bin/activate && ./tests.sh` +5. **Manual validation**: Test relevant tools in `build/bin/` + +### Performance Validation +```bash +# Benchmark inference performance +./build/bin/llama-bench -m model.gguf + +# Evaluate model perplexity +./build/bin/llama-perplexity -m model.gguf -f dataset.txt +``` + +### Backend Validation +```bash +# Test backend operations +./build/bin/test-backend-ops +``` + +## Environment Setup + +### Required Tools +- CMake 3.14+ (install via system package manager) +- Modern C++ compiler with C++17 support +- Git (for submodule management) +- Python 3.9+ with virtual environment (`.venv` is provided) + +### Optional but Recommended +- ccache: `apt install ccache` or `brew install ccache` +- clang-format 15+: Usually included with LLVM/Clang installation +- pre-commit: `pip install pre-commit` + +### Backend-Specific Requirements +- **CUDA**: NVIDIA CUDA Toolkit 11.2+ +- **Metal**: Xcode command line tools (macOS only) +- **Vulkan**: Vulkan SDK +- **SYCL**: Intel oneAPI toolkit + +## Important Guidelines + +### Code Changes +- **Minimal dependencies**: Avoid adding new external dependencies +- **Cross-platform compatibility**: Test on Linux, macOS, Windows when possible +- **Performance focus**: This is a performance-critical inference library +- **API stability**: Changes to `include/llama.h` require careful consideration + +### Git Workflow +- Always create feature branches from `master` +- **Never** commit build artifacts (`build/`, `.ccache/`, `*.o`, `*.gguf`) +- Use descriptive commit messages following project conventions + +### Trust These Instructions +Only search for additional information if these instructions are incomplete or found to be incorrect. This document contains validated build and test procedures that work reliably across different environments. + diff --git a/.github/workflows/build-riscv-native.yml b/.github/workflows/build-riscv-native.yml index a968f6d519..86dc0ff76e 100644 --- a/.github/workflows/build-riscv-native.yml +++ b/.github/workflows/build-riscv-native.yml @@ -1,10 +1,11 @@ name: Build on RISCV Linux Machine by Cloud-V on: + pull_request: workflow_dispatch: workflow_call: jobs: - bianbu-riscv64-native: # Bianbu 2.2 + debian-13-riscv64-native: # Bianbu 2.2 runs-on: self-hosted steps: @@ -20,24 +21,40 @@ jobs: build-essential \ gcc-14-riscv64-linux-gnu \ g++-14-riscv64-linux-gnu \ + ccache \ cmake + - name: Setup ccache + run: | + mkdir -p $HOME/.ccache + ccache -M 5G -d $HOME/.ccache + export CCACHE_LOGFILE=/home/runneruser/ccache_debug/ccache.log + export CCACHE_DEBUGDIR="/home/runneruser/ccache_debug" + echo "$GITHUB_WORKSPACE" + echo "CCACHE_LOGFILE=$CCACHE_LOGFILE" >> $GITHUB_ENV + echo "CCACHE_DEBUGDIR=$CCACHE_DEBUGDIR" >> $GITHUB_ENV + echo "CCACHE_BASEDIR=$GITHUB_WORKSPACE" >> $GITHUB_ENV + echo "CCACHE_DIR=$HOME/.ccache" >> $GITHUB_ENV + - name: Build run: | - cmake -B build -DLLAMA_CURL=OFF \ - -DCMAKE_BUILD_TYPE=Release \ - -DGGML_OPENMP=OFF \ - -DLLAMA_BUILD_EXAMPLES=ON \ - -DLLAMA_BUILD_TOOLS=ON \ - -DLLAMA_BUILD_TESTS=OFF \ - -DCMAKE_SYSTEM_NAME=Linux \ - -DCMAKE_SYSTEM_PROCESSOR=riscv64 \ - -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \ - -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \ - -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ - -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \ - -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \ - -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \ - -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH + cmake -B build \ + -DLLAMA_CURL=OFF \ + -DCMAKE_BUILD_TYPE=Release \ + -DGGML_OPENMP=OFF \ + -DLLAMA_BUILD_EXAMPLES=ON \ + -DLLAMA_BUILD_TOOLS=ON \ + -DLLAMA_BUILD_TESTS=OFF \ + -DCMAKE_SYSTEM_NAME=Linux \ + -DCMAKE_SYSTEM_PROCESSOR=riscv64 \ + -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \ + -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \ + -DCMAKE_C_COMPILER_LAUNCHER=ccache \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ + -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ + -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \ + -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \ + -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \ + -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH cmake --build build --config Release -j $(nproc) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 83aaea9947..43553ac13b 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1070,7 +1070,8 @@ jobs: write-host "Downloading AMD HIP SDK Installer" Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe" write-host "Installing AMD HIP SDK" - Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait + $proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru + $proc.WaitForExit(600000) write-host "Completed AMD HIP SDK installation" - name: Verify ROCm diff --git a/.github/workflows/copilot-setup-steps.yml b/.github/workflows/copilot-setup-steps.yml index 2d61ce3035..3645e30378 100644 --- a/.github/workflows/copilot-setup-steps.yml +++ b/.github/workflows/copilot-setup-steps.yml @@ -39,6 +39,10 @@ jobs: run: | sudo apt-get update sudo apt-get install build-essential libcurl4-openssl-dev + # Install git-clang-format script for formatting only changed code + wget -O /tmp/git-clang-format https://raw.githubusercontent.com/llvm/llvm-project/release/18.x/clang/tools/clang-format/git-clang-format + sudo cp /tmp/git-clang-format /usr/local/bin/git-clang-format + sudo chmod +x /usr/local/bin/git-clang-format - name: Set up Python uses: actions/setup-python@v5 @@ -50,4 +54,4 @@ jobs: python3 -m venv .venv .venv/bin/activate pip install -r requirements/requirements-all.txt -r tools/server/tests/requirements.txt - pip install flake8 pyright + pip install flake8 pyright pre-commit diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 9ebb0236fa..5367637e42 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -557,7 +557,8 @@ jobs: write-host "Downloading AMD HIP SDK Installer" Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe" write-host "Installing AMD HIP SDK" - Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait + $proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru + $proc.WaitForExit(600000) write-host "Completed AMD HIP SDK installation" - name: Verify ROCm diff --git a/.gitignore b/.gitignore index f48ce4cacd..595831accb 100644 --- a/.gitignore +++ b/.gitignore @@ -147,3 +147,4 @@ poetry.toml # Local scripts /run-vim.sh /run-chat.sh +.ccache/ diff --git a/CODEOWNERS b/CODEOWNERS index b698b1f4cb..18564a08b1 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -5,7 +5,6 @@ /tools/server/ @ngxson /ggml/src/ggml-cuda/fattn* @JohannesGaessler /ggml/src/ggml-cuda/mmq.* @JohannesGaessler -/ggml/src/ggml-cuda/mmv.* @JohannesGaessler /ggml/src/ggml-cuda/mmvq.* @JohannesGaessler /ggml/src/ggml-opt.cpp @JohannesGaessler /ggml/src/gguf.cpp @JohannesGaessler diff --git a/Makefile b/Makefile index ac442aec09..bcbc770205 100644 --- a/Makefile +++ b/Makefile @@ -1,1608 +1,9 @@ -ifndef LLAMA_MAKEFILE -$(error The Makefile build is deprecated. Use the CMake build instead. For more details, see https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md) -endif +define newline -# Define the default target now so that it is always the first target -BUILD_TARGETS = \ - libllava.a \ - llama-batched \ - llama-batched-bench \ - llama-bench \ - llama-cli \ - llama-convert-llama2c-to-ggml \ - llama-embedding \ - llama-eval-callback \ - llama-export-lora \ - llama-gbnf-validator \ - llama-gguf \ - llama-gguf-hash \ - llama-gguf-split \ - llama-gritlm \ - llama-imatrix \ - llama-infill \ - llama-llava-cli \ - llama-minicpmv-cli\ - llama-qwen2vl-cli\ - llama-lookahead \ - llama-lookup \ - llama-lookup-create \ - llama-lookup-merge \ - llama-lookup-stats \ - llama-parallel \ - llama-passkey \ - llama-perplexity \ - llama-q8dot \ - llama-quantize \ - llama-quantize-stats \ - llama-retrieval \ - llama-save-load-state \ - llama-server \ - llama-simple \ - llama-simple-chat \ - llama-run \ - llama-speculative \ - llama-tokenize \ - llama-vdot \ - llama-cvector-generator \ - llama-gen-docs \ - tests/test-c.o -# Binaries only useful for tests -TEST_TARGETS = \ - tests/test-arg-parser \ - tests/test-autorelease \ - tests/test-backend-ops \ - tests/test-chat \ - tests/test-chat-template \ - tests/test-double-float \ - tests/test-grammar-integration \ - tests/test-grammar-parser \ - tests/test-json-schema-to-grammar \ - tests/test-llama-grammar \ - tests/test-log \ - tests/test-model-load-cancel \ - tests/test-quantize-fns \ - tests/test-quantize-perf \ - tests/test-rope \ - tests/test-sampling \ - tests/test-tokenizer-0 \ - tests/test-tokenizer-1-bpe \ - tests/test-tokenizer-1-spm -# tests/test-opt \ +endef -# Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned -LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-llama2c-to-ggml \ - simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \ - retrieval speculative infill tokenize parallel export-lora lookahead lookup passkey gritlm - -# Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them. -# We don't want to clutter things too much, so we only build replacements for the most commonly used binaries. -LEGACY_TARGETS_BUILD = main quantize perplexity embedding server - -# Deprecation aliases -ifdef LLAMA_CUBLAS -$(error LLAMA_CUBLAS is removed. Use GGML_CUDA instead.) -endif - -ifdef LLAMA_CUDA -GGML_CUDA := 1 -DEPRECATE_WARNING := 1 -endif - -ifdef LLAMA_KOMPUTE -GGML_KOMPUTE := 1 -DEPRECATE_WARNING := 1 -endif - -ifdef LLAMA_METAL -GGML_METAL := 1 -DEPRECATE_WARNING := 1 -endif - -ifdef LLAMA_RPC -GGML_RPC := 1 -DEPRECATE_WARNING := 1 -endif - -ifdef LLAMA_SYCL -GGML_SYCL := 1 -DEPRECATE_WARNING := 1 -endif - -ifdef LLAMA_SYCL_F16 -GGML_SYCL_F16 := 1 -DEPRECATE_WARNING := 1 -endif - -ifdef LLAMA_OPENBLAS -GGML_OPENBLAS := 1 -DEPRECATE_WARNING := 1 -endif - -ifdef LLAMA_OPENBLAS64 -GGML_OPENBLAS64 := 1 -DEPRECATE_WARNING := 1 -endif - -ifdef LLAMA_BLIS -GGML_BLIS := 1 -DEPRECATE_WARNING := 1 -endif - -ifdef LLAMA_NO_LLAMAFILE -GGML_NO_LLAMAFILE := 1 -DEPRECATE_WARNING := 1 -endif - -ifdef LLAMA_NO_ACCELERATE -GGML_NO_ACCELERATE := 1 -DEPRECATE_WARNING := 1 -endif - -ifdef LLAMA_NO_OPENMP -GGML_NO_OPENMP := 1 -DEPRECATE_WARNING := 1 -endif - -ifdef LLAMA_NO_METAL -GGML_NO_METAL := 1 -DEPRECATE_WARNING := 1 -endif - -ifdef LLAMA_DISABLE_LOGS -REMOVE_WARNING := 1 -endif - -ifdef LLAMA_SERVER_VERBOSE -REMOVE_WARNING := 1 -endif - -ifndef UNAME_S -UNAME_S := $(shell uname -s) -endif - -ifndef UNAME_P -UNAME_P := $(shell uname -p) -endif - -ifndef UNAME_M -UNAME_M := $(shell uname -m) -endif - -# In GNU make default CXX is g++ instead of c++. Let's fix that so that users -# of non-gcc compilers don't have to provide g++ alias or wrapper. -DEFCC := cc -DEFCXX := c++ -ifeq ($(origin CC),default) -CC := $(DEFCC) -endif -ifeq ($(origin CXX),default) -CXX := $(DEFCXX) -endif - -# Mac OS + Arm can report x86_64 -# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789 -ifeq ($(UNAME_S),Darwin) - ifndef GGML_NO_METAL - GGML_METAL := 1 - endif - - GGML_NO_OPENMP := 1 - - ifneq ($(UNAME_P),arm) - SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null) - ifeq ($(SYSCTL_M),1) - # UNAME_P := arm - # UNAME_M := arm64 - warn := $(warning Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lead to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-1282546789) - endif - endif -endif - -ifdef GGML_METAL - GGML_METAL_EMBED_LIBRARY := 1 -endif - -ifdef GGML_RPC - BUILD_TARGETS += rpc-server -endif - -ifdef GGML_VULKAN - BUILD_TARGETS += vulkan-shaders-gen -endif - -default: $(BUILD_TARGETS) $(LEGACY_TARGETS_BUILD) - -test: $(TEST_TARGETS) - @failures=0; \ - for test_target in $(TEST_TARGETS); do \ - if [ "$$test_target" = "tests/test-tokenizer-0" ]; then \ - ./$$test_target $(CURDIR)/models/ggml-vocab-llama-spm.gguf; \ - ./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \ - ./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \ - ./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \ - ./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \ - ./$$test_target $(CURDIR)/models/ggml-vocab-starcoder.gguf; \ - ./$$test_target $(CURDIR)/models/ggml-vocab-gpt-2.gguf; \ - ./$$test_target $(CURDIR)/models/ggml-vocab-refact.gguf; \ - elif [ "$$test_target" = "tests/test-tokenizer-1-spm" ]; then \ - continue; \ - elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \ - continue; \ - else \ - echo "Running test $$test_target..."; \ - ./$$test_target; \ - fi; \ - if [ $$? -ne 0 ]; then \ - printf 'Test %s FAILED!\n\n' $$test_target; \ - failures=$$(( failures + 1 )); \ - else \ - printf 'Test %s passed.\n\n' $$test_target; \ - fi; \ - done; \ - if [ $$failures -gt 0 ]; then \ - printf '\n%s tests failed.\n' $$failures; \ - exit 1; \ - fi - @echo 'All tests passed.' - -all: $(BUILD_TARGETS) $(TEST_TARGETS) $(LEGACY_TARGETS_BUILD) - -ifdef RISCV_CROSS_COMPILE -CC := riscv64-unknown-linux-gnu-gcc -CXX := riscv64-unknown-linux-gnu-g++ -endif - -# -# Compile flags -# - -# keep standard at C11 and C++17 -MK_CPPFLAGS = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -DGGML_USE_CPU -MK_CFLAGS = -std=c11 -fPIC -MK_CXXFLAGS = -std=c++17 -fPIC -MK_NVCCFLAGS = -std=c++17 - -ifdef LLAMA_NO_CCACHE -GGML_NO_CCACHE := 1 -DEPRECATE_WARNING := 1 -endif - -ifndef GGML_NO_CCACHE -CCACHE := $(shell which ccache) -ifdef CCACHE -export CCACHE_SLOPPINESS = time_macros -$(info I ccache found, compilation results will be cached. Disable with GGML_NO_CCACHE.) -CC := $(CCACHE) $(CC) -CXX := $(CCACHE) $(CXX) -else -$(info I ccache not found. Consider installing it for faster compilation.) -endif # CCACHE -endif # GGML_NO_CCACHE - -# clock_gettime came in POSIX.1b (1993) -# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional -# posix_memalign came in POSIX.1-2001 / SUSv3 -# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985) -MK_CPPFLAGS += -D_XOPEN_SOURCE=600 - -# Somehow in OpenBSD whenever POSIX conformance is specified -# some string functions rely on locale_t availability, -# which was introduced in POSIX.1-2008, forcing us to go higher -ifeq ($(UNAME_S),OpenBSD) - MK_CPPFLAGS += -U_XOPEN_SOURCE -D_XOPEN_SOURCE=700 -endif - -# Data types, macros and functions related to controlling CPU affinity and -# some memory allocation are available on Linux through GNU extensions in libc -ifeq ($(UNAME_S),Linux) - MK_CPPFLAGS += -D_GNU_SOURCE - MK_LDFLAGS += -ldl -endif - -# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1, -# and on macOS its availability depends on enabling Darwin extensions -# similarly on DragonFly, enabling BSD extensions is necessary -ifeq ($(UNAME_S),Darwin) - MK_CPPFLAGS += -D_DARWIN_C_SOURCE -endif -ifeq ($(UNAME_S),DragonFly) - MK_CPPFLAGS += -D__BSD_VISIBLE -endif - -# alloca is a non-standard interface that is not visible on BSDs when -# POSIX conformance is specified, but not all of them provide a clean way -# to enable it in such cases -ifeq ($(UNAME_S),FreeBSD) - MK_CPPFLAGS += -D__BSD_VISIBLE -endif -ifeq ($(UNAME_S),NetBSD) - MK_CPPFLAGS += -D_NETBSD_SOURCE -endif -ifeq ($(UNAME_S),OpenBSD) - MK_CPPFLAGS += -D_BSD_SOURCE -endif - -ifdef GGML_SCHED_MAX_COPIES - MK_CPPFLAGS += -DGGML_SCHED_MAX_COPIES=$(GGML_SCHED_MAX_COPIES) -endif - -ifdef LLAMA_DEBUG - MK_CFLAGS += -O0 -g - MK_CXXFLAGS += -O0 -g - MK_LDFLAGS += -g - MK_NVCCFLAGS += -O0 -g - - ifeq ($(UNAME_S),Linux) - MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS - endif -else - MK_CPPFLAGS += -DNDEBUG - MK_CFLAGS += -O3 -g - MK_CXXFLAGS += -O3 -g - MK_NVCCFLAGS += -O3 -g -endif - -ifdef LLAMA_SANITIZE_THREAD - MK_CFLAGS += -fsanitize=thread -g - MK_CXXFLAGS += -fsanitize=thread -g - MK_LDFLAGS += -fsanitize=thread -g -endif - -ifdef LLAMA_SANITIZE_ADDRESS - MK_CFLAGS += -fsanitize=address -fno-omit-frame-pointer -g - MK_CXXFLAGS += -fsanitize=address -fno-omit-frame-pointer -g - MK_LDFLAGS += -fsanitize=address -fno-omit-frame-pointer -g -endif - -ifdef LLAMA_SANITIZE_UNDEFINED - MK_CFLAGS += -fsanitize=undefined -g - MK_CXXFLAGS += -fsanitize=undefined -g - MK_LDFLAGS += -fsanitize=undefined -g -endif - -ifdef LLAMA_SERVER_SSL - MK_CPPFLAGS += -DCPPHTTPLIB_OPENSSL_SUPPORT - MK_LDFLAGS += -lssl -lcrypto -endif - -ifndef GGML_NO_CPU_AARCH64 - MK_CPPFLAGS += -DGGML_USE_CPU_REPACK -endif - -# warnings -WARN_FLAGS = \ - -Wall \ - -Wextra \ - -Wpedantic \ - -Wcast-qual \ - -Wno-unused-function - -MK_CFLAGS += \ - $(WARN_FLAGS) \ - -Wshadow \ - -Wstrict-prototypes \ - -Wpointer-arith \ - -Wmissing-prototypes \ - -Werror=implicit-int \ - -Werror=implicit-function-declaration - -MK_CXXFLAGS += \ - $(WARN_FLAGS) \ - -Wmissing-declarations \ - -Wmissing-noreturn - -ifeq ($(LLAMA_FATAL_WARNINGS),1) - MK_CFLAGS += -Werror - MK_CXXFLAGS += -Werror -endif - -# this version of Apple ld64 is buggy -ifneq '' '$(findstring dyld-1015.7,$(shell $(CC) $(LDFLAGS) -Wl,-v 2>&1))' - MK_CPPFLAGS += -DHAVE_BUGGY_APPLE_LINKER -endif - -# OS specific -# TODO: support Windows -ifneq '' '$(filter $(UNAME_S),Linux Darwin FreeBSD NetBSD OpenBSD Haiku)' - MK_CFLAGS += -pthread - MK_CXXFLAGS += -pthread -endif - -# detect Windows -ifneq ($(findstring _NT,$(UNAME_S)),) - _WIN32 := 1 -endif - -# library name prefix -ifneq ($(_WIN32),1) - LIB_PRE := lib -endif - -# Dynamic Shared Object extension -ifneq ($(_WIN32),1) - DSO_EXT := .so -else - DSO_EXT := .dll -endif - -# Windows Sockets 2 (Winsock) for network-capable apps -ifeq ($(_WIN32),1) - LWINSOCK2 := -lws2_32 -endif - -ifdef LLAMA_GPROF - MK_CFLAGS += -pg - MK_CXXFLAGS += -pg -endif - -# Architecture specific -# TODO: probably these flags need to be tweaked on some architectures -# feel free to update the Makefile for your architecture and send a pull request or issue - -ifndef RISCV_CROSS_COMPILE - -ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64)) - # Use all CPU extensions that are available: - MK_CFLAGS += -march=native -mtune=native - HOST_CXXFLAGS += -march=native -mtune=native - - # Usage AMX build test - #MK_CFLAGS += -march=graniterapids -mtune=graniterapids - #HOST_CXXFLAGS += -march=graniterapids -mtune=graniterapids - - # Usage AVX-only - #MK_CFLAGS += -mfma -mf16c -mavx - #MK_CXXFLAGS += -mfma -mf16c -mavx - - # Usage SSSE3-only (Not is SSE3!) - #MK_CFLAGS += -mssse3 - #MK_CXXFLAGS += -mssse3 -endif - -ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))' - # The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves. - # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412 - # https://github.com/ggml-org/llama.cpp/issues/2922 - MK_CFLAGS += -Xassembler -muse-unaligned-vector-move - MK_CXXFLAGS += -Xassembler -muse-unaligned-vector-move - - # Target Windows 8 for PrefetchVirtualMemory - MK_CPPFLAGS += -D_WIN32_WINNT=0x602 -endif - -ifneq ($(filter aarch64%,$(UNAME_M)),) - # Apple M1, M2, etc. - # Raspberry Pi 3, 4, Zero 2 (64-bit) - # Nvidia Jetson - MK_CFLAGS += -mcpu=native - MK_CXXFLAGS += -mcpu=native - JETSON_RELEASE_INFO = $(shell jetson_release) - ifdef JETSON_RELEASE_INFO - ifneq ($(filter TX2%,$(JETSON_RELEASE_INFO)),) - JETSON_EOL_MODULE_DETECT = 1 - CC = aarch64-unknown-linux-gnu-gcc - cxx = aarch64-unknown-linux-gnu-g++ - endif - endif -endif - -ifneq ($(filter armv6%,$(UNAME_M)),) - # Raspberry Pi 1, Zero - MK_CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access - MK_CXXFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -endif - -ifneq ($(filter armv7%,$(UNAME_M)),) - # Raspberry Pi 2 - MK_CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations - MK_CXXFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations -endif - -ifneq ($(filter armv8%,$(UNAME_M)),) - # Raspberry Pi 3, 4, Zero 2 (32-bit) - MK_CFLAGS += -mfp16-format=ieee -mno-unaligned-access - MK_CXXFLAGS += -mfp16-format=ieee -mno-unaligned-access -endif - -ifneq ($(filter ppc64%,$(UNAME_M)),) - POWER9_M := $(shell grep "POWER9" /proc/cpuinfo) - ifneq (,$(findstring POWER9,$(POWER9_M))) - MK_CFLAGS += -mcpu=power9 - MK_CXXFLAGS += -mcpu=power9 - endif -endif - -ifneq ($(filter ppc64le%,$(UNAME_M)),) - MK_CFLAGS += -mcpu=powerpc64le - MK_CXXFLAGS += -mcpu=powerpc64le - CUDA_POWER_ARCH = 1 -endif - -ifneq ($(filter loongarch64%,$(UNAME_M)),) - MK_CFLAGS += -mlasx - MK_CXXFLAGS += -mlasx -endif - -ifneq ($(filter riscv64%,$(UNAME_M)),) - MK_CFLAGS += -march=rv64gcv -mabi=lp64d - MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d -endif - -else # RISC-V CROSS COMPILATION - MK_CFLAGS += -march=rv64gcv -mabi=lp64d - MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d -endif - -ifndef GGML_NO_ACCELERATE - # Mac OS - include Accelerate framework. - # `-framework Accelerate` works both with Apple Silicon and Mac Intel - ifeq ($(UNAME_S),Darwin) - MK_CPPFLAGS += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS -DGGML_BLAS_USE_ACCELERATE - MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK - MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64 - MK_LDFLAGS += -framework Accelerate - OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o - endif -endif # GGML_NO_ACCELERATE - -ifndef GGML_NO_OPENMP - MK_CPPFLAGS += -DGGML_USE_OPENMP - MK_CFLAGS += -fopenmp - MK_CXXFLAGS += -fopenmp -endif # GGML_NO_OPENMP - -ifdef GGML_OPENBLAS - MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas) - MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas) - MK_LDFLAGS += $(shell pkg-config --libs openblas) - OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o -endif # GGML_OPENBLAS - -ifdef GGML_OPENBLAS64 - MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas64) - MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas64) - MK_LDFLAGS += $(shell pkg-config --libs openblas64) - OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o -endif # GGML_OPENBLAS64 - -ifdef GGML_BLIS - MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_BLIS -I/usr/local/include/blis -I/usr/include/blis - MK_LDFLAGS += -lblis -L/usr/local/lib - OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o -endif # GGML_BLIS - -ifdef GGML_NVPL - MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_NVPL -DNVPL_ILP64 -I/usr/local/include/nvpl_blas -I/usr/include/nvpl_blas - MK_LDFLAGS += -L/usr/local/lib -lnvpl_blas_core -lnvpl_blas_ilp64_gomp - OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o -endif # GGML_NVPL - -ifndef GGML_NO_LLAMAFILE - MK_CPPFLAGS += -DGGML_USE_LLAMAFILE - OBJ_GGML_EXT += ggml/src/ggml-cpu/llamafile/sgemm.o -endif - -ifndef GGML_NO_AMX - MK_CPPFLAGS += -DGGML_USE_AMX - OBJ_GGML_EXT += ggml/src/ggml-cpu/amx/amx.o ggml/src/ggml-cpu/amx/mmq.o -endif - -# only necessary for the CPU backend files -MK_CPPFLAGS += -Iggml/src/ggml-cpu - -ifdef GGML_RPC - MK_CPPFLAGS += -DGGML_USE_RPC - OBJ_GGML_EXT += ggml/src/ggml-rpc.o -endif # GGML_RPC - -OBJ_CUDA_TMPL = $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-mma*.cu)) -OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/mmq*.cu)) - -ifdef GGML_CUDA_FA_ALL_QUANTS - OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*.cu)) -else - OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu)) - OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu)) - OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*f16-f16.cu)) -endif # GGML_CUDA_FA_ALL_QUANTS - -ifdef GGML_CUDA - ifneq ('', '$(wildcard /opt/cuda)') - CUDA_PATH ?= /opt/cuda - else - CUDA_PATH ?= /usr/local/cuda - endif - - MK_CPPFLAGS += -DGGML_USE_CUDA -DGGML_CUDA_USE_GRAPHS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include - MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib - MK_NVCCFLAGS += -use_fast_math - - OBJ_GGML_EXT += ggml/src/ggml-cuda/ggml-cuda.o - OBJ_GGML_EXT += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu)) - OBJ_GGML_EXT += $(OBJ_CUDA_TMPL) - -ifdef LLAMA_FATAL_WARNINGS - MK_NVCCFLAGS += -Werror all-warnings -endif # LLAMA_FATAL_WARNINGS - -ifndef JETSON_EOL_MODULE_DETECT - MK_NVCCFLAGS += --forward-unknown-to-host-compiler -endif # JETSON_EOL_MODULE_DETECT - -ifdef LLAMA_DEBUG - MK_NVCCFLAGS += -lineinfo -endif # LLAMA_DEBUG - -ifdef GGML_CUDA_DEBUG - MK_NVCCFLAGS += --device-debug -endif # GGML_CUDA_DEBUG - -ifdef GGML_CUDA_NVCC - NVCC = $(CCACHE) $(GGML_CUDA_NVCC) -else - NVCC = $(CCACHE) nvcc -endif # GGML_CUDA_NVCC - -ifdef CUDA_DOCKER_ARCH - MK_NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH) -else ifndef CUDA_POWER_ARCH - MK_NVCCFLAGS += -arch=native -endif # CUDA_DOCKER_ARCH - -ifdef GGML_CUDA_FORCE_MMQ - MK_NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ -endif # GGML_CUDA_FORCE_MMQ - -ifdef GGML_CUDA_FORCE_CUBLAS - MK_NVCCFLAGS += -DGGML_CUDA_FORCE_CUBLAS -endif # GGML_CUDA_FORCE_CUBLAS - -ifdef GGML_CUDA_F16 - MK_NVCCFLAGS += -DGGML_CUDA_F16 -endif # GGML_CUDA_F16 - -ifdef GGML_CUDA_DMMV_F16 - MK_NVCCFLAGS += -DGGML_CUDA_F16 -endif # GGML_CUDA_DMMV_F16 - -ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE - MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(GGML_CUDA_PEER_MAX_BATCH_SIZE) -else - MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -endif # GGML_CUDA_PEER_MAX_BATCH_SIZE - -ifdef GGML_CUDA_NO_PEER_COPY - MK_NVCCFLAGS += -DGGML_CUDA_NO_PEER_COPY -endif # GGML_CUDA_NO_PEER_COPY - -ifdef GGML_CUDA_CCBIN - MK_NVCCFLAGS += -ccbin $(GGML_CUDA_CCBIN) -endif # GGML_CUDA_CCBIN - -ifdef GGML_CUDA_NO_FA - MK_NVCCFLAGS += -DGGML_CUDA_NO_FA -endif # GGML_CUDA_NO_FA - -ifdef GGML_CUDA_FA_ALL_QUANTS - MK_NVCCFLAGS += -DGGML_CUDA_FA_ALL_QUANTS -endif # GGML_CUDA_FA_ALL_QUANTS - -ifdef JETSON_EOL_MODULE_DETECT -define NVCC_COMPILE - $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@ -endef # NVCC_COMPILE -else -define NVCC_COMPILE - $(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@ -endef # NVCC_COMPILE -endif # JETSON_EOL_MODULE_DETECT - -ggml/src/ggml-cuda/%.o: \ - ggml/src/ggml-cuda/%.cu \ - ggml/include/ggml.h \ - ggml/src/ggml-common.h \ - ggml/src/ggml-cuda/common.cuh - $(NVCC_COMPILE) - -ggml/src/ggml-cuda/ggml-cuda.o: \ - ggml/src/ggml-cuda/ggml-cuda.cu \ - ggml/include/ggml-cuda.h \ - ggml/include/ggml.h \ - ggml/include/ggml-backend.h \ - ggml/src/ggml-backend-impl.h \ - ggml/src/ggml-common.h \ - $(wildcard ggml/src/ggml-cuda/*.cuh) - $(NVCC_COMPILE) -endif # GGML_CUDA - -ifdef GGML_VULKAN - MK_CPPFLAGS += -DGGML_USE_VULKAN - MK_LDFLAGS += $(shell pkg-config --libs vulkan) - OBJ_GGML_EXT += ggml/src/ggml-vulkan.o ggml/src/ggml-vulkan-shaders.o - -ifdef GGML_VULKAN_CHECK_RESULTS - MK_CPPFLAGS += -DGGML_VULKAN_CHECK_RESULTS -endif - -ifdef GGML_VULKAN_DEBUG - MK_CPPFLAGS += -DGGML_VULKAN_DEBUG -endif - -ifdef GGML_VULKAN_MEMORY_DEBUG - MK_CPPFLAGS += -DGGML_VULKAN_MEMORY_DEBUG -endif - -ifdef GGML_VULKAN_PERF - MK_CPPFLAGS += -DGGML_VULKAN_PERF -endif - -ifdef GGML_VULKAN_VALIDATE - MK_CPPFLAGS += -DGGML_VULKAN_VALIDATE -endif - -ifdef GGML_VULKAN_RUN_TESTS - MK_CPPFLAGS += -DGGML_VULKAN_RUN_TESTS -endif - -GLSLC_CMD = glslc -_ggml_vk_genshaders_cmd = $(shell pwd)/vulkan-shaders-gen -_ggml_vk_header = ggml/src/ggml-vulkan-shaders.hpp -_ggml_vk_source = ggml/src/ggml-vulkan-shaders.cpp -_ggml_vk_input_dir = ggml/src/ggml-vulkan/vulkan-shaders -_ggml_vk_shader_deps = $(echo $(_ggml_vk_input_dir)/*.comp) - -ggml/src/ggml-vulkan.o: ggml/src/ggml-vulkan/ggml-vulkan.cpp ggml/include/ggml-vulkan.h $(_ggml_vk_header) $(_ggml_vk_source) - $(CXX) $(CXXFLAGS) $(shell pkg-config --cflags vulkan) -c $< -o $@ - -$(_ggml_vk_header): $(_ggml_vk_source) - -$(_ggml_vk_source): $(_ggml_vk_shader_deps) vulkan-shaders-gen - $(_ggml_vk_genshaders_cmd) \ - --glslc $(GLSLC_CMD) \ - --input-dir $(_ggml_vk_input_dir) \ - --target-hpp $(_ggml_vk_header) \ - --target-cpp $(_ggml_vk_source) - -vulkan-shaders-gen: ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp - $(CXX) $(CXXFLAGS) -o $@ $(LDFLAGS) ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp - -endif # GGML_VULKAN - -ifdef GGML_HIP - ifeq ($(wildcard /opt/rocm),) - ROCM_PATH ?= /usr - AMDGPU_TARGETS ?= $(shell $(shell which amdgpu-arch)) - else - ROCM_PATH ?= /opt/rocm - AMDGPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch) - endif - - MK_CPPFLAGS += -DGGML_USE_HIP -DGGML_USE_CUDA - - MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib - MK_LDFLAGS += -L$(ROCM_PATH)/lib64 -Wl,-rpath=$(ROCM_PATH)/lib64 - MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas - - HIPCC ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc - - HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS)) - -ifdef GGML_CUDA_FORCE_MMQ - HIPFLAGS += -DGGML_CUDA_FORCE_MMQ -endif # GGML_CUDA_FORCE_MMQ - -ifdef GGML_CUDA_FORCE_CUBLAS - HIPFLAGS += -DGGML_CUDA_FORCE_CUBLAS -endif # GGML_CUDA_FORCE_CUBLAS - -ifdef GGML_CUDA_NO_PEER_COPY - HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY -endif # GGML_CUDA_NO_PEER_COPY - -ifdef GGML_CUDA_NO_FA - HIPFLAGS += -DGGML_CUDA_NO_FA -endif # GGML_CUDA_NO_FA - - OBJ_GGML_EXT += ggml/src/ggml-cuda/ggml-cuda.o - OBJ_GGML_EXT += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu)) - OBJ_GGML_EXT += $(OBJ_CUDA_TMPL) - -ggml/src/ggml-cuda/ggml-cuda.o: \ - ggml/src/ggml-cuda/ggml-cuda.cu \ - ggml/include/ggml-cuda.h \ - ggml/include/ggml.h \ - ggml/include/ggml-backend.h \ - ggml/src/ggml-backend-impl.h \ - ggml/src/ggml-common.h \ - $(wildcard ggml/src/ggml-cuda/*.cuh) - $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $< - -ggml/src/ggml-cuda/%.o: \ - ggml/src/ggml-cuda/%.cu \ - ggml/include/ggml.h \ - ggml/src/ggml-common.h \ - ggml/src/ggml-cuda/common.cuh - $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $< -endif # GGML_HIP - -ifdef GGML_MUSA - ifeq ($(wildcard /opt/musa),) - MUSA_PATH ?= /usr/local/musa - else - MUSA_PATH ?= /opt/musa - endif - MUSA_ARCHITECTURES ?= 21;22;31 - - MK_CPPFLAGS += -DGGML_USE_MUSA -DGGML_USE_CUDA - MK_LDFLAGS += -L$(MUSA_PATH)/lib -Wl,-rpath=$(MUSA_PATH)/lib - MK_LDFLAGS += -lmusa -lmusart -lmublas - - ifndef GGML_NO_OPENMP - # For Ubuntu Focal - MK_CPPFLAGS += -I/usr/lib/llvm-10/include/openmp - MK_LDFLAGS += -L/usr/lib/llvm-10/lib - # For Ubuntu Jammy - MK_CPPFLAGS += -I/usr/lib/llvm-14/lib/clang/14.0.0/include - MK_LDFLAGS += -L/usr/lib/llvm-14/lib - endif # GGML_NO_OPENMP - - CC := $(MUSA_PATH)/bin/clang - CXX := $(MUSA_PATH)/bin/clang++ - MCC := $(CCACHE) $(MUSA_PATH)/bin/mcc - - MUSAFLAGS = -fsigned-char -x musa -mtgpu - MUSAFLAGS += $(foreach arch,$(subst ;, ,$(MUSA_ARCHITECTURES)),--cuda-gpu-arch=mp_$(arch)) - -ifdef GGML_CUDA_FORCE_MMQ - MUSAFLAGS += -DGGML_CUDA_FORCE_MMQ -endif # GGML_CUDA_FORCE_MMQ - -ifdef GGML_CUDA_FORCE_CUBLAS - MUSAFLAGS += -DGGML_CUDA_FORCE_CUBLAS -endif # GGML_CUDA_FORCE_CUBLAS - -ifdef GGML_CUDA_F16 - MUSAFLAGS += -DGGML_CUDA_F16 -endif # GGML_CUDA_F16 - -ifdef GGML_CUDA_DMMV_F16 - MUSAFLAGS += -DGGML_CUDA_F16 -endif # GGML_CUDA_DMMV_F16 - -ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE - MUSAFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(GGML_CUDA_PEER_MAX_BATCH_SIZE) -else - MUSAFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -endif # GGML_CUDA_PEER_MAX_BATCH_SIZE - -ifdef GGML_CUDA_NO_PEER_COPY - MUSAFLAGS += -DGGML_CUDA_NO_PEER_COPY -endif # GGML_CUDA_NO_PEER_COPY - -ifdef GGML_CUDA_NO_FA - MUSAFLAGS += -DGGML_CUDA_NO_FA -endif # GGML_CUDA_NO_FA - -ifdef GGML_CUDA_FA_ALL_QUANTS - MUSAFLAGS += -DGGML_CUDA_FA_ALL_QUANTS -endif # GGML_CUDA_FA_ALL_QUANTS - - OBJ_GGML_EXT += ggml/src/ggml-cuda/ggml-cuda.o - OBJ_GGML_EXT += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu)) - OBJ_GGML_EXT += $(OBJ_CUDA_TMPL) - -ggml/src/ggml-cuda/ggml-cuda.o: \ - ggml/src/ggml-cuda/ggml-cuda.cu \ - ggml/include/ggml-cuda.h \ - ggml/include/ggml.h \ - ggml/include/ggml-backend.h \ - ggml/src/ggml-backend-impl.h \ - ggml/src/ggml-common.h \ - $(wildcard ggml/src/ggml-cuda/*.cuh) - $(MCC) $(CXXFLAGS) $(MUSAFLAGS) -c -o $@ $< - -ggml/src/ggml-cuda/%.o: \ - ggml/src/ggml-cuda/%.cu \ - ggml/include/ggml.h \ - ggml/src/ggml-common.h \ - ggml/src/ggml-cuda/common.cuh - $(MCC) $(CXXFLAGS) $(MUSAFLAGS) -c -o $@ $< -endif # GGML_MUSA - -ifdef GGML_METAL - MK_CPPFLAGS += -DGGML_USE_METAL - MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit - OBJ_GGML_EXT += ggml/src/ggml-metal/ggml-metal.o - -ifdef GGML_METAL_USE_BF16 - MK_CPPFLAGS += -DGGML_METAL_USE_BF16 -endif # GGML_METAL_USE_BF16 -ifdef GGML_METAL_NDEBUG - MK_CPPFLAGS += -DGGML_METAL_NDEBUG -endif -ifdef GGML_METAL_EMBED_LIBRARY - MK_CPPFLAGS += -DGGML_METAL_EMBED_LIBRARY - OBJ_GGML_EXT += ggml/src/ggml-metal-embed.o -endif -endif # GGML_METAL - -ifdef GGML_METAL -ggml/src/ggml-metal/ggml-metal.o: \ - ggml/src/ggml-metal/ggml-metal.m \ - ggml/src/ggml-metal/ggml-metal-impl.h \ - ggml/include/ggml-metal.h \ - ggml/include/ggml.h - $(CC) $(CFLAGS) -c $< -o $@ - -ifdef GGML_METAL_EMBED_LIBRARY -ggml/src/ggml-metal-embed.o: \ - ggml/src/ggml-metal/ggml-metal.metal \ - ggml/src/ggml-metal/ggml-metal-impl.h \ - ggml/src/ggml-common.h - @echo "Embedding Metal library" - @sed -e '/__embed_ggml-common.h__/r ggml/src/ggml-common.h' -e '/__embed_ggml-common.h__/d' < ggml/src/ggml-metal/ggml-metal.metal > ggml/src/ggml-metal/ggml-metal-embed.metal.tmp - @sed -e '/#include "ggml-metal-impl.h"/r ggml/src/ggml-metal/ggml-metal-impl.h' -e '/#include "ggml-metal-impl.h"/d' < ggml/src/ggml-metal/ggml-metal-embed.metal.tmp > ggml/src/ggml-metal/ggml-metal-embed.metal - $(eval TEMP_ASSEMBLY=$(shell mktemp -d)) - @echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)/ggml-metal-embed.s - @echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s - @echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s - @echo ".incbin \"ggml/src/ggml-metal/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s - @echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s - @echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s - $(CC) $(CFLAGS) -c $(TEMP_ASSEMBLY)/ggml-metal-embed.s -o $@ - @rm -f ${TEMP_ASSEMBLY}/ggml-metal-embed.s - @rmdir ${TEMP_ASSEMBLY} -endif -endif # GGML_METAL - -DIR_GGML = ggml -DIR_LLAMA = src -DIR_COMMON = common - -OBJ_GGML = \ - $(DIR_GGML)/src/ggml.o \ - $(DIR_GGML)/src/ggml-alloc.o \ - $(DIR_GGML)/src/ggml-backend.o \ - $(DIR_GGML)/src/ggml-backend-reg.o \ - $(DIR_GGML)/src/ggml-opt.o \ - $(DIR_GGML)/src/ggml-quants.o \ - $(DIR_GGML)/src/ggml-threading.o \ - $(DIR_GGML)/src/ggml-cpu/ggml-cpu.o \ - $(DIR_GGML)/src/ggml-cpu/ggml-cpu_cpp.o \ - $(DIR_GGML)/src/ggml-cpu/repack.o \ - $(DIR_GGML)/src/ggml-cpu/ggml-cpu-hbm.o \ - $(DIR_GGML)/src/ggml-cpu/ggml-cpu-quants.o \ - $(DIR_GGML)/src/ggml-cpu/ggml-cpu-traits.o \ - $(OBJ_GGML_EXT) - -OBJ_LLAMA = \ - $(DIR_LLAMA)/llama.o \ - $(DIR_LLAMA)/llama-vocab.o \ - $(DIR_LLAMA)/llama-grammar.o \ - $(DIR_LLAMA)/llama-sampling.o \ - $(DIR_LLAMA)/unicode.o \ - $(DIR_LLAMA)/unicode-data.o - -OBJ_COMMON = \ - $(DIR_COMMON)/common.o \ - $(DIR_COMMON)/arg.o \ - $(DIR_COMMON)/log.o \ - $(DIR_COMMON)/console.o \ - $(DIR_COMMON)/ngram-cache.o \ - $(DIR_COMMON)/sampling.o \ - $(DIR_COMMON)/speculative.o \ - $(DIR_COMMON)/chat.o \ - $(DIR_COMMON)/build-info.o \ - $(DIR_COMMON)/json-schema-to-grammar.o - -OBJ_ALL = $(OBJ_GGML) $(OBJ_LLAMA) $(OBJ_COMMON) - -LIB_GGML = $(LIB_PRE)ggml$(DSO_EXT) -LIB_GGML_S = $(LIB_PRE)ggml.a - -LIB_LLAMA = $(LIB_PRE)llama$(DSO_EXT) -LIB_LLAMA_S = $(LIB_PRE)llama.a - -LIB_COMMON = $(LIB_PRE)common$(DSO_EXT) -LIB_COMMON_S = $(LIB_PRE)common.a - -LIB_ALL = $(LIB_GGML) $(LIB_LLAMA) $(LIB_COMMON) -LIB_ALL_S = $(LIB_GGML_S) $(LIB_LLAMA_S) $(LIB_COMMON_S) - -GF_CC := $(CC) -include scripts/get-flags.mk - -# combine build flags with cmdline overrides -override CPPFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) -override CFLAGS := $(CPPFLAGS) $(MK_CFLAGS) $(GF_CFLAGS) $(CFLAGS) -BASE_CXXFLAGS := $(MK_CXXFLAGS) $(CXXFLAGS) -override CXXFLAGS := $(BASE_CXXFLAGS) $(HOST_CXXFLAGS) $(GF_CXXFLAGS) $(CPPFLAGS) -override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS) -override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS) - -# identify CUDA host compiler -ifdef GGML_CUDA -GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler -include scripts/get-flags.mk -CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic -endif - -ifdef LLAMA_CURL -override CXXFLAGS := $(CXXFLAGS) -DLLAMA_USE_CURL -override LDFLAGS := $(LDFLAGS) -lcurl -endif - -# -# Print build information -# - -$(info I llama.cpp build info: ) -$(info I UNAME_S: $(UNAME_S)) -$(info I UNAME_P: $(UNAME_P)) -$(info I UNAME_M: $(UNAME_M)) -$(info I CFLAGS: $(CFLAGS)) -$(info I CXXFLAGS: $(CXXFLAGS)) -$(info I NVCCFLAGS: $(NVCCFLAGS)) -$(info I LDFLAGS: $(LDFLAGS)) -$(info I CC: $(shell $(CC) --version | head -n 1)) -$(info I CXX: $(shell $(CXX) --version | head -n 1)) -ifdef GGML_CUDA -$(info I NVCC: $(shell $(NVCC) --version | tail -n 1)) -CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])') -ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1) - -ifndef CUDA_DOCKER_ARCH -ifndef CUDA_POWER_ARCH -$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via environment variable CUDA_DOCKER_ARCH, e.g. by running "export CUDA_DOCKER_ARCH=compute_XX" on Unix-like systems, where XX is the minimum compute capability that the code needs to run on. A list with compute capabilities can be found here: https://developer.nvidia.com/cuda-gpus ) -endif # CUDA_POWER_ARCH -endif # CUDA_DOCKER_ARCH - -endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1) -endif # GGML_CUDA -$(info ) - -ifdef DEPRECATE_WARNING -$(info !!! DEPRECATION WARNING !!!) -$(info The following LLAMA_ options are deprecated and will be removed in the future. Use the GGML_ prefix instead) -$(info - LLAMA_CUDA) -$(info - LLAMA_METAL) -$(info - LLAMA_METAL_EMBED_LIBRARY) -$(info - LLAMA_OPENMP) -$(info - LLAMA_RPC) -$(info - LLAMA_SYCL) -$(info - LLAMA_SYCL_F16) -$(info - LLAMA_OPENBLAS) -$(info - LLAMA_OPENBLAS64) -$(info - LLAMA_BLIS) -$(info - LLAMA_NO_LLAMAFILE) -$(info - LLAMA_NO_ACCELERATE) -$(info - LLAMA_NO_OPENMP) -$(info - LLAMA_NO_METAL) -$(info - LLAMA_NO_CCACHE) -$(info ) -endif - -ifdef REMOVE_WARNING -$(info !!! REMOVAL WARNING !!!) -$(info The following LLAMA_ options have been removed and are no longer supported) -$(info - LLAMA_DISABLE_LOGS (https://github.com/ggml-org/llama.cpp/pull/9418)) -$(info - LLAMA_SERVER_VERBOSE (https://github.com/ggml-org/llama.cpp/pull/9418)) -$(info ) -endif - -# -# Build libraries -# - -# Libraries -LIB_GGML = libggml.so -LIB_GGML_S = libggml.a - -LIB_LLAMA = libllama.so -LIB_LLAMA_S = libllama.a - -LIB_COMMON = libcommon.so -LIB_COMMON_S = libcommon.a - -# Targets -BUILD_TARGETS += $(LIB_GGML) $(LIB_GGML_S) $(LIB_LLAMA) $(LIB_LLAMA_S) $(LIB_COMMON) $(LIB_COMMON_S) - -# Dependency files -DEP_FILES = $(OBJ_GGML:.o=.d) $(OBJ_LLAMA:.o=.d) $(OBJ_COMMON:.o=.d) - -# Default target -all: $(BUILD_TARGETS) - -# force c++ build for source file that have same name as c file -# Note: need this exception because `ggml-cpu.c` and `ggml-cpu.cpp` both produce the same obj/dep files -$(DIR_GGML)/%_cpp.o: $(DIR_GGML)/%.cpp - $(CXX) $(CXXFLAGS) -MMD -c $< -o $@ - -# Rules for building object files -$(DIR_GGML)/%.o: $(DIR_GGML)/%.c - $(CC) $(CFLAGS) -MMD -c $< -o $@ - -$(DIR_GGML)/%.o: $(DIR_GGML)/%.cpp - $(CXX) $(CXXFLAGS) -MMD -c $< -o $@ - -$(DIR_LLAMA)/%.o: $(DIR_LLAMA)/%.cpp - $(CXX) $(CXXFLAGS) -MMD -c $< -o $@ - -$(DIR_COMMON)/%.o: $(DIR_COMMON)/%.cpp - $(CXX) $(CXXFLAGS) -MMD -c $< -o $@ - -# Rules for building libraries -$(LIB_GGML): $(OBJ_GGML) - $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) - -$(LIB_GGML_S): $(OBJ_GGML) - ar rcs $(LIB_GGML_S) $^ - -$(LIB_LLAMA): $(OBJ_LLAMA) $(LIB_GGML) - $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) - -$(LIB_LLAMA_S): $(OBJ_LLAMA) - ar rcs $(LIB_LLAMA_S) $^ - -$(LIB_COMMON): $(OBJ_COMMON) $(LIB_LLAMA) $(LIB_GGML) - $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) - -$(LIB_COMMON_S): $(OBJ_COMMON) - ar rcs $(LIB_COMMON_S) $^ - -# Include dependency files --include $(DEP_FILES) - -# Clean generated server assets -clean-server-assets: - find tools/server -type f -name "*.js.hpp" -delete - find tools/server -type f -name "*.mjs.hpp" -delete - find tools/server -type f -name "*.css.hpp" -delete - find tools/server -type f -name "*.html.hpp" -delete - -# Clean rule -clean: clean-server-assets - rm -vrf $(BUILD_TARGETS) $(TEST_TARGETS) - rm -rvf *.a *.dll *.so *.dot - find ggml src common tests examples pocs -type f -name "*.o" -delete - find ggml src common tests examples pocs -type f -name "*.d" -delete - -# -# Examples -# - -# $< is the first prerequisite, i.e. the source file. -# Explicitly compile this to an object file so that it can be cached with ccache. -# The source file is then filtered out from $^ (the list of all prerequisites) and the object file is added instead. - -# Helper function that replaces .c, .cpp, and .cu file endings with .o: -GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1)))) - -llama-cli: tools/main/main.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - @echo - @echo '==== Run ./llama-cli -h for help. ====' - @echo - -llama-run: tools/run/run.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-simple: examples/simple/simple.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-simple-chat: examples/simple-chat/simple-chat.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-tokenize: tools/tokenize/tokenize.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-batched: examples/batched/batched.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-batched-bench: tools/batched-bench/batched-bench.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-quantize: tools/quantize/quantize.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-quantize-stats: tools/quantize-stats/quantize-stats.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-perplexity: tools/perplexity/perplexity.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-imatrix: tools/imatrix/imatrix.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-embedding: examples/embedding/embedding.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-gritlm: examples/gritlm/gritlm.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-save-load-state: examples/save-load-state/save-load-state.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-gguf: examples/gguf/gguf.cpp \ - $(OBJ_GGML) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -examples/gguf-hash/deps/sha1/sha1.o: \ - examples/gguf-hash/deps/sha1/sha1.c - $(CC) $(CFLAGS) -Iexamples/gguf-hash/deps -c $< -o $@ - -examples/gguf-hash/deps/xxhash/xxhash.o: \ - examples/gguf-hash/deps/xxhash/xxhash.c - $(CC) $(CFLAGS) -Iexamples/gguf-hash/deps -c $< -o $@ - -examples/gguf-hash/deps/sha256/sha256.o: \ - examples/gguf-hash/deps/sha256/sha256.c - $(CC) $(CFLAGS) -Iexamples/gguf-hash/deps -c $< -o $@ - -llama-gguf-hash: examples/gguf-hash/gguf-hash.cpp examples/gguf-hash/deps/sha1/sha1.o examples/gguf-hash/deps/xxhash/xxhash.o examples/gguf-hash/deps/sha256/sha256.o\ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -Iexamples/gguf-hash/deps -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-gguf-split: tools/gguf-split/gguf-split.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-eval-callback: examples/eval-callback/eval-callback.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-cvector-generator: tools/cvector-generator/cvector-generator.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-bench: tools/llama-bench/llama-bench.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-export-lora: tools/export-lora/export-lora.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-retrieval: examples/retrieval/retrieval.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-speculative: examples/speculative/speculative.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-parallel: examples/parallel/parallel.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-lookahead: examples/lookahead/lookahead.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-lookup: examples/lookup/lookup.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-lookup-create: examples/lookup/lookup-create.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-lookup-merge: examples/lookup/lookup-merge.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-lookup-stats: examples/lookup/lookup-stats.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-passkey: examples/passkey/passkey.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -ifdef GGML_RPC -rpc-server: tools/rpc/rpc-server.cpp \ - $(OBJ_GGML) - $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) -endif # GGML_RPC - -llama-server: \ - tools/server/server.cpp \ - tools/server/utils.hpp \ - tools/server/httplib.h \ - tools/server/index.html.hpp \ - tools/server/loading.html.hpp \ - common/chat.cpp \ - common/chat.h \ - common/chat-template.hpp \ - common/json.hpp \ - common/minja.hpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Itools/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2) - -# Portable equivalent of `cd tools/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`: -tools/server/%.hpp: tools/server/public/% FORCE Makefile - @( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \ - echo "unsigned char $${NAME}[] = {" && \ - cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \ - echo "};" && \ - echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \ - ) > $@ - -llama-gen-docs: examples/gen-docs/gen-docs.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -libllava.a: tools/mtmd/llava.cpp \ - tools/mtmd/llava.h \ - tools/mtmd/clip.cpp \ - tools/mtmd/clip.h \ - common/stb_image.h \ - common/base64.hpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual - -llama-llava-cli: tools/mtmd/llava-cli.cpp \ - tools/mtmd/llava.cpp \ - tools/mtmd/llava.h \ - tools/mtmd/clip.cpp \ - tools/mtmd/clip.h \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual - -llama-minicpmv-cli: tools/mtmd/minicpmv-cli.cpp \ - tools/mtmd/llava.cpp \ - tools/mtmd/llava.h \ - tools/mtmd/clip.cpp \ - tools/mtmd/clip.h \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual - -llama-qwen2vl-cli: tools/mtmd/qwen2vl-cli.cpp \ - tools/mtmd/llava.cpp \ - tools/mtmd/llava.h \ - tools/mtmd/clip.cpp \ - tools/mtmd/clip.h \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual - -ifeq ($(UNAME_S),Darwin) -swift: examples/batched.swift - (cd examples/batched.swift; make build) -endif - -common/build-info.cpp: $(wildcard .git/index) scripts/build-info.sh - @sh scripts/build-info.sh "$(CC)" > $@.tmp - @if ! cmp -s $@.tmp $@; then \ - mv $@.tmp $@; \ - else \ - rm $@.tmp; \ - fi - -common/build-info.o: common/build-info.cpp - $(CXX) $(CXXFLAGS) -c $(filter-out %.h,$^) -o $@ - -# -# Tests -# - -tests: $(TEST_TARGETS) - -tests/test-arg-parser: tests/test-arg-parser.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -tests/test-llama-grammar: tests/test-llama-grammar.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -tests/test-log: tests/test-log.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -tests/test-grammar-parser: tests/test-grammar-parser.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -tests/test-grammar-integration: tests/test-grammar-integration.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -tests/test-double-float: tests/test-double-float.cpp - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -tests/test-json-schema-to-grammar: tests/test-json-schema-to-grammar.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -Itools/server -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -tests/test-chat: tests/test-chat.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -Itools/server -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -tests/test-opt: tests/test-opt.cpp \ - $(OBJ_GGML) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -tests/test-quantize-fns: tests/test-quantize-fns.cpp \ - $(OBJ_GGML) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -tests/test-quantize-perf: tests/test-quantize-perf.cpp \ - $(OBJ_GGML) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -tests/test-sampling: tests/test-sampling.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -tests/test-tokenizer-0: tests/test-tokenizer-0.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -tests/test-tokenizer-1-spm: tests/test-tokenizer-1-spm.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -tests/test-rope: tests/test-rope.cpp ggml/src/ggml.o \ - $(OBJ_GGML) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -tests/test-c.o: tests/test-c.c include/llama.h - $(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@ - -tests/test-backend-ops: tests/test-backend-ops.cpp \ - $(OBJ_GGML) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -tests/test-model-load-cancel: tests/test-model-load-cancel.cpp tests/get-model.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -tests/test-autorelease: tests/test-autorelease.cpp tests/get-model.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -tests/test-chat-template: tests/test-chat-template.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -# -# PoCs -# - -llama-vdot: pocs/vdot/vdot.cpp ggml/src/ggml.o \ - $(OBJ_GGML) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \ - $(OBJ_GGML) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -# -# Deprecated binaries that we want to keep around long enough for people to migrate to the new filenames, then these can be removed. -# -# Mark legacy binary targets as .PHONY so that they are always checked. -.PHONY: FORCE main quantize perplexity embedding server - -# Define the object file target -examples/deprecation-warning/deprecation-warning.o: examples/deprecation-warning/deprecation-warning.cpp - $(CXX) $(CXXFLAGS) -c $< -o $@ - -# NOTE: We currently will always build the deprecation-warning `main` and `server` binaries to help users migrate. -# Eventually we will want to remove these target from building all the time. -main: examples/deprecation-warning/deprecation-warning.o - $(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS) - @echo "NOTICE: The 'main' binary is deprecated. Please use 'llama-cli' instead." - -server: examples/deprecation-warning/deprecation-warning.o - $(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS) - @echo "NOTICE: The 'server' binary is deprecated. Please use 'llama-server' instead." - -quantize: examples/deprecation-warning/deprecation-warning.o -ifneq (,$(wildcard quantize)) - $(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS) - @echo "#########" - @echo "WARNING: The 'quantize' binary is deprecated. Please use 'llama-quantize' instead." - @echo " Remove the 'quantize' binary to remove this warning." - @echo "#########" -endif - -perplexity: examples/deprecation-warning/deprecation-warning.o -ifneq (,$(wildcard perplexity)) - $(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS) - @echo "#########" - @echo "WARNING: The 'perplexity' binary is deprecated. Please use 'llama-perplexity' instead." - @echo " Remove the 'perplexity' binary to remove this warning." - @echo "#########" -endif - -embedding: examples/deprecation-warning/deprecation-warning.o -ifneq (,$(wildcard embedding)) - $(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS) - @echo "#########" - @echo "WARNING: The 'embedding' binary is deprecated. Please use 'llama-embedding' instead." - @echo " Remove the 'embedding' binary to remove this warning." - @echo "#########" -endif +$(error Build system changed:$(newline)\ +The Makefile build has been replaced by CMake.$(newline)$(newline)\ +For build instructions see:$(newline)\ +https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md$(newline)${newline}) diff --git a/README.md b/README.md index 11d9290786..8446756384 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,7 @@ LLM inference in C/C++ ## Hot topics +- **[guide : running gpt-oss with llama.cpp](https://github.com/ggml-org/llama.cpp/discussions/15396)** - **[[FEEDBACK] Better packaging for llama.cpp to support downstream consumers 🤗](https://github.com/ggml-org/llama.cpp/discussions/15313)** - Support for the `gpt-oss` model with native MXFP4 format has been added | [PR](https://github.com/ggml-org/llama.cpp/pull/15091) | [Collaboration with NVIDIA](https://blogs.nvidia.com/blog/rtx-ai-garage-openai-oss) | [Comment](https://github.com/ggml-org/llama.cpp/discussions/15095) - Hot PRs: [All](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+) | [Open](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+is%3Aopen) diff --git a/ci/run.sh b/ci/run.sh index 4d3abf9232..d51ba44385 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -106,7 +106,7 @@ function gg_wget { cd $out # should not re-download if file is the same - wget -nv -N $url + wget -nv -c -N $url cd $cwd } diff --git a/common/arg.cpp b/common/arg.cpp index b7e9a639bb..91c2229ebd 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1530,6 +1530,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.ctx_shift = false; } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT")); + add_opt(common_arg( + {"--context-shift"}, + string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"), + [](common_params & params) { + params.ctx_shift = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT")); add_opt(common_arg( {"--chunks"}, "N", string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks), @@ -1823,7 +1830,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.sampling.top_n_sigma = std::stof(value); } - ).set_examples({LLAMA_EXAMPLE_MAIN}).set_sparam()); + ).set_sparam()); add_opt(common_arg( {"--xtc-probability"}, "N", string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability), diff --git a/common/chat.cpp b/common/chat.cpp index 23d3828f9c..7f6809a4ed 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -147,6 +147,7 @@ struct templates_params { json extra_context; bool add_bos; bool add_eos; + bool is_inference = true; }; common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) { @@ -632,7 +633,6 @@ const char * common_reasoning_format_name(common_reasoning_format format) { case COMMON_REASONING_FORMAT_AUTO: return "auto"; case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek"; case COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY: return "deepseek-legacy"; - case COMMON_REASONING_FORMAT_GRANITE: return "granite"; default: throw std::runtime_error("Unknown reasoning format"); } @@ -1337,6 +1337,17 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp common_chat_params data; auto prompt = apply(tmpl, inputs); + // Check if we need to replace the return token with end token during + // inference and without generation prompt. For more details see: + // https://github.com/ggml-org/llama.cpp/issues/15417 + if (inputs.is_inference && !inputs.add_generation_prompt) { + static constexpr std::string_view return_token = "<|return|>"; + static constexpr std::string_view end_token = "<|end|>"; + if (size_t pos = prompt.rfind(return_token); pos != std::string::npos) { + prompt.replace(pos, return_token.length(), end_token); + } + } + data.prompt = prompt; data.format = COMMON_CHAT_FORMAT_GPT_OSS; diff --git a/common/common.cpp b/common/common.cpp index 67dd5404ff..decabcc2ed 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -558,13 +558,6 @@ std::string string_from(const struct llama_context * ctx, const std::vector 0.0 }; +// reasoning API response format (not to be confused as chat template's reasoning format) enum common_reasoning_format { COMMON_REASONING_FORMAT_NONE, - COMMON_REASONING_FORMAT_AUTO, + COMMON_REASONING_FORMAT_AUTO, // Same as deepseek, using `message.reasoning_content` COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in tags in stream mode COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas. - COMMON_REASONING_FORMAT_GRANITE, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas. + // do not extend this enum unless you absolutely have to + // in most cases, use COMMON_REASONING_FORMAT_AUTO + // see: https://github.com/ggml-org/llama.cpp/pull/15408 }; @@ -372,7 +375,7 @@ struct common_params { bool cont_batching = true; // insert new sequences for decoding on-the-fly bool flash_attn = false; // flash attention bool no_perf = false; // disable performance metrics - bool ctx_shift = true; // context shift on inifinite text generation + bool ctx_shift = false; // context shift on infinite text generation bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055) bool kv_unified = false; // enable unified KV cache diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 444e2cbdfb..42bf10d216 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -89,13 +89,16 @@ class ModelBase: block_count: int tensor_map: gguf.TensorNameMap + # Mistral format specifics is_mistral_format: bool = False + disable_mistral_community_chat_template: bool = False def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, *, is_big_endian: bool = False, use_temp_file: bool = False, eager: bool = False, metadata_override: Path | None = None, model_name: str | None = None, split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, - small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None): + small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None, + disable_mistral_community_chat_template: bool = False): if type(self) is ModelBase or \ type(self) is TextModel or \ type(self) is MmprojModel: @@ -147,6 +150,9 @@ class ModelBase: self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file, split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard) + # Mistral specific + self.disable_mistral_community_chat_template = disable_mistral_community_chat_template + @classmethod def add_prefix_to_filename(cls, path: Path, prefix: str) -> Path: stem, suffix = path.stem, path.suffix @@ -1334,6 +1340,12 @@ class MmprojModel(ModelBase): return None raise KeyError(f"could not find any of: {keys}") + def tensor_force_quant(self, name, new_name, bid, n_dims): + del bid, name, n_dims # unused + if ".patch_embd.weight" in new_name: + return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32 + return False + @ModelBase.register("GPTNeoXForCausalLM") class GPTNeoXModel(TextModel): @@ -2005,8 +2017,17 @@ class LlamaModel(TextModel): template_dir = Path(__file__).parent / "models/templates/" - template = MistralModel.get_community_chat_template(vocab, template_dir) - self.gguf_writer.add_chat_template(template) + if not self.is_mistral_format or not self.disable_mistral_community_chat_template: + # Log only for Mistral format that the official tokenization and detokenization is via `mistral-common`. + if self.is_mistral_format: + logger.info( + "Using a Mistral community chat template. These templates can be subject to errors in early days or weeks after a release. " + "Mistral recommends to use `mistral-common` to perform tokenization and detokenization." + ) + template = MistralModel.get_community_chat_template(vocab, template_dir, self.is_mistral_format) + self.gguf_writer.add_chat_template(template) + else: + logger.info("Not using a Mistral community chat template. Ensure to perform the tokenization and detokenization via `mistral-common`.") def set_vocab(self): if self.is_mistral_format: @@ -2305,10 +2326,9 @@ class SmolVLMModel(MmprojModel): self.gguf_writer.add_vision_use_gelu(True) def tensor_force_quant(self, name, new_name, bid, n_dims): - del bid, new_name, n_dims # unused if ".embeddings." in name: return gguf.GGMLQuantizationType.F32 - return False + return super().tensor_force_quant(name, new_name, bid, n_dims) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused @@ -3296,12 +3316,9 @@ class Qwen2VLVisionModel(MmprojModel): self.gguf_writer.add_vision_attention_layernorm_eps(self.global_config.get("rms_norm_eps", 1e-6)) def tensor_force_quant(self, name, new_name, bid, n_dims): - del bid, name, n_dims # unused - if ".patch_embd." in new_name: - return gguf.GGMLQuantizationType.F16 if ".position_embd." in new_name: return gguf.GGMLQuantizationType.F32 - return False + return super().tensor_force_quant(name, new_name, bid, n_dims) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused @@ -3374,10 +3391,9 @@ class Qwen25OmniModel(Qwen2VLVisionModel): yield ("audio_tower.embed_positions.weight", pos_embd) def tensor_force_quant(self, name, new_name, bid, n_dims): - del bid, new_name, n_dims # unused if ".conv" in name and ".weight" in name: return gguf.GGMLQuantizationType.F16 - return False + return super().tensor_force_quant(name, new_name, bid, n_dims) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: if name.startswith("thinker."): @@ -3423,12 +3439,9 @@ class InternVisionModel(MmprojModel): self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / downsample_ratio)) def tensor_force_quant(self, name, new_name, bid, n_dims): - del bid, name, n_dims # unused - if ".patch_embd." in new_name: - return gguf.GGMLQuantizationType.F16 if ".position_embd." in new_name: return gguf.GGMLQuantizationType.F32 - return False + return super().tensor_force_quant(name, new_name, bid, n_dims) def _mapping_interns1_name(self, name): names_map = { @@ -5062,13 +5075,12 @@ class Gemma3VisionModel(MmprojModel): self.gguf_writer.add_vision_projector_scale_factor(proj_scale_factor) def tensor_force_quant(self, name, new_name, bid, n_dims): - del bid, new_name, n_dims # unused # related to https://github.com/ggml-org/llama.cpp/issues/13025 if "input_projection" in name: return gguf.GGMLQuantizationType.F16 if ".embeddings." in name: return gguf.GGMLQuantizationType.F32 - return False + return super().tensor_force_quant(name, new_name, bid, n_dims) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused @@ -7727,10 +7739,9 @@ class WhisperEncoderModel(MmprojModel): self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5)) def tensor_force_quant(self, name, new_name, bid, n_dims): - del bid, new_name, n_dims # unused if ".conv" in name and ".weight" in name: return gguf.GGMLQuantizationType.F16 - return False + return super().tensor_force_quant(name, new_name, bid, n_dims) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused @@ -8251,8 +8262,7 @@ class GptOssModel(TextModel): self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling.get("original_max_position_embeddings", 4096)) -@ModelBase.register("Lfm2ForCausalLM") -@ModelBase.register("LFM2ForCausalLM") +@ModelBase.register("Lfm2ForCausalLM", "LFM2ForCausalLM") class LFM2Model(TextModel): model_arch = gguf.MODEL_ARCH.LFM2 @@ -8287,6 +8297,13 @@ class LFM2Model(TextModel): self._add_feed_forward_length() def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + is_vision_tensor = "vision_tower" in name or "multi_modal_projector" in name + if is_vision_tensor: + # skip vision tensors + return [] + + name = name.replace("language_model.", "") + # conv op requires 2d tensor if 'conv.conv' in name: data_torch = data_torch.squeeze(1) @@ -8294,6 +8311,41 @@ class LFM2Model(TextModel): return [(self.map_tensor_name(name), data_torch)] +@ModelBase.register("Lfm2VlForConditionalGeneration") +class LFM2VLModel(MmprojModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.hparams_vision is not None + # TODO(tarek): for dynamic resolution image_size is not specified, setting here for compatibility + self.hparams_vision["image_size"] = 256 + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LFM2) + self.gguf_writer.add_vision_attention_layernorm_eps(self.find_vparam(["layer_norm_eps"])) + self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("downsample_factor", 2)) + self.gguf_writer.add_vision_use_gelu(True) + # python notation, e.g. for vision_feature_layer == -1, we pick last layer -> vision_feature_layers_to_drop = 0 + vision_feature_layers_to_drop = -(self.global_config.get("vision_feature_layer", -1) + 1) + self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys) - vision_feature_layers_to_drop) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + is_vision_tensor = "vision_tower" in name or "multi_modal_projector" in name + + if is_vision_tensor: + # remove "model." prefix + name = name.replace("model.vision_tower.", "vision_tower.") + name = name.replace("model.multi_modal_projector.", "multi_modal_projector.") + + if "patch_embedding.weight" in name: + data_torch = data_torch.view(data_torch.shape[0], 16, 16, 3).permute(0, 3, 1, 2) + + return [(self.map_tensor_name(name), data_torch)] + + return [] # skip other tensors + + @ModelBase.register("SmallThinkerForCausalLM") class SmallThinkerModel(TextModel): model_arch = gguf.MODEL_ARCH.SMALLTHINKER @@ -8385,7 +8437,7 @@ class MistralModel(LlamaModel): undo_permute = False @staticmethod - def get_community_chat_template(vocab: MistralVocab, templates_dir: Path): + def get_community_chat_template(vocab: MistralVocab, templates_dir: Path, is_mistral_format: bool): assert TokenizerVersion is not None, "mistral_common is not installed" assert isinstance(vocab.tokenizer, (Tekkenizer, SentencePieceTokenizer)), ( f"Expected Tekkenizer or SentencePieceTokenizer, got {type(vocab.tokenizer)}" @@ -8406,7 +8458,13 @@ class MistralModel(LlamaModel): elif vocab.tokenizer.version == TokenizerVersion.v13: template_file = "unsloth-mistral-Devstral-Small-2507.jinja" else: - raise ValueError(f"Unknown tokenizer type: {vocab.tokenizer_type} and version {vocab.tokenizer.version}") + err_message = f"Unknown tokenizer type: {vocab.tokenizer_type} and version {vocab.tokenizer.version}" + if is_mistral_format: + err_message += ( + " . Please pass --disable-mistral-community-chat-template argument to the CLI " + "if you want to skip this error and use the Mistral official `mistral-common` pre-processing library." + ) + raise ValueError(err_message) template_path = templates_dir / template_file if not template_path.exists(): @@ -8601,6 +8659,13 @@ def parse_args() -> argparse.Namespace: "--mistral-format", action="store_true", help="Whether the model is stored following the Mistral format.", ) + parser.add_argument( + "--disable-mistral-community-chat-template", action="store_true", + help=( + "Whether to disable usage of Mistral community chat templates. If set, use the Mistral official `mistral-common` library for tokenization and detokenization of Mistral models. " + "Using `mistral-common` ensure correctness and zero-day support of tokenization for models converted from the Mistral format but requires to manually setup the tokenization server." + ) + ) args = parser.parse_args() if not args.print_supported_models and args.model is None: @@ -8707,6 +8772,7 @@ def main() -> None: fname_out = ModelBase.add_prefix_to_filename(fname_out, "mmproj-") is_mistral_format = args.mistral_format + disable_mistral_community_chat_template = args.disable_mistral_community_chat_template with torch.inference_mode(): output_type = ftype_map[args.outtype] @@ -8733,7 +8799,7 @@ def main() -> None: split_max_tensors=args.split_max_tensors, split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run, small_first_shard=args.no_tensor_first_split, - remote_hf_model_id=hf_repo_id, + remote_hf_model_id=hf_repo_id, disable_mistral_community_chat_template=disable_mistral_community_chat_template ) if args.vocab_only: diff --git a/docs/build.md b/docs/build.md index dd486fe293..b35a898ba9 100644 --- a/docs/build.md +++ b/docs/build.md @@ -197,13 +197,12 @@ The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enab The following compilation options are also available to tweak performance: -| Option | Legal values | Default | Description | -|-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| GGML_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, CDNA and RDNA3+). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower. | -| GGML_CUDA_FORCE_CUBLAS | Boolean | false | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models | -| GGML_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. | -| GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. | -| GGML_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. | +| Option | Legal values | Default | Description | +|-------------------------------|------------------------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| GGML_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, CDNA and RDNA3+). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower. | +| GGML_CUDA_FORCE_CUBLAS | Boolean | false | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models. There may be issues with numerical overflows (except for CDNA and RDNA4) and memory use will be higher. Prompt processing may become faster on recent datacenter GPUs (the custom kernels were tuned primarily for RTX 3000/4000). | +| GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. | +| GGML_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. | ## MUSA diff --git a/docs/multimodal/MobileVLM.md b/docs/multimodal/MobileVLM.md index 4f5eca6190..3bfab9f3d2 100644 --- a/docs/multimodal/MobileVLM.md +++ b/docs/multimodal/MobileVLM.md @@ -194,7 +194,7 @@ llama_print_timings: total time = 44411.01 ms / 377 tokens ## Orin compile and run ### compile ```sh -make GGML_CUDA=1 CUDA_DOCKER_ARCH=sm_87 GGML_CUDA_F16=1 -j 32 +make GGML_CUDA=1 CUDA_DOCKER_ARCH=sm_87 -j 32 ``` ### run on Orin ### case 1 diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 11ff38762b..abc4fa1c89 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -34,6 +34,7 @@ else() add_subdirectory(gen-docs) add_subdirectory(training) add_subdirectory(diffusion) + add_subdirectory(model-conversion) if (NOT GGML_BACKEND_DL) add_subdirectory(convert-llama2c-to-ggml) # these examples use the backends directly and cannot be built with dynamic loading diff --git a/examples/batched.swift/README.md b/examples/batched.swift/README.md index 7f2e2fcdcf..f08901506f 100644 --- a/examples/batched.swift/README.md +++ b/examples/batched.swift/README.md @@ -1,4 +1,5 @@ This is a swift clone of `examples/batched`. -$ `make` -$ `./llama-batched-swift MODEL_PATH [PROMPT] [PARALLEL]` +```bash +$ ./llama-batched-swift MODEL_PATH [PROMPT] [PARALLEL] +``` diff --git a/examples/lookahead/README.md b/examples/lookahead/README.md index aab3cd0ca4..c82de2a5a9 100644 --- a/examples/lookahead/README.md +++ b/examples/lookahead/README.md @@ -5,3 +5,9 @@ Demonstration of lookahead decoding technique: https://lmsys.org/blog/2023-11-21-lookahead-decoding/ More info: https://github.com/ggml-org/llama.cpp/pull/4207 + +Sample command: + +```bash +llama-lookahead -hf ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF -p "// network server implemented in C\n// author: Peter Hacker\n\n#include" -e -ngl 99 -t 4 -n 512 -c 4096 -kvu +``` diff --git a/examples/model-conversion/.gitignore b/examples/model-conversion/.gitignore new file mode 100644 index 0000000000..451227547f --- /dev/null +++ b/examples/model-conversion/.gitignore @@ -0,0 +1,3 @@ +.model_name +data +ppl diff --git a/examples/model-conversion/CMakeLists.txt b/examples/model-conversion/CMakeLists.txt new file mode 100644 index 0000000000..fc1746ce45 --- /dev/null +++ b/examples/model-conversion/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET llama-logits) +add_executable(${TARGET} logits.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/model-conversion/Makefile b/examples/model-conversion/Makefile new file mode 100644 index 0000000000..27d95b4f2b --- /dev/null +++ b/examples/model-conversion/Makefile @@ -0,0 +1,163 @@ +# Validation functions +define validate_model_path + @if [ -z "$(MODEL_PATH)" ]; then \ + echo "Error: MODEL_PATH must be provided either as:"; \ + echo " 1. Environment variable: export MODEL_PATH=/path/to/model"; \ + echo " 2. Command line argument: make $(1) MODEL_PATH=/path/to/model"; \ + exit 1; \ + fi +endef + +define validate_embedding_model_path + @if [ -z "$(EMBEDDING_MODEL_PATH)" ]; then \ + echo "Error: EMBEDDING_MODEL_PATH must be provided either as:"; \ + echo " 1. Environment variable: export EMBEDDING_MODEL_PATH=/path/to/model"; \ + echo " 2. Command line argument: make $(1) EMBEDDING_MODEL_PATH=/path/to/model"; \ + exit 1; \ + fi +endef + +### +### Casual Model targets/recipes +### +causal-convert-model-bf16: OUTTYPE=bf16 +causal-convert-model-bf16: causal-convert-model + +causal-convert-model: + $(call validate_model_path,causal-convert-model) + @MODEL_NAME="$(MODEL_NAME)" OUTTYPE="$(OUTTYPE)" MODEL_PATH="$(MODEL_PATH)" \ + METADATA_OVERRIDE="$(METADATA_OVERRIDE)" \ + ./scripts/causal/convert-model.sh + +causal-run-original-model: + $(call validate_model_path,causal-run-original-model) + @MODEL_PATH="$(MODEL_PATH)" ./scripts/causal/run-org-model.py + +causal-run-converted-model: + @CONVERTED_MODEL="$(CONVERTED_MODEL)" ./scripts/causal/run-converted-model.sh + +causal-verify-logits: causal-run-original-model causal-run-converted-model + @./scripts/causal/compare-logits.py + @MODEL_PATH="$(MODEL_PATH)" ./scripts/utils/check-nmse.py -m ${MODEL_PATH} + +causal-run-original-embeddings: + @./scripts/causal/run-casual-gen-embeddings-org.sh + +causal-run-converted-embeddings: + @./scripts/causal/run-converted-model-embeddings-logits.sh + +causal-verify-embeddings: causal-run-original-embeddings causal-run-converted-embeddings + @./scripts/causal/compare-embeddings-logits.sh + +causal-inspect-original-model: + @./scripts/utils/inspect-org-model.py + +causal-inspect-converted-model: + @./scripts/utils/inspect-converted-model.sh + +causal-start-embedding-server: + @./scripts/utils/run-embedding-server.sh ${CONVERTED_MODEL} + +causal-curl-embedding-endpoint: causal-run-original-embeddings + @./scripts/utils/curl-embedding-server.sh | ./scripts/causal/compare-embeddings-logits.sh + +causal-quantize-Q8_0: QUANTIZED_TYPE = Q8_0 +causal-quantize-Q8_0: causal-quantize-model + +causal-quantize-Q4_0: QUANTIZED_TYPE = Q4_0 +causal-quantize-Q4_0: causal-quantize-model + +causal-quantize-model: + @CONVERTED_MODEL="$(CONVERTED_MODEL)" QUANTIZED_TYPE="$(QUANTIZED_TYPE)" ./scripts/utils/quantize.sh ${CONVERTED_MODEL} ${QUANTIZED_TYPE} + @echo "Export the quantized model path to QUANTIZED_MODEL variable in your environment" + +causal-run-quantized-model: + @QUANTIZED_MODEL="$(QUANTIZED_MODEL)" ./scripts/causal/run-converted-model.sh ${QUANTIZED_MODEL} + + +### +### Embedding Model targets/recipes +### + +embedding-convert-model-bf16: OUTTYPE=bf16 +embedding-convert-model-bf16: embedding-convert-model + +embedding-convert-model: + $(call validate_embedding_model_path,embedding-convert-model) + @MODEL_NAME="$(MODEL_NAME)" OUTTYPE="$(OUTTYPE)" MODEL_PATH="$(EMBEDDING_MODEL_PATH)" \ + METADATA_OVERRIDE="$(METADATA_OVERRIDE)" \ + ./scripts/embedding/convert-model.sh + +embedding-run-original-model: + $(call validate_embedding_model_path,embedding-run-original-model) + @EMBEDDING_MODEL_PATH="$(EMBEDDING_MODEL_PATH)" ./scripts/embedding/run-original-model.py + +embedding-run-converted-model: + @CONVERTED_EMBEDDING_MODEL="$(CONVERTED_EMBEDDING_MODEL)" ./scripts/embedding/run-converted-model.sh ${CONVERTED_EMBEDDING_MODEL} + +embedding-verify-logits: embedding-run-original-model embedding-run-converted-model + @./scripts/embedding/compare-embeddings-logits.sh + +embedding-inspect-original-model: + $(call validate_embedding_model_path,embedding-inspect-original-model) + @EMBEDDING_MODEL_PATH="$(EMBEDDING_MODEL_PATH)" ./scripts/utils/inspect-org-model.py -m ${EMBEDDING_MODEL_PATH} + +embedding-inspect-converted-model: + @CONVERTED_EMBEDDING_MODEL="$(CONVERTED_EMBEDDING_MODEL)" ./scripts/utils/inspect-converted-model.sh ${CONVERTED_EMBEDDING_MODEL} + +embedding-start-embedding-server: + @./scripts/utils/run-embedding-server.sh ${CONVERTED_EMBEDDING_MODEL} + +embedding-curl-embedding-endpoint: + @./scripts/utils/curl-embedding-server.sh | ./scripts/embedding/compare-embeddings-logits.sh + +embedding-quantize-Q8_0: QUANTIZED_TYPE = Q8_0 +embedding-quantize-Q8_0: embedding-quantize-model + +embedding-quantize-Q4_0: QUANTIZED_TYPE = Q4_0 +embedding-quantize-Q4_0: embedding-quantize-model + +embedding-quantize-model: + @./scripts/utils/quantize.sh ${CONVERTED_EMBEDDING_MODEL} ${QUANTIZED_TYPE} + @echo "Export the quantized model path to QUANTIZED_EMBEDDING_MODEL variable in your environment" + +embedding-run-quantized-model: + @./scripts/embedding/run-converted-model.sh ${QUANTIZED_EMBEDDING_MODEL} + +### +### Perplexity targets/recipes +### +perplexity-data-gen: + CONVERTED_MODEL="$(CONVERTED_MODEL)" ./scripts/utils/perplexity-gen.sh + +perplexity-run-full: + QUANTIZED_MODEL="$(QUANTIZED_MODEL)" LOOGITS_FILE="$(LOGITS_FILE)" \ + ./scripts/utils/perplexity-run.sh + +perplexity-run: + QUANTIZED_MODEL="$(QUANTIZED_MODEL)" ./scripts/utils/perplexity-run-simple.sh + +### +### HuggingFace targets/recipes +### + +hf-create-model: + @./scripts/utils/hf-create-model.py -m "${MODEL_NAME}" -ns "${NAMESPACE}" -b "${ORIGINAL_BASE_MODEL}" + +hf-create-model-private: + @./scripts/utils/hf-create-model.py -m "${MODEL_NAME}" -ns "${NAMESPACE}" -b "${ORIGINAL_BASE_MODEL}" -p + +hf-upload-gguf-to-model: + @./scripts/utils/hf-upload-gguf-model.py -m "${MODEL_PATH}" -r "${REPO_ID}" -o "${NAME_IN_REPO}" + +hf-create-collection: + @./scripts/utils/hf-create-collection.py -n "${NAME}" -d "${DESCRIPTION}" -ns "${NAMESPACE}" + +hf-add-model-to-collection: + @./scripts/utils/hf-add-model-to-collection.py -c "${COLLECTION}" -m "${MODEL}" + + +.PHONY: clean +clean: + @${RM} -rf data .converted_embedding_model.txt .converted_model.txt .embedding_model_name.txt .model_name.txt + diff --git a/examples/model-conversion/README.md b/examples/model-conversion/README.md new file mode 100644 index 0000000000..c924a6be3c --- /dev/null +++ b/examples/model-conversion/README.md @@ -0,0 +1,335 @@ +# Model Conversion Example +This directory contains scripts and code to help in the process of converting +HuggingFace PyTorch models to GGUF format. + +The motivation for having this is that the conversion process can often be an +iterative process, where the original model is inspected, converted, updates +made to llama.cpp, converted again, etc. Once the model has been converted it +needs to be verified against the original model, and then optionally quantified, +and in some cases perplexity checked of the quantized model. And finally the +model/models need to the ggml-org on Hugging Face. This tool/example tries to +help with this process. + +### Overview +The idea is that the makefile targets and scripts here can be used in the +development/conversion process assisting with things like: + +* inspect/run the original model to figure out how it works +* convert the original model to GGUF format +* inspect/run the converted model +* verify the logits produced by the original model and the converted model +* quantize the model to GGUF format +* run perplexity evaluation to verify that the quantized model is performing + as expected +* upload the model to HuggingFace to make it available for others + +## Setup +Create virtual python environment +```console +$ python3.11 -m venv venv +$ source venv/bin/activate +(venv) $ pip install -r requirements.txt +``` + +## Causal Language Model Conversion +This section describes the steps to convert a causal language model to GGUF and +to verify that the conversion was successful. + +### Download the original model +First, clone the original model to some local directory: +```console +$ mkdir models && cd models +$ git clone https://huggingface.co/user/model_name +$ cd model_name +$ git lfs install +$ git lfs pull +``` + +### Set the MODEL_PATH +The path to the downloaded model can be provided in two ways: + +**Option 1: Environment variable (recommended for iterative development)** +```console +export MODEL_PATH=~/work/ai/models/some_model +``` + +**Option 2: Command line argument (for one-off tasks)** +```console +make causal-convert-model MODEL_PATH=~/work/ai/models/some_model +``` + +Command line arguments take precedence over environment variables when both are provided. + +In cases where the transformer implementation for the model has not been released +yet it is possible to set the environment variable `UNRELEASED_MODEL_NAME` which +will then cause the transformer implementation to be loaded explicitely and not +use AutoModelForCausalLM: +``` +export UNRELEASED_MODEL_NAME=SomeNewModel +``` + +### Inspecting the original tensors +```console +# Using environment variable +(venv) $ make causal-inspect-original-model + +# Or using command line argument +(venv) $ make causal-inspect-original-model MODEL_PATH=~/work/ai/models/some_model +``` + +### Running the original model +This is mainly to verify that the original model works, and to compare the output +from the converted model. +```console +# Using environment variable +(venv) $ make causal-run-original-model + +# Or using command line argument +(venv) $ make causal-run-original-model MODEL_PATH=~/work/ai/models/some_model +``` +This command will save two files to the `data` directory, one is a binary file +containing logits which will be used for comparison with the converted model +later, and the other is a text file which allows for manual visual inspection. + +### Model conversion +After updates have been made to [gguf-py](../../gguf-py) to add support for the +new model, the model can be converted to GGUF format using the following command: +```console +# Using environment variable +(venv) $ make causal-convert-model + +# Or using command line argument +(venv) $ make causal-convert-model MODEL_PATH=~/work/ai/models/some_model +``` + +### Inspecting the converted model +The converted model can be inspected using the following command: +```console +(venv) $ make inspect-converted-model +``` + +### Running the converted model +```console +(venv) $ make run-converted-model +``` + +### Model logits verfication +The following target will run the original model and the converted model and +compare the logits: +```console +(venv) $ make causal-verify-logits +``` + +### Quantizing the model +The causal model can be quantized to GGUF format using the following command: +```console +(venv) $ make causal-quantize-Q8_0 +Quantized model saved to: /path/to/quantized/model-Q8_0.gguf +Export the quantized model path to QUANTIZED_MODEL variable in your environment +``` +This will show the path to the quantized model in the terminal, which can then +be used to set the `QUANTIZED_MODEL` environment variable: +```console +export QUANTIZED_MODEL=/path/to/quantized/model-Q8_0.gguf +``` +Then the quantized model can be run using the following command: +```console +(venv) $ make causal-run-quantized-model +``` + + +## Embedding Language Model Conversion + +### Download the original model +```console +$ mkdir models && cd models +$ git clone https://huggingface.co/user/model_name +$ cd model_name +$ git lfs install +$ git lfs pull +``` + +The path to the embedding model can be provided in two ways: + +**Option 1: Environment variable (recommended for iterative development)** +```console +export EMBEDDING_MODEL_PATH=~/path/to/embedding_model +``` + +**Option 2: Command line argument (for one-off tasks)** +```console +make embedding-convert-model EMBEDDING_MODEL_PATH=~/path/to/embedding_model +``` + +Command line arguments take precedence over environment variables when both are provided. + +### Running the original model +This is mainly to verify that the original model works and to compare the output +with the output from the converted model. +```console +# Using environment variable +(venv) $ make embedding-run-original-model + +# Or using command line argument +(venv) $ make embedding-run-original-model EMBEDDING_MODEL_PATH=~/path/to/embedding_model +``` +This command will save two files to the `data` directory, one is a binary +file containing logits which will be used for comparison with the converted +model, and the other is a text file which allows for manual visual inspection. + +### Model conversion +After updates have been made to [gguf-py](../../gguf-py) to add support for the +new model the model can be converted to GGUF format using the following command: +```console +(venv) $ make embedding-convert-model +``` + +### Run the converted model +```console +(venv) $ make embedding-run-converted-model +``` + +### Model logits verfication +The following target will run the original model and the converted model (which +was done manually in the previous steps) and compare the logits: +```console +(venv) $ make embedding-verify-logits +``` + +### llama-server verification +To verify that the converted model works with llama-server, the following +command can be used: +```console +(venv) $ make embedding-start-embedding-server +``` +Then open another terminal and set the `EMBEDDINGS_MODEL_PATH` environment +variable as this will not be inherited by the new terminal: +```console +(venv) $ make embedding-curl-embedding-endpoint +``` +This will call the `embedding` endpoing and the output will be piped into +the same verification script as used by the target `embedding-verify-logits`. + +The causal model can also be used to produce embeddings and this can be verified +using the following commands: +```console +(venv) $ make causal-start-embedding-server +``` +Then open another terminal and set the `MODEL_PATH` environment +variable as this will not be inherited by the new terminal: +```console +(venv) $ make casual-curl-embedding-endpoint +``` + +### Quantizing the model +The embedding model can be quantized to GGUF format using the following command: +```console +(venv) $ make embedding-quantize-Q8_0 +Quantized model saved to: /path/to/quantized/model-Q8_0.gguf +Export the quantized model path to QUANTIZED_EMBEDDING_MODEL variable in your environment +``` +This will show the path to the quantized model in the terminal, which can then +be used to set the `QUANTIZED_EMBEDDING_MODEL` environment variable: +```console +export QUANTIZED_EMBEDDING_MODEL=/path/to/quantized/model-Q8_0.gguf +``` +Then the quantized model can be run using the following command: +```console +(venv) $ make embedding-run-quantized-model +``` + +## Perplexity Evaluation + +### Simple perplexity evaluation +This allows to run the perplexity evaluation without having to generate a +token/logits file: +```console +(venv) $ make perplexity-run QUANTIZED_MODEL=~/path/to/quantized/model.gguf +``` +This will use the wikitext dataset to run the perplexity evaluation and +output the perplexity score to the terminal. This value can then be compared +with the perplexity score of the unquantized model. + +### Full perplexity evaluation +First use the converted, non-quantized, model to generate the perplexity evaluation +dataset using the following command: +```console +$ make perplexity-data-gen CONVERTED_MODEL=~/path/to/converted/model.gguf +``` +This will generate a file in the `data` directory named after the model and with +a `.kld` suffix which contains the tokens and the logits for the wikitext dataset. + +After the dataset has been generated, the perplexity evaluation can be run using +the quantized model: +```console +$ make perplexity-run-full QUANTIZED_MODEL=~/path/to/quantized/model-Qxx.gguf LOGITS_FILE=data/model.gguf.ppl +``` + +> 📝 **Note:** The `LOGITS_FILE` is the file generated by the previous command +> can be very large, so make sure you have enough disk space available. + +## HuggingFace utilities +The following targets are useful for creating collections and model repositories +on Hugging Face in the the ggml-org. These can be used when preparing a relase +to script the process for new model releases. + +For the following targets a `HF_TOKEN` environment variable is required. + +> 📝 **Note:** Don't forget to logout from Hugging Face after running these +> commands, otherwise you might have issues pulling/cloning repositories as +> the token will still be in use: +> $ huggingface-cli logout +> $ unset HF_TOKEN + +### Create a new Hugging Face Model (model repository) +This will create a new model repsository on Hugging Face with the specified +model name. +```console +(venv) $ make hf-create-model MODEL_NAME='TestModel' NAMESPACE="danbev" +Repository ID: danbev/TestModel-GGUF +Repository created: https://huggingface.co/danbev/TestModel-GGUF +``` +Note that we append a `-GGUF` suffix to the model name to ensure a consistent +naming convention for GGUF models. + +### Upload a GGUF model to model repository +The following target uploads a model to an existing Hugging Face model repository. +```console +(venv) $ make hf-upload-gguf-to-model MODEL_PATH=dummy-model1.gguf REPO_ID=danbev/TestModel-GGUF +📤 Uploading dummy-model1.gguf to danbev/TestModel-GGUF/dummy-model1.gguf +✅ Upload successful! +🔗 File available at: https://huggingface.co/danbev/TestModel-GGUF/blob/main/dummy-model1.gguf +``` +This command can also be used to update an existing model file in a repository. + +### Create a new Collection +```console +(venv) $ make hf-new-collection NAME=TestCollection DESCRIPTION="Collection for testing scripts" NAMESPACE=danbev +🚀 Creating Hugging Face Collection +Title: TestCollection +Description: Collection for testing scripts +Namespace: danbev +Private: False +✅ Authenticated as: danbev +📚 Creating collection: 'TestCollection'... +✅ Collection created successfully! +📋 Collection slug: danbev/testcollection-68930fcf73eb3fc200b9956d +🔗 Collection URL: https://huggingface.co/collections/danbev/testcollection-68930fcf73eb3fc200b9956d + +🎉 Collection created successfully! +Use this slug to add models: danbev/testcollection-68930fcf73eb3fc200b9956d +``` + +### Add model to a Collection +```console +(venv) $ make hf-add-model-to-collection COLLECTION=danbev/testcollection-68930fcf73eb3fc200b9956d MODEL=danbev/TestModel-GGUF +✅ Authenticated as: danbev +🔍 Checking if model exists: danbev/TestModel-GGUF +✅ Model found: danbev/TestModel-GGUF +📚 Adding model to collection... +✅ Model added to collection successfully! +🔗 Collection URL: https://huggingface.co/collections/danbev/testcollection-68930fcf73eb3fc200b9956d + +🎉 Model added successfully! + +``` diff --git a/examples/model-conversion/logits.cpp b/examples/model-conversion/logits.cpp new file mode 100644 index 0000000000..2cac6a3b3e --- /dev/null +++ b/examples/model-conversion/logits.cpp @@ -0,0 +1,209 @@ +#include "llama.h" +#include +#include +#include +#include +#include +#include + +static void print_usage(int, char ** argv) { + printf("\nexample usage:\n"); + printf("\n %s -m model.gguf [-ngl n_gpu_layers] -embd-mode [prompt]\n", argv[0]); + printf("\n"); +} + +int main(int argc, char ** argv) { + std::string model_path; + std::string prompt = "Hello, my name is"; + int ngl = 0; + bool embedding_mode = false; + + { + int i = 1; + for (; i < argc; i++) { + if (strcmp(argv[i], "-m") == 0) { + if (i + 1 < argc) { + model_path = argv[++i]; + } else { + print_usage(argc, argv); + return 1; + } + } else if (strcmp(argv[i], "-ngl") == 0) { + if (i + 1 < argc) { + try { + ngl = std::stoi(argv[++i]); + } catch (...) { + print_usage(argc, argv); + return 1; + } + } else { + print_usage(argc, argv); + return 1; + } + } else if (strcmp(argv[i], "-embd-mode") == 0) { + if (i + 1 < argc) { + try { + embedding_mode = true; + } catch (...) { + print_usage(argc, argv); + return 1; + } + } else { + print_usage(argc, argv); + return 1; + } + } else { + // prompt starts here + break; + } + } + + if (model_path.empty()) { + print_usage(argc, argv); + return 1; + } + + if (i < argc) { + prompt = argv[i++]; + for (; i < argc; i++) { + prompt += " "; + prompt += argv[i]; + } + } + } + + ggml_backend_load_all(); + llama_model_params model_params = llama_model_default_params(); + model_params.n_gpu_layers = ngl; + + llama_model * model = llama_model_load_from_file(model_path.c_str(), model_params); + + if (model == NULL) { + fprintf(stderr , "%s: error: unable to load model\n" , __func__); + return 1; + } + + // Extract basename from model_path + const char * basename = strrchr(model_path.c_str(), '/'); + basename = (basename == NULL) ? model_path.c_str() : basename + 1; + + char model_name[256]; + strncpy(model_name, basename, 255); + model_name[255] = '\0'; + + char * dot = strrchr(model_name, '.'); + if (dot != NULL && strcmp(dot, ".gguf") == 0) { + *dot = '\0'; + } + printf("Model name: %s\n", model_name); + + const llama_vocab * vocab = llama_model_get_vocab(model); + const int n_prompt = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, true, true); + + std::vector prompt_tokens(n_prompt); + if (llama_tokenize(vocab, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) { + fprintf(stderr, "%s: error: failed to tokenize the prompt\n", __func__); + return 1; + } + + llama_context_params ctx_params = llama_context_default_params(); + ctx_params.n_ctx = n_prompt; + ctx_params.n_batch = n_prompt; + ctx_params.no_perf = false; + if (embedding_mode) { + ctx_params.embeddings = true; + ctx_params.n_ubatch = ctx_params.n_batch; + } + + llama_context * ctx = llama_init_from_model(model, ctx_params); + if (ctx == NULL) { + fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__); + return 1; + } + + printf("Input prompt: \"%s\"\n", prompt.c_str()); + printf("Tokenized prompt (%d tokens): ", n_prompt); + for (auto id : prompt_tokens) { + char buf[128]; + int n = llama_token_to_piece(vocab, id, buf, sizeof(buf), 0, true); + if (n < 0) { + fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__); + return 1; + } + std::string s(buf, n); + printf("%s", s.c_str()); + } + printf("\n"); + + llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size()); + + if (llama_decode(ctx, batch)) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return 1; + } + + float * logits; + int n_logits; + const char * type; + + if (embedding_mode) { + logits = llama_get_embeddings(ctx); + n_logits = llama_model_n_embd(model) * batch.n_tokens; + type = "-embeddings"; + printf("Embeddings size: %d\n", n_logits); + } else { + logits = llama_get_logits_ith(ctx, batch.n_tokens - 1); + n_logits = llama_vocab_n_tokens(vocab); + type = ""; + printf("Vocab size: %d\n", n_logits); + } + + std::filesystem::create_directory("data"); + + // Save logits to binary file + char bin_filename[512]; + snprintf(bin_filename, sizeof(bin_filename), "data/llamacpp-%s%s.bin", model_name, type); + printf("Saving logits to %s\n", bin_filename); + + FILE * f = fopen(bin_filename, "wb"); + if (f == NULL) { + fprintf(stderr, "%s: error: failed to open binary output file\n", __func__); + return 1; + } + fwrite(logits, sizeof(float), n_logits, f); + fclose(f); + + // Also save as text for debugging + char txt_filename[512]; + snprintf(txt_filename, sizeof(txt_filename), "data/llamacpp-%s%s.txt", model_name, type); + f = fopen(txt_filename, "w"); + if (f == NULL) { + fprintf(stderr, "%s: error: failed to open text output file\n", __func__); + return 1; + } + for (int i = 0; i < n_logits; i++) { + fprintf(f, "%d: %.6f\n", i, logits[i]); // Added index and changed format + } + fclose(f); + + // Print first and last 10 logits for quick verification + printf("First 10 logits: "); + for (int i = 0; i < 10 && i < n_logits; i++) { + printf("%.6f ", logits[i]); + } + printf("\n"); + + printf("Last 10 logits: "); + for (int i = n_logits - 10; i < n_logits; i++) { + if (i >= 0) printf("%.6f ", logits[i]); + } + printf("\n\n"); + + printf("Logits saved to %s\n", bin_filename); + printf("Logits saved to %s\n", txt_filename); + + llama_free(ctx); + llama_model_free(model); + + return 0; +} diff --git a/examples/model-conversion/requirements.txt b/examples/model-conversion/requirements.txt new file mode 100644 index 0000000000..b8148b269a --- /dev/null +++ b/examples/model-conversion/requirements.txt @@ -0,0 +1,5 @@ +--extra-index-url https://download.pytorch.org/whl/cpu +torch~=2.6.0 +torchvision~=0.21.0 +transformers~=4.55.0 +huggingface-hub~=0.34.0 diff --git a/examples/model-conversion/scripts/causal/compare-embeddings-logits.sh b/examples/model-conversion/scripts/causal/compare-embeddings-logits.sh new file mode 100755 index 0000000000..287158f638 --- /dev/null +++ b/examples/model-conversion/scripts/causal/compare-embeddings-logits.sh @@ -0,0 +1,43 @@ +#/bin/bash + +set -e + +MODEL_PATH="${1:-"$MODEL_PATH"}" +MODEL_NAME="${2:-$(basename "$MODEL_PATH")}" + +if [ -t 0 ]; then + CPP_EMBEDDINGS="data/llamacpp-${MODEL_NAME}-embeddings.bin" +else + # Process piped JSON data and convert to binary (matching logits.cpp format) + TEMP_FILE=$(mktemp /tmp/tmp.XXXXXX.binn) + python3 -c " +import json +import sys +import struct + +data = json.load(sys.stdin) + +# Flatten all embeddings completely +flattened = [] +for item in data: + embedding = item['embedding'] + for token_embedding in embedding: + flattened.extend(token_embedding) + +print(f'Total embedding values: {len(flattened)}', file=sys.stderr) + +# Write as binary floats - matches logitc.cpp fwrite format +with open('$TEMP_FILE', 'wb') as f: + for value in flattened: + f.write(struct.pack('f', value)) +" + CPP_EMBEDDINGS="$TEMP_FILE" + trap "rm -f $TEMP_FILE" EXIT +fi + +python scripts/utils/semantic_check.py --model-path $MODEL_PATH \ + --python-embeddings data/pytorch-${MODEL_NAME}-embeddings.bin \ + --cpp-embeddings $CPP_EMBEDDINGS \ + --prompt "Hello world today" \ + --causal + diff --git a/examples/model-conversion/scripts/causal/compare-logits.py b/examples/model-conversion/scripts/causal/compare-logits.py new file mode 100755 index 0000000000..fb959f0d56 --- /dev/null +++ b/examples/model-conversion/scripts/causal/compare-logits.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 + +import numpy as np +import sys +import os +from pathlib import Path + +def quick_logits_check(pytorch_file, llamacpp_file): + """Lightweight sanity check before NMSE""" + + try: + pytorch_logits = np.fromfile(pytorch_file, dtype=np.float32) + llamacpp_logits = np.fromfile(llamacpp_file, dtype=np.float32) + except Exception as e: + print(f"❌ NOK: Failed to load files - {e}") + return False + + # Check shapes match + if pytorch_logits.shape != llamacpp_logits.shape: + print(f"❌ NOK: Shape mismatch - PyTorch: {pytorch_logits.shape}, llama.cpp: {llamacpp_logits.shape}") + return False + + # Calculate key metrics + diff = pytorch_logits - llamacpp_logits + abs_diff = np.abs(diff) + max_diff = np.max(abs_diff) + + # Get top 10 predictions from both models + pytorch_top10 = np.argsort(pytorch_logits)[-10:][::-1] + llamacpp_top10 = np.argsort(llamacpp_logits)[-10:][::-1] + print(f"Top 10 PyTorch logits: {pytorch_logits[pytorch_top10]}") + print(f"Top 10 llama.cpp logits: {llamacpp_logits[llamacpp_top10]}") + print(f"Max absolute difference: {max_diff:.4f}") + + if max_diff > 1.0: + print(f"❌ NOK: Large differences detected - max diff: {max_diff:.4f}") + return False + + return True + +def main(): + model_path = os.getenv('MODEL_PATH') + if not model_path: + print("Error: MODEL_PATH environment variable not set") + sys.exit(1) + + if not os.path.exists(model_path): + print(f"Error: Model file not found: {model_path}") + sys.exit(1) + + model_name = os.path.splitext(os.path.basename(model_path))[0] + data_dir = Path("data") + + pytorch_file = data_dir / f"pytorch-{model_name}.bin" + llamacpp_file = data_dir / f"llamacpp-{model_name}.bin" + + if not pytorch_file.exists(): + print(f"Error: PyTorch logits file not found: {pytorch_file}") + print("Please run scripts/run-org-model.sh first to generate this file.") + sys.exit(1) + + if not llamacpp_file.exists(): + print(f"Error: llama.cpp logits file not found: {llamacpp_file}") + print("Please run scripts/run-converted-model.sh first to generate this file.") + sys.exit(1) + + print("Checked all required files were found. Proceeding...\n") + + + print("🔍 GGML Model Validation for model ", model_name) + print("=" * 40) + print(f"PyTorch logits : {pytorch_file}") + print(f"llama.cpp logits: {llamacpp_file}") + print() + + success = quick_logits_check(pytorch_file, llamacpp_file) + + # Exit with appropriate code + if success: + print("✅ OK: Lightweight model check successful!") + print(" Ok to proceed with NMSE check...") + sys.exit(0) + else: + print(f"❌ NOK: Top 10 predictions don't match - generation will differ") + sys.exit(1) + +if __name__ == "__main__": + main() diff --git a/examples/model-conversion/scripts/causal/convert-model.sh b/examples/model-conversion/scripts/causal/convert-model.sh new file mode 100755 index 0000000000..56b21f9baa --- /dev/null +++ b/examples/model-conversion/scripts/causal/convert-model.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +MODEL_NAME="${MODEL_NAME:-$(basename "$MODEL_PATH")}" +OUTPUT_DIR="${OUTPUT_DIR:-../../models}" +TYPE="${OUTTYPE:-f16}" +METADATA_OVERRIDE="${METADATA_OVERRIDE:-}" +CONVERTED_MODEL="${OUTPUT_DIR}/${MODEL_NAME}.gguf" + +echo "Model path: ${MODEL_PATH}" +echo "Model name: ${MODEL_NAME}" +echo "Data type: ${TYPE}" +echo "Converted model path:: ${CONVERTED_MODEL}" +echo "Metadata override: ${METADATA_OVERRIDE}" +python ../../convert_hf_to_gguf.py --verbose \ + ${MODEL_PATH} \ + --outfile ${CONVERTED_MODEL} \ + --outtype ${TYPE} \ + --metadata "${METADATA_OVERRIDE}" + +echo "" +echo "The environment variable CONVERTED_MODEL can be set to this path using:" +echo "export CONVERTED_MODEL=$(realpath ${CONVERTED_MODEL})" diff --git a/examples/model-conversion/scripts/causal/run-casual-gen-embeddings-org.sh b/examples/model-conversion/scripts/causal/run-casual-gen-embeddings-org.sh new file mode 100755 index 0000000000..2fb54ab990 --- /dev/null +++ b/examples/model-conversion/scripts/causal/run-casual-gen-embeddings-org.sh @@ -0,0 +1,113 @@ +#!/usr/bin/env python3 + +import argparse +import os +import importlib +import sys +import torch +import numpy as np + +from transformers import AutoTokenizer, AutoConfig, AutoModel, AutoModelForCausalLM +from pathlib import Path + +unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME') + +parser = argparse.ArgumentParser(description='Process model with specified path') +parser.add_argument('--model-path', '-m', help='Path to the model') +args = parser.parse_args() + +model_path = os.environ.get('MODEL_PATH', args.model_path) +if model_path is None: + parser.error("Model path must be specified either via --model-path argument or MODEL_PATH environment variable") + +config = AutoConfig.from_pretrained(model_path) + +print("Model type: ", config.model_type) +print("Vocab size: ", config.vocab_size) +print("Hidden size: ", config.hidden_size) +print("Number of layers: ", config.num_hidden_layers) +print("BOS token id: ", config.bos_token_id) +print("EOS token id: ", config.eos_token_id) + +print("Loading model and tokenizer using AutoTokenizer:", model_path) +tokenizer = AutoTokenizer.from_pretrained(model_path) + +if unreleased_model_name: + model_name_lower = unreleased_model_name.lower() + unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}" + class_name = f"{unreleased_model_name}ForCausalLM" + print(f"Importing unreleased model module: {unreleased_module_path}") + + try: + model_class = getattr(importlib.import_module(unreleased_module_path), class_name) + model = model_class.from_pretrained(model_path) + except (ImportError, AttributeError) as e: + print(f"Failed to import or load model: {e}") +else: + model = AutoModelForCausalLM.from_pretrained(model_path) +print(f"Model class: {type(model)}") +#print(f"Model file: {type(model).__module__}") + +model_name = os.path.basename(model_path) +print(f"Model name: {model_name}") + +prompt = "Hello world today" +input_ids = tokenizer(prompt, return_tensors="pt").input_ids +print(f"Input tokens: {input_ids}") +print(f"Input text: {repr(prompt)}") +print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}") + +with torch.no_grad(): + outputs = model(input_ids, output_hidden_states=True) + + # Extract hidden states from the last layer + # outputs.hidden_states is a tuple of (num_layers + 1) tensors + # Index -1 gets the last layer, shape: [batch_size, seq_len, hidden_size] + last_hidden_states = outputs.hidden_states[-1] + + # Get embeddings for all tokens + token_embeddings = last_hidden_states[0].cpu().numpy() # Remove batch dimension + + print(f"Hidden states shape: {last_hidden_states.shape}") + print(f"Token embeddings shape: {token_embeddings.shape}") + print(f"Hidden dimension: {token_embeddings.shape[-1]}") + print(f"Number of tokens: {token_embeddings.shape[0]}") + + # Save raw token embeddings + data_dir = Path("data") + data_dir.mkdir(exist_ok=True) + bin_filename = data_dir / f"pytorch-{model_name}-embeddings.bin" + txt_filename = data_dir / f"pytorch-{model_name}-embeddings.txt" + + # Save all token embeddings as binary + print(token_embeddings) + token_embeddings.astype(np.float32).tofile(bin_filename) + + # Save as text for inspection + with open(txt_filename, "w") as f: + for i, embedding in enumerate(token_embeddings): + for j, val in enumerate(embedding): + f.write(f"{i} {j} {val:.6f}\n") + + # Print embeddings per token in the requested format + print("\nToken embeddings:") + tokens = tokenizer.convert_ids_to_tokens(input_ids[0]) + for i, embedding in enumerate(token_embeddings): + # Format: show first few values, ..., then last few values + if len(embedding) > 10: + # Show first 3 and last 3 values with ... in between + first_vals = " ".join(f"{val:8.6f}" for val in embedding[:3]) + last_vals = " ".join(f"{val:8.6f}" for val in embedding[-3:]) + print(f"embedding {i}: {first_vals} ... {last_vals}") + else: + # If embedding is short, show all values + vals = " ".join(f"{val:8.6f}" for val in embedding) + print(f"embedding {i}: {vals}") + + # Also show token info for reference + print(f"\nToken reference:") + for i, token in enumerate(tokens): + print(f" Token {i}: {repr(token)}") + + print(f"Saved bin logits to: {bin_filename}") + print(f"Saved txt logist to: {txt_filename}") diff --git a/examples/model-conversion/scripts/causal/run-converted-model-embeddings-logits.sh b/examples/model-conversion/scripts/causal/run-converted-model-embeddings-logits.sh new file mode 100755 index 0000000000..64709f1798 --- /dev/null +++ b/examples/model-conversion/scripts/causal/run-converted-model-embeddings-logits.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +set -e + +# First try command line argument, then environment variable, then file +CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}" + +# Final check if we have a model path +if [ -z "$CONVERTED_MODEL" ]; then + echo "Error: Model path must be provided either as:" >&2 + echo " 1. Command line argument" >&2 + echo " 2. CONVERTED_MODEL environment variable" >&2 + exit 1 +fi + +cmake --build ../../build --target llama-logits -j8 + +../../build/bin/llama-logits -m $CONVERTED_MODEL -embd-mode "Hello world today" diff --git a/examples/model-conversion/scripts/causal/run-converted-model.sh b/examples/model-conversion/scripts/causal/run-converted-model.sh new file mode 100755 index 0000000000..e2762729e7 --- /dev/null +++ b/examples/model-conversion/scripts/causal/run-converted-model.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +set -e + +# First try command line argument, then environment variable, then file +CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}" + +# Final check if we have a model path +if [ -z "$CONVERTED_MODEL" ]; then + echo "Error: Model path must be provided either as:" >&2 + echo " 1. Command line argument" >&2 + echo " 2. CONVERTED_MODEL environment variable" >&2 + exit 1 +fi + +echo $CONVERTED_MODEL + +cmake --build ../../build --target llama-logits -j8 + +../../build/bin/llama-logits -m "$CONVERTED_MODEL" "Hello, my name is" diff --git a/examples/model-conversion/scripts/causal/run-org-model.py b/examples/model-conversion/scripts/causal/run-org-model.py new file mode 100755 index 0000000000..f6188ea6f3 --- /dev/null +++ b/examples/model-conversion/scripts/causal/run-org-model.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python3 + +import argparse +import os +import importlib +from pathlib import Path + +from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig +import torch +import numpy as np + +unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME') + +parser = argparse.ArgumentParser(description='Process model with specified path') +parser.add_argument('--model-path', '-m', help='Path to the model') +args = parser.parse_args() + +model_path = os.environ.get('MODEL_PATH', args.model_path) +if model_path is None: + parser.error("Model path must be specified either via --model-path argument or MODEL_PATH environment variable") + +config = AutoConfig.from_pretrained(model_path) + +print("Model type: ", config.model_type) +print("Vocab size: ", config.vocab_size) +print("Hidden size: ", config.hidden_size) +print("Number of layers: ", config.num_hidden_layers) +print("BOS token id: ", config.bos_token_id) +print("EOS token id: ", config.eos_token_id) + +print("Loading model and tokenizer using AutoTokenizer:", model_path) +tokenizer = AutoTokenizer.from_pretrained(model_path) +config = AutoConfig.from_pretrained(model_path) + +if unreleased_model_name: + model_name_lower = unreleased_model_name.lower() + unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}" + class_name = f"{unreleased_model_name}ForCausalLM" + print(f"Importing unreleased model module: {unreleased_module_path}") + + try: + model_class = getattr(importlib.import_module(unreleased_module_path), class_name) + model = model_class.from_pretrained(model_path) # Note: from_pretrained, not fromPretrained + except (ImportError, AttributeError) as e: + print(f"Failed to import or load model: {e}") + exit(1) +else: + model = AutoModelForCausalLM.from_pretrained(model_path) + +model_name = os.path.basename(model_path) +# Printing the Model class to allow for easier debugging. This can be useful +# when working with models that have not been publicly released yet and this +# migth require that the concrete class is imported and used directly instead +# of using AutoModelForCausalLM. +print(f"Model class: {model.__class__.__name__}") + +prompt = "Hello, my name is" +input_ids = tokenizer(prompt, return_tensors="pt").input_ids + +print(f"Input tokens: {input_ids}") +print(f"Input text: {repr(prompt)}") +print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}") + +with torch.no_grad(): + outputs = model(input_ids) + logits = outputs.logits + + # Extract logits for the last token (next token prediction) + last_logits = logits[0, -1, :].cpu().numpy() + + print(f"Logits shape: {logits.shape}") + print(f"Last token logits shape: {last_logits.shape}") + print(f"Vocab size: {len(last_logits)}") + + data_dir = Path("data") + data_dir.mkdir(exist_ok=True) + bin_filename = data_dir / f"pytorch-{model_name}.bin" + txt_filename = data_dir / f"pytorch-{model_name}.txt" + + # Save to file for comparison + last_logits.astype(np.float32).tofile(bin_filename) + + # Also save as text file for easy inspection + with open(txt_filename, "w") as f: + for i, logit in enumerate(last_logits): + f.write(f"{i}: {logit:.6f}\n") + + # Print some sample logits for quick verification + print(f"First 10 logits: {last_logits[:10]}") + print(f"Last 10 logits: {last_logits[-10:]}") + + # Show top 5 predicted tokens + top_indices = np.argsort(last_logits)[-5:][::-1] + print("Top 5 predictions:") + for idx in top_indices: + token = tokenizer.decode([idx]) + print(f" Token {idx} ({repr(token)}): {last_logits[idx]:.6f}") + + print(f"Saved bin logits to: {bin_filename}") + print(f"Saved txt logist to: {txt_filename}") diff --git a/examples/model-conversion/scripts/embedding/compare-embeddings-logits.sh b/examples/model-conversion/scripts/embedding/compare-embeddings-logits.sh new file mode 100755 index 0000000000..35b5d71984 --- /dev/null +++ b/examples/model-conversion/scripts/embedding/compare-embeddings-logits.sh @@ -0,0 +1,42 @@ +#/bin/bash + +set -e + +MODEL_PATH="${1:-"$EMBEDDING_MODEL_PATH"}" +MODEL_NAME="${2:-$(basename "$MODEL_PATH")}" + +if [ -t 0 ]; then + CPP_EMBEDDINGS="data/llamacpp-${MODEL_NAME}-embeddings.bin" +else + # Process piped JSON data and convert to binary (matching logits.cpp format) + TEMP_FILE=$(mktemp /tmp/tmp.XXXXXX.binn) + python3 -c " +import json +import sys +import struct + +data = json.load(sys.stdin) + +# Flatten all embeddings completely +flattened = [] +for item in data: + embedding = item['embedding'] + for token_embedding in embedding: + flattened.extend(token_embedding) + +print(f'Total embedding values: {len(flattened)}', file=sys.stderr) + +# Write as binary floats - matches logitc.cpp fwrite format +with open('$TEMP_FILE', 'wb') as f: + for value in flattened: + f.write(struct.pack('f', value)) +" + CPP_EMBEDDINGS="$TEMP_FILE" + trap "rm -f $TEMP_FILE" EXIT +fi + +python scripts/utils/semantic_check.py --model-path $MODEL_PATH \ + --python-embeddings data/pytorch-${MODEL_NAME}-embeddings.bin \ + --cpp-embeddings $CPP_EMBEDDINGS \ + --prompt "Hello world today" + diff --git a/examples/model-conversion/scripts/embedding/convert-model.sh b/examples/model-conversion/scripts/embedding/convert-model.sh new file mode 100755 index 0000000000..0609e35357 --- /dev/null +++ b/examples/model-conversion/scripts/embedding/convert-model.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +set -e + +MODEL_NAME="${MODEL_NAME:-$(basename "$EMBEDDING_MODEL_PATH")}" +OUTPUT_DIR="${OUTPUT_DIR:-../../models}" +TYPE="${OUTTYPE:-f16}" +METADATA_OVERRIDE="${METADATA_OVERRIDE:-}" +CONVERTED_MODEL="${OUTPUT_DIR}/${MODEL_NAME}.gguf" + +echo "Model path: ${EMBEDDING_MODEL_PATH}" +echo "Model name: ${MODEL_NAME}" +echo "Data type: ${TYPE}" +echo "Converted model path:: ${CONVERTED_MODEL}" +python ../../convert_hf_to_gguf.py --verbose \ + ${EMBEDDING_MODEL_PATH} \ + --outfile ${CONVERTED_MODEL} \ + --outtype ${TYPE} + +echo "" +echo "The environment variable CONVERTED_EMBEDDING MODEL can be set to this path using:" +echo "export CONVERTED_EMBEDDING_MODEL=$(realpath ${CONVERTED_MODEL})" diff --git a/examples/model-conversion/scripts/embedding/run-converted-model.sh b/examples/model-conversion/scripts/embedding/run-converted-model.sh new file mode 100755 index 0000000000..5896090411 --- /dev/null +++ b/examples/model-conversion/scripts/embedding/run-converted-model.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +set -e + +# First try command line argument, then environment variable, then file +CONVERTED_MODEL="${1:-"$CONVERTED_EMBEDDING_MODEL"}" + +# Final check if we have a model path +if [ -z "$CONVERTED_MODEL" ]; then + echo "Error: Model path must be provided either as:" >&2 + echo " 1. Command line argument" >&2 + echo " 2. CONVERTED_EMBEDDING_MODEL environment variable" >&2 + exit 1 +fi + +echo $CONVERTED_MODEL + +cmake --build ../../build --target llama-logits -j8 + +../../build/bin/llama-logits -m "$CONVERTED_MODEL" -embd-mode "Hello world today" diff --git a/examples/model-conversion/scripts/embedding/run-original-model.py b/examples/model-conversion/scripts/embedding/run-original-model.py new file mode 100755 index 0000000000..b9db0b893f --- /dev/null +++ b/examples/model-conversion/scripts/embedding/run-original-model.py @@ -0,0 +1,116 @@ +#!/usr/bin/env python3 + +import argparse +import os +import numpy as np +import importlib +from pathlib import Path + +from transformers import AutoTokenizer, AutoConfig, AutoModel +import torch + +unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME') + +parser = argparse.ArgumentParser(description='Process model with specified path') +parser.add_argument('--model-path', '-m', help='Path to the model') +args = parser.parse_args() + +model_path = os.environ.get('EMBEDDING_MODEL_PATH', args.model_path) +if model_path is None: + parser.error("Model path must be specified either via --model-path argument or EMBEDDING_MODEL_PATH environment variable") + +tokenizer = AutoTokenizer.from_pretrained(model_path) + +if unreleased_model_name: + model_name_lower = unreleased_model_name.lower() + unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}" + class_name = f"{unreleased_model_name}Model" + print(f"Importing unreleased model module: {unreleased_module_path}") + + try: + model_class = getattr(importlib.import_module(unreleased_module_path), class_name) + model = model_class.from_pretrained(model_path) # Note: from_pretrained, not fromPretrained + except (ImportError, AttributeError) as e: + print(f"Failed to import or load model: {e}") + exit(1) +else: + model = AutoModel.from_pretrained(model_path) +print(f"Model class: {type(model)}") +#print(f"Model file: {type(model).__module__}") +config = AutoConfig.from_pretrained(model_path) + +model_name = os.path.basename(model_path) + +texts = [ "Hello world today" ] + +encoded = tokenizer( + texts, + padding=True, + truncation=True, + return_tensors="pt" +) + +tokens = encoded['input_ids'][0] +token_strings = tokenizer.convert_ids_to_tokens(tokens) +for i, (token_id, token_str) in enumerate(zip(tokens, token_strings)): + print(f"{token_id:6d} -> '{token_str}'") + +with torch.no_grad(): + outputs = model(**encoded) + hidden_states = outputs.last_hidden_state # Shape: [batch_size, seq_len, hidden_size] + + # Extract embeddings for each token (matching LLAMA_POOLING_TYPE_NONE behavior) + all_embeddings = hidden_states[0].cpu().numpy() # Shape: [seq_len, hidden_size] + + print(f"Hidden states shape: {hidden_states.shape}") + print(f"All embeddings shape: {all_embeddings.shape}") + print(f"Embedding dimension: {all_embeddings.shape[1]}") + + # Print embeddings exactly like embedding.cpp does for LLAMA_POOLING_TYPE_NONE + n_embd = all_embeddings.shape[1] + n_embd_count = all_embeddings.shape[0] + + print() # Empty line to match C++ output + + for j in range(n_embd_count): + embedding = all_embeddings[j] + print(f"embedding {j}: ", end="") + + # Print first 3 values + for i in range(min(3, n_embd)): + print(f"{embedding[i]:9.6f} ", end="") + + print(" ... ", end="") + + # Print last 3 values + for i in range(n_embd - 3, n_embd): + print(f"{embedding[i]:9.6f} ", end="") + + print() # New line + + print() # Final empty line to match C++ output + + data_dir = Path("data") + data_dir.mkdir(exist_ok=True) + bin_filename = data_dir / f"pytorch-{model_name}-embeddings.bin" + txt_filename = data_dir / f"pytorch-{model_name}-embeddings.txt" + + # Save all embeddings flattened (matching what embedding.cpp would save if it did) + flattened_embeddings = all_embeddings.flatten() + flattened_embeddings.astype(np.float32).tofile(bin_filename) + + with open(txt_filename, "w") as f: + f.write(f"# Model class: {model_name}\n") + f.write(f"# Tokens: {token_strings}\n") + f.write(f"# Shape: {all_embeddings.shape}\n") + f.write(f"# n_embd_count: {n_embd_count}, n_embd: {n_embd}\n\n") + + for j in range(n_embd_count): + f.write(f"# Token {j} ({token_strings[j]}):\n") + for i, value in enumerate(all_embeddings[j]): + f.write(f"{j}_{i}: {value:.6f}\n") + f.write("\n") + print(f"Total values: {len(flattened_embeddings)} ({n_embd_count} tokens × {n_embd} dimensions)") + print("") + print(f"Saved bin embeddings to: {bin_filename}") + print(f"Saved txt embeddings to: {txt_filename}") diff --git a/examples/model-conversion/scripts/readme.md.template b/examples/model-conversion/scripts/readme.md.template new file mode 100644 index 0000000000..87800a1b93 --- /dev/null +++ b/examples/model-conversion/scripts/readme.md.template @@ -0,0 +1,13 @@ +--- +base_model: +- {base_model} +--- +# {model_name} GGUF + +Recommended way to run this model: + +```sh +llama-server -hf {namespace}/{model_name}-GGUF -c 0 -fa +``` + +Then, access http://localhost:8080 diff --git a/examples/model-conversion/scripts/utils/check-nmse.py b/examples/model-conversion/scripts/utils/check-nmse.py new file mode 100755 index 0000000000..196a6210f0 --- /dev/null +++ b/examples/model-conversion/scripts/utils/check-nmse.py @@ -0,0 +1,174 @@ +#!/usr/bin/env python3 + +import numpy as np +import sys +import os +import argparse +from pathlib import Path + +def calculate_nmse(reference, test): + mse = np.mean((test - reference) ** 2) + ref_var = np.var(reference) + if ref_var == 0: + nmse = float('inf') if mse > 0 else 0.0 + return mse, mse, ref_var + + nmse = mse / ref_var + + return nmse, mse, ref_var + +def load_logits(file_path): + if not os.path.exists(file_path): + raise FileNotFoundError(f"File not found: {file_path}") + + if file_path.suffix == '.npy': + return np.load(file_path) + elif file_path.suffix == '.bin': + return np.fromfile(file_path, dtype=np.float32) + else: + # Try to load as text file + try: + # If it has index format "0: value", extract just values + data = [] + with open(file_path, 'r') as f: + for line in f: + if ':' in line: + # Format: "index: value" + value = float(line.split(':')[1].strip()) + else: + # Just the value + value = float(line.strip()) + data.append(value) + return np.array(data, dtype=np.float32) + except: + return np.loadtxt(file_path, dtype=np.float32) + +def interpret_nmse(nmse): + """Provide interpretation of NMSE value""" + if nmse == 0: + return "Perfect match", "🎉" + elif nmse < 1e-6: + return "Essentially identical", "✅" + elif nmse < 1e-4: + return "Excellent match", "✅" + elif nmse < 1e-3: + return "Very good match", "👍" + elif nmse < 1e-2: + return "Good match", "👍" + elif nmse < 0.1: + return "Acceptable match", "⚠️" + elif nmse < 1.0: + return "Poor match", "❌" + else: + return "Very poor match (worse than noise)", "❌" + +def main(): + parser = argparse.ArgumentParser(description='Validate model logits') + parser.add_argument('-m', '--model-path', required=True, help='Path to the model directory') + args = parser.parse_args() + + model_name = os.path.splitext(os.path.basename(args.model_path))[0] + data_dir = Path("data") + + pytorch_file = data_dir / f"pytorch-{model_name}.bin" + llamacpp_file = data_dir / f"llamacpp-{model_name}.bin" + + print(f"Model name: {model_name}") + print(f"PyTorch logits file: {pytorch_file}") + print(f"llama.cpp logits file: {llamacpp_file}") + + reference_file = pytorch_file + test_file = llamacpp_file + + print("📊 NMSE Check for Model Comparison") + print("=" * 50) + print(f"Reference (ground truth): {reference_file}") + print(f"Test (to evaluate): {test_file}") + print() + + try: + print("Loading reference logits...") + reference = load_logits(reference_file) + print(f" Shape: {reference.shape}, Type: {reference.dtype}") + + print("Loading test logits...") + test = load_logits(test_file) + print(f" Shape: {test.shape}, Type: {test.dtype}") + + # Check shapes match + if reference.shape != test.shape: + print(f"\n❌ Error: Shape mismatch!") + print(f" Reference: {reference.shape}") + print(f" Test: {test.shape}") + sys.exit(1) + + print(f"\n✅ Shapes match: {reference.shape}") + + nmse, mse, ref_var = calculate_nmse(reference, test) + + # Additional metrics + max_abs_error = np.max(np.abs(test - reference)) + mean_abs_error = np.mean(np.abs(test - reference)) + + # Results + print(f"\n📈 METRICS") + print("=" * 30) + print(f"MSE (Mean Squared Error): {mse:.6e}") + print(f"Reference Variance: {ref_var:.6e}") + print(f"NMSE: {nmse:.6e}") + print(f"Max Absolute Error: {max_abs_error:.6f}") + print(f"Mean Absolute Error: {mean_abs_error:.6f}") + + # NMSE in dB (common in signal processing) + if nmse > 0: + nmse_db = 10 * np.log10(nmse) + print(f"NMSE (dB): {nmse_db:.2f} dB") + + # Interpretation + interpretation, emoji = interpret_nmse(nmse) + print(f"\n🎯 INTERPRETATION") + print("=" * 30) + print(f"{emoji} {interpretation}") + + # Detailed guidance + print(f"\n📋 GUIDANCE") + print("=" * 30) + if nmse < 1e-3: + print("✅ EXCELLENT: Your GGML conversion is working very well!") + print(" The differences are negligible for practical use.") + elif nmse < 1e-2: + print("👍 GOOD: Your GGML conversion is working well.") + print(" Small differences are likely due to precision/quantization.") + elif nmse < 0.1: + print("⚠️ ACCEPTABLE: Conversion is working but with some differences.") + print(" Check if you're using quantization (Q4, Q8, etc.)") + print(" Test generation quality to see if it's acceptable.") + else: + print("❌ PROBLEMATIC: Large differences detected.") + print(" Check your conversion process for potential issues.") + print(" Verify you're using the same model weights.") + + # NMSE benchmarks + print(f"\n📚 NMSE BENCHMARKS") + print("=" * 30) + print("< 1e-6: Essentially identical") + print("< 1e-4: Excellent (typical for good conversions)") + print("< 1e-3: Very good") + print("< 1e-2: Good (acceptable for most use cases)") + print("< 0.1: Acceptable (may need verification)") + print("> 1.0: Poor (worse than random)") + + # Exit code based on NMSE + if nmse < 1e-2: + print(f"\n✅ RESULT: PASS (NMSE = {nmse:.2e})") + sys.exit(0) + else: + print(f"\n❌ RESULT: NEEDS REVIEW (NMSE = {nmse:.2e})") + sys.exit(1) + + except Exception as e: + print(f"❌ Error: {e}") + sys.exit(1) + +if __name__ == "__main__": + main() diff --git a/examples/model-conversion/scripts/utils/create-collection-add-model.sh b/examples/model-conversion/scripts/utils/create-collection-add-model.sh new file mode 100644 index 0000000000..4809da6cb6 --- /dev/null +++ b/examples/model-conversion/scripts/utils/create-collection-add-model.sh @@ -0,0 +1,6 @@ + +COLLECTION_SLUG=$(python ./create_collection.py --return-slug) +echo "Created collection: $COLLECTION_SLUG" + +# Use it in the next command +python add_model_to_collection.py "$COLLECTION_SLUG" "username/my-model" diff --git a/examples/model-conversion/scripts/utils/hf-add-model-to-collection.py b/examples/model-conversion/scripts/utils/hf-add-model-to-collection.py new file mode 100755 index 0000000000..7e38af3c13 --- /dev/null +++ b/examples/model-conversion/scripts/utils/hf-add-model-to-collection.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 + +from huggingface_hub import HfApi +import argparse +import sys + +def add_model_to_collection(collection_slug, model_id, note=""): + """ + Add a model to an existing collection + + Args: + collection_slug: The slug of the collection (e.g., "username/collection-name-12345") + model_id: The model repository ID (e.g., "username/model-name") + note: Optional note about the model + + Returns: + True if successful, False if failed + """ + + # Initialize API + api = HfApi() + + try: + user_info = api.whoami() + print(f"✅ Authenticated as: {user_info['name']}") + + # Verify the model exists + print(f"🔍 Checking if model exists: {model_id}") + try: + model_info = api.model_info(model_id) + except Exception as e: + print(f"❌ Model not found or not accessible: {model_id}") + print(f"Error: {e}") + return False + + print(f"📚 Adding model to collection...") + api.add_collection_item( + collection_slug=collection_slug, + item_id=model_id, + item_type="model", + note=note + ) + + print(f"✅ Model added to collection successfully!") + print(f"🔗 Collection URL: https://huggingface.co/collections/{collection_slug}") + + return True + + except Exception as e: + print(f"❌ Error adding model to collection: {e}") + return False + +def main(): + # This script requires that the environment variable HF_TOKEN is set with your + # Hugging Face API token. + api = HfApi() + + parser = argparse.ArgumentParser(description='Add model to a Huggingface Collection') + parser.add_argument('--collection', '-c', help='The collection slug username/collection-hash', required=True) + parser.add_argument('--model', '-m', help='The model to add to the Collection', required=True) + parser.add_argument('--note', '-n', help='An optional note/description', required=False) + args = parser.parse_args() + + collection = args.collection + model = args.model + note = args.note + + success = add_model_to_collection( + collection_slug=collection, + model_id=model, + note=note + ) + + if success: + print("\n🎉 Model added successfully!") + else: + print("\n❌ Failed to add model to collection") + sys.exit(1) +if __name__ == "__main__": + main() diff --git a/examples/model-conversion/scripts/utils/hf-create-collection.py b/examples/model-conversion/scripts/utils/hf-create-collection.py new file mode 100755 index 0000000000..e0fa60af1a --- /dev/null +++ b/examples/model-conversion/scripts/utils/hf-create-collection.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 + +from huggingface_hub import HfApi +import argparse +import os +import sys + + +def create_collection(title, description, private=False, namespace=None, return_slug=False): + """ + Create a new collection on Hugging Face + + Args: + title: Collection title + description: Collection description + private: Whether the collection should be private (default: False) + namespace: Optional namespace (defaults to your username) + + Returns: + Collection object if successful, None if failed + """ + + # Check if HF_TOKEN is available + token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN") + if not token: + print("❌ No HF_TOKEN or HUGGINGFACE_HUB_TOKEN found in environment variables") + print("Please set your Hugging Face token as an environment variable") + return None + + # Initialize API + api = HfApi() + + try: + # Test authentication first + user_info = api.whoami() + if not return_slug: + print(f"✅ Authenticated as: {user_info['name']}") + + # Create the collection + if not return_slug: + print(f"📚 Creating collection: '{title}'...") + collection = api.create_collection( + title=title, + description=description, + private=private, + namespace=namespace + ) + + if not return_slug: + print(f"✅ Collection created successfully!") + print(f"📋 Collection slug: {collection.slug}") + print(f"🔗 Collection URL: https://huggingface.co/collections/{collection.slug}") + + return collection + + except Exception as e: + print(f"❌ Error creating collection: {e}") + return None + +def main(): + # This script requires that the environment variable HF_TOKEN is set with your + # Hugging Face API token. + api = HfApi() + + parser = argparse.ArgumentParser(description='Create a Huggingface Collection') + parser.add_argument('--name', '-n', help='The name/title of the Collection', required=True) + parser.add_argument('--description', '-d', help='The description for the Collection', required=True) + parser.add_argument('--namespace', '-ns', help='The namespace to add the Collection to', required=True) + parser.add_argument('--private', '-p', help='Create a private Collection', action='store_true') # Fixed + parser.add_argument('--return-slug', '-s', help='Only output the collection slug', action='store_true') # Fixed + + args = parser.parse_args() + + name = args.name + description = args.description + private = args.private + namespace = args.namespace + return_slug = args.return_slug + + if not return_slug: + print("🚀 Creating Hugging Face Collection") + print(f"Title: {name}") + print(f"Description: {description}") + print(f"Namespace: {namespace}") + print(f"Private: {private}") + + collection = create_collection( + title=name, + description=description, + private=private, + namespace=namespace, + return_slug=return_slug + ) + + if collection: + if return_slug: + print(collection.slug) + else: + print("\n🎉 Collection created successfully!") + print(f"Use this slug to add models: {collection.slug}") + else: + print("\n❌ Failed to create collection") + sys.exit(1) + +if __name__ == "__main__": + main() diff --git a/examples/model-conversion/scripts/utils/hf-create-model.py b/examples/model-conversion/scripts/utils/hf-create-model.py new file mode 100755 index 0000000000..09bb8511ef --- /dev/null +++ b/examples/model-conversion/scripts/utils/hf-create-model.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 + +from huggingface_hub import HfApi +import argparse + +# This script requires that the environment variable HF_TOKEN is set with your +# Hugging Face API token. +api = HfApi() + +def load_template_and_substitute(template_path, **kwargs): + try: + with open(template_path, 'r', encoding='utf-8') as f: + template_content = f.read() + + return template_content.format(**kwargs) + except FileNotFoundError: + print(f"Template file '{template_path}' not found!") + return None + except KeyError as e: + print(f"Missing template variable: {e}") + return None + +parser = argparse.ArgumentParser(description='Create a new Hugging Face model repository') +parser.add_argument('--model-name', '-m', help='Name for the model', required=True) +parser.add_argument('--namespace', '-ns', help='Namespace to add the model to', required=True) +parser.add_argument('--org-base-model', '-b', help='Original Base model name', default="") +parser.add_argument('--no-card', action='store_true', help='Skip creating model card') +parser.add_argument('--private', '-p', action='store_true', help='Create private model') + +args = parser.parse_args() + +repo_id = f"{args.namespace}/{args.model_name}-GGUF" +print("Repository ID: ", repo_id) + +repo_url = api.create_repo( + repo_id=repo_id, + repo_type="model", + private=args.private, + exist_ok=False +) + +if not args.no_card: + template_path = "scripts/readme.md.template" + model_card_content = load_template_and_substitute( + template_path, + model_name=args.model_name, + namespace=args.namespace, + base_model=args.org_base_model, + ) + + if model_card_content: + api.upload_file( + path_or_fileobj=model_card_content.encode('utf-8'), + path_in_repo="README.md", + repo_id=repo_id + ) + print("Model card created successfully.") + else: + print("Failed to create model card.") + +print(f"Repository created: {repo_url}") + + diff --git a/examples/model-conversion/scripts/utils/hf-upload-gguf-model.py b/examples/model-conversion/scripts/utils/hf-upload-gguf-model.py new file mode 100755 index 0000000000..15ccb1150e --- /dev/null +++ b/examples/model-conversion/scripts/utils/hf-upload-gguf-model.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python3 + +from huggingface_hub import HfApi +import argparse +import os + +def upload_gguf_file(local_file_path, repo_id, filename_in_repo=None): + """ + Upload a GGUF file to a Hugging Face model repository + + Args: + local_file_path: Path to your local GGUF file + repo_id: Your repository ID (e.g., "username/model-name") + filename_in_repo: Optional custom name for the file in the repo + """ + + if not os.path.exists(local_file_path): + print(f"❌ File not found: {local_file_path}") + return False + + if filename_in_repo is None: + filename_in_repo = os.path.basename(local_file_path) + + if filename_in_repo is None or filename_in_repo == "": + filename_in_repo = os.path.basename(local_file_path) + + print(f"📤 Uploading {local_file_path} to {repo_id}/{filename_in_repo}") + + api = HfApi() + + try: + api.upload_file( + path_or_fileobj=local_file_path, + path_in_repo=filename_in_repo, + repo_id=repo_id, + repo_type="model", + commit_message=f"Upload {filename_in_repo}" + ) + + print("✅ Upload successful!") + print(f"🔗 File available at: https://huggingface.co/{repo_id}/blob/main/{filename_in_repo}") + return True + + except Exception as e: + print(f"❌ Upload failed: {e}") + return False + +# This script requires that the environment variable HF_TOKEN is set with your +# Hugging Face API token. +api = HfApi() + +parser = argparse.ArgumentParser(description='Upload a GGUF model to a Huggingface model repository') +parser.add_argument('--gguf-model-path', '-m', help='The GGUF model file to upload', required=True) +parser.add_argument('--repo-id', '-r', help='The repository to upload to', required=True) +parser.add_argument('--name', '-o', help='The name in the model repository', required=False) +args = parser.parse_args() + +upload_gguf_file(args.gguf_model_path, args.repo_id, args.name) diff --git a/examples/model-conversion/scripts/utils/inspect-converted-model.sh b/examples/model-conversion/scripts/utils/inspect-converted-model.sh new file mode 100755 index 0000000000..e5b9324542 --- /dev/null +++ b/examples/model-conversion/scripts/utils/inspect-converted-model.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +# First try command line argument, then environment variable, then file +CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}" + +# Final check if we have a model path +if [ -z "$CONVERTED_MODEL" ]; then + echo "Error: Model path must be provided either as:" >&2 + echo " 1. Command line argument" >&2 + echo " 2. CONVERTED_MODEL environment variable" >&2 + exit 1 +fi + +../../gguf-py/gguf/scripts/gguf_dump.py $CONVERTED_MODEL diff --git a/examples/model-conversion/scripts/utils/inspect-org-model.py b/examples/model-conversion/scripts/utils/inspect-org-model.py new file mode 100755 index 0000000000..bc6f45a5fb --- /dev/null +++ b/examples/model-conversion/scripts/utils/inspect-org-model.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 + +import argparse +import os +import json +from safetensors import safe_open +from collections import defaultdict + +parser = argparse.ArgumentParser(description='Process model with specified path') +parser.add_argument('--model-path', '-m', help='Path to the model') +args = parser.parse_args() + +model_path = os.environ.get('MODEL_PATH', args.model_path) +if model_path is None: + parser.error("Model path must be specified either via --model-path argument or MODEL_PATH environment variable") + +# Check if there's an index file (multi-file model) +index_path = os.path.join(model_path, "model.safetensors.index.json") +single_file_path = os.path.join(model_path, "model.safetensors") + +if os.path.exists(index_path): + # Multi-file model + print("Multi-file model detected") + + with open(index_path, 'r') as f: + index_data = json.load(f) + + # Get the weight map (tensor_name -> file_name) + weight_map = index_data.get("weight_map", {}) + + # Group tensors by file for efficient processing + file_tensors = defaultdict(list) + for tensor_name, file_name in weight_map.items(): + file_tensors[file_name].append(tensor_name) + + print("Tensors in model:") + + # Process each shard file + for file_name, tensor_names in file_tensors.items(): + file_path = os.path.join(model_path, file_name) + print(f"\n--- From {file_name} ---") + + with safe_open(file_path, framework="pt") as f: + for tensor_name in sorted(tensor_names): + tensor = f.get_tensor(tensor_name) + print(f"- {tensor_name} : shape = {tensor.shape}, dtype = {tensor.dtype}") + +elif os.path.exists(single_file_path): + # Single file model (original behavior) + print("Single-file model detected") + + with safe_open(single_file_path, framework="pt") as f: + keys = f.keys() + print("Tensors in model:") + for key in sorted(keys): + tensor = f.get_tensor(key) + print(f"- {key} : shape = {tensor.shape}, dtype = {tensor.dtype}") + +else: + print(f"Error: Neither 'model.safetensors.index.json' nor 'model.safetensors' found in {model_path}") + print("Available files:") + if os.path.exists(model_path): + for item in sorted(os.listdir(model_path)): + print(f" {item}") + else: + print(f" Directory {model_path} does not exist") + exit(1) diff --git a/examples/model-conversion/scripts/utils/perplexity-gen.sh b/examples/model-conversion/scripts/utils/perplexity-gen.sh new file mode 100755 index 0000000000..3db0b3fd27 --- /dev/null +++ b/examples/model-conversion/scripts/utils/perplexity-gen.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +set -e + +CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}" + +# Final check if we have a model path +if [ -z "$CONVERTED_MODEL" ]; then + echo "Error: Model path must be provided either as:" >&2 + echo " 1. Command line argument" >&2 + echo " 2. CONVERTED_MODEL environment variable" >&2 + exit 1 +fi + +# Check if data/wikitext-2-raw directory exists +if [ ! -d "ppl/wikitext-2-raw" ]; then + echo "ppl/wikitext-2-raw directory does not exist. Downloading..." >&2 + mkdir -p ppl + pushd ppl + ./../../../scripts/get-wikitext-2.sh + popd +fi + +mkdir -p ppl +OUTPUTFILE="ppl/$(basename $CONVERTED_MODEL).kld" +echo "Model: $CONVERTED_MODEL" + +cmake --build ../../build --target llama-perplexity -j8 + +../.././build/bin/llama-perplexity -m $CONVERTED_MODEL \ + -f ppl/wikitext-2-raw/wiki.test.raw \ + --kl-divergence-base $OUTPUTFILE + +echo "Generated logits in $OUTPUTFILE" + diff --git a/examples/model-conversion/scripts/utils/perplexity-run-simple.sh b/examples/model-conversion/scripts/utils/perplexity-run-simple.sh new file mode 100755 index 0000000000..69b3438f59 --- /dev/null +++ b/examples/model-conversion/scripts/utils/perplexity-run-simple.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +set -e + +QUANTIZED_MODEL="${1:-"$QUANTIZED_MODEL"}" + +if [ -z "$QUANTIZED_MODEL" ]; then + echo "Error: Model path must be provided either as:" >&2 + echo " 1. Command line argument" >&2 + echo " 2. QUANTIZED_MODEL environment variable" >&2 + exit 1 +fi + +# Check if data/wikitext-2-raw directory exists +if [ ! -d "ppl/wikitext-2-raw" ]; then + echo "ppl/wikitext-2-raw directory does not exist. Downloading..." >&2 + mkdir -p ppl + pushd ppl + ./../../../scripts/get-wikitext-2.sh + popd +fi + +cmake --build ../../build --target llama-perplexity -j8 + +../.././build/bin/llama-perplexity -m $QUANTIZED_MODEL -f ppl/wikitext-2-raw/wiki.test.raw + + diff --git a/examples/model-conversion/scripts/utils/perplexity-run.sh b/examples/model-conversion/scripts/utils/perplexity-run.sh new file mode 100755 index 0000000000..3bce7c8472 --- /dev/null +++ b/examples/model-conversion/scripts/utils/perplexity-run.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +set -e + +QUANTIZED_MODEL="${1:-"$QUANTIZED_MODEL"}" +LOGITS_FILE="${1:-"$LOGITS_FILE"}" + +if [ -z "$QUANTIZED_MODEL" ]; then + echo "Error: Model path must be provided either as:" >&2 + echo " 1. Command line argument" >&2 + echo " 2. QUANTIZED_MODEL environment variable" >&2 + exit 1 +fi + +if [ ! -f ${LOGITS_FILE} ]; then + echo "Error: logits file '${LOGITS_FILE} was not found" + echo "Did you run the perplexity-gen.sh script?" + exit 1 +fi + +echo "Model: $QUANTIZED_MODEL" +echo "Data file: $LOGITS_FILE" + +cmake --build ../../build --target llama-perplexity -j8 + +../.././build/bin/llama-perplexity -m $QUANTIZED_MODEL \ + --kl-divergence-base $LOGITS_FILE \ + --kl-divergence diff --git a/examples/model-conversion/scripts/utils/quantize.sh b/examples/model-conversion/scripts/utils/quantize.sh new file mode 100755 index 0000000000..bcb8775754 --- /dev/null +++ b/examples/model-conversion/scripts/utils/quantize.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +set -e + +CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}" +QUANTIZED_TYPE="${2:-"$QUANTIZED_TYPE"}" +QUANTIZED_MODEL=$CONVERTED_MODEL + +# Final check if we have a model path +if [ -z "$CONVERTED_MODEL" ]; then + echo "Error: Model path must be provided either as:" >&2 + echo " 1. Command line argument" >&2 + echo " 2. CONVERTED_MODEL environment variable" >&2 + exit 1 +fi + +echo $CONVERTED_MODEL + +# Process the quantized model filename +if [[ "$QUANTIZED_MODEL" == *.gguf ]]; then + # Remove .gguf suffix, add quantized type, then add .gguf back + BASE_NAME="${QUANTIZED_MODEL%.gguf}" + QUANTIZED_MODEL="${BASE_NAME}-${QUANTIZED_TYPE}.gguf" +else + echo "Error: QUANTIZED_MODEL must end with .gguf extension" >&2 + exit 1 +fi + + +cmake --build ../../build --target llama-quantize -j8 + +../../build/bin/llama-quantize $CONVERTED_MODEL $QUANTIZED_MODEL $QUANTIZED_TYPE + +echo "Quantized model saved to: $QUANTIZED_MODEL" diff --git a/examples/model-conversion/scripts/utils/run-embedding-server.sh b/examples/model-conversion/scripts/utils/run-embedding-server.sh new file mode 100755 index 0000000000..828fc47069 --- /dev/null +++ b/examples/model-conversion/scripts/utils/run-embedding-server.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +set -e +# +# First try command line argument, then environment variable, then file +CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}" + +# Final check if we have a model path +if [ -z "$CONVERTED_MODEL" ]; then + echo "Error: Model path must be provided either as:" >&2 + echo " 1. Command line argument" >&2 + echo " 2. CONVERTED_MODEL environment variable" >&2 + exit 1 +fi + +echo $CONVERTED_MODEL + +cmake --build ../../build --target llama-server + +../../build/bin/llama-server -m $CONVERTED_MODEL \ + --embedding \ + --pooling none diff --git a/examples/model-conversion/scripts/utils/semantic_check.py b/examples/model-conversion/scripts/utils/semantic_check.py new file mode 100644 index 0000000000..d211048097 --- /dev/null +++ b/examples/model-conversion/scripts/utils/semantic_check.py @@ -0,0 +1,179 @@ +#!/usr/bin/env python3 + +import numpy as np +import argparse +import os +import importlib + +from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, AutoModel + +unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME') + +def cosine_similarity(a, b=None): + a = np.asarray(a) + if b is None: + b = a + else: + b = np.asarray(b) + + if a.ndim == 1: + a = a.reshape(1, -1) + if b.ndim == 1: + b = b.reshape(1, -1) + + a_norms = np.linalg.norm(a, axis=1, keepdims=True) + b_norms = np.linalg.norm(b, axis=1, keepdims=True) + + a_norms = np.where(a_norms == 0, 1e-8, a_norms) + b_norms = np.where(b_norms == 0, 1e-8, b_norms) + + a_normalized = a / a_norms + b_normalized = b / b_norms + + # Compute cosine similarity + return np.dot(a_normalized, b_normalized.T) + +def load_embeddings_from_file(filename, n_tokens, n_embd): + embeddings = np.fromfile(filename, dtype=np.float32) + return embeddings.reshape(n_tokens, n_embd) + +def test_single_prompt_similarity(python_emb, cpp_emb, tokens, prompt): + np.set_printoptions(suppress=True, precision=6) + print("pytorch embeddings:"); + print(python_emb) + print("llama.cpp embeddings:"); + print(cpp_emb) + print(f"\n=== Prompt: '{prompt}' ===") + print(f"Tokens: {tokens}") + print(f"Embeddings shape: Python {python_emb.shape}, llama.cpp {cpp_emb.shape}") + + n_tokens = len(tokens) + + # 1. Direct embedding comparison + print(f"\n1. Raw Embedding Magnitude Comparison:") + # Check if the distance of each token embedding from the origin and compare + # if the vectors are on the same "sphere". This does not tell us about + # direction (meaning of the token embedding), just magnitude. + for i in range(n_tokens): + py_mag = np.linalg.norm(python_emb[i]) # calculate standard euclidean norm for Python embeddings + cpp_mag = np.linalg.norm(cpp_emb[i]) # calculate standard euclidean norm for llama.cpp embeddings + ratio = py_mag / cpp_mag if cpp_mag > 0 else float('inf') + print(f" Token {i} ({tokens[i]}): Python={py_mag:.3f}, llama.cpp={cpp_mag:.3f}, ratio={ratio:.3f}") + + # 2. Cosine similarity between tokens within each model + # Here we check the direction of token embeddings to see if the have the + # same meaning (similarity). This is done by calculating cosine similarity + # of a pair of token embeddings within each model. + print(f"\n2. Within-Model Token Similarities:") + print(" Python model:") + for i in range(n_tokens): + for j in range(i+1, n_tokens): + sim = cosine_similarity([python_emb[i]], [python_emb[j]])[0][0] + print(f" {tokens[i]} ↔ {tokens[j]}: {sim:.4f}") + + print(" llama.cpp model:") + for i in range(n_tokens): + for j in range(i+1, n_tokens): + sim = cosine_similarity([cpp_emb[i]], [cpp_emb[j]])[0][0] + print(f" {tokens[i]} ↔ {tokens[j]}: {sim:.4f}") + + # 3. Cross-model similarity (same token position) + print(f"\n3. Cross-Model Same-Token Similarities:") + for i in range(n_tokens): + sim = cosine_similarity([python_emb[i]], [cpp_emb[i]])[0][0] + print(f" Token {i} ({tokens[i]}): {sim:.4f}") + + # 4. Similarity matrix comparison + print(f"\n4. Similarity Matrix Differences:") + py_sim_matrix = cosine_similarity(python_emb) + cpp_sim_matrix = cosine_similarity(cpp_emb) + diff_matrix = np.abs(py_sim_matrix - cpp_sim_matrix) + + print(f" Max difference: {np.max(diff_matrix):.4f}") + print(f" Mean difference: {np.mean(diff_matrix):.4f}") + print(f" RMS difference: {np.sqrt(np.mean(diff_matrix**2)):.4f}") + + return { + 'cross_model_similarities': [cosine_similarity([python_emb[i]], [cpp_emb[i]])[0][0] for i in range(n_tokens)], + 'similarity_matrix_diff': diff_matrix, + 'max_diff': np.max(diff_matrix), + 'mean_diff': np.mean(diff_matrix), + 'rms_diff': np.sqrt(np.mean(diff_matrix**2)) + } + +def main(): + parser = argparse.ArgumentParser(description='Test semantic similarity between Python and llama.cpp embeddings') + parser.add_argument('--model-path', '-m', required=True, help='Path to the original Python model') + parser.add_argument('--python-embeddings', '-pe', help='Path to pytorch embeddings "logits" binary file') + parser.add_argument('--cpp-embeddings', '-ce', help='Path to llama.cpp embeddings "logits" binary file') + parser.add_argument('--causal', '-c', default=False, help='if the model is causal (default: false)', action='store_true') + parser.add_argument('--prompt', '-p', default='Hello world today', help='Test prompt') + + args = parser.parse_args() + + print("Semantic Similarity Test Between Python and llama.cpp Embedding Models") + print("=" * 70) + + # Single prompt detailed comparison + print(f"\nTesting with prompt: '{args.prompt}'") + + # Load the python model to get configuration information and also to load the tokenizer. + print("Loading model and tokenizer using AutoTokenizer:", args.model_path) + tokenizer = AutoTokenizer.from_pretrained(args.model_path) + config = AutoConfig.from_pretrained(args.model_path) + + if unreleased_model_name: + model_name_lower = unreleased_model_name.lower() + unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}" + if args.causal: + class_name = f"{unreleased_model_name}ForCausalLM" + else: + class_name = f"{unreleased_model_name}Model" + print(f"Model class: {class_name}") + print(f"Importing unreleased model module: {unreleased_module_path}") + + try: + model_class = getattr(importlib.import_module(unreleased_module_path), class_name) + model = model_class.from_pretrained(args.model_path) + except (ImportError, AttributeError) as e: + print(f"Failed to import or load model: {e}") + exit(1) + else: + if args.causal: + model = AutoModelForCausalLM.from_pretrained(args.model_path) + else: + model = AutoModel.from_pretrained(args.model_path) + + encoded = tokenizer(args.prompt, return_tensors="pt") + tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0]) + n_tokens = len(tokens) + print(f"n_tokens: {n_tokens}"); + print(f"hidden_size: {model.config.hidden_size}") + + # Load binary embeddings from data directory. + llamacpp_embeddings = load_embeddings_from_file(args.cpp_embeddings, n_tokens, model.config.hidden_size) + python_embeddings = load_embeddings_from_file(args.python_embeddings, n_tokens, model.config.hidden_size) + + # Run comparison + results = test_single_prompt_similarity(python_embeddings, llamacpp_embeddings, tokens, args.prompt) + + # Summary + print(f"\n=== SUMMARY ===") + avg_cross_sim = np.mean(results['cross_model_similarities']) + print(f"Average cross-model similarity: {avg_cross_sim:.4f}") + print(f"Similarity matrix RMS difference: {results['rms_diff']:.4f}") + + # Quality assessment + if avg_cross_sim > 0.95: + print("✅ EXCELLENT: Models are highly similar") + elif avg_cross_sim > 0.90: + print("✅ VERY GOOD: Models are very similar") + elif avg_cross_sim > 0.80: + print("⚠️ GOOD: Models are reasonably similar") + elif avg_cross_sim > 0.70: + print("⚠️ FAIR: Models have some differences") + else: + print("❌ POOR: Models are significantly different") + +if __name__ == "__main__": + main() diff --git a/examples/passkey/README.md b/examples/passkey/README.md index 2f19597c48..cbaf28fd82 100644 --- a/examples/passkey/README.md +++ b/examples/passkey/README.md @@ -11,5 +11,5 @@ See the following PRs for more info: ### Usage ```bash -make -j && ./llama-passkey -m ./models/llama-7b-v2/ggml-model-f16.gguf --junk 250 +llama-passkey -m ./models/llama-7b-v2/ggml-model-f16.gguf --junk 250 ``` diff --git a/examples/retrieval/README.md b/examples/retrieval/README.md index 6938a1e96e..51038cc36b 100644 --- a/examples/retrieval/README.md +++ b/examples/retrieval/README.md @@ -15,7 +15,7 @@ https://github.com/ggml-org/llama.cpp/pull/6193 `retrieval` example can be tested as follows: ```bash -make -j && ./llama-retrieval --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator . +llama-retrieval --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator . ``` This chunks and embeds all given files and starts a loop requesting query inputs: diff --git a/examples/sycl/win-build-sycl.bat b/examples/sycl/win-build-sycl.bat index 6fc897b148..862998e737 100644 --- a/examples/sycl/win-build-sycl.bat +++ b/examples/sycl/win-build-sycl.bat @@ -18,8 +18,6 @@ if %errorlevel% neq 0 goto ERROR :: for FP32 cmake -G "Ninja" .. -DLLAMA_CURL=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release if %errorlevel% neq 0 goto ERROR -:: build example/main only -:: make main :: build all binary cmake --build . -j diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 90e274ccdb..2ead001e2c 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -158,7 +158,6 @@ option(GGML_CUDA "ggml: use CUDA" option(GGML_MUSA "ggml: use MUSA" OFF) option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF) option(GGML_CUDA_FORCE_CUBLAS "ggml: always use cuBLAS instead of mmq kernels" OFF) -option(GGML_CUDA_F16 "ggml: use 16 bit floats for some calculations" OFF) set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING "ggml: max. batch size for using peer access") option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index da8813fd27..b8b82e11c8 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -244,6 +244,13 @@ #define GGML_MROPE_SECTIONS 4 #define GGML_UNUSED(x) (void)(x) +#ifdef __CUDACC__ +template +__host__ __device__ constexpr inline void ggml_unused_vars_impl(Args&&...) noexcept {} +#define GGML_UNUSED_VARS(...) ggml_unused_vars_impl(__VA_ARGS__) +#else +#define GGML_UNUSED_VARS(...) do { (void)sizeof((__VA_ARGS__, 0)); } while(0) +#endif // __CUDACC__ #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1)) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 1b9d29e911..c1e58fbb64 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -19,9 +19,8 @@ #include #include #include -#include -#include #include +#include #ifdef __APPLE__ #include @@ -1352,6 +1351,10 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) { static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) { struct ggml_backend_sched_split * splits = sched->splits; + ggml_tensor * prev_ids_tensor = nullptr; + std::vector ids; + std::vector used_ids; + for (int i = 0; i < sched->n_splits; i++) { struct ggml_backend_sched_split * split = &splits[i]; int split_backend_id = split->backend_id; @@ -1378,16 +1381,91 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s } else { ggml_backend_synchronize(split_backend); } - // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events - // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface - if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) { + + // when offloading MoE weights, we can reduce the amount of data copied by copying only the experts that are used + ggml_tensor * node = split->graph.nodes[0]; + if (split->graph.n_nodes > 0 && + ggml_backend_buffer_get_usage(input->buffer) == GGML_BACKEND_BUFFER_USAGE_WEIGHTS && + ggml_backend_buffer_is_host(input->buffer) && ( + (node->src[0] == input_cpy && node->op == GGML_OP_MUL_MAT_ID) + //|| (node->src[1] == input_cpy && node->op == GGML_OP_ADD_ID) /* GGML_OP_ADD_ID weights are small and not worth splitting */ + )) { + + const int64_t n_expert = node->op == GGML_OP_MUL_MAT_ID ? input->ne[2] : input->ne[1]; + const size_t expert_size = node->op == GGML_OP_MUL_MAT_ID ? input->nb[2] : input->nb[1]; + ggml_backend_synchronize(input_backend); - if (sched->events[split_backend_id][sched->cur_copy] != NULL) { - ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]); - } else { + + // get the ids + ggml_tensor * ids_tensor = node->src[2]; + if (ids_tensor != prev_ids_tensor) { + ids.resize(ggml_nbytes(ids_tensor) / sizeof(int32_t)); + ggml_backend_tensor_get_async(split_backend, ids_tensor, ids.data(), 0, ggml_nbytes(ids_tensor)); ggml_backend_synchronize(split_backend); + + // find the used experts + used_ids.clear(); + used_ids.resize(ggml_bitset_size(n_expert)); + for (int64_t i1 = 0; i1 < ids_tensor->ne[1]; i1++) { + for (int64_t i0 = 0; i0 < ids_tensor->ne[0]; i0++) { + int32_t id = ids[i1 * ids_tensor->nb[1]/sizeof(int32_t) + i0 * ids_tensor->nb[0]/sizeof(int32_t)]; + ggml_bitset_set(used_ids.data(), id); + } + } + + prev_ids_tensor = ids_tensor; + } + + // group consecutive experts and copy them together + auto copy_experts = [&](int32_t first_id, int32_t last_id) { + const size_t expert_offset = first_id * expert_size; + const size_t expert_size_copy = (last_id - first_id + 1) * expert_size; + const size_t padding = std::min(expert_size, 512); + const size_t padding_end = last_id < n_expert - 1 ? padding : 0; + + ggml_backend_tensor_set_async(split_backend, + input_cpy, + (const uint8_t *)input->data + expert_offset, expert_offset, + // copy a bit extra at the to ensure there are no NaNs in the padding of the last expert + // this is necessary for MMQ in the CUDA backend + expert_size_copy + padding_end); + }; + + int id = 0; + while (!ggml_bitset_get(used_ids.data(), id)) { + id++; + } + int32_t first_id = id; + int32_t last_id = first_id; + + for (++id; id < n_expert; ++id) { + if (!ggml_bitset_get(used_ids.data(), id)) { + continue; + } + + if (id == last_id + 1) { + last_id = id; + continue; + } + + copy_experts(first_id, last_id); + + first_id = id; + last_id = id; + } + copy_experts(first_id, last_id); + } else { + // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events + // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface + if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) { + ggml_backend_synchronize(input_backend); + if (sched->events[split_backend_id][sched->cur_copy] != NULL) { + ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]); + } else { + ggml_backend_synchronize(split_backend); + } + ggml_backend_tensor_copy(input, input_cpy); } - ggml_backend_tensor_copy(input, input_cpy); } } } diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index 259a2928b1..2a5cb8abfa 100755 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -2154,86 +2154,129 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, GGML_TENSOR_BINARY_OP_LOCALS - // theta_scale arange, [0,1,...,ne00/2 - 1] int64_t theta_scale_length = ne00 / 2; - ggml_cann_pool_alloc theta_scale_allocator(ctx.pool(), - theta_scale_length * sizeof(float_t)); - void* theta_scale_buffer = theta_scale_allocator.get(); int64_t theta_scale_ne[] = {theta_scale_length, 1, 1, 1}; size_t theta_scale_nb[] = {sizeof(float_t), sizeof(float_t), sizeof(float_t), theta_scale_length * sizeof(float_t)}; - aclTensor* acl_theta_scale_tensor = - ggml_cann_create_tensor(theta_scale_buffer, ACL_FLOAT, sizeof(float_t), - theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS); - float start = 0; - float step = 1; - float stop = ne00 / 2; - float n_elements = ne00 / 2; - aclnn_arange(ctx, acl_theta_scale_tensor, start, stop, step, n_elements); - - // power - aclScalar* acl_theta_scale = aclCreateScalar(&theta_scale, aclDataType::ACL_FLOAT); - GGML_CANN_CALL_ACLNN_OP(ctx, PowScalarTensor, acl_theta_scale, acl_theta_scale_tensor, - acl_theta_scale_tensor); - - // freq_scale - if (freq_scale != 1) { - aclnn_muls(ctx, acl_theta_scale_tensor, freq_scale, nullptr, true); - } - - // freq_factors - if (src2) { - aclTensor* acl_freq_factors_tensor = ggml_cann_create_tensor( - src2->data, ggml_cann_type_mapping(src2->type), - ggml_type_size(src2->type), theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS); - aclnn_div(ctx, acl_theta_scale_tensor, acl_freq_factors_tensor); - ggml_cann_release_resources(ctx, acl_freq_factors_tensor); - } - - // position GGML_ASSERT(src1->type == GGML_TYPE_I32); int64_t position_length = src1->ne[0]; int64_t position_ne[] = {1, 1, position_length, 1}; size_t position_nb[] = {sizeof(int32_t), sizeof(int32_t), sizeof(int32_t), sizeof(int32_t) * position_length}; - aclTensor* acl_position_tensor = ggml_cann_create_tensor( - src1->data, ggml_cann_type_mapping(src1->type), - ggml_type_size(src1->type), position_ne, position_nb, GGML_MAX_DIMS); - // power * position - int64_t theta_length = theta_scale_length * position_length; - ggml_cann_pool_alloc theta_allocator(ctx.pool(), - theta_length * sizeof(float_t)); - void* theta_buffer = theta_allocator.get(); int64_t theta_ne[] = {theta_scale_length, 1, position_length, 1}; size_t theta_nb[GGML_MAX_DIMS]; theta_nb[0] = sizeof(float_t); for (int i = 1; i < GGML_MAX_DIMS; i++) { theta_nb[i] = theta_nb[i - 1] * theta_ne[i - 1]; } - aclTensor* acl_theta_tensor = - ggml_cann_create_tensor(theta_buffer, ACL_FLOAT, sizeof(float_t), - theta_ne, theta_nb, GGML_MAX_DIMS); - aclnn_mul(ctx, acl_position_tensor, acl_theta_scale_tensor, - acl_theta_tensor); - // sin/cos - ggml_cann_pool_alloc sin_allocator(ctx.pool(), - theta_length * sizeof(float_t)); - void* sin_buffer = sin_allocator.get(); + bool is_q = (std::strncmp(dst->name, "Qcur-", 5) == 0); + bool is_k = (std::strncmp(dst->name, "Kcur-", 5) == 0); + + // used for accuracy testing + bool is_attention = is_q || is_k; + + if(ctx.init_ptr == nullptr || !is_attention) { + // theta_scale arange, [0,1,...,ne00/2 - 1] + if(ctx.init_ptr != nullptr){ + ACL_CHECK(aclrtFree(ctx.init_ptr)); + } + ACL_CHECK(aclrtMalloc(&ctx.init_ptr, theta_scale_length * sizeof(float_t), ACL_MEM_MALLOC_HUGE_FIRST)); + + aclTensor* acl_theta_scale_tensor = + ggml_cann_create_tensor(ctx.init_ptr, ACL_FLOAT, sizeof(float_t), + theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS); + float start = 0; + float step = 1; + float stop = ne00 / 2; + float n_elements = ne00 / 2; + aclnn_arange(ctx, acl_theta_scale_tensor, start, stop, step, n_elements); + + // power + aclScalar* acl_theta_scale = aclCreateScalar(&theta_scale, aclDataType::ACL_FLOAT); + GGML_CANN_CALL_ACLNN_OP(ctx, PowScalarTensor, acl_theta_scale, acl_theta_scale_tensor, + acl_theta_scale_tensor); + + // freq_scale + if (freq_scale != 1) { + aclnn_muls(ctx, acl_theta_scale_tensor, freq_scale, nullptr, true); + } + + // freq_factors + if (src2) { + aclTensor* acl_freq_factors_tensor = ggml_cann_create_tensor( + src2->data, ggml_cann_type_mapping(src2->type), + ggml_type_size(src2->type), theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS); + aclnn_div(ctx, acl_theta_scale_tensor, acl_freq_factors_tensor); + ggml_cann_release_resources(ctx, acl_freq_factors_tensor); + } + // release + ggml_cann_release_resources(ctx, acl_theta_scale_tensor,acl_theta_scale); + } + + if(ctx.sin_ptr == nullptr) { + int64_t theta_length = theta_scale_length * ctx.max_prompt_length; + ACL_CHECK(aclrtMalloc(&ctx.sin_ptr, theta_length * sizeof(float_t), ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc(&ctx.cos_ptr, theta_length * sizeof(float_t), ACL_MEM_MALLOC_HUGE_FIRST)); + } + if(position_length > ctx.max_prompt_length) { + ctx.max_prompt_length = position_length; + int64_t theta_length = theta_scale_length * ctx.max_prompt_length; + ACL_CHECK(aclrtFree(ctx.sin_ptr)); + ACL_CHECK(aclrtFree(ctx.cos_ptr)); + ACL_CHECK(aclrtMalloc(&ctx.sin_ptr, theta_length * sizeof(float_t), ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc(&ctx.cos_ptr, theta_length * sizeof(float_t), ACL_MEM_MALLOC_HUGE_FIRST)); + } + + bool is_fisrt_layer = (std::strncmp(dst->name, "Qcur-0", GGML_MAX_NAME) == 0); + + if(is_fisrt_layer || !is_attention) { + + aclTensor* acl_theta_scale_tensor = + ggml_cann_create_tensor(ctx.init_ptr, ACL_FLOAT, sizeof(float_t), + theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS); + + // position + aclTensor* acl_position_tensor = ggml_cann_create_tensor( + src1->data, ggml_cann_type_mapping(src1->type), + ggml_type_size(src1->type), position_ne, position_nb, GGML_MAX_DIMS); + + // power * position + int64_t theta_length = theta_scale_length * position_length; + ggml_cann_pool_alloc theta_allocator(ctx.pool(), + theta_length * sizeof(float_t)); + void* theta_buffer = theta_allocator.get(); + + aclTensor* acl_theta_tensor = + ggml_cann_create_tensor(theta_buffer, ACL_FLOAT, sizeof(float_t), + theta_ne, theta_nb, GGML_MAX_DIMS); + aclnn_mul(ctx, acl_position_tensor, acl_theta_scale_tensor, + acl_theta_tensor); + + // sin/cos + aclTensor* acl_sin_tensor = ggml_cann_create_tensor( + ctx.sin_ptr, ACL_FLOAT, sizeof(float_t), theta_ne, theta_nb, + GGML_MAX_DIMS, ACL_FORMAT_ND); + aclnn_sin(ctx, acl_theta_tensor, acl_sin_tensor); + + aclTensor* acl_cos_tensor = ggml_cann_create_tensor( + ctx.cos_ptr, ACL_FLOAT, sizeof(float_t), theta_ne, theta_nb, + GGML_MAX_DIMS, ACL_FORMAT_ND); + aclnn_cos(ctx, acl_theta_tensor, acl_cos_tensor); + + // release + ggml_cann_release_resources(ctx, acl_theta_scale_tensor, acl_position_tensor, + acl_theta_tensor, acl_sin_tensor, acl_cos_tensor); + } + aclTensor* acl_sin_tensor = ggml_cann_create_tensor( - sin_buffer, ACL_FLOAT, sizeof(float_t), theta_ne, theta_nb, - GGML_MAX_DIMS, ACL_FORMAT_ND); - aclnn_sin(ctx, acl_theta_tensor, acl_sin_tensor); - - ggml_cann_pool_alloc cos_allocator(ctx.pool(), - theta_length * sizeof(float_t)); - void* cos_buffer = cos_allocator.get(); + ctx.sin_ptr, ACL_FLOAT, sizeof(float_t), theta_ne, theta_nb, + GGML_MAX_DIMS, ACL_FORMAT_ND); aclTensor* acl_cos_tensor = ggml_cann_create_tensor( - cos_buffer, ACL_FLOAT, sizeof(float_t), theta_ne, theta_nb, - GGML_MAX_DIMS, ACL_FORMAT_ND); - aclnn_cos(ctx, acl_theta_tensor, acl_cos_tensor); + ctx.cos_ptr, ACL_FLOAT, sizeof(float_t), theta_ne, theta_nb, + GGML_MAX_DIMS, ACL_FORMAT_ND); // attn_factor if (attn_factor != 1) { @@ -2257,8 +2300,7 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, } // release - ggml_cann_release_resources(ctx, acl_theta_scale_tensor, acl_position_tensor, - acl_theta_tensor, acl_sin_tensor, acl_cos_tensor, acl_theta_scale); + ggml_cann_release_resources(ctx, acl_sin_tensor, acl_cos_tensor); } #ifdef __cplusplus diff --git a/ggml/src/ggml-cann/common.h b/ggml/src/ggml-cann/common.h index 9d294f72b6..2c2033bfba 100755 --- a/ggml/src/ggml-cann/common.h +++ b/ggml/src/ggml-cann/common.h @@ -368,6 +368,10 @@ struct ggml_backend_cann_context { std::string name; /**< Name of the device. */ std::string description; /**< Description of the device. */ aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */ + void* init_ptr = nullptr; + void* sin_ptr = nullptr; + void* cos_ptr = nullptr; + int64_t max_prompt_length = 65536; #ifdef USE_ACL_GRAPH /// Cached CANN ACL graph used for executing the current ggml computation graph. std::unique_ptr cann_graph; @@ -414,6 +418,15 @@ struct ggml_backend_cann_context { ACL_CHECK(aclrtDestroyStream(streams[i])); } } + if(init_ptr != nullptr) { + ACL_CHECK(aclrtFree(init_ptr)); + } + if(sin_ptr != nullptr) { + ACL_CHECK(aclrtFree(sin_ptr)); + } + if(cos_ptr != nullptr) { + ACL_CHECK(aclrtFree(cos_ptr)); + } } /** diff --git a/ggml/src/ggml-cpu/arch-fallback.h b/ggml/src/ggml-cpu/arch-fallback.h index f476127995..0bfb92df17 100644 --- a/ggml/src/ggml-cpu/arch-fallback.h +++ b/ggml/src/ggml-cpu/arch-fallback.h @@ -73,7 +73,6 @@ #define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K #define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K #define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K -#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0 // repack.cpp #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4 #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8 diff --git a/ggml/src/ggml-cpu/arch/powerpc/quants.c b/ggml/src/ggml-cpu/arch/powerpc/quants.c index 49aae7a23b..d3dfd049ea 100644 --- a/ggml/src/ggml-cpu/arch/powerpc/quants.c +++ b/ggml/src/ggml-cpu/arch/powerpc/quants.c @@ -278,6 +278,72 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi #endif } +void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + assert(n % QK_MXFP4 == 0); + static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same"); + + const block_mxfp4 * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + + const int nb = n / QK_MXFP4; + + int ib = 0; + float sumf = 0; + +#if defined(__POWER9_VECTOR__) + const vector signed char lowMask = vec_splats((signed char)0xF); + const vector unsigned char vshift4 = vec_splats((unsigned char)4); + vector float vsumf0 = vec_splats(0.0f); + + vector signed char kv = vec_xl(0, (const signed char *)kvalues_mxfp4); + +#pragma GCC unroll 8 + for (; ib < nb; ++ib) { + __builtin_prefetch(x[ib].qs, 0, 1); + __builtin_prefetch(y[ib].qs, 0, 1); + + vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d) * + GGML_E8M0_TO_FP32_HALF(x[ib].e)); + + vector signed char q8y0 = vec_xl( 0, y[ib].qs); + vector signed char q8y1 = vec_xl(16, y[ib].qs); + + vector signed char qxs = (vector signed char)vec_xl(0, x[ib].qs); + + vector unsigned char lo_nibbles = (vector unsigned char)vec_and(qxs, lowMask); + vector unsigned char hi_nibbles = (vector unsigned char)vec_sr(qxs, vshift4); + + vector signed char q4x0 = vec_perm(kv, kv, lo_nibbles); + vector signed char q4x1 = vec_perm(kv, kv, hi_nibbles); + + vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0)); + vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1)); + + vector signed int vsumi0 = vec_splats((int32_t)0); + vsumi0 = vec_sum4s(qv0, vsumi0); + vsumi0 = vec_sum4s(qv1, vsumi0); + + vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vyd, vsumf0); + } + + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); + sumf = vec_extract(vsumf0, 0); + *s = sumf; +#else + UNUSED(x); + UNUSED(y); + UNUSED(ib); + UNUSED(sumf); + ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); +#endif +} + void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { const int qk = QK8_0; const int nb = n / qk; diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt index bce07ac362..ea824965aa 100644 --- a/ggml/src/ggml-cuda/CMakeLists.txt +++ b/ggml/src/ggml-cuda/CMakeLists.txt @@ -24,12 +24,6 @@ if (CUDAToolkit_FOUND) # for best performance and to also build real architectures for the most commonly used GPUs. if (GGML_NATIVE AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.6" AND CMAKE_VERSION VERSION_GREATER_EQUAL "3.24") set(CMAKE_CUDA_ARCHITECTURES "native") - elseif(GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16) - if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.8") - set(CMAKE_CUDA_ARCHITECTURES "60-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real;89-real") - else() - set(CMAKE_CUDA_ARCHITECTURES "60-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real") - endif() else() if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.8") set(CMAKE_CUDA_ARCHITECTURES "50-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real;89-real") @@ -91,10 +85,6 @@ if (CUDAToolkit_FOUND) add_compile_definitions(GGML_CUDA_NO_FA) endif() - if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16) - add_compile_definitions(GGML_CUDA_F16) - endif() - if (GGML_CUDA_NO_PEER_COPY) add_compile_definitions(GGML_CUDA_NO_PEER_COPY) endif() diff --git a/ggml/src/ggml-cuda/add-id.cu b/ggml/src/ggml-cuda/add-id.cu index 8bed62ac9d..8d9cf692b4 100644 --- a/ggml/src/ggml-cuda/add-id.cu +++ b/ggml/src/ggml-cuda/add-id.cu @@ -11,14 +11,14 @@ static __global__ void add_id_kernel( const int64_t i1 = blockIdx.x; const int64_t i2 = blockIdx.y; - const int i11 = *(int32_t *) ((char *) src2 + i1*sizeof(int32_t) + i2*nb21); + const int i11 = *(const int32_t *) ((const char *) src2 + i1*sizeof(int32_t) + i2*nb21); const size_t nb1 = ne0 * sizeof(float); const size_t nb2 = ne1 * nb1; float * dst_row = (float *)((char *)dst + i1*nb1 + i2*nb2); - const float * src0_row = (const float *)((char *)src0 + i1*nb01 + i2*nb02); - const float * src1_row = (const float *)((char *)src1 + i11*nb11); + const float * src0_row = (const float *)((const char *)src0 + i1*nb01 + i2*nb02); + const float * src1_row = (const float *)((const char *)src1 + i11*nb11); for (int64_t i0 = threadIdx.x; i0 < ne0; i0 += blockDim.x) { dst_row[i0] = src0_row[i0] + src1_row[i0]; diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 2b14b30ac9..767ad83f60 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -78,6 +78,8 @@ #define GGML_CUDA_CC_IS_CDNA3(cc) (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_RDNA1) // Moore Threads +#define MUSART_HMASK 40300 // MUSA rc4.3, min. ver. for half2 -> uint mask comparisons + #define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000 #define GGML_CUDA_CC_QY2 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000 #define GGML_CUDA_CC_NG (GGML_CUDA_CC_OFFSET_MTHREADS + 0x310) // TBD @@ -204,14 +206,6 @@ static const char * cu_get_error_str(CUresult err) { #define GGML_CUDA_ASSUME(x) #endif // CUDART_VERSION >= 11010 -#ifdef GGML_CUDA_F16 -typedef half dfloat; // dequantize float -typedef half2 dfloat2; -#else -typedef float dfloat; // dequantize float -typedef float2 dfloat2; -#endif // GGML_CUDA_F16 - #if (!defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)) || (defined(GGML_USE_HIP) && !defined(GGML_HIP_NO_VMM)) #define GGML_USE_VMM #endif // (!defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)) || (defined(GGML_USE_HIP) && !defined(GGML_HIP_NO_VMM)) @@ -490,13 +484,14 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) { #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || defined(GGML_USE_HIP) } -#if CUDART_VERSION < CUDART_HMASK +#if (defined(CUDART_VERSION) && CUDART_VERSION < CUDART_HMASK) || defined(GGML_USE_HIP) || \ + (defined(MUSART_VERSION) && MUSART_VERSION < MUSART_HMASK) static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half2 b) { const uint32_t mask_low = 0x0000FFFF * (float( __low2half(a)) > float( __low2half(b))); const uint32_t mask_high = 0xFFFF0000 * (float(__high2half(a)) > float(__high2half(b))); return mask_low | mask_high; } -#endif // CUDART_VERSION < CUDART_HMASK +#endif // (defined(CUDART_VERSION) && CUDART_VERSION < CUDART_HMASK) || defined(GGML_USE_HIP) || (defined(MUSART_VERSION) && MUSART_VERSION < MUSART_HMASK) static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) { #if defined(GGML_USE_HIP) @@ -556,7 +551,7 @@ static __device__ __forceinline__ float ggml_cuda_e8m0_to_fp32(uint8_t x) { #endif // CUDART_VERSION >= 12050 } -typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v); +typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, float2 & v); static __device__ __forceinline__ float get_alibi_slope( const float max_bias, const uint32_t h, const uint32_t n_head_log2, const float m0, const float m1 diff --git a/ggml/src/ggml-cuda/conv-transpose-1d.cu b/ggml/src/ggml-cuda/conv-transpose-1d.cu index fe4caf674d..8418ba6673 100644 --- a/ggml/src/ggml-cuda/conv-transpose-1d.cu +++ b/ggml/src/ggml-cuda/conv-transpose-1d.cu @@ -34,10 +34,7 @@ static __global__ void conv_transpose_1d_kernel( } } dst[global_index] = accumulator; - GGML_UNUSED(p0); GGML_UNUSED(d0); GGML_UNUSED(src0_ne3); - GGML_UNUSED(src1_ne3); GGML_UNUSED(dst_ne3); - GGML_UNUSED(src1_ne1); GGML_UNUSED(dst_ne1); - GGML_UNUSED(src1_ne2); GGML_UNUSED(dst_ne2); + GGML_UNUSED_VARS(p0, d0, src0_ne3, src1_ne3, dst_ne3, src1_ne1, dst_ne1, src1_ne2, dst_ne2); } static void conv_transpose_1d_f32_f32_cuda( diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu index 8f0efdcc12..ba3d4eeb88 100644 --- a/ggml/src/ggml-cuda/convert.cu +++ b/ggml/src/ggml-cuda/convert.cu @@ -27,7 +27,7 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __ const int64_t y_offset = qr == 1 ? 1 : qk/2; // dequantize - dfloat2 v; + float2 v; dequantize_kernel(vx, ib, iqs, v); const int64_t iy0 = ((i03*ne02 + i02)*ne01 + i01)*ne00 + iybs + iqs; @@ -71,9 +71,7 @@ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, h y2[iy/2 + threadIdx.x] = __hmul2(make_half2(qs.x, qs.y), __half2half2(d)); } #else - GGML_UNUSED(vx); - GGML_UNUSED(y); - GGML_UNUSED(k); + GGML_UNUSED_VARS(vx, y, k); NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL } diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu index f9bb025643..c40db08ced 100644 --- a/ggml/src/ggml-cuda/cpy.cu +++ b/ggml/src/ggml-cuda/cpy.cu @@ -42,7 +42,7 @@ static __device__ void cpy_blck_q8_0_f32(const char * cxi, char * cdsti) { #pragma unroll for (int j = 0; j < QK8_0; j += 2) { - dfloat2 dq; + float2 dq; dequantize_q8_0(cxi, 0, j, dq); *(cdstf + j) = dq.x; *(cdstf + j + 1) = dq.y; @@ -55,7 +55,7 @@ static __device__ void cpy_blck_q_f32(const char * cxi, char * cdsti) { #pragma unroll for (int j = 0; j < qk/2; j++) { - dfloat2 dq; + float2 dq; dequant(cxi, 0, j, dq); *(cdstf + j) = dq.x; *(cdstf + j + qk/2) = dq.y; @@ -134,8 +134,7 @@ void ggml_cuda_cpy_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_des CUDA_CHECK(cudaMemcpyAsync(cuda_graph->dest_ptrs_d, host_dest_ptrs, host_dest_ptrs_size*sizeof(char *), cudaMemcpyHostToDevice, stream)); cuda_graph->graph_cpynode_index = 0; // reset index #else - GGML_UNUSED(cuda_graph); GGML_UNUSED(host_dest_ptrs); - GGML_UNUSED(host_dest_ptrs_size); GGML_UNUSED(stream); + GGML_UNUSED_VARS(cuda_graph, host_dest_ptrs, host_dest_ptrs_size, stream); #endif } diff --git a/ggml/src/ggml-cuda/dequantize.cuh b/ggml/src/ggml-cuda/dequantize.cuh index bd3c2d9db9..e060fb29fd 100644 --- a/ggml/src/ggml-cuda/dequantize.cuh +++ b/ggml/src/ggml-cuda/dequantize.cuh @@ -1,48 +1,37 @@ #include "common.cuh" -static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){ +static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int64_t ib, const int iqs, float2 & v){ const block_q4_0 * x = (const block_q4_0 *) vx; - const dfloat d = x[ib].d; + const float d = x[ib].d; const int vui = x[ib].qs[iqs]; v.x = vui & 0xF; v.y = vui >> 4; -#ifdef GGML_CUDA_F16 - v = __hsub2(v, {8.0f, 8.0f}); - v = __hmul2(v, {d, d}); -#else v.x = (v.x - 8.0f) * d; v.y = (v.y - 8.0f) * d; -#endif // GGML_CUDA_F16 } -static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){ +static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int64_t ib, const int iqs, float2 & v){ const block_q4_1 * x = (const block_q4_1 *) vx; - const dfloat d = __low2half(x[ib].dm); - const dfloat m = __high2half(x[ib].dm); + const float2 dm = __half22float2(x[ib].dm); const int vui = x[ib].qs[iqs]; v.x = vui & 0xF; v.y = vui >> 4; -#ifdef GGML_CUDA_F16 - v = __hmul2(v, {d, d}); - v = __hadd2(v, {m, m}); -#else - v.x = (v.x * d) + m; - v.y = (v.y * d) + m; -#endif // GGML_CUDA_F16 + v.x = (v.x * dm.x) + dm.y; + v.y = (v.y * dm.x) + dm.y; } -static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){ +static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int64_t ib, const int iqs, float2 & v){ const block_q5_0 * x = (const block_q5_0 *) vx; - const dfloat d = x[ib].d; + const float d = x[ib].d; uint32_t qh; memcpy(&qh, x[ib].qh, sizeof(qh)); @@ -53,20 +42,14 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in v.x = ((x[ib].qs[iqs] & 0xf) | xh_0); v.y = ((x[ib].qs[iqs] >> 4) | xh_1); -#ifdef GGML_CUDA_F16 - v = __hsub2(v, {16.0f, 16.0f}); - v = __hmul2(v, {d, d}); -#else v.x = (v.x - 16.0f) * d; v.y = (v.y - 16.0f) * d; -#endif // GGML_CUDA_F16 } -static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){ +static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int64_t ib, const int iqs, float2 & v){ const block_q5_1 * x = (const block_q5_1 *) vx; - const dfloat d = __low2half(x[ib].dm); - const dfloat m = __high2half(x[ib].dm); + const float2 dm = __half22float2(x[ib].dm); uint32_t qh; memcpy(&qh, x[ib].qh, sizeof(qh)); @@ -77,27 +60,18 @@ static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const in v.x = ((x[ib].qs[iqs] & 0xf) | xh_0); v.y = ((x[ib].qs[iqs] >> 4) | xh_1); -#ifdef GGML_CUDA_F16 - v = __hmul2(v, {d, d}); - v = __hadd2(v, {m, m}); -#else - v.x = (v.x * d) + m; - v.y = (v.y * d) + m; -#endif // GGML_CUDA_F16 + v.x = (v.x * dm.x) + dm.y; + v.y = (v.y * dm.x) + dm.y; } -static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){ +static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int64_t ib, const int iqs, float2 & v){ const block_q8_0 * x = (const block_q8_0 *) vx; - const dfloat d = x[ib].d; + const float d = x[ib].d; v.x = x[ib].qs[iqs + 0]; v.y = x[ib].qs[iqs + 1]; -#ifdef GGML_CUDA_F16 - v = __hmul2(v, {d, d}); -#else v.x *= d; v.y *= d; -#endif // GGML_CUDA_F16 } diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh index d4ed938391..b69f57d659 100644 --- a/ggml/src/ggml-cuda/fattn-common.cuh +++ b/ggml/src/ggml-cuda/fattn-common.cuh @@ -704,28 +704,6 @@ static __global__ void flash_attn_combine_results( dst[tid] = VKQ_numerator / VKQ_denominator; } -[[noreturn]] -static void on_no_fattn_vec_case(const int D) { - if (D == 64) { - fprintf(stderr, "Unsupported KV type combination for head_size 64.\n"); - fprintf(stderr, "By default only f16 KV cache is supported.\n"); - fprintf(stderr, "Compile with GGML_CUDA_FA_ALL_QUANTS for V cache quantization support.\n"); - GGML_ABORT("fatal error"); - } else if (D == 128) { - fprintf(stderr, "Unsupported KV type combination for head_size 128.\n"); - fprintf(stderr, "Supported combinations:\n"); - fprintf(stderr, " - K == q4_0, V == q4_0, 4.50 BPV\n"); - fprintf(stderr, " - K == q8_0, V == q8_0, 8.50 BPV\n"); - fprintf(stderr, " - K == f16, V == f16, 16.00 BPV\n"); - fprintf(stderr, "Compile with GGML_CUDA_FA_ALL_QUANTS for all combinations of q4_0, q4_1, q5_0, q5_1, q8_0, and f16.\n"); - GGML_ABORT("fatal error"); - } else { - fprintf(stderr, "Unsupported KV type combination for head_size %d.\n", D); - fprintf(stderr, "Only f16 is supported.\n"); - GGML_ABORT("fatal error"); - } -} - template void launch_fattn( ggml_backend_cuda_context & ctx, ggml_tensor * dst, fattn_kernel_t fattn_kernel, const int nwarps, const size_t nbytes_shared, diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh index 39731baaeb..57defb0c62 100644 --- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh +++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh @@ -767,14 +767,11 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter( } } #else - GGML_UNUSED(Q_f2); GGML_UNUSED(K_h2); GGML_UNUSED(V_h2); - GGML_UNUSED(mask_h2); GGML_UNUSED(dstk); GGML_UNUSED(dstk_fixup); - GGML_UNUSED(scale); GGML_UNUSED(slope); GGML_UNUSED(logit_softcap); - GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(stride_K); GGML_UNUSED(stride_V); - GGML_UNUSED(stride_mask); GGML_UNUSED(tile_K); - GGML_UNUSED(tile_V); GGML_UNUSED(tile_mask); GGML_UNUSED(Q_B); - GGML_UNUSED(VKQ_C); GGML_UNUSED(KQ_max); GGML_UNUSED(KQ_rowsum); - GGML_UNUSED(kb0); GGML_UNUSED(tile_Q); + GGML_UNUSED_VARS(Q_f2, K_h2, V_h2, mask_h2, dstk, dstk_fixup, + scale, slope, logit_softcap, ne01, ne02, + stride_K, stride_V, stride_mask, + tile_Q, tile_K, tile_V, tile_mask, + Q_B, VKQ_C, KQ_max, KQ_rowsum, kb0); NO_DEVICE_CODE; #endif // TURING_MMA_AVAILABLE } @@ -1236,12 +1233,10 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile( } } #else - GGML_UNUSED(Q_f2); GGML_UNUSED(K_h2); GGML_UNUSED(V_h2); - GGML_UNUSED(mask_h2); GGML_UNUSED(dstk); GGML_UNUSED(dstk_fixup); - GGML_UNUSED(scale); GGML_UNUSED(slope); GGML_UNUSED(logit_softcap); - GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(stride_Q1); - GGML_UNUSED(stride_Q2); GGML_UNUSED(stride_K); GGML_UNUSED(stride_V); GGML_UNUSED(stride_mask); - GGML_UNUSED(jt); GGML_UNUSED(kb0_start); GGML_UNUSED(kb0_stop); + GGML_UNUSED_VARS(Q_f2, K_h2, V_h2, mask_h2, sinks_f, dstk, dstk_fixup, + scale, slope, logit_softcap, ne01, ne02, + stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, + jt, kb0_start, kb0_stop); NO_DEVICE_CODE; #endif // TURING_MMA_AVAILABLE } @@ -1395,17 +1390,15 @@ static __global__ void flash_attn_ext_f16( (Q_f2, K_h2, V_h2, mask_h2, sinks_f, dstk, dst_meta, scale, slope, logit_softcap, ne01, ne02, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel); #else - GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask); GGML_UNUSED(sinks); - GGML_UNUSED(dst); GGML_UNUSED(dst_meta); - GGML_UNUSED(scale); GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1); - GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); - GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); - GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03); - GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); - GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); - GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23); - GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne33); - GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33); + GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale, + max_bias, m0, m1, n_head_log2, logit_softcap, + ne00, ne01, ne02, ne03, + nb01, nb02, nb03, + ne10, ne11, ne12, ne13, + nb11, nb12, nb13, + nb21, nb22, nb23, + ne31, ne32, ne33, + nb31, nb32, nb33); NO_DEVICE_CODE; #endif // defined(FLASH_ATTN_AVAILABLE) && defined(TURING_MMA_AVAILABLE) } diff --git a/ggml/src/ggml-cuda/fattn-tile-f16.cu b/ggml/src/ggml-cuda/fattn-tile-f16.cu index 1e23f8f79c..6239d184d0 100644 --- a/ggml/src/ggml-cuda/fattn-tile-f16.cu +++ b/ggml/src/ggml-cuda/fattn-tile-f16.cu @@ -299,17 +299,15 @@ static __global__ void flash_attn_tile_ext_f16( } } #else - GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask); GGML_UNUSED(sinks); - GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale); - GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1); - GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); - GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); - GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11); - GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne33); - GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33); GGML_UNUSED(nb01); GGML_UNUSED(nb02); - GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12); - GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22); - GGML_UNUSED(nb23); + GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale, + max_bias, m0, m1, n_head_log2, logit_softcap, + ne00, ne01, ne02, ne03, + nb01, nb02, nb03, + ne10, ne11, ne12, ne13, + nb11, nb12, nb13, + nb21, nb22, nb23, + ne31, ne32, ne33, + nb31, nb32, nb33); NO_DEVICE_CODE; #endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE) } diff --git a/ggml/src/ggml-cuda/fattn-tile-f32.cu b/ggml/src/ggml-cuda/fattn-tile-f32.cu index c58194937d..b96a9ef971 100644 --- a/ggml/src/ggml-cuda/fattn-tile-f32.cu +++ b/ggml/src/ggml-cuda/fattn-tile-f32.cu @@ -38,17 +38,15 @@ static __global__ void flash_attn_tile_ext_f32( return; #endif // FP16_MMA_AVAILABLE if (use_logit_softcap && !(D == 128 || D == 256)) { - GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask); GGML_UNUSED(sinks); - GGML_UNUSED(dst); GGML_UNUSED(dst_meta); - GGML_UNUSED(scale); GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1); - GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); - GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); - GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03); - GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); - GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); - GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23); - GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne33); - GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33); + GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale, + max_bias, m0, m1, n_head_log2, logit_softcap, + ne00, ne01, ne02, ne03, + nb01, nb02, nb03, + ne10, ne11, ne12, ne13, + nb11, nb12, nb13, + nb21, nb22, nb23, + ne31, ne32, ne33, + nb31, nb32, nb33); NO_DEVICE_CODE; return; } @@ -312,17 +310,15 @@ static __global__ void flash_attn_tile_ext_f32( } } #else - GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask); - GGML_UNUSED(dst); GGML_UNUSED(dst_meta); - GGML_UNUSED(scale); GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1); - GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); - GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); - GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03); - GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); - GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); - GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23); - GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne33); - GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33); + GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale, + max_bias, m0, m1, n_head_log2, logit_softcap, + ne00, ne01, ne02, ne03, + nb01, nb02, nb03, + ne10, ne11, ne12, ne13, + nb11, nb12, nb13, + nb21, nb22, nb23, + ne31, ne32, ne33, + nb31, nb32, nb33); NO_DEVICE_CODE; #endif // FLASH_ATTN_AVAILABLE } diff --git a/ggml/src/ggml-cuda/fattn-vec-f16.cuh b/ggml/src/ggml-cuda/fattn-vec-f16.cuh index b05f682cd3..27a2dd6ae4 100644 --- a/ggml/src/ggml-cuda/fattn-vec-f16.cuh +++ b/ggml/src/ggml-cuda/fattn-vec-f16.cuh @@ -349,17 +349,15 @@ static __global__ void flash_attn_vec_ext_f16( dst_meta[((sequence*ne01 + ic0 + tid)*ne02 + head)*gridDim.y + blockIdx.y] = make_float2(kqmax[tid], kqsum[tid]); } #else - GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask); GGML_UNUSED(sinks); - GGML_UNUSED(dst); GGML_UNUSED(dst_meta); - GGML_UNUSED(scale); GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1); - GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); - GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); - GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03); - GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); - GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); - GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23); - GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne33); - GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33); + GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale, + max_bias, m0, m1, n_head_log2, logit_softcap, + ne00, ne01, ne02, ne03, + nb01, nb02, nb03, + ne10, ne11, ne12, ne13, + nb11, nb12, nb13, + nb21, nb22, nb23, + ne31, ne32, ne33, + nb31, nb32, nb33); NO_DEVICE_CODE; #endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE) } diff --git a/ggml/src/ggml-cuda/fattn-vec-f32.cuh b/ggml/src/ggml-cuda/fattn-vec-f32.cuh index d6d0bfb744..da195d0334 100644 --- a/ggml/src/ggml-cuda/fattn-vec-f32.cuh +++ b/ggml/src/ggml-cuda/fattn-vec-f32.cuh @@ -37,17 +37,15 @@ static __global__ void flash_attn_vec_ext_f32( // Skip unused kernel variants for faster compilation: if (use_logit_softcap && !(D == 128 || D == 256)) { - GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask); - GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale); - GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1); - GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); - GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); - GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11); - GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne33); - GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33); GGML_UNUSED(nb01); GGML_UNUSED(nb02); - GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12); - GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22); - GGML_UNUSED(nb23); + GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale, + max_bias, m0, m1, n_head_log2, logit_softcap, + ne00, ne01, ne02, ne03, + nb01, nb02, nb03, + ne10, ne11, ne12, ne13, + nb11, nb12, nb13, + nb21, nb22, nb23, + ne31, ne32, ne33, + nb31, nb32, nb33); NO_DEVICE_CODE; return; } @@ -345,17 +343,15 @@ static __global__ void flash_attn_vec_ext_f32( dst_meta[((sequence*ne01 + ic0 + tid)*ne02 + head)*gridDim.y + blockIdx.y] = make_float2(kqmax[tid], kqsum[tid]); } #else - GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask); - GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale); - GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1); - GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); - GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); - GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); - GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne33); - GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33); - GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03); - GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); - GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23); + GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale, + max_bias, m0, m1, n_head_log2, logit_softcap, + ne00, ne01, ne02, ne03, + nb01, nb02, nb03, + ne10, ne11, ne12, ne13, + nb11, nb12, nb13, + nb21, nb22, nb23, + ne31, ne32, ne33, + nb31, nb32, nb33); NO_DEVICE_CODE; #endif // FLASH_ATTN_AVAILABLE } diff --git a/ggml/src/ggml-cuda/fattn-wmma-f16.cu b/ggml/src/ggml-cuda/fattn-wmma-f16.cu index 6bc7943ccd..2219191fd9 100644 --- a/ggml/src/ggml-cuda/fattn-wmma-f16.cu +++ b/ggml/src/ggml-cuda/fattn-wmma-f16.cu @@ -471,16 +471,15 @@ static __global__ void flash_attn_ext_f16( dst_meta[j_dst_unrolled] = dst_meta_val; } #else - GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask); GGML_UNUSED(sinks); - GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale); - GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1); - GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); - GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); - GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); - GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne33); GGML_UNUSED(nb31); - GGML_UNUSED(nb32); GGML_UNUSED(nb33); GGML_UNUSED(nb01); GGML_UNUSED(nb02); - GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); - GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23); + GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale, + max_bias, m0, m1, n_head_log2, logit_softcap, + ne00, ne01, ne02, ne03, + nb01, nb02, nb03, + ne10, ne11, ne12, ne13, + nb11, nb12, nb13, + nb21, nb22, nb23, + ne31, ne32, ne33, + nb31, nb32, nb33); NO_DEVICE_CODE; #endif // defined(FLASH_ATTN_AVAILABLE) && (__CUDA_ARCH__ == GGML_CUDA_CC_VOLTA || (defined(GGML_HIP_ROCWMMA_FATTN) && defined(FP16_MMA_AVAILABLE))) } diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu index 22e90d0e7b..4883427266 100644 --- a/ggml/src/ggml-cuda/fattn.cu +++ b/ggml/src/ggml-cuda/fattn.cu @@ -190,7 +190,7 @@ static void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, gg FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16) #endif // GGML_CUDA_FA_ALL_QUANTS - on_no_fattn_vec_case(Q->ne[0]); + GGML_ABORT("fatal error"); } #define FATTN_VEC_F32_CASE(D, type_K, type_V) \ @@ -265,74 +265,184 @@ static void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, gg FATTN_VEC_F32_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16) #endif // GGML_CUDA_FA_ALL_QUANTS - on_no_fattn_vec_case(Q->ne[0]); + GGML_ABORT("fatal error"); } -void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { +// Best FlashAttention kernel for a specific GPU: +enum best_fattn_kernel { + BEST_FATTN_KERNEL_NONE = 0, + BEST_FATTN_KERNEL_TILE_F32 = 200, + BEST_FATTN_KERNEL_TILE_F16 = 210, + BEST_FATTN_KERNEL_VEC_F32 = 100, + BEST_FATTN_KERNEL_VEC_F16 = 110, + BEST_FATTN_KERNEL_WMMA_F16 = 300, + BEST_FATTN_KERNEL_MMA_F16 = 400, +}; + +static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const ggml_tensor * dst) { +#ifndef FLASH_ATTN_AVAILABLE + GGML_UNUSED(device); GGML_UNUSED(dst); + return BEST_FATTN_KERNEL_NONE; +#endif// FLASH_ATTN_AVAILABLE + const ggml_tensor * KQV = dst; const ggml_tensor * Q = dst->src[0]; const ggml_tensor * K = dst->src[1]; const ggml_tensor * V = dst->src[2]; const ggml_tensor * mask = dst->src[3]; - ggml_cuda_set_device(ctx.device); - const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; - const int warp_size = ggml_cuda_info().devices[ggml_cuda_get_device()].warp_size; + const int gqa_ratio = Q->ne[2] / K->ne[2]; + GGML_ASSERT(Q->ne[2] % K->ne[2] == 0); + + const int cc = ggml_cuda_info().devices[device].cc; + const int warp_size = ggml_cuda_info().devices[device].warp_size; const enum ggml_prec prec = ggml_flash_attn_ext_get_prec(KQV); -#if defined(GGML_HIP_ROCWMMA_FATTN) - if (GGML_CUDA_CC_IS_AMD(cc) && fp16_mma_available(cc)) { - ggml_cuda_flash_attn_ext_wmma_f16(ctx, dst); - return; - } -#endif // defined(GGML_HIP_ROCWMMA_FATTN) - - if (!fast_fp16_available(cc)) { - if (Q->ne[1] <= 8 || Q->ne[0] == 256) { - ggml_cuda_flash_attn_ext_vec_f32(ctx, dst); - } else { - ggml_cuda_flash_attn_ext_tile_f32(ctx, dst); - } - return; - } - - if (!fp16_mma_available(cc)) { - if (prec == GGML_PREC_DEFAULT) { - if (Q->ne[1] <= 8 || Q->ne[0] == 256) { - ggml_cuda_flash_attn_ext_vec_f16(ctx, dst); - } else { - ggml_cuda_flash_attn_ext_tile_f16(ctx, dst); + switch (K->ne[0]) { + case 64: + case 128: + case 256: + if (V->ne[0] != K->ne[0]) { + return BEST_FATTN_KERNEL_NONE; } - } else { - if (Q->ne[1] <= 8 || Q->ne[0] == 256) { - ggml_cuda_flash_attn_ext_vec_f32(ctx, dst); - } else { - ggml_cuda_flash_attn_ext_tile_f32(ctx, dst); + break; + case 80: + case 96: + case 112: + if (V->ne[0] != K->ne[0]) { + return BEST_FATTN_KERNEL_NONE; } - } - return; + if (!fp16_mma_available(cc) && !turing_mma_available(cc)) { + return BEST_FATTN_KERNEL_NONE; + } + break; + case 576: + if (V->ne[0] != 512) { + return BEST_FATTN_KERNEL_NONE; + } + if (!turing_mma_available(cc) || gqa_ratio % 16 != 0) { + return BEST_FATTN_KERNEL_NONE; + } + break; + default: + return BEST_FATTN_KERNEL_NONE; + } + +#ifndef GGML_CUDA_FA_ALL_QUANTS + if (K->type != V->type) { + return BEST_FATTN_KERNEL_NONE; + } +#endif // GGML_CUDA_FA_ALL_QUANTS + + switch (K->type) { + case GGML_TYPE_F16: + break; + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: +#ifndef GGML_CUDA_FA_ALL_QUANTS + return BEST_FATTN_KERNEL_NONE; +#endif // GGML_CUDA_FA_ALL_QUANTS + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q8_0: +#ifdef GGML_CUDA_FA_ALL_QUANTS + if (K->ne[0] != 128 && K->ne[0] != 64) { + return BEST_FATTN_KERNEL_NONE; + } +#else + if (K->ne[0] != 128) { + return BEST_FATTN_KERNEL_NONE; + } +#endif // GGML_CUDA_FA_ALL_QUANTS + break; + default: + return BEST_FATTN_KERNEL_NONE; + } + + switch (V->type) { + case GGML_TYPE_F16: + break; + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q8_0: + if (K->ne[0] != 128) { + return BEST_FATTN_KERNEL_NONE; + } + break; + default: + return BEST_FATTN_KERNEL_NONE; + } + + if (mask && mask->ne[2] != 1) { + return BEST_FATTN_KERNEL_NONE; } - const bool gqa_opt_applies = ((Q->ne[2] / K->ne[2]) % 2 == 0) && mask; // The mma-based kernels have GQA-specific optimizations - const bool mma_needs_data_conversion = K->type != GGML_TYPE_F16 || V->type != GGML_TYPE_F16; - const bool mma_faster_for_rtx4000 = Q->ne[3] > 1 || (Q->ne[2] > 4*K->ne[2] && K->ne[1] >= 8192); - const bool mma_faster_for_bs1 = turing_mma_available(cc) && gqa_opt_applies && !mma_needs_data_conversion && - (cc < GGML_CUDA_CC_ADA_LOVELACE || mma_faster_for_rtx4000); const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % (2*warp_size) == 0; - if (Q->ne[1] == 1 && can_use_vector_kernel && !mma_faster_for_bs1) { - if (prec == GGML_PREC_DEFAULT) { - ggml_cuda_flash_attn_ext_vec_f16(ctx, dst); - } else { - ggml_cuda_flash_attn_ext_vec_f32(ctx, dst); + + // If Turing tensor cores available, use them except for some cases with batch size 1: + if (turing_mma_available(cc)) { + const bool gqa_opt_applies = gqa_ratio % 2 == 0 && mask; // The mma-based kernels have GQA-specific optimizations + const bool mma_needs_data_conversion = K->type != GGML_TYPE_F16 || V->type != GGML_TYPE_F16; + const bool mma_faster_for_rtx4000 = Q->ne[3] > 1 || (gqa_ratio > 4 && K->ne[1] >= 8192); + const bool mma_faster_for_bs1 = gqa_opt_applies && !mma_needs_data_conversion && + (cc < GGML_CUDA_CC_ADA_LOVELACE || mma_faster_for_rtx4000); + if (Q->ne[1] == 1 && can_use_vector_kernel && !mma_faster_for_bs1) { + if (prec == GGML_PREC_DEFAULT && fast_fp16_available(cc)) { + return BEST_FATTN_KERNEL_VEC_F16; + } + return BEST_FATTN_KERNEL_VEC_F32; } - return; + return BEST_FATTN_KERNEL_MMA_F16; } - // The MMA implementation needs Turing or newer, use the old WMMA code for Volta: - if (fp16_mma_available(cc) && !turing_mma_available(cc)) { - ggml_cuda_flash_attn_ext_wmma_f16(ctx, dst); - return; + // Use kernels specializes for small batch sizes if possible: + if (Q->ne[1] <= 8 && can_use_vector_kernel) { + if (prec == GGML_PREC_DEFAULT && fast_fp16_available(cc)) { + return BEST_FATTN_KERNEL_VEC_F16; + } + return BEST_FATTN_KERNEL_VEC_F32; } - ggml_cuda_flash_attn_ext_mma_f16(ctx, dst); + // For large batch sizes, use the WMMA kernel if possible: + if (fp16_mma_available(cc)) { + return BEST_FATTN_KERNEL_WMMA_F16; + } + + // If there is no suitable kernel for tensor cores or small batch sizes, use the generic kernel for large batch sizes: + if (prec == GGML_PREC_DEFAULT && fast_fp16_available(cc)) { + return BEST_FATTN_KERNEL_TILE_F16; + } + return BEST_FATTN_KERNEL_TILE_F32; +} + +void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + ggml_cuda_set_device(ctx.device); + switch (ggml_cuda_get_best_fattn_kernel(ggml_cuda_get_device(), dst)) { + case BEST_FATTN_KERNEL_NONE: + GGML_ABORT("fatal error"); + case BEST_FATTN_KERNEL_TILE_F32: + ggml_cuda_flash_attn_ext_tile_f32(ctx, dst); + break; + case BEST_FATTN_KERNEL_TILE_F16: + ggml_cuda_flash_attn_ext_tile_f16(ctx, dst); + break; + case BEST_FATTN_KERNEL_VEC_F32: + ggml_cuda_flash_attn_ext_vec_f32(ctx, dst); + break; + case BEST_FATTN_KERNEL_VEC_F16: + ggml_cuda_flash_attn_ext_vec_f16(ctx, dst); + break; + case BEST_FATTN_KERNEL_WMMA_F16: + ggml_cuda_flash_attn_ext_wmma_f16(ctx, dst); + break; + case BEST_FATTN_KERNEL_MMA_F16: + ggml_cuda_flash_attn_ext_mma_f16(ctx, dst); + break; + } +} + +bool ggml_cuda_flash_attn_ext_supported(int device, const ggml_tensor * dst) { + return ggml_cuda_get_best_fattn_kernel(device, dst) != BEST_FATTN_KERNEL_NONE; } diff --git a/ggml/src/ggml-cuda/fattn.cuh b/ggml/src/ggml-cuda/fattn.cuh index ad3ca7a8d8..78705d5995 100644 --- a/ggml/src/ggml-cuda/fattn.cuh +++ b/ggml/src/ggml-cuda/fattn.cuh @@ -1,3 +1,5 @@ #include "common.cuh" void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + +bool ggml_cuda_flash_attn_ext_supported(int device, const ggml_tensor * dst); diff --git a/ggml/src/ggml-cuda/getrows.cu b/ggml/src/ggml-cuda/getrows.cu index 68d3254fbe..3ec0e957ab 100644 --- a/ggml/src/ggml-cuda/getrows.cu +++ b/ggml/src/ggml-cuda/getrows.cu @@ -32,7 +32,7 @@ static __global__ void k_get_rows( const int y_offset = qr == 1 ? 1 : qk/2; // dequantize - dfloat2 v; + float2 v; dequantize_kernel(src0_row, ib, iqs, v); dst_row[iybs + iqs + 0] = ggml_cuda_cast(v.x); diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index d6402a8daa..4e17fd211e 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -1328,9 +1328,7 @@ static void ggml_cuda_op_mul_mat_cublas( &beta, dst_dd_i, ldc)); } - GGML_UNUSED(dst); - GGML_UNUSED(src1_ddq_i); - GGML_UNUSED(src1_padded_row_size); + GGML_UNUSED_VARS(dst, src1_ddq_i, src1_padded_row_size); } static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) { @@ -3499,44 +3497,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_GATED_LINEAR_ATTN: case GGML_OP_RWKV_WKV7: return true; - case GGML_OP_FLASH_ATTN_EXT: { -#ifndef FLASH_ATTN_AVAILABLE - return false; -#endif // FLASH_ATTN_AVAILABLE - if (op->src[1]->ne[0] != op->src[2]->ne[0]) { - const int cc = ggml_cuda_info().devices[dev_ctx->device].cc; - if (!turing_mma_available(cc)) { - return false; - } - const int gqa_ratio = op->src[0]->ne[2] / op->src[1]->ne[2]; - return op->src[1]->ne[0] == 576 && op->src[2]->ne[0] == 512 && op->src[3] && gqa_ratio % 16 == 0; - } - // TODO: more general-purpose attention sink support [TAG_ATTN_SINKS] - if (op->src[4] && !fp16_mma_available(ggml_cuda_info().devices[dev_ctx->device].cc) - && op->src[0]->ne[0] != 64 && op->src[0]->ne[0] != 128) { - return false; - } - if (op->src[0]->ne[0] == 192) { - return false; - } - if (op->src[1]->type == GGML_TYPE_BF16 || op->src[2]->type == GGML_TYPE_BF16) { - return false; - } - if (op->src[0]->ne[0] == 64 && op->src[1]->type == GGML_TYPE_F16) { - return true; - } - if (op->src[0]->ne[0] == 128) { - return true; - } - if (op->src[0]->ne[0] == 256 && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16) { - return true; - } - if (op->src[3] && op->src[3]->ne[2] != 1) { - return false; - } - return fp16_mma_available(ggml_cuda_info().devices[dev_ctx->device].cc) && - op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16; - } + case GGML_OP_FLASH_ATTN_EXT: + return ggml_cuda_flash_attn_ext_supported(dev_ctx->device, op); case GGML_OP_CROSS_ENTROPY_LOSS: case GGML_OP_CROSS_ENTROPY_LOSS_BACK: case GGML_OP_OPT_STEP_ADAMW: @@ -3672,10 +3634,6 @@ static ggml_backend_feature * ggml_backend_cuda_get_features(ggml_backend_reg_t features.push_back({ "NO_PEER_COPY", "1" }); #endif - #ifdef GGML_CUDA_F16 - features.push_back({ "F16", "1" }); - #endif - #ifdef GGML_CUDA_USE_GRAPHS features.push_back({ "USE_GRAPHS", "1" }); #endif diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh index 83ee16b27d..667deb9c65 100644 --- a/ggml/src/ggml-cuda/mma.cuh +++ b/ggml/src/ggml-cuda/mma.cuh @@ -291,9 +291,7 @@ namespace ggml_cuda_mma { : "=r"(xi[0]), "=r"(xi[2]), "=r"(xi[1]), "=r"(xi[3]) : "l"(xs)); #else - GGML_UNUSED(t); - GGML_UNUSED(xs0); - GGML_UNUSED(stride); + GGML_UNUSED_VARS(t, xs0, stride); NO_DEVICE_CODE; #endif // TURING_MMA_AVAILABLE } @@ -315,9 +313,7 @@ namespace ggml_cuda_mma { : "r"(A.x[1]), "r"(B.x[0])); #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE #else - GGML_UNUSED(D); - GGML_UNUSED(A); - GGML_UNUSED(B); + GGML_UNUSED_VARS(D, A, B); NO_DEVICE_CODE; #endif // TURING_MMA_AVAILABLE } @@ -345,9 +341,7 @@ namespace ggml_cuda_mma { : "r"(A.x[3]), "r"(B.x[1])); #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE #else - GGML_UNUSED(D); - GGML_UNUSED(A); - GGML_UNUSED(B); + GGML_UNUSED_VARS(D, A, B); NO_DEVICE_CODE; #endif // TURING_MMA_AVAILABLE } @@ -372,9 +366,7 @@ namespace ggml_cuda_mma { : "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[1])); #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE #else - GGML_UNUSED(D); - GGML_UNUSED(A); - GGML_UNUSED(B); + GGML_UNUSED_VARS(D, A, B); NO_DEVICE_CODE; #endif // TURING_MMA_AVAILABLE } @@ -408,9 +400,7 @@ namespace ggml_cuda_mma { : "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[3])); #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE #else - GGML_UNUSED(D); - GGML_UNUSED(A); - GGML_UNUSED(B); + GGML_UNUSED_VARS(D, A, B); NO_DEVICE_CODE; #endif // TURING_MMA_AVAILABLE } @@ -425,9 +415,7 @@ namespace ggml_cuda_mma { : "+r"(Dxi[0]), "+r"(Dxi[1]), "+r"(Dxi[2]), "+r"(Dxi[3]) : "r"(Axi[0]), "r"(Axi[1]), "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[0]), "r"(Bxi[1])); #else - GGML_UNUSED(D); - GGML_UNUSED(A); - GGML_UNUSED(B); + GGML_UNUSED_VARS(D, A, B); NO_DEVICE_CODE; #endif // AMPERE_MMA_AVAILABLE } @@ -452,9 +440,7 @@ namespace ggml_cuda_mma { : "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[1])); #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE #else - GGML_UNUSED(D); - GGML_UNUSED(A); - GGML_UNUSED(B); + GGML_UNUSED_VARS(D, A, B); NO_DEVICE_CODE; #endif // TURING_MMA_AVAILABLE } @@ -469,9 +455,7 @@ namespace ggml_cuda_mma { : "+r"(Dxi[0]), "+r"(Dxi[1]), "+r"(Dxi[2]), "+r"(Dxi[3]) : "r"(Axi[0]), "r"(Axi[1]), "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[0]), "r"(Bxi[1])); #else - GGML_UNUSED(D); - GGML_UNUSED(A); - GGML_UNUSED(B); + GGML_UNUSED_VARS(D, A, B); NO_DEVICE_CODE; #endif // AMPERE_MMA_AVAILABLE } @@ -505,9 +489,7 @@ namespace ggml_cuda_mma { : "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[3])); #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE #else - GGML_UNUSED(D); - GGML_UNUSED(A); - GGML_UNUSED(B); + GGML_UNUSED_VARS(D, A, B); NO_DEVICE_CODE; #endif // TURING_MMA_AVAILABLE } @@ -533,9 +515,7 @@ namespace ggml_cuda_mma { 0, 0, 0); #endif // defined(CDNA3) #else - GGML_UNUSED(D); - GGML_UNUSED(A); - GGML_UNUSED(B); + GGML_UNUSED_VARS(D, A, B); NO_DEVICE_CODE; #endif // AMD_MFMA_AVAILABLE } @@ -561,9 +541,7 @@ namespace ggml_cuda_mma { 0, 0, 0); #endif // defined(CDNA3) #else - GGML_UNUSED(D); - GGML_UNUSED(A); - GGML_UNUSED(B); + GGML_UNUSED_VARS(D, A, B); NO_DEVICE_CODE; #endif // AMD_MFMA_AVAILABLE } diff --git a/ggml/src/ggml-cuda/mmf.cu b/ggml/src/ggml-cuda/mmf.cu index 1437367e87..cfa5c5cce2 100644 --- a/ggml/src/ggml-cuda/mmf.cu +++ b/ggml/src/ggml-cuda/mmf.cu @@ -132,11 +132,11 @@ static __global__ void mul_mat_f( dst[j*stride_col_dst + row0 + threadIdx.x] = sum; } #else + GGML_UNUSED_VARS(x, y, ids, dst, + ncols, nchannels_y, stride_row, stride_col_y, stride_col_dst, + channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); NO_DEVICE_CODE; - GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(ids); GGML_UNUSED(dst); - GGML_UNUSED(ncols); GGML_UNUSED(nchannels_y); GGML_UNUSED(stride_row); GGML_UNUSED(stride_col_y); GGML_UNUSED(stride_col_dst); - GGML_UNUSED(channel_ratio); GGML_UNUSED(stride_channel_x); GGML_UNUSED(stride_channel_y); GGML_UNUSED(stride_channel_dst); - GGML_UNUSED(sample_ratio); GGML_UNUSED(stride_sample_x); GGML_UNUSED(stride_sample_y); GGML_UNUSED(stride_sample_dst); #endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) } @@ -151,7 +151,6 @@ static void mul_mat_f_cuda( cudaStream_t stream) { typedef tile<16, 8, T> tile_A; typedef tile< 8, 8, T> tile_B; - typedef tile<16, 8, float> tile_C; GGML_ASSERT(!ids && "mul_mat_id not implemented"); @@ -352,9 +351,6 @@ void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * sr GGML_ASSERT(!ids || ids->nb[0] == ggml_type_size(ids->type)); GGML_ASSERT( nb0 == ts_dst); - const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; - const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32; - const float * src1_d = (const float *) src1->data; const int32_t * ids_d = ids ? (const int32_t *) ids->data : nullptr; float * dst_d = (float *) dst->data; diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu index 384ee7615f..576032a0ce 100644 --- a/ggml/src/ggml-cuda/mmq.cu +++ b/ggml/src/ggml-cuda/mmq.cu @@ -266,10 +266,7 @@ void ggml_cuda_op_mul_mat_q( ggml_cuda_mul_mat_q_switch_type(ctx, args, stream); - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_ddf_i); - GGML_UNUSED(src1_padded_row_size); + GGML_UNUSED_VARS(src1, dst, src1_ddf_i, src1_padded_row_size); } bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) { diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index 96129bd831..650f708067 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -1255,7 +1255,7 @@ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma( } } #else - GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum); GGML_UNUSED(k00); + GGML_UNUSED_VARS(x, y, sum, k00); NO_DEVICE_CODE; #endif // AMD_MFMA_AVAILABLE } @@ -1572,7 +1572,7 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma( } } #else - GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum); GGML_UNUSED(k00); + GGML_UNUSED_VARS(x, y, sum, k00); NO_DEVICE_CODE; #endif // AMD_MFMA_AVAILABLE } @@ -2301,7 +2301,7 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma( } } #else - GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum); GGML_UNUSED(k00); + GGML_UNUSED_VARS(x, y, sum, k00); NO_DEVICE_CODE; #endif // AMD_MFMA_AVAILABLE } @@ -2855,12 +2855,14 @@ static __device__ __forceinline__ void mmq_write_back_mma( #else typedef tile<16, 8, int> tile_C; constexpr int rows_per_warp = 2 * granularity; -#endif +#endif // defined(AMD_MFMA_AVAILABLE) constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp. const int i0 = (threadIdx.y / ntx) * (ntx*tile_C::I); #if defined(TURING_MMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE) static_assert(nwarps*tile_C::I == mmq_y, "nwarps*tile_C::I != mmq_y"); +#else + GGML_UNUSED(nwarps); #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) #pragma unroll diff --git a/ggml/src/ggml-cuda/mmvf.cu b/ggml/src/ggml-cuda/mmvf.cu index 16100b6804..5b21ef05b3 100644 --- a/ggml/src/ggml-cuda/mmvf.cu +++ b/ggml/src/ggml-cuda/mmvf.cu @@ -433,12 +433,7 @@ void ggml_cuda_op_mul_mat_vec_f( GGML_ABORT("unsupported type: %s", ggml_type_name(src0->type)); } - GGML_UNUSED(ctx); - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_ddq_i); - GGML_UNUSED(src1_ncols); - GGML_UNUSED(src1_padded_row_size); + GGML_UNUSED_VARS(ctx, src1, dst, src1_ddq_i, src1_ncols, src1_padded_row_size); } bool ggml_cuda_should_use_mmvf(enum ggml_type type, int cc, const int64_t * src0_ne, int64_t ne11) { diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index 5c8e5c4a7e..b7c3079308 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -596,9 +596,5 @@ void ggml_cuda_op_mul_mat_vec_q( src0_dd_i, src0->type, src1_ddq_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row_x, stride_col_y, nrows_dst, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, stream); - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_ddf_i); - GGML_UNUSED(src1_ncols); - GGML_UNUSED(src1_padded_row_size); + GGML_UNUSED_VARS(src1, dst, src1_ddf_i, src1_ncols, src1_padded_row_size); } diff --git a/ggml/src/ggml-cuda/reduce_rows.cuh b/ggml/src/ggml-cuda/reduce_rows.cuh index 6bee204136..6bcae9e52f 100644 --- a/ggml/src/ggml-cuda/reduce_rows.cuh +++ b/ggml/src/ggml-cuda/reduce_rows.cuh @@ -39,7 +39,7 @@ static __global__ void reduce_rows_f32(const float * __restrict__ x, float * __r } __syncthreads(); sum = 0.0f; - if (lane_id < (blockDim.x / WARP_SIZE)) { + if (lane_id < (static_cast(blockDim.x) / WARP_SIZE)) { sum = s_sum[lane_id]; } sum = warp_reduce_sum(sum); diff --git a/ggml/src/ggml-cuda/vecdotq.cuh b/ggml/src/ggml-cuda/vecdotq.cuh index d8f9aa5ba6..d60292b83b 100644 --- a/ggml/src/ggml-cuda/vecdotq.cuh +++ b/ggml/src/ggml-cuda/vecdotq.cuh @@ -87,7 +87,7 @@ template static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp sumi = ggml_cuda_dp4a(vi1, u[2*i+1], sumi); } -#ifdef GGML_CUDA_F16 +#ifdef FAST_FP16_AVAILABLE const float2 tmp = __half22float2(__hmul2(dm4, ds8)); const float d4d8 = tmp.x; const float m4s8 = tmp.y; @@ -96,7 +96,7 @@ template static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp const float2 ds8f = __half22float2(ds8); const float d4d8 = dm4f.x * ds8f.x; const float m4s8 = dm4f.y * ds8f.y; -#endif // GGML_CUDA_F16 +#endif // FAST_FP16_AVAILABLE // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1)); @@ -158,7 +158,7 @@ template static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp sumi = ggml_cuda_dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values } -#ifdef GGML_CUDA_F16 +#ifdef FAST_FP16_AVAILABLE const float2 tmp = __half22float2(__hmul2(dm5, ds8)); const float d5d8 = tmp.x; const float m5s8 = tmp.y; @@ -167,7 +167,7 @@ template static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp const float2 ds8f = __half22float2(ds8); const float d5d8 = dm5f.x * ds8f.x; const float m5s8 = dm5f.y * ds8f.y; -#endif // GGML_CUDA_F16 +#endif // FAST_FP16_AVAILABLE // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it return sumi*d5d8 + m5s8 / (QI5_1 / vdr); @@ -201,7 +201,7 @@ template static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp sumi = ggml_cuda_dp4a(v[i], u[i], sumi); } -#ifdef GGML_CUDA_F16 +#ifdef FAST_FP16_AVAILABLE const float2 tmp = __half22float2(__hmul2(dm8, ds8)); const float d8d8 = tmp.x; const float m8s8 = tmp.y; @@ -210,7 +210,7 @@ template static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp const float2 ds8f = __half22float2(ds8); const float d8d8 = dm8f.x * ds8f.x; const float m8s8 = dm8f.y * ds8f.y; -#endif // GGML_CUDA_F16 +#endif // FAST_FP16_AVAILABLE // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it return sumi*d8d8 + m8s8 / (QI8_1 / vdr); diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index cb8eff4a77..7c70d352df 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -1846,7 +1846,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex case GGML_OP_ROPE: return true; case GGML_OP_IM2COL: - return op->src[0]->type == GGML_TYPE_F16; + return op->src[1]->type == GGML_TYPE_F32 && (op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_F32); case GGML_OP_POOL_1D: return false; case GGML_OP_UPSCALE: @@ -4703,7 +4703,6 @@ static int ggml_metal_encode_node( { GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(src1)); - GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32); diff --git a/ggml/src/ggml-musa/CMakeLists.txt b/ggml/src/ggml-musa/CMakeLists.txt index 02904526ad..cdb3818c78 100644 --- a/ggml/src/ggml-musa/CMakeLists.txt +++ b/ggml/src/ggml-musa/CMakeLists.txt @@ -96,10 +96,6 @@ if (MUSAToolkit_FOUND) add_compile_definitions(GGML_CUDA_NO_FA) endif() - if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16) - add_compile_definitions(GGML_CUDA_F16) - endif() - if (GGML_CUDA_NO_PEER_COPY) add_compile_definitions(GGML_CUDA_NO_PEER_COPY) endif() diff --git a/ggml/src/ggml-opencl/CMakeLists.txt b/ggml/src/ggml-opencl/CMakeLists.txt index 015821f34f..9a7ccbcff0 100644 --- a/ggml/src/ggml-opencl/CMakeLists.txt +++ b/ggml/src/ggml-opencl/CMakeLists.txt @@ -112,6 +112,9 @@ set(GGML_OPENCL_KERNELS mul_mat_f16_f32 conv2d conv2d_f16_f32 + flash_attn_f32_f16 + flash_attn_f16 + flash_attn_f32 ) foreach (K ${GGML_OPENCL_KERNELS}) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index b8cbf6cf0d..df27501361 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -332,6 +333,7 @@ struct ggml_backend_opencl_context { cl_int alignment; size_t max_alloc_size; + size_t max_workgroup_size; bool fp16_support; bool has_vector_subgroup_broadcast; bool disable_fusion; @@ -424,6 +426,14 @@ struct ggml_backend_opencl_context { cl_kernel kernel_diag_mask_inf, kernel_diag_mask_inf_8; cl_kernel kernel_soft_max, kernel_soft_max_4; cl_kernel kernel_soft_max_f16, kernel_soft_max_4_f16; + std::map, cl_kernel> kernels_flash_attn_f16; + std::map, cl_kernel> kernels_flash_attn_f16_q1; + std::map, cl_kernel> kernels_flash_attn_f32; + std::map, cl_kernel> kernels_flash_attn_f32_q1; + std::map, cl_kernel> kernels_flash_attn_f32_f16; + std::map, cl_kernel> kernels_flash_attn_f32_f16_q1; + std::map, int> kernels_flash_attn_bm; + std::map, int> kernels_flash_attn_bn; cl_kernel kernel_get_rows_f32, kernel_get_rows_f16, kernel_get_rows_q4_0; cl_kernel kernel_set_rows_f32, kernel_set_rows_f16; cl_kernel kernel_rope_norm_f32, kernel_rope_norm_f16, kernel_rope_neox_f32, kernel_rope_neox_f16; @@ -1308,6 +1318,73 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve GGML_LOG_CONT("."); } + // flash_attn + { + #ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src_f16 { + #include "flash_attn_f16.cl.h" + }; + const std::string kernel_src_f32 { + #include "flash_attn_f32.cl.h" + }; + const std::string kernel_src_f32_f16 { + #include "flash_attn_f32_f16.cl.h" + }; + #else + const std::string kernel_src_f16 = read_file("flash_attn_f16.cl"); + const std::string kernel_src_f32 = read_file("flash_attn_f32.cl"); + const std::string kernel_src_f32_f16 = read_file("flash_attn_f32_f16.cl"); + #endif + + if (!kernel_src_f16.empty() && !kernel_src_f32.empty() && !kernel_src_f32_f16.empty()) { + const struct { int dk; int dv; int bm; int bn; } fa_dims[] = { + { 64, 64, 64, 64}, { 80, 80, 64, 32}, { 96, 96, 64, 32}, + {112, 112, 32, 32}, {128, 128, 32, 32}, {192, 128, 16, 16}, + {192, 192, 16, 16}, {256, 256, 16, 16}, + }; + + for (size_t i = 0; i < sizeof(fa_dims)/sizeof(fa_dims[0]); ++i) { + const int dk = fa_dims[i].dk; + const int dv = fa_dims[i].dv; + const int bm = fa_dims[i].bm; + const int bn = fa_dims[i].bn; + std::string OPTS = compile_opts + + " -D DK=" + std::to_string(dk) + + " -D DV=" + std::to_string(dv) + + " -D BLOCK_M=" + std::to_string(bm) + + " -D BLOCK_N=" + std::to_string(bn); + + cl_program prog_f16 = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_f16.c_str(), OPTS); + cl_kernel k_f16, k_f16_q1; + CL_CHECK((k_f16 = clCreateKernel(prog_f16, "flash_attn_f16", &err), err)); + CL_CHECK((k_f16_q1 = clCreateKernel(prog_f16, "flash_attn_f16_q1", &err), err)); + backend_ctx->kernels_flash_attn_f16[{dk, dv}] = k_f16; + backend_ctx->kernels_flash_attn_f16_q1[{dk, dv}] = k_f16_q1; + CL_CHECK(clReleaseProgram(prog_f16)); + + cl_program prog_f32 = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_f32.c_str(), OPTS); + cl_kernel k_f32, k_f32_q1; + CL_CHECK((k_f32 = clCreateKernel(prog_f32, "flash_attn_f32", &err), err)); + CL_CHECK((k_f32_q1 = clCreateKernel(prog_f32, "flash_attn_f32_q1", &err), err)); + backend_ctx->kernels_flash_attn_f32[{dk, dv}] = k_f32; + backend_ctx->kernels_flash_attn_f32_q1[{dk, dv}] = k_f32_q1; + CL_CHECK(clReleaseProgram(prog_f32)); + + cl_program prog_f32_f16 = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_f32_f16.c_str(), OPTS); + cl_kernel k_f32_f16, k_f32_f16_q1; + CL_CHECK((k_f32_f16 = clCreateKernel(prog_f32_f16, "flash_attn_f32_f16", &err), err)); + CL_CHECK((k_f32_f16_q1 = clCreateKernel(prog_f32_f16, "flash_attn_f32_f16_q1", &err), err)); + backend_ctx->kernels_flash_attn_f32_f16[{dk, dv}] = k_f32_f16; + backend_ctx->kernels_flash_attn_f32_f16_q1[{dk, dv}] = k_f32_f16_q1; + CL_CHECK(clReleaseProgram(prog_f32_f16)); + + backend_ctx->kernels_flash_attn_bm[{dk, dv}] = bm; + backend_ctx->kernels_flash_attn_bn[{dk, dv}] = bn; + } + GGML_LOG_CONT("."); + } + } + // argsort { #ifdef GGML_OPENCL_EMBED_KERNELS @@ -2142,6 +2219,9 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &backend_ctx->max_alloc_size, NULL); GGML_LOG_INFO("ggml_opencl: max mem alloc size: %zu MB\n", backend_ctx->max_alloc_size/1024/1024); + clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &backend_ctx->max_workgroup_size, NULL); + GGML_LOG_INFO("ggml_opencl: device max workgroup size: %lu\n", backend_ctx->max_workgroup_size); + // Check SVM. cl_device_svm_capabilities svm_caps; CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_SVM_CAPABILITIES, sizeof(cl_device_svm_capabilities), &svm_caps, 0)); @@ -2457,7 +2537,8 @@ static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggm } static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { - GGML_UNUSED(dev); + ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *)dev->context; + ggml_backend_opencl_context * backend_ctx = dev_ctx->backend_ctx; switch (op->op) { case GGML_OP_NONE: @@ -2632,10 +2713,58 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te } case GGML_OP_IM2COL: return true; - case GGML_OP_ARGSORT: - return op->src[0]->type == GGML_TYPE_F32; + case GGML_OP_ARGSORT: { + cl_kernel kernel = backend_ctx->kernel_argsort_f32_i32; + int max_workgroup_size = backend_ctx->get_kernel_workgroup_size(kernel); + + int cols = 1; + while (cols < op->ne[0]) { + cols *= 2; + } + + return cols <= max_workgroup_size && op->src[0]->type == GGML_TYPE_F32; + } case GGML_OP_SUM_ROWS: return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]); + case GGML_OP_FLASH_ATTN_EXT: + { + if (op->src[4]) { + return false; + } + + const ggml_tensor * q = op->src[0]; + const ggml_tensor * k = op->src[1]; + const ggml_tensor * v = op->src[2]; + + const int dk = q->ne[0]; + const int dv = v->ne[0]; + + const struct { int dk; int dv; } supported_dims[] = { + { 64, 64}, { 80, 80}, { 96, 96}, + {112, 112}, {128, 128}, {192, 128}, + {192, 192}, {256, 256}, + }; + + bool dims_supported = false; + for (size_t i = 0; i < sizeof(supported_dims)/sizeof(supported_dims[0]); ++i) { + if (supported_dims[i].dk == dk && supported_dims[i].dv == dv) { + dims_supported = true; + break; + } + } + if (!dims_supported) { + return false; + } + + const bool is_f32_f32 = q->type == GGML_TYPE_F32 && k->type == GGML_TYPE_F32 && + v->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; + const bool is_f16_f16 = q->type == GGML_TYPE_F16 && k->type == GGML_TYPE_F16 && + v->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16; + const bool is_f32_f16 = q->type == GGML_TYPE_F32 && k->type == GGML_TYPE_F16 && + v->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F32; + + return is_f32_f32 || is_f16_f16 || is_f32_f16; + } default: return false; } @@ -5451,6 +5580,133 @@ static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst); } +static void ggml_cl_flash_attn(ggml_backend_t backend, const ggml_tensor * q, const ggml_tensor * k, ggml_tensor * dst) { + const ggml_tensor * v = dst->src[2]; + const ggml_tensor * mask = dst->src[3]; + GGML_ASSERT(q->extra); + GGML_ASSERT(k->extra); + GGML_ASSERT(v->extra); + GGML_ASSERT(dst->extra); + if (mask) { + GGML_ASSERT(mask->extra); + } + + ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; + + const int n_q = q->ne[1]; + const int n_kv = k->ne[1]; + const int d_head_q = q->ne[0]; + const int d_head_v = v->ne[0]; + const int n_head = q->ne[2]; + const int n_head_kv = k->ne[2]; + const int n_batch = q->ne[3]; + + cl_kernel kernel = NULL; + + const bool is_f16 = q->type == GGML_TYPE_F16; + const bool is_mixed = q->type == GGML_TYPE_F32 && k->type == GGML_TYPE_F16; + const std::pair dk_dv = {d_head_q, d_head_v}; + + if (n_q == 1) { + if (is_mixed) { + kernel = backend_ctx->kernels_flash_attn_f32_f16_q1.at(dk_dv); + } else if (is_f16) { + kernel = backend_ctx->kernels_flash_attn_f16_q1.at(dk_dv); + } else { + kernel = backend_ctx->kernels_flash_attn_f32_q1.at(dk_dv); + } + } else { + if (is_mixed) { + kernel = backend_ctx->kernels_flash_attn_f32_f16.at(dk_dv); + } else if (is_f16) { + kernel = backend_ctx->kernels_flash_attn_f16.at(dk_dv); + } else { + kernel = backend_ctx->kernels_flash_attn_f32.at(dk_dv); + } + } + GGML_ASSERT(kernel != NULL); + + ggml_tensor_extra_cl * extra_q = (ggml_tensor_extra_cl *)q->extra; + ggml_tensor_extra_cl * extra_k = (ggml_tensor_extra_cl *)k->extra; + ggml_tensor_extra_cl * extra_v = (ggml_tensor_extra_cl *)v->extra; + ggml_tensor_extra_cl * extra_o = (ggml_tensor_extra_cl *)dst->extra; + ggml_tensor_extra_cl * extra_mask = mask ? (ggml_tensor_extra_cl *)mask->extra : NULL; + + cl_ulong offset_q = extra_q->offset + q->view_offs; + cl_ulong offset_k = extra_k->offset + k->view_offs; + cl_ulong offset_v = extra_v->offset + v->view_offs; + cl_ulong offset_o = extra_o->offset + dst->view_offs; + cl_mem mask_buffer = extra_mask ? extra_mask->data_device : NULL; + cl_ulong offset_mask = extra_mask ? extra_mask->offset + mask->view_offs : 0; + + const cl_ulong q_nb1 = q->nb[1], q_nb2 = q->nb[2], q_nb3 = q->nb[3]; + const cl_ulong k_nb1 = k->nb[1], k_nb2 = k->nb[2], k_nb3 = k->nb[3]; + const cl_ulong v_nb1 = v->nb[1], v_nb2 = v->nb[2], v_nb3 = v->nb[3]; + const cl_ulong o_nb1 = dst->nb[1], o_nb2 = dst->nb[2], o_nb3 = dst->nb[3]; + const cl_ulong mask_nb1 = mask ? mask->nb[1] : 0; + const cl_ulong mask_nb2 = mask ? mask->nb[2] : 0; + const cl_ulong mask_nb3 = mask ? mask->nb[3] : 0; + const int mask_ne2 = mask ? mask->ne[2] : 0; + const int mask_ne3 = mask ? mask->ne[3] : 0; + + float scale, max_bias, logit_softcap; + const float * params = (const float *)dst->op_params; + scale = params[0]; + max_bias = params[1]; + logit_softcap = params[2]; + + const int is_causal = (mask == NULL && n_q > 1 && n_q == n_kv); + + const int n_head_log2_val = n_head > 0 ? 1u << (int)floorf(log2f((float)n_head)) : 0; + const float n_head_log2_f = n_head_log2_val > 0 ? (float)n_head_log2_val : 1.0f; + const float m0 = powf(2.0f, -(max_bias) / n_head_log2_f); + const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2_f); + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_q->data_device)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset_q)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra_k->data_device)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset_k)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra_v->data_device)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset_v)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extra_o->data_device)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offset_o)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(float), &scale)); + CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &n_q)); + CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &n_kv)); + CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &is_causal)); + CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &n_head)); + CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &q_nb1)); CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &q_nb2)); CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &q_nb3)); + CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &k_nb1)); CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &k_nb2)); CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &k_nb3)); + CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &v_nb1)); CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &v_nb2)); CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &v_nb3)); + CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &o_nb1)); CL_CHECK(clSetKernelArg(kernel, 23, sizeof(cl_ulong), &o_nb2)); CL_CHECK(clSetKernelArg(kernel, 24, sizeof(cl_ulong), &o_nb3)); + CL_CHECK(clSetKernelArg(kernel, 25, sizeof(float), &max_bias)); + CL_CHECK(clSetKernelArg(kernel, 26, sizeof(float), &m0)); + CL_CHECK(clSetKernelArg(kernel, 27, sizeof(float), &m1)); + CL_CHECK(clSetKernelArg(kernel, 28, sizeof(int), &n_head_log2_val)); + CL_CHECK(clSetKernelArg(kernel, 29, sizeof(float), &logit_softcap)); + CL_CHECK(clSetKernelArg(kernel, 30, sizeof(int), &n_head_kv)); + CL_CHECK(clSetKernelArg(kernel, 31, sizeof(cl_mem), &mask_buffer)); + CL_CHECK(clSetKernelArg(kernel, 32, sizeof(cl_ulong), &offset_mask)); + CL_CHECK(clSetKernelArg(kernel, 33, sizeof(cl_ulong), &mask_nb1)); + CL_CHECK(clSetKernelArg(kernel, 34, sizeof(cl_ulong), &mask_nb2)); + CL_CHECK(clSetKernelArg(kernel, 35, sizeof(cl_ulong), &mask_nb3)); + CL_CHECK(clSetKernelArg(kernel, 36, sizeof(int), &mask_ne2)); + CL_CHECK(clSetKernelArg(kernel, 37, sizeof(int), &mask_ne3)); + + if (n_q == 1) { + const size_t wg_size = 64; + size_t local_work_size[] = { wg_size, 1 }; + size_t global_work_size[] = { wg_size, (size_t)(n_head * n_batch) }; + backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size, local_work_size, dst); + } else { + const int block_m = backend_ctx->kernels_flash_attn_bm.at(dk_dv); + const size_t wg_size = block_m; + size_t local_work_size[] = { wg_size, 1 }; + size_t global_work_size[] = { (size_t)((n_q + block_m - 1) / block_m) * wg_size, (size_t)(n_head * n_batch) }; + backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size, local_work_size, dst); + } +} + static void ggml_cl_mul_mat_f16_f32_tiled(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; @@ -7607,6 +7863,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor } func = ggml_cl_sum_rows; break; + case GGML_OP_FLASH_ATTN_EXT: + if (!any_on_device) { + return false; + } + ggml_cl_flash_attn(backend, tensor->src[0], tensor->src[1], tensor); + return true; default: return false; } diff --git a/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl b/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl new file mode 100644 index 0000000000..fea06867e1 --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl @@ -0,0 +1,343 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +#define ACC_TYPE float +#define ACC_TYPE4 float4 +#define DATA_TYPE half +#define DATA_TYPE4 half4 +#define CONVERT_ACC4(x) convert_float4(x) +#define CONVERT_DATA4(x) convert_half4(x) + +#define DK_VEC (DK/4) +#define DV_VEC (DV/4) +#define WG_SIZE (BLOCK_M) +#define Q1_WG_SIZE 64 + +inline float get_alibi_slope( + const float max_bias, const uint h, const uint n_head_log2, const float m0, const float m1 +) { + if (max_bias <= 0.0f) { + return 1.0f; + } + const float base = h < n_head_log2 ? m0 : m1; + const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1; + + return pow(base, exph); +} +__kernel void flash_attn_f16( + const global void * q_void, ulong q_offset, + const global void * k_void, ulong k_offset, + const global void * v_void, ulong v_offset, + global void * o_void, ulong o_offset, + const float scale, + const int n_q, + const int n_kv, + const int is_causal, + const int n_head, + const ulong q_nb1, const ulong q_nb2, const ulong q_nb3, + const ulong k_nb1, const ulong k_nb2, const ulong k_nb3, + const ulong v_nb1, const ulong v_nb2, const ulong v_nb3, + const ulong o_nb1, const ulong o_nb2, const ulong o_nb3, + const float max_bias, + const float m0, + const float m1, + const int n_head_log2, + const float logit_softcap, + const int n_head_kv, + const global void* mask_void, + const ulong mask_offset, + const ulong mask_nb1, + const ulong mask_nb2, + const ulong mask_nb3, + const int mask_ne2, + const int mask_ne3 +) { + const int tid = get_local_id(0); + const int block_q_idx = get_group_id(0); + const int head_batch_idx = get_global_id(1); + + const int my_query_row = block_q_idx * BLOCK_M + tid; + + const int batch_idx = head_batch_idx / n_head; + const int head_idx = head_batch_idx % n_head; + + const int gqa_ratio = n_head / n_head_kv; + const int head_kv_idx = head_idx / gqa_ratio; + + const global char* q_base = (const global char*)q_void + q_offset; + const global char* k_base = (const global char*)k_void + k_offset; + const global char* v_base = (const global char*)v_void + v_offset; + global char* o_base = (global char*)o_void + o_offset; + + const global char* mask_base = NULL; + if (mask_void != NULL) { + const int mask_head_idx = head_idx % mask_ne2; + const int mask_batch_idx = batch_idx % mask_ne3; + mask_base = (const global char*)mask_void + mask_offset + mask_batch_idx * mask_nb3 + mask_head_idx * mask_nb2; + } + + ACC_TYPE4 q_priv[DK_VEC]; + if (my_query_row < n_q) { + const ulong q_row_offset = batch_idx * q_nb3 + head_idx * q_nb2 + my_query_row * q_nb1; + const global DATA_TYPE4* q_ptr = (const global DATA_TYPE4*)(q_base + q_row_offset); + #pragma unroll + for (int i = 0; i < DK_VEC; ++i) { + q_priv[i] = CONVERT_ACC4(q_ptr[i]); + } + } + + ACC_TYPE4 o_acc[DV_VEC]; + #pragma unroll + for (int i = 0; i < DV_VEC; ++i) { + o_acc[i] = (ACC_TYPE4)(0.0f); + } + ACC_TYPE m_i = -INFINITY; + ACC_TYPE l_i = 0.0f; + + float slope = get_alibi_slope(max_bias, head_idx, n_head_log2, m0, m1); + + __local DATA_TYPE4 l_k[BLOCK_N][DK_VEC]; + __local DATA_TYPE4 l_v[BLOCK_N][DV_VEC]; + + for (int k_start = 0; k_start < n_kv; k_start += BLOCK_N) { + for (int i = tid; i < BLOCK_N * DK_VEC; i += WG_SIZE) { + const int row = i / DK_VEC; + const int col = i % DK_VEC; + const int k_row_idx = k_start + row; + if (k_row_idx < n_kv) { + const ulong k_row_offset = batch_idx * k_nb3 + head_kv_idx * k_nb2 + k_row_idx * k_nb1; + l_k[row][col] = ((__global DATA_TYPE4*)(k_base + k_row_offset))[col]; + } + } + for (int i = tid; i < BLOCK_N * DV_VEC; i += WG_SIZE) { + const int row = i / DV_VEC; + const int col = i % DV_VEC; + const int v_row_idx = k_start + row; + if (v_row_idx < n_kv) { + const ulong v_row_offset = batch_idx * v_nb3 + head_kv_idx * v_nb2 + v_row_idx * v_nb1; + l_v[row][col] = ((__global DATA_TYPE4*)(v_base + v_row_offset))[col]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (my_query_row >= n_q) { + continue; + } + + for (int j = 0; j < BLOCK_N; j += 2) { + const int k_row0 = k_start + j; + const int k_row1 = k_start + j + 1; + + ACC_TYPE4 dot_acc0 = (ACC_TYPE4)(0.0f); + ACC_TYPE4 dot_acc1 = (ACC_TYPE4)(0.0f); + #pragma unroll + for (int k = 0; k < DK_VEC; k++) { + dot_acc0 = mad(q_priv[k], CONVERT_ACC4(l_k[j][k]), dot_acc0); + dot_acc1 = mad(q_priv[k], CONVERT_ACC4(l_k[j+1][k]), dot_acc1); + } + ACC_TYPE score0 = (dot_acc0.s0 + dot_acc0.s1 + dot_acc0.s2 + dot_acc0.s3) * scale; + ACC_TYPE score1 = (dot_acc1.s0 + dot_acc1.s1 + dot_acc1.s2 + dot_acc1.s3) * scale; + + if (is_causal) { + if (k_row0 > (n_kv - n_q + my_query_row)) score0 = -INFINITY; + if (k_row1 > (n_kv - n_q + my_query_row)) score1 = -INFINITY; + } + + if (k_row0 >= n_kv) score0 = -INFINITY; + if (k_row1 >= n_kv) score1 = -INFINITY; + + if (mask_base != NULL) { + const global DATA_TYPE* mask_ptr = (const global DATA_TYPE*)(mask_base + my_query_row * mask_nb1); + if (k_row0 < n_kv) score0 += slope * (ACC_TYPE)mask_ptr[k_row0]; + if (k_row1 < n_kv) score1 += slope * (ACC_TYPE)mask_ptr[k_row1]; + } + + if (logit_softcap > 0.0f) { + score0 = logit_softcap * tanh(score0 / logit_softcap); + score1 = logit_softcap * tanh(score1 / logit_softcap); + } + + const ACC_TYPE m_new = max(m_i, max(score0, score1)); + const ACC_TYPE p0 = exp(score0 - m_new); + const ACC_TYPE p1 = exp(score1 - m_new); + const ACC_TYPE scale_prev = exp(m_i - m_new); + + #pragma unroll + for (int i = 0; i < DV_VEC; ++i) { + o_acc[i] = o_acc[i] * scale_prev + p0 * CONVERT_ACC4(l_v[j][i]) + p1 * CONVERT_ACC4(l_v[j+1][i]); + } + l_i = l_i * scale_prev + p0 + p1; + m_i = m_new; + } + } + + if (my_query_row < n_q) { + const ulong o_row_offset = batch_idx * o_nb3 + my_query_row * o_nb2 + head_idx * o_nb1; + global DATA_TYPE4 *o_row = (global DATA_TYPE4 *)(o_base + o_row_offset); + if (l_i > 0.0f) { + const ACC_TYPE l_inv = 1.0f / l_i; + #pragma unroll + for (int i = 0; i < DV_VEC; ++i) { + o_row[i] = CONVERT_DATA4(o_acc[i] * l_inv); + } + } else { + #pragma unroll + for (int i = 0; i < DV_VEC; ++i) { + o_row[i] = (DATA_TYPE4)(0.0f); + } + } + } +} + +__kernel void flash_attn_f16_q1( + const global void * q_void, ulong q_offset, + const global void * k_void, ulong k_offset, + const global void * v_void, ulong v_offset, + global void * o_void, ulong o_offset, + const float scale, + const int n_q, + const int n_kv, + const int is_causal, + const int n_head, + const ulong q_nb1, const ulong q_nb2, const ulong q_nb3, + const ulong k_nb1, const ulong k_nb2, const ulong k_nb3, + const ulong v_nb1, const ulong v_nb2, const ulong v_nb3, + const ulong o_nb1, const ulong o_nb2, const ulong o_nb3, + const float max_bias, + const float m0, + const float m1, + const int n_head_log2, + const float logit_softcap, + const int n_head_kv, + const global void* mask_void, + const ulong mask_offset, + const ulong mask_nb1, + const ulong mask_nb2, + const ulong mask_nb3, + const int mask_ne2, + const int mask_ne3 +) { + const int tid = get_local_id(0); + const int head_batch_idx = get_global_id(1); + + const int batch_idx = head_batch_idx / n_head; + const int head_idx = head_batch_idx % n_head; + + const int gqa_ratio = n_head / n_head_kv; + const int head_kv_idx = head_idx / gqa_ratio; + + const global char* q_base = (const global char*)q_void + q_offset; + const global char* k_base = (const global char*)k_void + k_offset; + const global char* v_base = (const global char*)v_void + v_offset; + global char* o_base = (global char*)o_void + o_offset; + + const global char* mask_base = NULL; + if (mask_void != NULL) { + const int mask_head_idx = head_idx % mask_ne2; + const int mask_batch_idx = batch_idx % mask_ne3; + mask_base = (const global char*)mask_void + mask_offset + mask_batch_idx * mask_nb3 + mask_head_idx * mask_nb2; + } + + ACC_TYPE4 q_priv[DK_VEC]; + const ulong q_row_offset = batch_idx * q_nb3 + head_idx * q_nb2; + const global DATA_TYPE4* q_ptr = (const global DATA_TYPE4*)(q_base + q_row_offset); + #pragma unroll + for (int i = 0; i < DK_VEC; ++i) { + q_priv[i] = CONVERT_ACC4(q_ptr[i]); + } + + float slope = get_alibi_slope(max_bias, head_idx, n_head_log2, m0, m1); + + ACC_TYPE m_i = -INFINITY; + for (int k_idx = tid; k_idx < n_kv; k_idx += Q1_WG_SIZE) { + const ulong k_row_offset = batch_idx * k_nb3 + head_kv_idx * k_nb2 + k_idx * k_nb1; + const global DATA_TYPE4* k_ptr = (const global DATA_TYPE4*)(k_base + k_row_offset); + ACC_TYPE4 dot_acc = (ACC_TYPE4)(0.0f); + #pragma unroll + for (int k = 0; k < DK_VEC; k++) { + dot_acc = mad(q_priv[k], CONVERT_ACC4(k_ptr[k]), dot_acc); + } + ACC_TYPE score = (dot_acc.s0 + dot_acc.s1 + dot_acc.s2 + dot_acc.s3) * scale; + if (mask_base != NULL) { + const global DATA_TYPE* mask_ptr = (const global DATA_TYPE*)(mask_base); + score += slope * (ACC_TYPE)mask_ptr[k_idx]; + } + if (logit_softcap > 0.0f) { + score = logit_softcap * tanh(score / logit_softcap); + } + m_i = max(m_i, score); + } + + __local ACC_TYPE local_m[Q1_WG_SIZE]; + local_m[tid] = m_i; + barrier(CLK_LOCAL_MEM_FENCE); + #pragma unroll + for (int s = Q1_WG_SIZE / 2; s > 0; s >>= 1) { + if (tid < s) local_m[tid] = max(local_m[tid], local_m[tid + s]); + barrier(CLK_LOCAL_MEM_FENCE); + } + const ACC_TYPE m_final = local_m[0]; + + ACC_TYPE4 o_acc[DV_VEC]; + #pragma unroll + for (int i = 0; i < DV_VEC; ++i) o_acc[i] = (ACC_TYPE4)(0.0f); + ACC_TYPE l_i = 0.0f; + + for (int k_idx = tid; k_idx < n_kv; k_idx += Q1_WG_SIZE) { + const ulong k_row_offset = batch_idx * k_nb3 + head_kv_idx * k_nb2 + k_idx * k_nb1; + const ulong v_row_offset = batch_idx * v_nb3 + head_kv_idx * v_nb2 + k_idx * v_nb1; + const global DATA_TYPE4* k_ptr = (const global DATA_TYPE4*)(k_base + k_row_offset); + const global DATA_TYPE4* v_ptr = (const global DATA_TYPE4*)(v_base + v_row_offset); + ACC_TYPE4 dot_acc = (ACC_TYPE4)(0.0f); + #pragma unroll + for (int k = 0; k < DK_VEC; k++) { + dot_acc = mad(q_priv[k], CONVERT_ACC4(k_ptr[k]), dot_acc); + } + ACC_TYPE score = (dot_acc.s0 + dot_acc.s1 + dot_acc.s2 + dot_acc.s3) * scale; + if (mask_base != NULL) { + const global DATA_TYPE* mask_ptr = (const global DATA_TYPE*)(mask_base); + score += slope * (ACC_TYPE)mask_ptr[k_idx]; + } + if (logit_softcap > 0.0f) { + score = logit_softcap * tanh(score / logit_softcap); + } + const ACC_TYPE p = exp(score - m_final); + l_i += p; + #pragma unroll + for (int i = 0; i < DV_VEC; i++) { + o_acc[i] = mad(p, CONVERT_ACC4(v_ptr[i]), o_acc[i]); + } + } + + __local ACC_TYPE local_l[Q1_WG_SIZE]; + __local ACC_TYPE4 local_o_comp[Q1_WG_SIZE]; + local_l[tid] = l_i; + barrier(CLK_LOCAL_MEM_FENCE); + #pragma unroll + for (int s = Q1_WG_SIZE / 2; s > 0; s >>= 1) { + if (tid < s) local_l[tid] += local_l[tid + s]; + barrier(CLK_LOCAL_MEM_FENCE); + } + + const ulong o_row_offset = batch_idx * o_nb3 + head_idx * o_nb1; + global DATA_TYPE4 *o_row = (global DATA_TYPE4 *)(o_base + o_row_offset); + const ACC_TYPE l_final = local_l[0]; + + if (l_final > 0.0f) { + const ACC_TYPE l_inv = 1.0f / l_final; + for (int i = 0; i < DV_VEC; i++) { + local_o_comp[tid] = o_acc[i]; + barrier(CLK_LOCAL_MEM_FENCE); + #pragma unroll + for (int s = Q1_WG_SIZE / 2; s > 0; s >>= 1) { + if (tid < s) local_o_comp[tid] += local_o_comp[tid + s]; + barrier(CLK_LOCAL_MEM_FENCE); + } + if (tid == 0) { + o_row[i] = CONVERT_DATA4(local_o_comp[0] * l_inv); + } + } + } else if (tid == 0) { + #pragma unroll + for (int i = 0; i < DV_VEC; ++i) o_row[i] = (DATA_TYPE4)(0.0f); + } +} diff --git a/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl b/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl new file mode 100644 index 0000000000..2d657327d6 --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl @@ -0,0 +1,343 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +#define ACC_TYPE float +#define ACC_TYPE4 float4 +#define DATA_TYPE float +#define DATA_TYPE4 float4 +#define CONVERT_ACC4(x) (x) +#define CONVERT_DATA4(x) (x) + +#define DK_VEC (DK/4) +#define DV_VEC (DV/4) +#define WG_SIZE (BLOCK_M) +#define Q1_WG_SIZE 64 + +inline float get_alibi_slope( + const float max_bias, const uint h, const uint n_head_log2, const float m0, const float m1 +) { + if (max_bias <= 0.0f) { + return 1.0f; + } + const float base = h < n_head_log2 ? m0 : m1; + const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1; + + return pow(base, exph); +} +__kernel void flash_attn_f32( + const global void * q_void, ulong q_offset, + const global void * k_void, ulong k_offset, + const global void * v_void, ulong v_offset, + global void * o_void, ulong o_offset, + const float scale, + const int n_q, + const int n_kv, + const int is_causal, + const int n_head, + const ulong q_nb1, const ulong q_nb2, const ulong q_nb3, + const ulong k_nb1, const ulong k_nb2, const ulong k_nb3, + const ulong v_nb1, const ulong v_nb2, const ulong v_nb3, + const ulong o_nb1, const ulong o_nb2, const ulong o_nb3, + const float max_bias, + const float m0, + const float m1, + const int n_head_log2, + const float logit_softcap, + const int n_head_kv, + const global void* mask_void, + const ulong mask_offset, + const ulong mask_nb1, + const ulong mask_nb2, + const ulong mask_nb3, + const int mask_ne2, + const int mask_ne3 +) { + const int tid = get_local_id(0); + const int block_q_idx = get_group_id(0); + const int head_batch_idx = get_global_id(1); + + const int my_query_row = block_q_idx * BLOCK_M + tid; + + const int batch_idx = head_batch_idx / n_head; + const int head_idx = head_batch_idx % n_head; + + const int gqa_ratio = n_head / n_head_kv; + const int head_kv_idx = head_idx / gqa_ratio; + + const global char* q_base = (const global char*)q_void + q_offset; + const global char* k_base = (const global char*)k_void + k_offset; + const global char* v_base = (const global char*)v_void + v_offset; + global char* o_base = (global char*)o_void + o_offset; + + const global char* mask_base = NULL; + if (mask_void != NULL) { + const int mask_head_idx = head_idx % mask_ne2; + const int mask_batch_idx = batch_idx % mask_ne3; + mask_base = (const global char*)mask_void + mask_offset + mask_batch_idx * mask_nb3 + mask_head_idx * mask_nb2; + } + + ACC_TYPE4 q_priv[DK_VEC]; + if (my_query_row < n_q) { + const ulong q_row_offset = batch_idx * q_nb3 + head_idx * q_nb2 + my_query_row * q_nb1; + const global DATA_TYPE4* q_ptr = (const global DATA_TYPE4*)(q_base + q_row_offset); + #pragma unroll + for (int i = 0; i < DK_VEC; ++i) { + q_priv[i] = CONVERT_ACC4(q_ptr[i]); + } + } + + ACC_TYPE4 o_acc[DV_VEC]; + #pragma unroll + for (int i = 0; i < DV_VEC; ++i) { + o_acc[i] = (ACC_TYPE4)(0.0f); + } + ACC_TYPE m_i = -INFINITY; + ACC_TYPE l_i = 0.0f; + + float slope = get_alibi_slope(max_bias, head_idx, n_head_log2, m0, m1); + + __local DATA_TYPE4 l_k[BLOCK_N][DK_VEC]; + __local DATA_TYPE4 l_v[BLOCK_N][DV_VEC]; + + for (int k_start = 0; k_start < n_kv; k_start += BLOCK_N) { + for (int i = tid; i < BLOCK_N * DK_VEC; i += WG_SIZE) { + const int row = i / DK_VEC; + const int col = i % DK_VEC; + const int k_row_idx = k_start + row; + if (k_row_idx < n_kv) { + const ulong k_row_offset = batch_idx * k_nb3 + head_kv_idx * k_nb2 + k_row_idx * k_nb1; + l_k[row][col] = ((__global DATA_TYPE4*)(k_base + k_row_offset))[col]; + } + } + for (int i = tid; i < BLOCK_N * DV_VEC; i += WG_SIZE) { + const int row = i / DV_VEC; + const int col = i % DV_VEC; + const int v_row_idx = k_start + row; + if (v_row_idx < n_kv) { + const ulong v_row_offset = batch_idx * v_nb3 + head_kv_idx * v_nb2 + v_row_idx * v_nb1; + l_v[row][col] = ((__global DATA_TYPE4*)(v_base + v_row_offset))[col]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (my_query_row >= n_q) { + continue; + } + + for (int j = 0; j < BLOCK_N; j += 2) { + const int k_row0 = k_start + j; + const int k_row1 = k_start + j + 1; + + ACC_TYPE4 dot_acc0 = (ACC_TYPE4)(0.0f); + ACC_TYPE4 dot_acc1 = (ACC_TYPE4)(0.0f); + #pragma unroll + for (int k = 0; k < DK_VEC; k++) { + dot_acc0 = mad(q_priv[k], CONVERT_ACC4(l_k[j][k]), dot_acc0); + dot_acc1 = mad(q_priv[k], CONVERT_ACC4(l_k[j+1][k]), dot_acc1); + } + ACC_TYPE score0 = (dot_acc0.s0 + dot_acc0.s1 + dot_acc0.s2 + dot_acc0.s3) * scale; + ACC_TYPE score1 = (dot_acc1.s0 + dot_acc1.s1 + dot_acc1.s2 + dot_acc1.s3) * scale; + + if (is_causal) { + if (k_row0 > (n_kv - n_q + my_query_row)) score0 = -INFINITY; + if (k_row1 > (n_kv - n_q + my_query_row)) score1 = -INFINITY; + } + + if (k_row0 >= n_kv) score0 = -INFINITY; + if (k_row1 >= n_kv) score1 = -INFINITY; + + if (mask_base != NULL) { + const global DATA_TYPE* mask_ptr = (const global DATA_TYPE*)(mask_base + my_query_row * mask_nb1); + if (k_row0 < n_kv) score0 += slope * (ACC_TYPE)mask_ptr[k_row0]; + if (k_row1 < n_kv) score1 += slope * (ACC_TYPE)mask_ptr[k_row1]; + } + + if (logit_softcap > 0.0f) { + score0 = logit_softcap * tanh(score0 / logit_softcap); + score1 = logit_softcap * tanh(score1 / logit_softcap); + } + + const ACC_TYPE m_new = max(m_i, max(score0, score1)); + const ACC_TYPE p0 = exp(score0 - m_new); + const ACC_TYPE p1 = exp(score1 - m_new); + const ACC_TYPE scale_prev = exp(m_i - m_new); + + #pragma unroll + for (int i = 0; i < DV_VEC; ++i) { + o_acc[i] = o_acc[i] * scale_prev + p0 * CONVERT_ACC4(l_v[j][i]) + p1 * CONVERT_ACC4(l_v[j+1][i]); + } + l_i = l_i * scale_prev + p0 + p1; + m_i = m_new; + } + } + + if (my_query_row < n_q) { + const ulong o_row_offset = batch_idx * o_nb3 + my_query_row * o_nb2 + head_idx * o_nb1; + global DATA_TYPE4 *o_row = (global DATA_TYPE4 *)(o_base + o_row_offset); + if (l_i > 0.0f) { + const ACC_TYPE l_inv = 1.0f / l_i; + #pragma unroll + for (int i = 0; i < DV_VEC; ++i) { + o_row[i] = CONVERT_DATA4(o_acc[i] * l_inv); + } + } else { + #pragma unroll + for (int i = 0; i < DV_VEC; ++i) { + o_row[i] = (DATA_TYPE4)(0.0f); + } + } + } +} + +__kernel void flash_attn_f32_q1( + const global void * q_void, ulong q_offset, + const global void * k_void, ulong k_offset, + const global void * v_void, ulong v_offset, + global void * o_void, ulong o_offset, + const float scale, + const int n_q, + const int n_kv, + const int is_causal, + const int n_head, + const ulong q_nb1, const ulong q_nb2, const ulong q_nb3, + const ulong k_nb1, const ulong k_nb2, const ulong k_nb3, + const ulong v_nb1, const ulong v_nb2, const ulong v_nb3, + const ulong o_nb1, const ulong o_nb2, const ulong o_nb3, + const float max_bias, + const float m0, + const float m1, + const int n_head_log2, + const float logit_softcap, + const int n_head_kv, + const global void* mask_void, + const ulong mask_offset, + const ulong mask_nb1, + const ulong mask_nb2, + const ulong mask_nb3, + const int mask_ne2, + const int mask_ne3 +) { + const int tid = get_local_id(0); + const int head_batch_idx = get_global_id(1); + + const int batch_idx = head_batch_idx / n_head; + const int head_idx = head_batch_idx % n_head; + + const int gqa_ratio = n_head / n_head_kv; + const int head_kv_idx = head_idx / gqa_ratio; + + const global char* q_base = (const global char*)q_void + q_offset; + const global char* k_base = (const global char*)k_void + k_offset; + const global char* v_base = (const global char*)v_void + v_offset; + global char* o_base = (global char*)o_void + o_offset; + + const global char* mask_base = NULL; + if (mask_void != NULL) { + const int mask_head_idx = head_idx % mask_ne2; + const int mask_batch_idx = batch_idx % mask_ne3; + mask_base = (const global char*)mask_void + mask_offset + mask_batch_idx * mask_nb3 + mask_head_idx * mask_nb2; + } + + ACC_TYPE4 q_priv[DK_VEC]; + const ulong q_row_offset = batch_idx * q_nb3 + head_idx * q_nb2; + const global DATA_TYPE4* q_ptr = (const global DATA_TYPE4*)(q_base + q_row_offset); + #pragma unroll + for (int i = 0; i < DK_VEC; ++i) { + q_priv[i] = CONVERT_ACC4(q_ptr[i]); + } + + float slope = get_alibi_slope(max_bias, head_idx, n_head_log2, m0, m1); + + ACC_TYPE m_i = -INFINITY; + for (int k_idx = tid; k_idx < n_kv; k_idx += Q1_WG_SIZE) { + const ulong k_row_offset = batch_idx * k_nb3 + head_kv_idx * k_nb2 + k_idx * k_nb1; + const global DATA_TYPE4* k_ptr = (const global DATA_TYPE4*)(k_base + k_row_offset); + ACC_TYPE4 dot_acc = (ACC_TYPE4)(0.0f); + #pragma unroll + for (int k = 0; k < DK_VEC; k++) { + dot_acc = mad(q_priv[k], CONVERT_ACC4(k_ptr[k]), dot_acc); + } + ACC_TYPE score = (dot_acc.s0 + dot_acc.s1 + dot_acc.s2 + dot_acc.s3) * scale; + if (mask_base != NULL) { + const global DATA_TYPE* mask_ptr = (const global DATA_TYPE*)(mask_base); + score += slope * (ACC_TYPE)mask_ptr[k_idx]; + } + if (logit_softcap > 0.0f) { + score = logit_softcap * tanh(score / logit_softcap); + } + m_i = max(m_i, score); + } + + __local ACC_TYPE local_m[Q1_WG_SIZE]; + local_m[tid] = m_i; + barrier(CLK_LOCAL_MEM_FENCE); + #pragma unroll + for (int s = Q1_WG_SIZE / 2; s > 0; s >>= 1) { + if (tid < s) local_m[tid] = max(local_m[tid], local_m[tid + s]); + barrier(CLK_LOCAL_MEM_FENCE); + } + const ACC_TYPE m_final = local_m[0]; + + ACC_TYPE4 o_acc[DV_VEC]; + #pragma unroll + for (int i = 0; i < DV_VEC; ++i) o_acc[i] = (ACC_TYPE4)(0.0f); + ACC_TYPE l_i = 0.0f; + + for (int k_idx = tid; k_idx < n_kv; k_idx += Q1_WG_SIZE) { + const ulong k_row_offset = batch_idx * k_nb3 + head_kv_idx * k_nb2 + k_idx * k_nb1; + const ulong v_row_offset = batch_idx * v_nb3 + head_kv_idx * v_nb2 + k_idx * v_nb1; + const global DATA_TYPE4* k_ptr = (const global DATA_TYPE4*)(k_base + k_row_offset); + const global DATA_TYPE4* v_ptr = (const global DATA_TYPE4*)(v_base + v_row_offset); + ACC_TYPE4 dot_acc = (ACC_TYPE4)(0.0f); + #pragma unroll + for (int k = 0; k < DK_VEC; k++) { + dot_acc = mad(q_priv[k], CONVERT_ACC4(k_ptr[k]), dot_acc); + } + ACC_TYPE score = (dot_acc.s0 + dot_acc.s1 + dot_acc.s2 + dot_acc.s3) * scale; + if (mask_base != NULL) { + const global DATA_TYPE* mask_ptr = (const global DATA_TYPE*)(mask_base); + score += slope * (ACC_TYPE)mask_ptr[k_idx]; + } + if (logit_softcap > 0.0f) { + score = logit_softcap * tanh(score / logit_softcap); + } + const ACC_TYPE p = exp(score - m_final); + l_i += p; + #pragma unroll + for (int i = 0; i < DV_VEC; i++) { + o_acc[i] = mad(p, CONVERT_ACC4(v_ptr[i]), o_acc[i]); + } + } + + __local ACC_TYPE local_l[Q1_WG_SIZE]; + __local ACC_TYPE4 local_o_comp[Q1_WG_SIZE]; + local_l[tid] = l_i; + barrier(CLK_LOCAL_MEM_FENCE); + #pragma unroll + for (int s = Q1_WG_SIZE / 2; s > 0; s >>= 1) { + if (tid < s) local_l[tid] += local_l[tid + s]; + barrier(CLK_LOCAL_MEM_FENCE); + } + + const ulong o_row_offset = batch_idx * o_nb3 + head_idx * o_nb1; + global DATA_TYPE4 *o_row = (global DATA_TYPE4 *)(o_base + o_row_offset); + const ACC_TYPE l_final = local_l[0]; + + if (l_final > 0.0f) { + const ACC_TYPE l_inv = 1.0f / l_final; + for (int i = 0; i < DV_VEC; i++) { + local_o_comp[tid] = o_acc[i]; + barrier(CLK_LOCAL_MEM_FENCE); + #pragma unroll + for (int s = Q1_WG_SIZE / 2; s > 0; s >>= 1) { + if (tid < s) local_o_comp[tid] += local_o_comp[tid + s]; + barrier(CLK_LOCAL_MEM_FENCE); + } + if (tid == 0) { + o_row[i] = CONVERT_DATA4(local_o_comp[0] * l_inv); + } + } + } else if (tid == 0) { + #pragma unroll + for (int i = 0; i < DV_VEC; ++i) o_row[i] = (DATA_TYPE4)(0.0f); + } +} diff --git a/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl b/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl new file mode 100644 index 0000000000..7067bd2591 --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl @@ -0,0 +1,346 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +#define ACC_TYPE float +#define ACC_TYPE4 float4 +#define Q_DATA_TYPE4 float4 +#define KV_DATA_TYPE4 half4 +#define O_DATA_TYPE4 float4 +#define MASK_DATA_TYPE half +#define CONVERT_Q_ACC4(x) (x) +#define CONVERT_KV_ACC4(x) convert_float4(x) +#define CONVERT_O_DATA4(x) (x) + +#define DK_VEC (DK/4) +#define DV_VEC (DV/4) +#define WG_SIZE (BLOCK_M) +#define Q1_WG_SIZE 64 + +inline float get_alibi_slope( + const float max_bias, const uint h, const uint n_head_log2, const float m0, const float m1 +) { + if (max_bias <= 0.0f) { + return 1.0f; + } + const float base = h < n_head_log2 ? m0 : m1; + const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1; + + return pow(base, exph); +} +__kernel void flash_attn_f32_f16( + const global void * q_void, ulong q_offset, + const global void * k_void, ulong k_offset, + const global void * v_void, ulong v_offset, + global void * o_void, ulong o_offset, + const float scale, + const int n_q, + const int n_kv, + const int is_causal, + const int n_head, + const ulong q_nb1, const ulong q_nb2, const ulong q_nb3, + const ulong k_nb1, const ulong k_nb2, const ulong k_nb3, + const ulong v_nb1, const ulong v_nb2, const ulong v_nb3, + const ulong o_nb1, const ulong o_nb2, const ulong o_nb3, + const float max_bias, + const float m0, + const float m1, + const int n_head_log2, + const float logit_softcap, + const int n_head_kv, + const global void* mask_void, + const ulong mask_offset, + const ulong mask_nb1, + const ulong mask_nb2, + const ulong mask_nb3, + const int mask_ne2, + const int mask_ne3 +) { + const int tid = get_local_id(0); + const int block_q_idx = get_group_id(0); + const int head_batch_idx = get_global_id(1); + + const int my_query_row = block_q_idx * BLOCK_M + tid; + + const int batch_idx = head_batch_idx / n_head; + const int head_idx = head_batch_idx % n_head; + + const int gqa_ratio = n_head / n_head_kv; + const int head_kv_idx = head_idx / gqa_ratio; + + const global char* q_base = (const global char*)q_void + q_offset; + const global char* k_base = (const global char*)k_void + k_offset; + const global char* v_base = (const global char*)v_void + v_offset; + global char* o_base = (global char*)o_void + o_offset; + + const global char* mask_base = NULL; + if (mask_void != NULL) { + const int mask_head_idx = head_idx % mask_ne2; + const int mask_batch_idx = batch_idx % mask_ne3; + mask_base = (const global char*)mask_void + mask_offset + mask_batch_idx * mask_nb3 + mask_head_idx * mask_nb2; + } + + ACC_TYPE4 q_priv[DK_VEC]; + if (my_query_row < n_q) { + const ulong q_row_offset = batch_idx * q_nb3 + head_idx * q_nb2 + my_query_row * q_nb1; + const global Q_DATA_TYPE4* q_ptr = (const global Q_DATA_TYPE4*)(q_base + q_row_offset); + #pragma unroll + for (int i = 0; i < DK_VEC; ++i) { + q_priv[i] = CONVERT_Q_ACC4(q_ptr[i]); + } + } + + ACC_TYPE4 o_acc[DV_VEC]; + #pragma unroll + for (int i = 0; i < DV_VEC; ++i) { + o_acc[i] = (ACC_TYPE4)(0.0f); + } + ACC_TYPE m_i = -INFINITY; + ACC_TYPE l_i = 0.0f; + + float slope = get_alibi_slope(max_bias, head_idx, n_head_log2, m0, m1); + + __local KV_DATA_TYPE4 l_k[BLOCK_N][DK_VEC]; + __local KV_DATA_TYPE4 l_v[BLOCK_N][DV_VEC]; + + for (int k_start = 0; k_start < n_kv; k_start += BLOCK_N) { + for (int i = tid; i < BLOCK_N * DK_VEC; i += WG_SIZE) { + const int row = i / DK_VEC; + const int col = i % DK_VEC; + const int k_row_idx = k_start + row; + if (k_row_idx < n_kv) { + const ulong k_row_offset = batch_idx * k_nb3 + head_kv_idx * k_nb2 + k_row_idx * k_nb1; + l_k[row][col] = ((__global KV_DATA_TYPE4*)(k_base + k_row_offset))[col]; + } + } + for (int i = tid; i < BLOCK_N * DV_VEC; i += WG_SIZE) { + const int row = i / DV_VEC; + const int col = i % DV_VEC; + const int v_row_idx = k_start + row; + if (v_row_idx < n_kv) { + const ulong v_row_offset = batch_idx * v_nb3 + head_kv_idx * v_nb2 + v_row_idx * v_nb1; + l_v[row][col] = ((__global KV_DATA_TYPE4*)(v_base + v_row_offset))[col]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (my_query_row >= n_q) { + continue; + } + + for (int j = 0; j < BLOCK_N; j += 2) { + const int k_row0 = k_start + j; + const int k_row1 = k_start + j + 1; + + ACC_TYPE4 dot_acc0 = (ACC_TYPE4)(0.0f); + ACC_TYPE4 dot_acc1 = (ACC_TYPE4)(0.0f); + #pragma unroll + for (int k = 0; k < DK_VEC; k++) { + dot_acc0 = mad(q_priv[k], CONVERT_KV_ACC4(l_k[j][k]), dot_acc0); + dot_acc1 = mad(q_priv[k], CONVERT_KV_ACC4(l_k[j+1][k]), dot_acc1); + } + ACC_TYPE score0 = (dot_acc0.s0 + dot_acc0.s1 + dot_acc0.s2 + dot_acc0.s3) * scale; + ACC_TYPE score1 = (dot_acc1.s0 + dot_acc1.s1 + dot_acc1.s2 + dot_acc1.s3) * scale; + + if (is_causal) { + if (k_row0 > (n_kv - n_q + my_query_row)) score0 = -INFINITY; + if (k_row1 > (n_kv - n_q + my_query_row)) score1 = -INFINITY; + } + + if (k_row0 >= n_kv) score0 = -INFINITY; + if (k_row1 >= n_kv) score1 = -INFINITY; + + if (mask_base != NULL) { + const global MASK_DATA_TYPE* mask_ptr = (const global MASK_DATA_TYPE*)(mask_base + my_query_row * mask_nb1); + if (k_row0 < n_kv) score0 += slope * (ACC_TYPE)mask_ptr[k_row0]; + if (k_row1 < n_kv) score1 += slope * (ACC_TYPE)mask_ptr[k_row1]; + } + + if (logit_softcap > 0.0f) { + score0 = logit_softcap * tanh(score0 / logit_softcap); + score1 = logit_softcap * tanh(score1 / logit_softcap); + } + + const ACC_TYPE m_new = max(m_i, max(score0, score1)); + const ACC_TYPE p0 = exp(score0 - m_new); + const ACC_TYPE p1 = exp(score1 - m_new); + const ACC_TYPE scale_prev = exp(m_i - m_new); + + #pragma unroll + for (int i = 0; i < DV_VEC; ++i) { + o_acc[i] = o_acc[i] * scale_prev + p0 * CONVERT_KV_ACC4(l_v[j][i]) + p1 * CONVERT_KV_ACC4(l_v[j+1][i]); + } + l_i = l_i * scale_prev + p0 + p1; + m_i = m_new; + } + } + + if (my_query_row < n_q) { + const ulong o_row_offset = batch_idx * o_nb3 + my_query_row * o_nb2 + head_idx * o_nb1; + global O_DATA_TYPE4 *o_row = (global O_DATA_TYPE4 *)(o_base + o_row_offset); + if (l_i > 0.0f) { + const ACC_TYPE l_inv = 1.0f / l_i; + #pragma unroll + for (int i = 0; i < DV_VEC; ++i) { + o_row[i] = CONVERT_O_DATA4(o_acc[i] * l_inv); + } + } else { + #pragma unroll + for (int i = 0; i < DV_VEC; ++i) { + o_row[i] = (O_DATA_TYPE4)(0.0f); + } + } + } +} + +__kernel void flash_attn_f32_f16_q1( + const global void * q_void, ulong q_offset, + const global void * k_void, ulong k_offset, + const global void * v_void, ulong v_offset, + global void * o_void, ulong o_offset, + const float scale, + const int n_q, + const int n_kv, + const int is_causal, + const int n_head, + const ulong q_nb1, const ulong q_nb2, const ulong q_nb3, + const ulong k_nb1, const ulong k_nb2, const ulong k_nb3, + const ulong v_nb1, const ulong v_nb2, const ulong v_nb3, + const ulong o_nb1, const ulong o_nb2, const ulong o_nb3, + const float max_bias, + const float m0, + const float m1, + const int n_head_log2, + const float logit_softcap, + const int n_head_kv, + const global void* mask_void, + const ulong mask_offset, + const ulong mask_nb1, + const ulong mask_nb2, + const ulong mask_nb3, + const int mask_ne2, + const int mask_ne3 +) { + const int tid = get_local_id(0); + const int head_batch_idx = get_global_id(1); + + const int batch_idx = head_batch_idx / n_head; + const int head_idx = head_batch_idx % n_head; + + const int gqa_ratio = n_head / n_head_kv; + const int head_kv_idx = head_idx / gqa_ratio; + + const global char* q_base = (const global char*)q_void + q_offset; + const global char* k_base = (const global char*)k_void + k_offset; + const global char* v_base = (const global char*)v_void + v_offset; + global char* o_base = (global char*)o_void + o_offset; + + const global char* mask_base = NULL; + if (mask_void != NULL) { + const int mask_head_idx = head_idx % mask_ne2; + const int mask_batch_idx = batch_idx % mask_ne3; + mask_base = (const global char*)mask_void + mask_offset + mask_batch_idx * mask_nb3 + mask_head_idx * mask_nb2; + } + + ACC_TYPE4 q_priv[DK_VEC]; + const ulong q_row_offset = batch_idx * q_nb3 + head_idx * q_nb2; + const global Q_DATA_TYPE4* q_ptr = (const global Q_DATA_TYPE4*)(q_base + q_row_offset); + #pragma unroll + for (int i = 0; i < DK_VEC; ++i) { + q_priv[i] = CONVERT_Q_ACC4(q_ptr[i]); + } + + float slope = get_alibi_slope(max_bias, head_idx, n_head_log2, m0, m1); + + ACC_TYPE m_i = -INFINITY; + for (int k_idx = tid; k_idx < n_kv; k_idx += Q1_WG_SIZE) { + const ulong k_row_offset = batch_idx * k_nb3 + head_kv_idx * k_nb2 + k_idx * k_nb1; + const global KV_DATA_TYPE4* k_ptr = (const global KV_DATA_TYPE4*)(k_base + k_row_offset); + ACC_TYPE4 dot_acc = (ACC_TYPE4)(0.0f); + #pragma unroll + for (int k = 0; k < DK_VEC; k++) { + dot_acc = mad(q_priv[k], CONVERT_KV_ACC4(k_ptr[k]), dot_acc); + } + ACC_TYPE score = (dot_acc.s0 + dot_acc.s1 + dot_acc.s2 + dot_acc.s3) * scale; + if (mask_base != NULL) { + const global MASK_DATA_TYPE* mask_ptr = (const global MASK_DATA_TYPE*)(mask_base); + score += slope * (ACC_TYPE)mask_ptr[k_idx]; + } + if (logit_softcap > 0.0f) { + score = logit_softcap * tanh(score / logit_softcap); + } + m_i = max(m_i, score); + } + + __local ACC_TYPE local_m[Q1_WG_SIZE]; + local_m[tid] = m_i; + barrier(CLK_LOCAL_MEM_FENCE); + #pragma unroll + for (int s = Q1_WG_SIZE / 2; s > 0; s >>= 1) { + if (tid < s) local_m[tid] = max(local_m[tid], local_m[tid + s]); + barrier(CLK_LOCAL_MEM_FENCE); + } + const ACC_TYPE m_final = local_m[0]; + + ACC_TYPE4 o_acc[DV_VEC]; + #pragma unroll + for (int i = 0; i < DV_VEC; ++i) o_acc[i] = (ACC_TYPE4)(0.0f); + ACC_TYPE l_i = 0.0f; + + for (int k_idx = tid; k_idx < n_kv; k_idx += Q1_WG_SIZE) { + const ulong k_row_offset = batch_idx * k_nb3 + head_kv_idx * k_nb2 + k_idx * k_nb1; + const ulong v_row_offset = batch_idx * v_nb3 + head_kv_idx * v_nb2 + k_idx * v_nb1; + const global KV_DATA_TYPE4* k_ptr = (const global KV_DATA_TYPE4*)(k_base + k_row_offset); + const global KV_DATA_TYPE4* v_ptr = (const global KV_DATA_TYPE4*)(v_base + v_row_offset); + ACC_TYPE4 dot_acc = (ACC_TYPE4)(0.0f); + #pragma unroll + for (int k = 0; k < DK_VEC; k++) { + dot_acc = mad(q_priv[k], CONVERT_KV_ACC4(k_ptr[k]), dot_acc); + } + ACC_TYPE score = (dot_acc.s0 + dot_acc.s1 + dot_acc.s2 + dot_acc.s3) * scale; + if (mask_base != NULL) { + const global MASK_DATA_TYPE* mask_ptr = (const global MASK_DATA_TYPE*)(mask_base); + score += slope * (ACC_TYPE)mask_ptr[k_idx]; + } + if (logit_softcap > 0.0f) { + score = logit_softcap * tanh(score / logit_softcap); + } + const ACC_TYPE p = exp(score - m_final); + l_i += p; + #pragma unroll + for (int i = 0; i < DV_VEC; i++) { + o_acc[i] = mad(p, CONVERT_KV_ACC4(v_ptr[i]), o_acc[i]); + } + } + + __local ACC_TYPE local_l[Q1_WG_SIZE]; + __local ACC_TYPE4 local_o_comp[Q1_WG_SIZE]; + local_l[tid] = l_i; + barrier(CLK_LOCAL_MEM_FENCE); + #pragma unroll + for (int s = Q1_WG_SIZE / 2; s > 0; s >>= 1) { + if (tid < s) local_l[tid] += local_l[tid + s]; + barrier(CLK_LOCAL_MEM_FENCE); + } + + const ulong o_row_offset = batch_idx * o_nb3 + head_idx * o_nb1; + global O_DATA_TYPE4 *o_row = (global O_DATA_TYPE4 *)(o_base + o_row_offset); + const ACC_TYPE l_final = local_l[0]; + + if (l_final > 0.0f) { + const ACC_TYPE l_inv = 1.0f / l_final; + for (int i = 0; i < DV_VEC; i++) { + local_o_comp[tid] = o_acc[i]; + barrier(CLK_LOCAL_MEM_FENCE); + #pragma unroll + for (int s = Q1_WG_SIZE / 2; s > 0; s >>= 1) { + if (tid < s) local_o_comp[tid] += local_o_comp[tid + s]; + barrier(CLK_LOCAL_MEM_FENCE); + } + if (tid == 0) { + o_row[i] = CONVERT_O_DATA4(local_o_comp[0] * l_inv); + } + } + } else if (tid == 0) { + #pragma unroll + for (int i = 0; i < DV_VEC; ++i) o_row[i] = (O_DATA_TYPE4)(0.0f); + } +} diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 94f6405ca1..727932123e 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -566,7 +566,7 @@ static float make_q3_quants(int n, int nmax, const float * GGML_RESTRICT x, int8 for (int i = 0; i < n; ++i) { L[i] += nmax; } - return sumlx / suml2; + return suml2 > 0.0f ? sumlx / suml2 : 0.0f; } for (int i = 0; i < n; ++i) { int l = nearest_int(iscale * x[i]); @@ -901,7 +901,7 @@ static float make_qp_quants(int n, int nmax, const float * GGML_RESTRICT x, uint for (int i = 0; i < n; ++i) { max = MAX(max, x[i]); } - if (!max) { // all zero + if (max < GROUP_MAX_EPS) { // all zero for (int i = 0; i < n; ++i) { L[i] = 0; } return 0.f; } @@ -966,7 +966,7 @@ static float make_qp_quants(int n, int nmax, const float * GGML_RESTRICT x, uint break; } } - return sumlx/suml2; + return suml2 > 0.0f ? sumlx / suml2 : 0.0f; } static void quantize_row_q2_K_impl(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int k, const float * GGML_RESTRICT quant_weights) { @@ -4266,7 +4266,7 @@ static void quantize_row_iq1_s_impl(const float * GGML_RESTRICT x, void * GGML_R sumw[j+1] = sumw[j] + weight[i]; } } - float best_score = -FLT_MIN, scale = max; + float best_score = -FLT_MAX, scale = max; int besti1 = -1, besti2 = -1, best_shift = 0; for (int i1 = 0; i1 <= block_size; ++i1) { for (int i2 = i1; i2 <= block_size; ++i2) { @@ -4442,7 +4442,7 @@ static void quantize_row_iq1_m_impl(const float * GGML_RESTRICT x, void * GGML_R idx[2*j] = j; } qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper); - float best_score = -FLT_MIN, scale = max; + float best_score = -FLT_MAX, scale = max; int besti1 = -1, besti2 = -1, best_k = -1; // 0: +, + // 1: +, - diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 76a0cfa4f0..fb18a55cda 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -103,6 +103,8 @@ static bool is_pow2(uint32_t x) { return x > 1 && (x & (x-1)) == 0; } struct ggml_backend_vk_context; #define MAX_PARAMETER_COUNT 8 +// Max number of adds that can be fused without exceeding MAX_PARAMETER_COUNT. +#define MAX_FUSED_ADDS (MAX_PARAMETER_COUNT - 2) struct vk_pipeline_struct { std::string name; @@ -343,6 +345,15 @@ enum vk_conv_shapes { CONV_SHAPE_COUNT, }; +enum dmmv_wg_sizes { + DMMV_WG_SIZE_SUBGROUP, + DMMV_WG_SIZE_LARGE, + DMMV_WG_SIZE_COUNT, +}; + +static constexpr uint32_t num_argsort_pipelines = 11; +static constexpr uint32_t max_argsort_cols = 1 << (num_argsort_pipelines-1); + struct vk_device_struct { std::recursive_mutex mutex; @@ -368,6 +379,7 @@ struct vk_device_struct { bool float_controls_rte_fp16; bool subgroup_add; bool subgroup_shuffle; + bool multi_add; bool integer_dot_product; @@ -429,8 +441,8 @@ struct vk_device_struct { vk_pipeline pipeline_quantize_q8_1; vk_pipeline pipeline_dequant[GGML_TYPE_COUNT]; - vk_pipeline pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_COUNT][mul_mat_vec_max_cols]; - vk_pipeline pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_COUNT][mul_mat_vec_max_cols]; + vk_pipeline pipeline_dequant_mul_mat_vec_f32_f32[DMMV_WG_SIZE_COUNT][GGML_TYPE_COUNT][mul_mat_vec_max_cols]; + vk_pipeline pipeline_dequant_mul_mat_vec_f16_f32[DMMV_WG_SIZE_COUNT][GGML_TYPE_COUNT][mul_mat_vec_max_cols]; vk_pipeline pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_COUNT]; vk_pipeline pipeline_mul_mat_vec_p021_f16_f32[p021_max_gqa_ratio]; @@ -449,12 +461,16 @@ struct vk_device_struct { vk_pipeline pipeline_div[2][2][2]; vk_pipeline pipeline_div_norepeat[2][2][2]; + // indexed by num_additional_fused_ops == num_adds - 1 + vk_pipeline pipeline_multi_add[MAX_FUSED_ADDS]; + vk_pipeline pipeline_add_id_f32; vk_pipeline pipeline_concat_f32, pipeline_concat_f16, pipeline_concat_i32; vk_pipeline pipeline_upscale_nearest_f32, pipeline_upscale_bilinear_f32, pipeline_upscale_bilinear_ac_f32; vk_pipeline pipeline_scale_f32; vk_pipeline pipeline_sqr_f32; + vk_pipeline pipeline_sqrt_f32; vk_pipeline pipeline_sin_f32; vk_pipeline pipeline_cos_f32; vk_pipeline pipeline_clamp_f32; @@ -474,6 +490,7 @@ struct vk_device_struct { vk_pipeline pipeline_l2_norm_f32; // [src/dst 0=fp32,1=fp16] + vk_pipeline pipeline_exp[2]; vk_pipeline pipeline_gelu[2]; vk_pipeline pipeline_gelu_erf[2]; vk_pipeline pipeline_gelu_quick[2]; @@ -499,7 +516,7 @@ struct vk_device_struct { vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16; vk_pipeline pipeline_rope_multi_f32, pipeline_rope_multi_f16; vk_pipeline pipeline_rope_vision_f32, pipeline_rope_vision_f16; - vk_pipeline pipeline_argsort_f32; + vk_pipeline pipeline_argsort_f32[num_argsort_pipelines]; vk_pipeline pipeline_sum_rows_f32; vk_pipeline pipeline_argmax_f32; vk_pipeline pipeline_count_equal_i32; @@ -513,8 +530,8 @@ struct vk_device_struct { vk_pipeline pipeline_opt_step_sgd_f32; vk_pipeline pipeline_conv2d_f32[CONV_SHAPE_COUNT]; vk_pipeline pipeline_conv2d_f16_f32[CONV_SHAPE_COUNT]; - vk_pipeline pipeline_conv2d_dw_whcn_f32; - vk_pipeline pipeline_conv2d_dw_cwhn_f32; + vk_pipeline pipeline_conv2d_dw_whcn_f32, pipeline_conv2d_dw_whcn_f16_f32; + vk_pipeline pipeline_conv2d_dw_cwhn_f32, pipeline_conv2d_dw_cwhn_f16_f32; // [2][2][2] is for {f16acc,f32acc}x{large,small_rows}x{unaligned, aligned} vk_pipeline pipeline_flash_attn_f32_f16_cm2[GGML_TYPE_COUNT][FA_HEAD_SIZE_COUNT][2][2][2]; @@ -801,6 +818,14 @@ struct vk_op_binary_push_constants { float param1; float param2; int32_t param3; }; +struct vk_op_multi_add_push_constants { + // shape for dst + uint32_t ne20; uint32_t ne21; uint32_t ne22; uint32_t ne23; + + // strides for srcs+dst + uint32_t nb[8][4]; +}; + struct vk_op_add_id_push_constants { uint32_t ne0; uint32_t ne1; @@ -856,7 +881,6 @@ struct vk_op_soft_max_push_constants { struct vk_op_argsort_push_constants { uint32_t ncols; - uint32_t ncols_pad; int32_t order; }; @@ -1170,6 +1194,10 @@ struct ggml_backend_vk_context { vk::Fence fence, almost_ready_fence; bool almost_ready_fence_pending {}; + // Cache most recent tensor that was converted into prealloc_y, and what pipeline it used to convert. + vk_pipeline_struct * prealloc_y_last_pipeline_used {}; + const ggml_tensor * prealloc_y_last_tensor_used {}; + vk_buffer buffer_pool[MAX_VK_BUFFERS]; vk_context_ref compute_ctx; @@ -2387,26 +2415,26 @@ static void ggml_vk_load_shaders(vk_device& device) { CREATE_MM(pipeline_matmul_id_bf16, matmul_id_bf16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, 4) } #endif - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f16acc, matmul_id_q5_1_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f16acc, matmul_id_q8_0_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f16acc, matmul_id_q2_k_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f16acc, matmul_id_q3_k_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_S].f16acc, matmul_id_iq1_s_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_M].f16acc, matmul_id_iq1_m_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS].f16acc, matmul_id_iq2_xxs_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS].f16acc, matmul_id_iq2_xs_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S].f16acc, matmul_id_iq2_s_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS].f16acc, matmul_id_iq3_xxs_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S].f16acc, matmul_id_iq3_s_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS].f16acc, matmul_id_iq4_xs_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_MXFP4].f16acc, matmul_id_mxfp4_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_q4_0_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1], matmul_id_q4_1_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0], matmul_id_q5_0_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1], matmul_id_q5_1_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0], matmul_id_q8_0_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K], matmul_id_q2_k_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K], matmul_id_q3_k_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K], matmul_id_q4_k_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K], matmul_id_q5_k_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K], matmul_id_q6_k_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_S], matmul_id_iq1_s_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_M], matmul_id_iq1_m_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS], matmul_id_iq2_xxs_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS], matmul_id_iq2_xs_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S], matmul_id_iq2_s_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS], matmul_id_iq3_xxs_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S], matmul_id_iq3_s_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS], matmul_id_iq4_xs_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL], matmul_id_iq4_nl_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_MXFP4], matmul_id_mxfp4_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) #undef CREATE_MM #undef CREATE_MM2 } else @@ -2502,51 +2530,27 @@ static void ggml_vk_load_shaders(vk_device& device) { } #endif - if (device->coopmat_acc_f16_support) { - CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f16acc, matmul_id_q5_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f16acc, matmul_id_q8_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM2(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_q4_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM2(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1], matmul_id_q4_1_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM2(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0], matmul_id_q5_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM2(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1], matmul_id_q5_1_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM2(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0], matmul_id_q8_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f16acc, matmul_id_q2_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f16acc, matmul_id_q3_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_IQ1_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_S].f16acc, matmul_id_iq1_s_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_IQ1_M, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_M].f16acc, matmul_id_iq1_m_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS].f16acc, matmul_id_iq2_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_IQ2_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS].f16acc, matmul_id_iq2_xs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_IQ2_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S].f16acc, matmul_id_iq2_s_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS].f16acc, matmul_id_iq3_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_IQ3_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S].f16acc, matmul_id_iq3_s_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS].f16acc, matmul_id_iq4_xs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_MXFP4, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_MXFP4].f16acc, matmul_id_mxfp4_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - } else { - CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f16acc, matmul_id_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f16acc, matmul_id_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - - CREATE_MM(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f16acc, matmul_id_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f16acc, matmul_id_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_IQ1_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_S].f16acc, matmul_id_iq1_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_IQ1_M, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_M].f16acc, matmul_id_iq1_m_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS].f16acc, matmul_id_iq2_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_IQ2_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS].f16acc, matmul_id_iq2_xs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_IQ2_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S].f16acc, matmul_id_iq2_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS].f16acc, matmul_id_iq3_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_IQ3_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S].f16acc, matmul_id_iq3_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS].f16acc, matmul_id_iq4_xs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_MXFP4, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_MXFP4].f16acc, matmul_id_mxfp4_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - } + CREATE_MM2(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K], matmul_id_q2_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM2(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K], matmul_id_q3_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM2(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K], matmul_id_q4_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM2(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K], matmul_id_q5_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM2(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K], matmul_id_q6_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM2(GGML_TYPE_IQ1_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_S], matmul_id_iq1_s_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM2(GGML_TYPE_IQ1_M, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_M], matmul_id_iq1_m_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM2(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS], matmul_id_iq2_xxs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM2(GGML_TYPE_IQ2_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS], matmul_id_iq2_xs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM2(GGML_TYPE_IQ2_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S], matmul_id_iq2_s_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM2(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS], matmul_id_iq3_xxs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM2(GGML_TYPE_IQ3_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S], matmul_id_iq3_s_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM2(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS], matmul_id_iq4_xs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM2(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL], matmul_id_iq4_nl_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM2(GGML_TYPE_MXFP4, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_MXFP4], matmul_id_mxfp4_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); #undef CREATE_MM2 #undef CREATE_MM } else @@ -2631,27 +2635,27 @@ static void ggml_vk_load_shaders(vk_device& device) { CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_bf16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f16acc, matmul_id_q5_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f16acc, matmul_id_q8_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM2(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_q4_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM2(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1], matmul_id_q4_1_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM2(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0], matmul_id_q5_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM2(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1], matmul_id_q5_1_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM2(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0], matmul_id_q8_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f16acc, matmul_id_q2_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f16acc, matmul_id_q3_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_IQ1_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_S].f16acc, matmul_id_iq1_s_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_IQ1_M, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_M].f16acc, matmul_id_iq1_m_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS].f16acc, matmul_id_iq2_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_IQ2_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS].f16acc, matmul_id_iq2_xs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_IQ2_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S].f16acc, matmul_id_iq2_s_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS].f16acc, matmul_id_iq3_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_IQ3_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S].f16acc, matmul_id_iq3_s_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS].f16acc, matmul_id_iq4_xs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(GGML_TYPE_MXFP4, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_MXFP4].f16acc, matmul_id_mxfp4_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM2(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K], matmul_id_q2_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM2(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K], matmul_id_q3_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM2(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K], matmul_id_q4_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM2(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K], matmul_id_q5_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM2(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K], matmul_id_q6_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM2(GGML_TYPE_IQ1_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_S], matmul_id_iq1_s_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM2(GGML_TYPE_IQ1_M, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_M], matmul_id_iq1_m_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM2(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS], matmul_id_iq2_xxs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM2(GGML_TYPE_IQ2_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS], matmul_id_iq2_xs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM2(GGML_TYPE_IQ2_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S], matmul_id_iq2_s_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM2(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS], matmul_id_iq3_xxs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM2(GGML_TYPE_IQ3_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S], matmul_id_iq3_s_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM2(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS], matmul_id_iq4_xs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM2(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL], matmul_id_iq4_nl_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM2(GGML_TYPE_MXFP4, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_MXFP4], matmul_id_mxfp4_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); #undef CREATE_MM2 #undef CREATE_MMQ #undef CREATE_MM @@ -2780,54 +2784,61 @@ static void ggml_vk_load_shaders(vk_device& device) { rm_stdq = 2; uint32_t rm_iq = 2 * rm_kq; - for (uint32_t i = 0; i < mul_mat_vec_max_cols; ++i) { - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f32_f32_"+std::to_string(i+1), mul_mat_vec_f32_f32_f32_len, mul_mat_vec_f32_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2, i+1}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F16 ][i], "mul_mat_vec_f16_f32_f32_"+std::to_string(i+1), mul_mat_vec_f16_f32_f32_len, mul_mat_vec_f16_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2, i+1}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_BF16][i], "mul_mat_vec_bf16_f32_f32_"+std::to_string(i+1), mul_mat_vec_bf16_f32_f32_len, mul_mat_vec_bf16_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2, i+1}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_0][i], "mul_mat_vec_q4_0_f32_f32_"+std::to_string(i+1), mul_mat_vec_q4_0_f32_f32_len, mul_mat_vec_q4_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq, i+1}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_1][i], "mul_mat_vec_q4_1_f32_f32_"+std::to_string(i+1), mul_mat_vec_q4_1_f32_f32_len, mul_mat_vec_q4_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq, i+1}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0][i], "mul_mat_vec_q5_0_f32_f32_"+std::to_string(i+1), mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq, i+1}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_1][i], "mul_mat_vec_q5_1_f32_f32_"+std::to_string(i+1), mul_mat_vec_q5_1_f32_f32_len, mul_mat_vec_q5_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq, i+1}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q8_0][i], "mul_mat_vec_q8_0_f32_f32_"+std::to_string(i+1), mul_mat_vec_q8_0_f32_f32_len, mul_mat_vec_q8_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1*rm_stdq, 1, 1}, {device->subgroup_size, 1*rm_stdq, i+1}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K][i], "mul_mat_vec_q2_k_f32_f32_"+std::to_string(i+1), mul_mat_vec_q2_k_f32_f32_len, mul_mat_vec_q2_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K][i], "mul_mat_vec_q3_k_f32_f32_"+std::to_string(i+1), mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K][i], "mul_mat_vec_q4_k_f32_f32_"+std::to_string(i+1), mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K][i], "mul_mat_vec_q5_k_f32_f32_"+std::to_string(i+1), mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K][i], "mul_mat_vec_q6_k_f32_f32_"+std::to_string(i+1), mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ1_S][i], "mul_mat_vec_iq1_s_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq1_s_f32_f32_len, mul_mat_vec_iq1_s_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ1_M][i], "mul_mat_vec_iq1_m_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq1_m_f32_f32_len, mul_mat_vec_iq1_m_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ2_XXS][i], "mul_mat_vec_iq2_xxs_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq2_xxs_f32_f32_len, mul_mat_vec_iq2_xxs_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ2_XS][i], "mul_mat_vec_iq2_xs_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq2_xs_f32_f32_len, mul_mat_vec_iq2_xs_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ2_S][i], "mul_mat_vec_iq2_s_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq2_s_f32_f32_len, mul_mat_vec_iq2_s_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ3_XXS][i], "mul_mat_vec_iq3_xxs_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq3_xxs_f32_f32_len, mul_mat_vec_iq3_xxs_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ3_S][i], "mul_mat_vec_iq3_s_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq3_s_f32_f32_len, mul_mat_vec_iq3_s_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_XS][i], "mul_mat_vec_iq4_xs_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq4_xs_f32_f32_len, mul_mat_vec_iq4_xs_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL][i], "mul_mat_vec_iq4_nl_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_MXFP4][i], "mul_mat_vec_mxfp4_f32_f32_"+std::to_string(i+1), mul_mat_vec_mxfp4_f32_f32_len, mul_mat_vec_mxfp4_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true); + for (uint32_t w = 0; w < DMMV_WG_SIZE_COUNT; ++w) { + uint32_t wg_size_subgroup16 = (w == DMMV_WG_SIZE_SUBGROUP) ? subgroup_size_16 : (subgroup_size_16 * 4); + uint32_t wg_size_subgroup = (w == DMMV_WG_SIZE_SUBGROUP) ? device->subgroup_size : (device->subgroup_size * 4); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f16_f32_"+std::to_string(i+1), mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2, i+1}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ][i], "mul_mat_vec_f16_f16_f32_"+std::to_string(i+1), mul_mat_vec_f16_f16_f32_len, mul_mat_vec_f16_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2, i+1}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_BF16][i], "mul_mat_vec_bf16_f16_f32_"+std::to_string(i+1), mul_mat_vec_bf16_f16_f32_len, mul_mat_vec_bf16_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2, i+1}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_0][i], "mul_mat_vec_q4_0_f16_f32_"+std::to_string(i+1), mul_mat_vec_q4_0_f16_f32_len, mul_mat_vec_q4_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq, i+1}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_1][i], "mul_mat_vec_q4_1_f16_f32_"+std::to_string(i+1), mul_mat_vec_q4_1_f16_f32_len, mul_mat_vec_q4_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq, i+1}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0][i], "mul_mat_vec_q5_0_f16_f32_"+std::to_string(i+1), mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq, i+1}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_1][i], "mul_mat_vec_q5_1_f16_f32_"+std::to_string(i+1), mul_mat_vec_q5_1_f16_f32_len, mul_mat_vec_q5_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq, i+1}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q8_0][i], "mul_mat_vec_q8_0_f16_f32_"+std::to_string(i+1), mul_mat_vec_q8_0_f16_f32_len, mul_mat_vec_q8_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1*rm_stdq, 1, 1}, {device->subgroup_size, 1*rm_stdq, i+1}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K][i], "mul_mat_vec_q2_k_f16_f32_"+std::to_string(i+1), mul_mat_vec_q2_k_f16_f32_len, mul_mat_vec_q2_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K][i], "mul_mat_vec_q3_k_f16_f32_"+std::to_string(i+1), mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K][i], "mul_mat_vec_q4_k_f16_f32_"+std::to_string(i+1), mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K][i], "mul_mat_vec_q5_k_f16_f32_"+std::to_string(i+1), mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K][i], "mul_mat_vec_q6_k_f16_f32_"+std::to_string(i+1), mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ1_S][i], "mul_mat_vec_iq1_s_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq1_s_f16_f32_len, mul_mat_vec_iq1_s_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ1_M][i], "mul_mat_vec_iq1_m_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq1_m_f16_f32_len, mul_mat_vec_iq1_m_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ2_XXS][i], "mul_mat_vec_iq2_xxs_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq2_xxs_f16_f32_len, mul_mat_vec_iq2_xxs_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ2_XS][i], "mul_mat_vec_iq2_xs_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq2_xs_f16_f32_len, mul_mat_vec_iq2_xs_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ2_S][i], "mul_mat_vec_iq2_s_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq2_s_f16_f32_len, mul_mat_vec_iq2_s_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ3_XXS][i], "mul_mat_vec_iq3_xxs_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq3_xxs_f16_f32_len, mul_mat_vec_iq3_xxs_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ3_S][i], "mul_mat_vec_iq3_s_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq3_s_f16_f32_len, mul_mat_vec_iq3_s_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_XS][i], "mul_mat_vec_iq4_xs_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq4_xs_f16_f32_len, mul_mat_vec_iq4_xs_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL][i], "mul_mat_vec_iq4_nl_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_MXFP4][i], "mul_mat_vec_mxfp4_f16_f32_"+std::to_string(i+1), mul_mat_vec_mxfp4_f16_f32_len, mul_mat_vec_mxfp4_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true); + const bool s = device->subgroup_add && device->architecture != vk_device_architecture::AMD_GCN; + + for (uint32_t i = 0; i < mul_mat_vec_max_cols; ++i) { + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f32_f32", arr_dmmv_f32_f32_f32_len[s], arr_dmmv_f32_f32_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_F16 ][i], "mul_mat_vec_f16_f32_f32", arr_dmmv_f16_f32_f32_len[s], arr_dmmv_f16_f32_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_BF16][i], "mul_mat_vec_bf16_f32_f32", arr_dmmv_bf16_f32_f32_len[s], arr_dmmv_bf16_f32_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q4_0][i], "mul_mat_vec_q4_0_f32_f32", arr_dmmv_q4_0_f32_f32_len[s], arr_dmmv_q4_0_f32_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q4_1][i], "mul_mat_vec_q4_1_f32_f32", arr_dmmv_q4_1_f32_f32_len[s], arr_dmmv_q4_1_f32_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q5_0][i], "mul_mat_vec_q5_0_f32_f32", arr_dmmv_q5_0_f32_f32_len[s], arr_dmmv_q5_0_f32_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q5_1][i], "mul_mat_vec_q5_1_f32_f32", arr_dmmv_q5_1_f32_f32_len[s], arr_dmmv_q5_1_f32_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q8_0][i], "mul_mat_vec_q8_0_f32_f32", arr_dmmv_q8_0_f32_f32_len[s], arr_dmmv_q8_0_f32_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {1*rm_stdq, 1, 1}, {wg_size_subgroup, 1*rm_stdq, i+1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q2_K][i], "mul_mat_vec_q2_k_f32_f32", arr_dmmv_q2_k_f32_f32_len[s], arr_dmmv_q2_k_f32_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q3_K][i], "mul_mat_vec_q3_k_f32_f32", arr_dmmv_q3_k_f32_f32_len[s], arr_dmmv_q3_k_f32_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q4_K][i], "mul_mat_vec_q4_k_f32_f32", arr_dmmv_q4_k_f32_f32_len[s], arr_dmmv_q4_k_f32_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q5_K][i], "mul_mat_vec_q5_k_f32_f32", arr_dmmv_q5_k_f32_f32_len[s], arr_dmmv_q5_k_f32_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q6_K][i], "mul_mat_vec_q6_k_f32_f32", arr_dmmv_q6_k_f32_f32_len[s], arr_dmmv_q6_k_f32_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ1_S][i], "mul_mat_vec_iq1_s_f32_f32", arr_dmmv_iq1_s_f32_f32_len[s], arr_dmmv_iq1_s_f32_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ1_M][i], "mul_mat_vec_iq1_m_f32_f32", arr_dmmv_iq1_m_f32_f32_len[s], arr_dmmv_iq1_m_f32_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ2_XXS][i], "mul_mat_vec_iq2_xxs_f32_f32", arr_dmmv_iq2_xxs_f32_f32_len[s], arr_dmmv_iq2_xxs_f32_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ2_XS][i], "mul_mat_vec_iq2_xs_f32_f32", arr_dmmv_iq2_xs_f32_f32_len[s], arr_dmmv_iq2_xs_f32_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ2_S][i], "mul_mat_vec_iq2_s_f32_f32", arr_dmmv_iq2_s_f32_f32_len[s], arr_dmmv_iq2_s_f32_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ3_XXS][i], "mul_mat_vec_iq3_xxs_f32_f32", arr_dmmv_iq3_xxs_f32_f32_len[s], arr_dmmv_iq3_xxs_f32_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ3_S][i], "mul_mat_vec_iq3_s_f32_f32", arr_dmmv_iq3_s_f32_f32_len[s], arr_dmmv_iq3_s_f32_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ4_XS][i], "mul_mat_vec_iq4_xs_f32_f32", arr_dmmv_iq4_xs_f32_f32_len[s], arr_dmmv_iq4_xs_f32_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ4_NL][i], "mul_mat_vec_iq4_nl_f32_f32", arr_dmmv_iq4_nl_f32_f32_len[s], arr_dmmv_iq4_nl_f32_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_MXFP4][i], "mul_mat_vec_mxfp4_f32_f32", arr_dmmv_mxfp4_f32_f32_len[s], arr_dmmv_mxfp4_f32_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true); + + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f16_f32", arr_dmmv_f32_f16_f32_len[s], arr_dmmv_f32_f16_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_F16 ][i], "mul_mat_vec_f16_f16_f32", arr_dmmv_f16_f16_f32_len[s], arr_dmmv_f16_f16_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_BF16][i], "mul_mat_vec_bf16_f16_f32", arr_dmmv_bf16_f16_f32_len[s], arr_dmmv_bf16_f16_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q4_0][i], "mul_mat_vec_q4_0_f16_f32", arr_dmmv_q4_0_f16_f32_len[s], arr_dmmv_q4_0_f16_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q4_1][i], "mul_mat_vec_q4_1_f16_f32", arr_dmmv_q4_1_f16_f32_len[s], arr_dmmv_q4_1_f16_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q5_0][i], "mul_mat_vec_q5_0_f16_f32", arr_dmmv_q5_0_f16_f32_len[s], arr_dmmv_q5_0_f16_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q5_1][i], "mul_mat_vec_q5_1_f16_f32", arr_dmmv_q5_1_f16_f32_len[s], arr_dmmv_q5_1_f16_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q8_0][i], "mul_mat_vec_q8_0_f16_f32", arr_dmmv_q8_0_f16_f32_len[s], arr_dmmv_q8_0_f16_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {1*rm_stdq, 1, 1}, {wg_size_subgroup, 1*rm_stdq, i+1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q2_K][i], "mul_mat_vec_q2_k_f16_f32", arr_dmmv_q2_k_f16_f32_len[s], arr_dmmv_q2_k_f16_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q3_K][i], "mul_mat_vec_q3_k_f16_f32", arr_dmmv_q3_k_f16_f32_len[s], arr_dmmv_q3_k_f16_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q4_K][i], "mul_mat_vec_q4_k_f16_f32", arr_dmmv_q4_k_f16_f32_len[s], arr_dmmv_q4_k_f16_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q5_K][i], "mul_mat_vec_q5_k_f16_f32", arr_dmmv_q5_k_f16_f32_len[s], arr_dmmv_q5_k_f16_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q6_K][i], "mul_mat_vec_q6_k_f16_f32", arr_dmmv_q6_k_f16_f32_len[s], arr_dmmv_q6_k_f16_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ1_S][i], "mul_mat_vec_iq1_s_f16_f32", arr_dmmv_iq1_s_f16_f32_len[s], arr_dmmv_iq1_s_f16_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ1_M][i], "mul_mat_vec_iq1_m_f16_f32", arr_dmmv_iq1_m_f16_f32_len[s], arr_dmmv_iq1_m_f16_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ2_XXS][i], "mul_mat_vec_iq2_xxs_f16_f32", arr_dmmv_iq2_xxs_f16_f32_len[s], arr_dmmv_iq2_xxs_f16_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ2_XS][i], "mul_mat_vec_iq2_xs_f16_f32", arr_dmmv_iq2_xs_f16_f32_len[s], arr_dmmv_iq2_xs_f16_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ2_S][i], "mul_mat_vec_iq2_s_f16_f32", arr_dmmv_iq2_s_f16_f32_len[s], arr_dmmv_iq2_s_f16_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ3_XXS][i], "mul_mat_vec_iq3_xxs_f16_f32", arr_dmmv_iq3_xxs_f16_f32_len[s], arr_dmmv_iq3_xxs_f16_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ3_S][i], "mul_mat_vec_iq3_s_f16_f32", arr_dmmv_iq3_s_f16_f32_len[s], arr_dmmv_iq3_s_f16_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ4_XS][i], "mul_mat_vec_iq4_xs_f16_f32", arr_dmmv_iq4_xs_f16_f32_len[s], arr_dmmv_iq4_xs_f16_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ4_NL][i], "mul_mat_vec_iq4_nl_f16_f32", arr_dmmv_iq4_nl_f16_f32_len[s], arr_dmmv_iq4_nl_f16_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_MXFP4][i], "mul_mat_vec_mxfp4_f16_f32", arr_dmmv_mxfp4_f16_f32_len[s], arr_dmmv_mxfp4_f16_f32_data[s], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true); + } } ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", mul_mat_vec_id_f32_f32_len, mul_mat_vec_id_f32_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); @@ -3018,6 +3029,12 @@ static void ggml_vk_load_shaders(vk_device& device) { CREATE_BINARY(div, _norepeat, {1}) #undef CREATE_BINARY + if (device->multi_add) { + for (uint32_t i = 0; i < MAX_FUSED_ADDS; ++i) { + ggml_vk_create_pipeline(device, device->pipeline_multi_add[i], "multi_add_f32_" + std::to_string(i+1), multi_add_f32_len, multi_add_f32_data, "main", MAX_PARAMETER_COUNT, sizeof(vk_op_multi_add_push_constants), {512, 1, 1}, {i+2}, 1); + } + } + ggml_vk_create_pipeline(device, device->pipeline_add_id_f32, "add_id_f32", add_id_f32_len, add_id_f32_data, "main", 4, sizeof(vk_op_add_id_push_constants), {1, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_acc_f32, "acc_f32", acc_f32_len, acc_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); @@ -3033,6 +3050,7 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_scale_f32, "scale_f32", scale_f32_len, scale_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_sqr_f32, "sqr_f32", sqr_f32_len, sqr_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_sqrt_f32, "sqrt_f32", sqrt_f32_len, sqrt_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_sin_f32, "sin_f32", sin_f32_len, sin_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cos_f32, "cos_f32", cos_f32_len, cos_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); @@ -3049,6 +3067,7 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32", name ## _f32_len, name ## _f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); \ ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16", name ## _f16_len, name ## _f16_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + CREATE_UNARY(exp) CREATE_UNARY(gelu) CREATE_UNARY(gelu_erf) CREATE_UNARY(gelu_quick) @@ -3103,7 +3122,9 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_rope_vision_f16, "rope_vision_f16", rope_vision_f16_len, rope_vision_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); } - ggml_vk_create_pipeline(device, device->pipeline_argsort_f32, "argsort_f32", argsort_f32_len, argsort_f32_data, "main", 2, sizeof(vk_op_argsort_push_constants), {1024, 1, 1}, {}, 1); + for (uint32_t i = 0; i < num_argsort_pipelines; ++i) { + ggml_vk_create_pipeline(device, device->pipeline_argsort_f32[i], "argsort_f32_"+std::to_string(i), argsort_f32_len, argsort_f32_data, "main", 2, sizeof(vk_op_argsort_push_constants), {1u<pipeline_argmax_f32, "argmax_f32", argmax_f32_len, argmax_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); @@ -3236,6 +3257,8 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f32, "conv2d_dw_whcn_f32", conv2d_dw_whcn_f32_len, conv2d_dw_whcn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_cwhn_f32, "conv2d_dw_cwhn_f32", conv2d_dw_cwhn_f32_len, conv2d_dw_cwhn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f16_f32, "conv2d_dw_whcn_f16_f32", conv2d_dw_whcn_f16_f32_len, conv2d_dw_whcn_f16_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_cwhn_f16_f32, "conv2d_dw_cwhn_f16_f32", conv2d_dw_cwhn_f16_f32_len, conv2d_dw_cwhn_f16_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1); for (auto &c : compiles) { c.wait(); @@ -3557,6 +3580,12 @@ static vk_device ggml_vk_get_device(size_t idx) { device->pipeline_robustness = pl_robustness_features.pipelineRobustness; + device->multi_add = vk12_props.shaderRoundingModeRTEFloat16 && + device->properties.limits.maxPushConstantsSize >= sizeof(vk_op_multi_add_push_constants) && + vk12_features.runtimeDescriptorArray && + device->vendor_id != VK_VENDOR_ID_INTEL && + getenv("GGML_VK_DISABLE_MULTI_ADD") == nullptr; + if (device->subgroup_size_control) { device->subgroup_min_size = subgroup_size_control_props.minSubgroupSize; device->subgroup_max_size = subgroup_size_control_props.maxSubgroupSize; @@ -4381,7 +4410,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte return (ctx->device->fp16 && prec == GGML_PREC_DEFAULT) ? ctx->device->pipeline_dequant_mul_mat_mat[src0_type].f16acc : ctx->device->pipeline_dequant_mul_mat_mat[src0_type].f32acc; } -static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type, uint32_t num_cols) { +static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type, uint32_t num_cols, uint32_t m, uint32_t k) { VK_LOG_DEBUG("ggml_vk_get_dequantize_mul_mat_vec()"); GGML_ASSERT(b_type == GGML_TYPE_F32 || b_type == GGML_TYPE_F16); GGML_ASSERT(num_cols >= 1 && num_cols <= mul_mat_vec_max_cols); @@ -4415,7 +4444,24 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * return nullptr; } - return b_type == GGML_TYPE_F32 ? ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[a_type][num_cols-1] : ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[a_type][num_cols-1]; + // heuristic to choose workgroup size + uint32_t dmmv_wg = DMMV_WG_SIZE_SUBGROUP; + if (ctx->device->vendor_id == VK_VENDOR_ID_NVIDIA || ctx->device->vendor_id == VK_VENDOR_ID_INTEL) { + // Prefer larger workgroups when M is small, to spread the work out more + // and keep more SMs busy. + // q6_k seems to prefer small workgroup size even for "medium" values of M. + if (a_type == GGML_TYPE_Q6_K) { + if (m < 4096 && k >= 1024) { + dmmv_wg = DMMV_WG_SIZE_LARGE; + } + } else { + if (m <= 8192 && k >= 1024) { + dmmv_wg = DMMV_WG_SIZE_LARGE; + } + } + } + + return b_type == GGML_TYPE_F32 ? ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[dmmv_wg][a_type][num_cols-1] : ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[dmmv_wg][a_type][num_cols-1]; } static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type, ggml_prec prec) { @@ -4470,7 +4516,17 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_co return nullptr; } - return ctx->device->fp16 ? ctx->device->pipeline_dequant_mul_mat_mat_id[src0_type].f16acc : ctx->device->pipeline_dequant_mul_mat_mat_id[src0_type].f32acc; + // XXX TODO 'prec' is not actually allowed in mul_mat_id. + bool prefer_fp16acc = ctx->device->fp16 /*&& prec == GGML_PREC_DEFAULT*/; + bool support_fp16acc = ctx->device->pipeline_dequant_mul_mat_mat_id[src0_type].f16acc != nullptr; + bool support_fp32acc = ctx->device->pipeline_dequant_mul_mat_mat_id[src0_type].f32acc != nullptr; + + if (support_fp16acc && (prefer_fp16acc || !support_fp32acc)) { + return ctx->device->pipeline_dequant_mul_mat_mat_id[src0_type].f16acc; + } else { + GGML_ASSERT(support_fp32acc); + return ctx->device->pipeline_dequant_mul_mat_mat_id[src0_type].f32acc; + } } static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec_id(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type) { @@ -5603,10 +5659,20 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); } if (y_non_contig) { - ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }); + if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() || + ctx->prealloc_y_last_tensor_used != src1) { + ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }); + ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get(); + ctx->prealloc_y_last_tensor_used = src1; + } } if (quantize_y) { - ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13); + if (ctx->prealloc_y_last_pipeline_used != to_q8_1.get() || + ctx->prealloc_y_last_tensor_used != src1) { + ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13); + ctx->prealloc_y_last_pipeline_used = to_q8_1.get(); + ctx->prealloc_y_last_tensor_used = src1; + } } uint32_t stride_batch_x = ne00*ne01; @@ -5712,7 +5778,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& } else { to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type); } - vk_pipeline dmmv = ggml_vk_get_dequantize_mul_mat_vec(ctx, src0->type, src1->type, ne11); + vk_pipeline dmmv = ggml_vk_get_dequantize_mul_mat_vec(ctx, src0->type, src1->type, ne11, ne20, ne00); GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT GGML_ASSERT(dmmv != nullptr); @@ -5781,7 +5847,12 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& } if (y_non_contig) { GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne); - ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }); + if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() || + ctx->prealloc_y_last_tensor_used != src1) { + ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }); + ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get(); + ctx->prealloc_y_last_tensor_used = src1; + } } // For batch_n, the A matrix is the same for each batch, and B/D use the row stride as the batch stride @@ -6211,7 +6282,12 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); } if (y_non_contig) { - ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }); + if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() || + ctx->prealloc_y_last_tensor_used != src1) { + ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }); + ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get(); + ctx->prealloc_y_last_tensor_used = src1; + } } uint32_t stride_batch_x = ne00*ne01; @@ -6399,7 +6475,12 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte } if (y_non_contig) { GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne); - ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }); + if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() || + ctx->prealloc_y_last_tensor_used != src1) { + ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }); + ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get(); + ctx->prealloc_y_last_tensor_used = src1; + } } uint32_t stride_batch_y = ne10*ne11; @@ -6443,22 +6524,29 @@ static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx GGML_ASSERT(nei0 <= 4096); const uint32_t split_size = std::min(nei1, 4096u / nei0); - ggml_tensor src1_copy = *src1; - ggml_tensor src2_copy = *src2; - ggml_tensor dst_copy = *dst; + if (split_size == nei1) { + ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, src1, src2, dst, dryrun); + } else { + ggml_tensor src1_copy = *src1; + ggml_tensor src2_copy = *src2; + ggml_tensor dst_copy = *dst; - for (uint32_t token_start = 0; token_start < nei1; token_start += split_size) { - const uint32_t n_tokens = std::min(split_size, nei1 - token_start); + for (uint32_t token_start = 0; token_start < nei1; token_start += split_size) { + const uint32_t n_tokens = std::min(split_size, nei1 - token_start); - src1_copy.view_offs = src1->view_offs + token_start * src1_copy.nb[2]; - src2_copy.view_offs = src2->view_offs + token_start * src2_copy.nb[1]; - dst_copy.view_offs = dst->view_offs + token_start * dst_copy.nb[2]; + src1_copy.view_offs = src1->view_offs + token_start * src1_copy.nb[2]; + src2_copy.view_offs = src2->view_offs + token_start * src2_copy.nb[1]; + dst_copy.view_offs = dst->view_offs + token_start * dst_copy.nb[2]; - src1_copy.ne[2] = n_tokens; - src2_copy.ne[1] = n_tokens; - dst_copy.ne[2] = n_tokens; + src1_copy.ne[2] = n_tokens; + src2_copy.ne[1] = n_tokens; + dst_copy.ne[2] = n_tokens; - ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, &src1_copy, &src2_copy, &dst_copy, dryrun); + ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, &src1_copy, &src2_copy, &dst_copy, dryrun); + // invalidate cached prealloc_y, can't cache based on the copy of the ggml_tensor + ctx->prealloc_y_last_pipeline_used = {}; + ctx->prealloc_y_last_tensor_used = nullptr; + } } } } @@ -6901,6 +6989,9 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const switch (op) { case GGML_OP_ADD: { + if (ctx->num_additional_fused_ops > 0) { + return ctx->device->pipeline_multi_add[ctx->num_additional_fused_ops]; + } auto pipelines = ggml_are_same_shape(src0, src1) ? ctx->device->pipeline_add_norepeat : ctx->device->pipeline_add; return pipelines[src0->type == GGML_TYPE_F16][src1->type == GGML_TYPE_F16][dst->type == GGML_TYPE_F16]; } @@ -6962,6 +7053,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const return ctx->device->pipeline_sqr_f32; } return nullptr; + case GGML_OP_SQRT: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_sqrt_f32; + } + return nullptr; case GGML_OP_SIN: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_sin_f32; @@ -7041,6 +7137,8 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const } switch (ggml_get_unary_op(dst)) { + case GGML_UNARY_OP_EXP: + return ctx->device->pipeline_exp[dst->type == GGML_TYPE_F16]; case GGML_UNARY_OP_SILU: return ctx->device->pipeline_silu[dst->type == GGML_TYPE_F16]; case GGML_UNARY_OP_GELU: @@ -7145,7 +7243,8 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const } case GGML_OP_ARGSORT: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_I32) { - return ctx->device->pipeline_argsort_f32; + uint32_t idx = (uint32_t)ceilf(log2f(float(dst->ne[0]))); + return ctx->device->pipeline_argsort_f32[idx]; } return nullptr; case GGML_OP_SUM: @@ -7249,6 +7348,12 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const } else if (ggml_is_contiguous_channels(src1)) { return ctx->device->pipeline_conv2d_dw_cwhn_f32; } + } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) { + if (ggml_is_contiguous(src1)) { + return ctx->device->pipeline_conv2d_dw_whcn_f16_f32; + } else if (ggml_is_contiguous_channels(src1)) { + return ctx->device->pipeline_conv2d_dw_cwhn_f16_f32; + } } return nullptr; default: @@ -7270,6 +7375,7 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) { case GGML_OP_CONCAT: case GGML_OP_UPSCALE: case GGML_OP_SQR: + case GGML_OP_SQRT: case GGML_OP_SIN: case GGML_OP_COS: case GGML_OP_CLAMP: @@ -7575,6 +7681,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co case GGML_OP_MUL: case GGML_OP_SCALE: case GGML_OP_SQR: + case GGML_OP_SQRT: case GGML_OP_SIN: case GGML_OP_COS: case GGML_OP_CLAMP: @@ -7757,6 +7864,107 @@ static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const }, dryrun); } +static void ggml_vk_multi_add(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx, bool dryrun = false) { + const ggml_tensor *first_node = cgraph->nodes[node_idx]; + const ggml_tensor *dst = cgraph->nodes[node_idx + ctx->num_additional_fused_ops]; + + // Make a list of all the tensors used by the op. + // Last element of the list is the dest tensor. + const ggml_tensor *tensors[MAX_PARAMETER_COUNT]; + uint32_t num_srcs = ctx->num_additional_fused_ops + 2; + uint32_t num_tensors = num_srcs + 1; + GGML_ASSERT(num_tensors <= MAX_PARAMETER_COUNT); + + tensors[0] = first_node->src[0]; + tensors[1] = first_node->src[1]; + for (int32_t i = 0; i < ctx->num_additional_fused_ops; ++i) { + // check whether the previous result is src[0] or src[1] + if (cgraph->nodes[node_idx + i] == cgraph->nodes[node_idx + i + 1]->src[0]) { + tensors[i+2] = cgraph->nodes[node_idx + i + 1]->src[1]; + } else { + tensors[i+2] = cgraph->nodes[node_idx + i + 1]->src[0]; + } + } + tensors[num_srcs] = dst; + + vk_op_multi_add_push_constants pc; + pc.ne20 = (uint32_t)dst->ne[0]; + pc.ne21 = (uint32_t)dst->ne[1]; + pc.ne22 = (uint32_t)dst->ne[2]; + pc.ne23 = (uint32_t)dst->ne[3]; + + for (uint32_t i = 0; i < num_tensors; ++i) { + const ggml_tensor *t = tensors[i]; + pc.nb[i][0] = (uint32_t)t->nb[0] / sizeof(float); + pc.nb[i][1] = (uint32_t)t->nb[1] / sizeof(float); + pc.nb[i][2] = (uint32_t)t->nb[2] / sizeof(float); + pc.nb[i][3] = (uint32_t)t->nb[3] / sizeof(float); + } + + vk_pipeline pipeline = ctx->device->pipeline_multi_add[ctx->num_additional_fused_ops]; + + if (pipeline == nullptr) { + std::cerr << "ggml_vulkan: Error: Missing multi_add"; + GGML_ABORT("fatal error"); + } + + if (dryrun) { + ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); + return; + } + + ggml_backend_vk_buffer_context * buf_ctx[MAX_PARAMETER_COUNT]; + vk_buffer buf[MAX_PARAMETER_COUNT]; + size_t offset[MAX_PARAMETER_COUNT]; + bool uma[MAX_PARAMETER_COUNT]; + + for (uint32_t i = 0; i < num_tensors; ++i) { + buf_ctx[i] = (ggml_backend_vk_buffer_context *)tensors[i]->buffer->context; + buf[i] = nullptr; + offset[i] = 0; + uma[i] = false; + + if (ctx->device->uma) { + ggml_vk_host_get(ctx->device, tensors[i]->data, buf[i], offset[i]); + uma[i] = buf[i] != nullptr; + } + if (!uma[i]) { + buf[i] = buf_ctx[i]->dev_buffer; + offset[i] = vk_tensor_offset(tensors[i]) + tensors[i]->view_offs; + } + GGML_ASSERT(buf[i] != nullptr); + } + // If any remaining descriptors are unused, just point them at src[0] + for (uint32_t i = num_tensors; i < MAX_PARAMETER_COUNT; ++i) { + buf[i] = buf[0]; + offset[i] = 0; + } + + std::array elements; + + uint32_t ne = ggml_nelements(dst); + if (ne > 262144) { + elements = { 512, 512, CEIL_DIV(ne, 262144) }; + } else if (ne > 512) { + elements = { 512, CEIL_DIV(ne, 512), 1 }; + } else { + elements = { ne, 1, 1 }; + } + + ggml_vk_sync_buffers(subctx); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, + { + vk_subbuffer{ buf[0], offset[0], VK_WHOLE_SIZE }, + vk_subbuffer{ buf[1], offset[1], VK_WHOLE_SIZE }, + vk_subbuffer{ buf[2], offset[2], VK_WHOLE_SIZE }, + vk_subbuffer{ buf[3], offset[3], VK_WHOLE_SIZE }, + vk_subbuffer{ buf[4], offset[4], VK_WHOLE_SIZE }, + vk_subbuffer{ buf[5], offset[5], VK_WHOLE_SIZE }, + vk_subbuffer{ buf[6], offset[6], VK_WHOLE_SIZE }, + vk_subbuffer{ buf[7], offset[7], VK_WHOLE_SIZE }, + }, pc, elements); +} + static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t src1_type_size = ggml_type_size(src1->type); @@ -8121,6 +8329,10 @@ static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SQR, vk_op_unary_push_constants_init(src0, dst), dryrun); } +static void ggml_vk_sqrt(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SQRT, vk_op_unary_push_constants_init(src0, dst), dryrun); +} + static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SIN, vk_op_unary_push_constants_init(src0, dst), dryrun); } @@ -8369,16 +8581,8 @@ static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context& subctx, c uint32_t ncols = src0->ne[0]; - uint32_t ncols_pad = 1; - while (ncols_pad < ncols) { - ncols_pad *= 2; - } - - GGML_ASSERT(ncols_pad <= 1024); - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ARGSORT, { ncols, - ncols_pad, op_params[0], }, dryrun); } @@ -9546,6 +9750,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr return false; case GGML_OP_UNARY: switch (ggml_get_unary_op(node)) { + case GGML_UNARY_OP_EXP: case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_GELU: case GGML_UNARY_OP_GELU_ERF: @@ -9584,6 +9789,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr case GGML_OP_UPSCALE: case GGML_OP_SCALE: case GGML_OP_SQR: + case GGML_OP_SQRT: case GGML_OP_SIN: case GGML_OP_COS: case GGML_OP_CLAMP: @@ -9653,6 +9859,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr case GGML_OP_UPSCALE: case GGML_OP_SCALE: case GGML_OP_SQR: + case GGML_OP_SQRT: case GGML_OP_SIN: case GGML_OP_COS: case GGML_OP_CLAMP: @@ -9717,8 +9924,11 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr break; case GGML_OP_ADD: - ggml_vk_add(ctx, compute_ctx, src0, src1, node, dryrun); - + if (ctx->num_additional_fused_ops) { + ggml_vk_multi_add(ctx, compute_ctx, cgraph, node_idx, dryrun); + } else { + ggml_vk_add(ctx, compute_ctx, src0, src1, node, dryrun); + } break; case GGML_OP_SUB: ggml_vk_sub(ctx, compute_ctx, src0, src1, node, dryrun); @@ -9751,6 +9961,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr case GGML_OP_SQR: ggml_vk_sqr(ctx, compute_ctx, src0, node, dryrun); + break; + case GGML_OP_SQRT: + ggml_vk_sqrt(ctx, compute_ctx, src0, node, dryrun); + break; case GGML_OP_SIN: ggml_vk_sin(ctx, compute_ctx, src0, node, dryrun); @@ -9814,6 +10028,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr break; case GGML_OP_UNARY: switch (ggml_get_unary_op(node)) { + case GGML_UNARY_OP_EXP: case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_GELU: case GGML_UNARY_OP_GELU_ERF: @@ -10002,6 +10217,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph * case GGML_OP_UPSCALE: case GGML_OP_SCALE: case GGML_OP_SQR: + case GGML_OP_SQRT: case GGML_OP_SIN: case GGML_OP_COS: case GGML_OP_CLAMP: @@ -10049,6 +10265,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph * break; case GGML_OP_UNARY: switch (ggml_get_unary_op(tensor)) { + case GGML_UNARY_OP_EXP: case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_GELU: case GGML_UNARY_OP_GELU_ERF: @@ -10145,6 +10362,7 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) { ggml_vk_pool_free(ctx, buffer); } ctx->gc.temp_buffers.clear(); + ctx->prealloc_y_last_pipeline_used = {}; ggml_vk_command_pool_cleanup(ctx->device, ctx->compute_cmd_pool); ggml_vk_command_pool_cleanup(ctx->device, ctx->transfer_cmd_pool); @@ -10180,6 +10398,7 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) { ggml_vk_destroy_buffer(ctx->prealloc_x); ggml_vk_destroy_buffer(ctx->prealloc_y); ggml_vk_destroy_buffer(ctx->prealloc_split_k); + ctx->prealloc_y_last_pipeline_used = nullptr; for (auto& buffer : ctx->buffer_pool) { ggml_vk_destroy_buffer(buffer); @@ -10600,6 +10819,58 @@ static bool ggml_vk_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, st return true; } +static uint32_t ggml_vk_fuse_multi_add(ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, int node_idx) { + + const ggml_tensor *first_node = cgraph->nodes[node_idx]; + if (first_node->op != GGML_OP_ADD) { + return 0; + } + + if (!ctx->device->multi_add) { + return 0; + } + + int32_t num_adds = 1; + while (node_idx + num_adds < cgraph->n_nodes && + cgraph->nodes[node_idx + num_adds]->op == GGML_OP_ADD && + num_adds < MAX_FUSED_ADDS) { + num_adds++; + } + + // The shader currently requires same shapes (but different strides are allowed), + // everything f32, and no misalignment + for (int32_t i = 0; i < num_adds; ++i) { + const ggml_tensor *next_node = cgraph->nodes[node_idx + i]; + if (!ggml_are_same_shape(first_node, next_node->src[0]) || + !ggml_are_same_shape(first_node, next_node->src[1]) || + next_node->type != GGML_TYPE_F32 || + next_node->src[0]->type != GGML_TYPE_F32 || + next_node->src[1]->type != GGML_TYPE_F32 || + get_misalign_bytes(ctx, next_node) || + get_misalign_bytes(ctx, next_node->src[0]) || + get_misalign_bytes(ctx, next_node->src[1])) { + num_adds = i; + } + } + + // Verify we can fuse these + ggml_op adds[MAX_FUSED_ADDS]; + for (int32_t i = 0; i < num_adds; ++i) { + adds[i] = GGML_OP_ADD; + } + + // decrease num_adds if they can't all be fused + while (num_adds > 1 && !ggml_can_fuse(cgraph, node_idx, adds, num_adds)) { + num_adds--; + } + + // a single add is not "fused", so just return zero + if (num_adds == 1) { + return 0; + } + return num_adds; +} + static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; @@ -10613,8 +10884,13 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg uint64_t total_mat_mul_bytes = 0; for (int i = 0; i < cgraph->n_nodes; i++) { - if (!ctx->device->disable_fusion && ggml_vk_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) { - ctx->num_additional_fused_ops = 1; + if (!ctx->device->disable_fusion) { + uint32_t num_adds = ggml_vk_fuse_multi_add(ctx, cgraph, i); + if (num_adds) { + ctx->num_additional_fused_ops = num_adds - 1; + } else if (ggml_vk_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) { + ctx->num_additional_fused_ops = 1; + } } ggml_vk_build_graph(ctx, cgraph, i, nullptr, 0, true, false, false, false); if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT || cgraph->nodes[i]->op == GGML_OP_MUL_MAT_ID) { @@ -10671,6 +10947,9 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, 0); } + ctx->prealloc_y_last_pipeline_used = nullptr; + ctx->prealloc_y_last_tensor_used = nullptr; + // Submit after enough work has accumulated, to overlap CPU cmdbuffer generation with GPU execution. // Estimate the amount of matmul work by looking at the weight matrix size, and submit every 100MB // (and scaled down based on model size, so smaller models submit earlier). @@ -10689,8 +10968,13 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg mul_mat_bytes += ggml_nbytes(cgraph->nodes[i]->src[0]); } - if (!ctx->device->disable_fusion && ggml_vk_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) { - ctx->num_additional_fused_ops = 1; + if (!ctx->device->disable_fusion) { + uint32_t num_adds = ggml_vk_fuse_multi_add(ctx, cgraph, i); + if (num_adds) { + ctx->num_additional_fused_ops = num_adds - 1; + } else if (ggml_vk_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) { + ctx->num_additional_fused_ops = 1; + } } // Signal the almost_ready fence when the graph is mostly complete (< 20% remaining) @@ -10897,6 +11181,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm switch (op->op) { case GGML_OP_UNARY: switch (ggml_get_unary_op(op)) { + case GGML_UNARY_OP_EXP: case GGML_UNARY_OP_GELU: case GGML_UNARY_OP_GELU_ERF: case GGML_UNARY_OP_GELU_QUICK: @@ -11179,6 +11464,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm case GGML_OP_SILU_BACK: case GGML_OP_RMS_NORM_BACK: case GGML_OP_SQR: + case GGML_OP_SQRT: case GGML_OP_SIN: case GGML_OP_COS: case GGML_OP_CLAMP: @@ -11186,6 +11472,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm case GGML_OP_OPT_STEP_ADAMW: case GGML_OP_OPT_STEP_SGD: return op->src[0]->type == GGML_TYPE_F32; + case GGML_OP_ARGSORT: + return op->ne[0] <= max_argsort_cols; case GGML_OP_UPSCALE: case GGML_OP_ACC: case GGML_OP_CONCAT: @@ -11195,7 +11483,6 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm case GGML_OP_DIAG_MASK_INF: case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX_BACK: - case GGML_OP_ARGSORT: case GGML_OP_SUM: case GGML_OP_SUM_ROWS: case GGML_OP_ARGMAX: @@ -11622,6 +11909,8 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph * tensor_clone = ggml_scale_bias(ggml_ctx, src_clone[0], params[0], params[1]); } else if (tensor->op == GGML_OP_SQR) { tensor_clone = ggml_sqr(ggml_ctx, src_clone[0]); + } else if (tensor->op == GGML_OP_SQRT) { + tensor_clone = ggml_sqrt(ggml_ctx, src_clone[0]); } else if (tensor->op == GGML_OP_SIN) { tensor_clone = ggml_sin(ggml_ctx, src_clone[0]); } else if (tensor->op == GGML_OP_COS) { @@ -11692,6 +11981,9 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph * } } else if (tensor->op == GGML_OP_UNARY) { switch (ggml_get_unary_op(tensor)) { + case GGML_UNARY_OP_EXP: + tensor_clone = ggml_exp(ggml_ctx, src_clone[0]); + break; case GGML_UNARY_OP_SILU: tensor_clone = ggml_silu(ggml_ctx, src_clone[0]); break; @@ -11723,6 +12015,8 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph * } else { tensor_clone = ggml_glu_split(ggml_ctx, src_clone[0], src_clone[1], (ggml_glu_op) tensor->op_params[0]); } + ggml_set_op_params_i32(tensor_clone, 2, ggml_get_op_params_i32(tensor, 2)); + ggml_set_op_params_i32(tensor_clone, 3, ggml_get_op_params_i32(tensor, 3)); } else if (tensor->op == GGML_OP_CPY || tensor->op == GGML_OP_DUP) { if (src1 == nullptr) { tensor_clone = ggml_dup(ggml_ctx, src_clone[0]); @@ -11807,6 +12101,8 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph * src_clone[0]->flags = src0->flags; tensor_clone = ggml_opt_step_sgd(ggml_ctx, src_clone[0], src_clone[1], src_clone[2]); + } else if (tensor->op == GGML_OP_ADD_ID) { + tensor_clone = ggml_add_id(ggml_ctx, src_clone[0], src_clone[1], src_clone[2]); } else { std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp b/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp index d4fa45b1e1..dc53a401e0 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp @@ -1,22 +1,24 @@ #version 450 +#extension GL_EXT_control_flow_attributes : enable #include "types.comp" -#define BLOCK_SIZE 1024 +layout(constant_id = 0) const int BLOCK_SIZE = 1024; +layout(constant_id = 1) const int BLOCK_SIZE_LOG2 = 10; #define ASC 0 -layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in; +layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; layout (binding = 1) buffer D {int data_d[];}; layout (push_constant) uniform parameter { uint ncols; - uint ncols_pad; uint order; } p; shared int dst_row[BLOCK_SIZE]; +shared A_TYPE a_sh[BLOCK_SIZE]; void swap(uint idx0, uint idx1) { int tmp = dst_row[idx0]; @@ -24,7 +26,7 @@ void swap(uint idx0, uint idx1) { dst_row[idx1] = tmp; } -void main() { +void argsort(bool needs_bounds_check) { // bitonic sort const int col = int(gl_LocalInvocationID.x); const uint row = gl_WorkGroupID.y; @@ -32,38 +34,46 @@ void main() { const uint row_offset = row * p.ncols; // initialize indices - if (col < p.ncols_pad) { - dst_row[col] = col; - } + dst_row[col] = col; + a_sh[col] = data_a[row_offset + col]; barrier(); - for (uint k = 2; k <= p.ncols_pad; k *= 2) { - for (uint j = k / 2; j > 0; j /= 2) { - const uint ixj = col ^ j; - if (col < p.ncols_pad && ixj > col) { - if ((col & k) == 0) { - if (dst_row[col] >= p.ncols || - (dst_row[ixj] < p.ncols && (p.order == ASC ? - data_a[row_offset + dst_row[col]] > data_a[row_offset + dst_row[ixj]] : - data_a[row_offset + dst_row[col]] < data_a[row_offset + dst_row[ixj]])) - ) { - swap(col, ixj); - } - } else { - if (dst_row[ixj] >= p.ncols || - (dst_row[col] < p.ncols && (p.order == ASC ? - data_a[row_offset + dst_row[col]] < data_a[row_offset + dst_row[ixj]] : - data_a[row_offset + dst_row[col]] > data_a[row_offset + dst_row[ixj]])) - ) { - swap(col, ixj); - } - } + uint num_outer_loop_iters = BLOCK_SIZE_LOG2; + [[unroll]] for (uint k = 2, outer_idx = 0; outer_idx < num_outer_loop_iters; k *= 2, outer_idx++) { + uint num_inner_loop_iters = outer_idx + 1; + [[unroll]] for (uint j = k / 2, inner_idx = 0; inner_idx < num_inner_loop_iters; j /= 2, inner_idx++) { + const int ixj = int(col ^ j); + + int idx_0 = (col & k) == 0 ? col : ixj; + int idx_1 = (col & k) == 0 ? ixj : col; + + int sh_idx_0 = dst_row[idx_0]; + int sh_idx_1 = dst_row[idx_1]; + bool idx_0_oob = needs_bounds_check ? sh_idx_0 >= p.ncols : false; + bool idx_1_oob = needs_bounds_check ? sh_idx_1 >= p.ncols : false; + + if ((idx_0_oob || + (!idx_1_oob && a_sh[sh_idx_0] > a_sh[sh_idx_1])) && (ixj > col)) { + swap(idx_0, idx_1); } + barrier(); } } if (col < p.ncols) { - data_d[row_offset + col] = dst_row[col]; + if (p.order == ASC) { + data_d[row_offset + col] = dst_row[col]; + } else { + data_d[row_offset + p.ncols - col - 1] = dst_row[col]; + } + } +} + +void main() { + if (p.ncols == BLOCK_SIZE) { + argsort(false); + } else { + argsort(true); } } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp b/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp new file mode 100644 index 0000000000..abecd2d3dc --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp @@ -0,0 +1,20 @@ +#version 450 + +#include "generic_head.comp" +#include "types.comp" + +#extension GL_EXT_control_flow_attributes : enable + +layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; + +layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; +layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; + +void main() { + const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; + + if (i >= p.KX) { + return; + } + data_d[i] = D_TYPE(exp(float(data_a[i]))); +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp index 230e815f22..81cc3f81fc 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp @@ -210,7 +210,7 @@ void main() { [[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) { [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) { - Of[r][d] = float16_t(eMf[r]) * Of[r][d]; + Of[r][d] = ACC_TYPE(eMf[r]) * Of[r][d]; } } [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) { @@ -233,7 +233,7 @@ void main() { vec4 Vf = vec4(data_vv4[v_offset / 4 + (j * Bc + c * cols_per_iter + col_tid) * v_stride / 4 + d * D_split + d_tid]); #endif [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) { - Of[r][d] += float16_t(Pf[r]) * ACC_TYPEV4(Vf); + Of[r][d] += ACC_TYPE(Pf[r]) * ACC_TYPEV4(Vf); } } } @@ -288,7 +288,7 @@ void main() { [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) { [[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) { - Of[r][d] = float16_t(eMf[r]) * Of[r][d]; + Of[r][d] = ACC_TYPE(eMf[r]) * Of[r][d]; tmpshv4[tid] = Of[r][d]; barrier(); @@ -357,7 +357,7 @@ void main() { [[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) { [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) { - Of[r][d] *= float16_t(Lfrcp[r]); + Of[r][d] *= ACC_TYPE(Lfrcp[r]); } } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp b/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp index 4b4316cf3d..750e785753 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp @@ -2,6 +2,7 @@ #extension GL_EXT_control_flow_attributes : require #include "rte.comp" +#include "utils.comp" layout (push_constant) uniform parameter { @@ -28,25 +29,9 @@ uint get_aoffset() { return p.misalign_offsets >> 16; } uint get_boffset() { return (p.misalign_offsets >> 8) & 0xFF; } uint get_doffset() { return p.misalign_offsets & 0xFF; } -// mod and div are expensive and coordinates/dimensions are often power of 2 or equal to 1 -uint fastmod(uint a, uint b) { - if ((b & (b-1)) == 0) { - return a & (b-1); - } - return a % b; -} - -uint fastdiv(uint a, uint b) { - return (a < b) ? 0 : (a / b); -} void get_indices(uint idx, out uint i00, out uint i01, out uint i02, out uint i03) { - i03 = fastdiv(idx, (p.ne02*p.ne01*p.ne00)); - const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00; - i02 = fastdiv((idx - i03_offset), (p.ne01*p.ne00)); - const uint i02_offset = i02*p.ne01*p.ne00; - i01 = (idx - i03_offset - i02_offset) / p.ne00; - i00 = idx - i03_offset - i02_offset - i01*p.ne00; + get_indices(idx, i00, i01, i02, i03, p.ne00, p.ne01, p.ne02, p.ne03); } uint src0_idx(uint i00, uint i01, uint i02, uint i03) { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp index 903753c7e2..b93e9948f7 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp @@ -1,6 +1,10 @@ #extension GL_EXT_control_flow_attributes : enable #extension GL_EXT_shader_16bit_storage : require #extension GL_EXT_shader_8bit_storage : require +#if USE_SUBGROUP_ADD +#extension GL_KHR_shader_subgroup_basic : require +#extension GL_KHR_shader_subgroup_arithmetic : require +#endif #ifdef MUL_MAT_ID #define EXPERT_COUNT 8 @@ -90,7 +94,38 @@ layout (constant_id = 2) const uint NUM_COLS = 1; shared FLOAT_TYPE tmpsh[NUM_COLS][NUM_ROWS][BLOCK_SIZE]; -void reduce_result(const in FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const in uint32_t d_offset, const in uint32_t first_row, const in uint32_t num_rows, const in uint32_t tid) { +void reduce_result(FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const in uint32_t d_offset, const in uint32_t first_row, const in uint32_t num_rows, const in uint32_t tid) { + // subgroupAdd is probably faster on devices that support it, + // particularly when the workgroup has more than one subgroup +#if USE_SUBGROUP_ADD + // sum up partial sums within a subgroup + [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) { + [[unroll]] for (uint n = 0; n < num_rows; ++n) { + temp[j][n] = subgroupAdd(temp[j][n]); + } + } + + // Go through shared memory to sum partials across subgroups + if (gl_SubgroupInvocationID == 0) { + [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) { + [[unroll]] for (uint n = 0; n < num_rows; ++n) { + tmpsh[j][n][gl_SubgroupID] = temp[j][n]; + } + } + } + barrier(); + if (tid == 0) { + [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) { + [[unroll]] for (uint n = 0; n < num_rows; ++n) { + temp[j][n] = FLOAT_TYPE(0); + [[unroll]] for (uint s = 0; s < gl_NumSubgroups; ++s) { + temp[j][n] += tmpsh[j][n][s]; + } + data_d[j*p.batch_stride_d + d_offset + first_row + n] = D_TYPE(temp[j][n]); + } + } + } +#else // sum up partial sums and write back result [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) { [[unroll]] for (uint n = 0; n < num_rows; ++n) { @@ -115,4 +150,5 @@ void reduce_result(const in FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const in uint32 } } } +#endif } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp index 8c5114a79d..a61a464c7b 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp @@ -801,7 +801,7 @@ void main() { } #else const uint row_i = ic * BN + loadc_b + l; - if (row_i < _ne1) { + if (row_i < _ne1 && block + loadr_b < end_k) { const u16vec2 row_idx = row_ids[row_i]; buf_b[(loadc_b + l) * SHMEM_STRIDE + loadr_b] = TO_FLOAT_TYPE(data_b[pos_b + row_idx.y * p.batch_stride_b + (row_idx.x % p.ne11) * p.stride_b + loadr_b]); } else { @@ -875,7 +875,9 @@ void main() { const u16vec2 row_idx = row_ids[row_i]; - data_d[row_idx.y * p.batch_stride_d + row_idx.x * p.stride_d + dr + cm_row * TM + store_r] = D_TYPE(coopmat_stage[warp_i * TM * TN + (col + store_c) * TM + store_r]); + if (dr + cm_row * TM + store_r < p.M) { + data_d[row_idx.y * p.batch_stride_d + row_idx.x * p.stride_d + dr + cm_row * TM + store_r] = D_TYPE(coopmat_stage[warp_i * TM * TN + (col + store_c) * TM + store_r]); + } } } } @@ -925,7 +927,9 @@ void main() { #endif // MUL_MAT_ID [[unroll]] for (uint cr = 0; cr < TM; cr++) { #ifdef MUL_MAT_ID - data_d[row_idx.y * p.batch_stride_d + row_idx.x * p.stride_d + dr_warp + cr] = D_TYPE(sums[(wsic * TN + cc) * (WMITER * TM) + wsir * TM + cr]); + if (dr_warp + cr < p.M) { + data_d[row_idx.y * p.batch_stride_d + row_idx.x * p.stride_d + dr_warp + cr] = D_TYPE(sums[(wsic * TN + cc) * (WMITER * TM) + wsir * TM + cr]); + } #else if (dr_warp + cr < p.M && dc_warp + cc < p.N) { data_d[offsets + (dc_warp + cc) * p.stride_d + dr_warp + cr] = D_TYPE(sums[(wsic * TN + cc) * (WMITER * TM) + wsir * TM + cr]); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp b/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp new file mode 100644 index 0000000000..0c7acb7060 --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp @@ -0,0 +1,68 @@ +#version 450 + +#extension GL_EXT_shader_16bit_storage : require +#extension GL_EXT_nonuniform_qualifier : enable +#extension GL_EXT_control_flow_attributes : require + +#include "rte.comp" +#include "types.comp" +#include "utils.comp" + +layout (push_constant) uniform parameter2 +{ + // shape for dst + uint ne20; uint ne21; uint ne22; uint ne23; + + // strides for srcs+dst + uint nb[8][4]; +} p; + +layout (binding = 0) readonly buffer A {A_TYPE data_a[];} a[]; +layout (binding = 0) writeonly buffer D {D_TYPE data_d[];} d[]; + +layout(constant_id = 0) const uint num_srcs = 2; + +uint src_idx(uint s, uint i00, uint i01, uint i02, uint i03) { + return i03*p.nb[s][3] + i02*p.nb[s][2] + i01*p.nb[s][1] + i00*p.nb[s][0]; +} + +uint dst_idx(uint i00, uint i01, uint i02, uint i03) { + uint nb20 = p.nb[num_srcs][0]; + uint nb21 = p.nb[num_srcs][1]; + uint nb22 = p.nb[num_srcs][2]; + uint nb23 = p.nb[num_srcs][3]; + return i03*nb23 + i02*nb22 + i01*nb21 + i00*nb20; +} + +uint get_idx() { + return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; +} + +const uint num_threads = 256; + +layout(local_size_x = num_threads, local_size_y = 1, local_size_z = 1) in; + +void main() { + uint idx = get_idx(); + + uint ne = p.ne20 * p.ne21 * p.ne22 * p.ne23; + + // num_threads * num_iter must equal 512, to match the wg_denoms and get_idx calculation + const uint num_iter = 2; + + [[unroll]] for (uint i = 0; i < num_iter; ++i) { + if (idx >= ne) { + continue; + } + uint i00, i01, i02, i03; + get_indices(idx, i00, i01, i02, i03, p.ne20, p.ne21, p.ne22, p.ne23); + + FLOAT_TYPE sum = FLOAT_TYPE(0); + [[unroll]] for (uint s = 0; s < num_srcs; ++s) { + sum += FLOAT_TYPE(a[s].data_a[src_idx(s, i00, i01, i02, i03)]); + } + d[num_srcs].data_d[dst_idx(i00, i01, i02, i03)] = D_TYPE(sum); + + idx += num_threads; + } +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp b/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp new file mode 100644 index 0000000000..4bc697b9b9 --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp @@ -0,0 +1,17 @@ +#version 450 + +#include "types.comp" +#include "generic_unary_head.comp" + +layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; + +void main() { + const uint idx = get_idx(); + + if (idx >= p.ne) { + return; + } + + const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]); + data_d[get_doffset() + dst_idx(idx)] = D_TYPE(sqrt(val)); +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/utils.comp b/ggml/src/ggml-vulkan/vulkan-shaders/utils.comp new file mode 100644 index 0000000000..dc4a1e6d96 --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/utils.comp @@ -0,0 +1,25 @@ +#ifndef UTILS_COMP +#define UTILS_COMP + +// mod and div are expensive and coordinates/dimensions are often power of 2 or equal to 1 +uint fastmod(uint a, uint b) { + if ((b & (b-1)) == 0) { + return a & (b-1); + } + return a % b; +} + +uint fastdiv(uint a, uint b) { + return (a < b) ? 0 : (a / b); +} + +void get_indices(uint idx, out uint i00, out uint i01, out uint i02, out uint i03, uint ne00, uint ne01, uint ne02, uint ne03) { + i03 = fastdiv(idx, (ne02*ne01*ne00)); + const uint i03_offset = i03 * ne02*ne01*ne00; + i02 = fastdiv((idx - i03_offset), (ne01*ne00)); + const uint i02_offset = i02*ne01*ne00; + i01 = (idx - i03_offset - i02_offset) / ne00; + i00 = idx - i03_offset - i02_offset - i01*ne00; +} + +#endif // UTILS_COMP diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index 68933d19f2..123ae04491 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -223,7 +223,8 @@ void string_to_spv_func(const std::string& _name, const std::string& in_fname, c std::string target_env = (name.find("_cm2") != std::string::npos) ? "--target-env=vulkan1.3" : "--target-env=vulkan1.2"; // disable spirv-opt for coopmat shaders for https://github.com/ggerganov/llama.cpp/issues/10734 - std::string opt_level = coopmat ? "" : "-O"; + // disable spirv-opt for bf16 shaders for https://github.com/ggml-org/llama.cpp/issues/15344 + std::string opt_level = (coopmat || name.find("bf16") != std::string::npos) ? "" : "-O"; #ifdef _WIN32 std::vector cmd = {GLSLC, "-fshader-stage=compute", target_env, opt_level, "\"" + in_path + "\"", "-o", "\"" + out_fname + "\""}; @@ -472,6 +473,9 @@ void process_shaders() { string_to_spv("mul_mat_vec_" + tname + "_f32_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}})); string_to_spv("mul_mat_vec_" + tname + "_f16_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"B_TYPE_VEC2", "f16vec2"}, {"B_TYPE_VEC4", "f16vec4"}, {"D_TYPE", "float"}})); + string_to_spv("mul_mat_vec_" + tname + "_f32_f32_subgroup", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}, {"USE_SUBGROUP_ADD", "1"}})); + string_to_spv("mul_mat_vec_" + tname + "_f16_f32_subgroup", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"B_TYPE_VEC2", "f16vec2"}, {"B_TYPE_VEC4", "f16vec4"}, {"D_TYPE", "float"}, {"USE_SUBGROUP_ADD", "1"}})); + string_to_spv("mul_mat_vec_id_" + tname + "_f32", shader, merge_maps(base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}})); // Dequant shaders @@ -566,6 +570,8 @@ void process_shaders() { string_to_spv("sqr_f32", "square.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); + string_to_spv("sqrt_f32", "sqrt.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); + string_to_spv("sin_f32", "sin.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); string_to_spv("cos_f32", "cos.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); @@ -580,6 +586,8 @@ void process_shaders() { string_to_spv("upscale_f32", "upscale.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}); + string_to_spv("exp_f16", "exp.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); + string_to_spv("exp_f32", "exp.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); string_to_spv("gelu_f16", "gelu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); string_to_spv("gelu_f32", "gelu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); string_to_spv("gelu_erf_f16", "gelu_erf.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); @@ -672,11 +680,15 @@ void process_shaders() { string_to_spv("conv2d_dw_whcn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"WHCN", "1"}})); string_to_spv("conv2d_dw_cwhn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"CWHN", "1"}})); + string_to_spv("conv2d_dw_whcn_f16_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"WHCN", "1"}})); + string_to_spv("conv2d_dw_cwhn_f16_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"CWHN", "1"}})); string_to_spv("roll_f32", "roll.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); string_to_spv("add_id_f32", "add_id.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}})); + string_to_spv("multi_add_f32", "multi_add.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"RTE16", "1"}}); + for (auto &c : compiles) { c.wait(); } @@ -785,6 +797,18 @@ void write_output_files() { fputs(data.c_str(), src); fputs(len.c_str(), src); } + + for (const std::string& btype : {"f16", "f32"}) { + for (const auto& tname : type_names) { + fprintf(hdr, "extern unsigned char *arr_dmmv_%s_%s_f32_data[2];\n", tname.c_str(), btype.c_str()); + fprintf(hdr, "extern uint64_t arr_dmmv_%s_%s_f32_len[2];\n", tname.c_str(), btype.c_str()); + std::string data = "unsigned char *arr_dmmv_" + tname + "_" + btype + "_f32_data[2] = {mul_mat_vec_" + tname + "_" + btype + "_f32_data, mul_mat_vec_" + tname + "_" + btype + "_f32_subgroup_data};\n"; + std::string len = "uint64_t arr_dmmv_" + tname + "_" + btype + "_f32_len[2] = {mul_mat_vec_" + tname + "_" + btype + "_f32_len, mul_mat_vec_" + tname + "_" + btype + "_f32_subgroup_len};\n"; + fputs(data.c_str(), src); + fputs(len.c_str(), src); + } + } + fclose(hdr); fclose(src); } diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 911eea504a..41804f3a2b 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -2832,6 +2832,7 @@ class VisionProjectorType: QWEN2A = "qwen2a" # audio QWEN25O = "qwen2.5o" # omni VOXTRAL = "voxtral" + LFM2 = "lfm2" # Items here are (block size, type size) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index c5c2798090..87edaa3232 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1272,6 +1272,7 @@ class TensorNameMap: MODEL_TENSOR.V_MM_INP_NORM: ( "multi_modal_projector.norm", + "multi_modal_projector.layer_norm", "pre_mm_projector_norm", ), diff --git a/include/llama.h b/include/llama.h index 135eaf1b65..662e0971df 100644 --- a/include/llama.h +++ b/include/llama.h @@ -64,8 +64,6 @@ extern "C" { typedef struct llama_memory_i * llama_memory_t; - struct llama_kv_cache; // DEPRECATED (use llama_memory instead) - typedef int32_t llama_pos; typedef int32_t llama_token; typedef int32_t llama_seq_id; @@ -469,8 +467,6 @@ extern "C" { LLAMA_API llama_memory_t llama_get_memory (const struct llama_context * ctx); LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); // TODO: rename to llama_get_pooling_type - DEPRECATED(LLAMA_API struct llama_kv_cache * llama_get_kv_self(struct llama_context * ctx), "use llama_get_memory instead"); - LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model); LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model); @@ -667,111 +663,6 @@ extern "C" { // Check if the memory supports shifting LLAMA_API bool llama_memory_can_shift(llama_memory_t mem); - // - // KV cache for self-attention (TODO: deprecate in favor of llama_memory) - // - - // Returns the number of tokens in the KV cache (slow, use only for debug) - // If a KV cell has multiple sequences assigned to it, it will be counted multiple times - DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx), - "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)"); - - // Returns the number of used KV cells (i.e. have at least one sequence assigned to them) - DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx), - "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)"); - - // Clear the KV cache - both cell info is erased and KV data is zeroed - DEPRECATED(LLAMA_API void llama_kv_self_clear( - struct llama_context * ctx), - "Use llama_memory_clear() instead"); - - // Removes all tokens that belong to the specified sequence and have positions in [p0, p1) - // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails - // seq_id < 0 : match any sequence - // p0 < 0 : [0, p1] - // p1 < 0 : [p0, inf) - DEPRECATED(LLAMA_API bool llama_kv_self_seq_rm( - struct llama_context * ctx, - llama_seq_id seq_id, - llama_pos p0, - llama_pos p1), - "Use llama_memory_seq_rm() instead"); - - // Copy all tokens that belong to the specified sequence to another sequence - // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence - // p0 < 0 : [0, p1] - // p1 < 0 : [p0, inf) - DEPRECATED(LLAMA_API void llama_kv_self_seq_cp( - struct llama_context * ctx, - llama_seq_id seq_id_src, - llama_seq_id seq_id_dst, - llama_pos p0, - llama_pos p1), - "Use llama_memory_seq_cp() instead"); - - // Removes all tokens that do not belong to the specified sequence - DEPRECATED(LLAMA_API void llama_kv_self_seq_keep( - struct llama_context * ctx, - llama_seq_id seq_id), - "Use llama_memory_seq_keep() instead"); - - // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1) - // If the KV cache is RoPEd, the KV data is updated accordingly: - // - lazily on next llama_decode() - // p0 < 0 : [0, p1] - // p1 < 0 : [p0, inf) - DEPRECATED(LLAMA_API void llama_kv_self_seq_add( - struct llama_context * ctx, - llama_seq_id seq_id, - llama_pos p0, - llama_pos p1, - llama_pos delta), - "Use llama_memory_seq_add() instead"); - - // Integer division of the positions by factor of `d > 1` - // If the KV cache is RoPEd, the KV data is updated accordingly: - // - lazily on next llama_decode() - // p0 < 0 : [0, p1] - // p1 < 0 : [p0, inf) - DEPRECATED(LLAMA_API void llama_kv_self_seq_div( - struct llama_context * ctx, - llama_seq_id seq_id, - llama_pos p0, - llama_pos p1, - int d), - "Use llama_memory_seq_div() instead"); - - // Returns the smallest position present in the KV cache for the specified sequence - // This is typically non-zero only for SWA caches - // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache - // Return -1 if the sequence is empty - DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_min( - struct llama_context * ctx, - llama_seq_id seq_id), - "Use llama_memory_seq_pos_min() instead"); - - // Returns the largest position present in the KV cache for the specified sequence - // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache - // Return -1 if the sequence is empty - DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_max( - struct llama_context * ctx, - llama_seq_id seq_id), - "Use llama_memory_seq_pos_max() instead"); - - // Defragment the KV cache - // This will be applied: - // - lazily on next llama_decode() - DEPRECATED(LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx), - "simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'"); - - // Check if the context supports KV cache shifting - DEPRECATED(LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx), - "use llama_memory_can_shift() instead"); - - // Apply the KV cache updates (such as K-shifts, defragmentation, etc.) - DEPRECATED(LLAMA_API void llama_kv_self_update(struct llama_context * ctx), - "simply remove this call, updates are applied lazily on the next llama_decode()"); - // // State / sessions // diff --git a/scripts/sync-ggml-am.sh b/scripts/sync-ggml-am.sh index 29d30e0a18..826c560cd8 100755 --- a/scripts/sync-ggml-am.sh +++ b/scripts/sync-ggml-am.sh @@ -74,21 +74,7 @@ while read c; do cmake/common.cmake \ cmake/ggml-config.cmake.in \ src/ggml-cpu/cmake/FindSIMD.cmake \ - src/ggml*.h \ - src/ggml*.c \ - src/ggml*.cpp \ - src/gguf*.cpp \ - src/ggml-blas/* \ - src/ggml-cann/* \ - src/ggml-cpu/* \ - src/ggml-cuda/* \ - src/ggml-hip/* \ - src/ggml-metal/* \ - src/ggml-musa/* \ - src/ggml-opencl/* \ - src/ggml-rpc/* \ - src/ggml-sycl/* \ - src/ggml-vulkan/* \ + src/ggml* \ include/ggml*.h \ include/gguf*.h \ tests/test-opt.cpp \ @@ -131,21 +117,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then # cmake/ggml-config.cmake.in -> ggml/cmake/ggml-config.cmake.in # src/ggml-cpu/cmake/FindSIMD.cmake -> ggml/src/ggml-cpu/cmake/FindSIMD.cmake # - # src/ggml*.c -> ggml/src/ggml*.c - # src/ggml*.cpp -> ggml/src/ggml*.cpp - # src/ggml*.h -> ggml/src/ggml*.h - # src/gguf*.cpp -> ggml/src/gguf*.cpp - # src/ggml-blas/* -> ggml/src/ggml-blas/* - # src/ggml-cann/* -> ggml/src/ggml-cann/* - # src/ggml-cpu/* -> ggml/src/ggml-cpu/* - # src/ggml-cuda/* -> ggml/src/ggml-cuda/* - # src/ggml-hip/* -> ggml/src/ggml-hip/* - # src/ggml-metal/* -> ggml/src/ggml-metal/* - # src/ggml-musa/* -> ggml/src/ggml-musa/* - # src/ggml-opencl/* -> ggml/src/ggml-opencl/* - # src/ggml-rpc/* -> ggml/src/ggml-rpc/* - # src/ggml-sycl/* -> ggml/src/ggml-sycl/* - # src/ggml-vulkan/* -> ggml/src/ggml-vulkan/* + # src/ggml* -> ggml/src/ggml* # # include/ggml*.h -> ggml/include/ggml*.h # include/gguf*.h -> ggml/include/gguf*.h @@ -163,20 +135,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then -e 's/([[:space:]]| [ab]\/)cmake\/common.cmake/\1ggml\/cmake\/common.cmake/g' \ -e 's/([[:space:]]| [ab]\/)cmake\/ggml-config.cmake.in/\1ggml\/cmake\/ggml-config.cmake.in/g' \ -e 's/([[:space:]]| [ab]\/)src\/ggml-cpu\/cmake\/FindSIMD.cmake/\1ggml\/src\/ggml-cpu\/cmake\/FindSIMD.cmake/g' \ - -e 's/([[:space:]]| [ab]\/)src\/ggml(.*)\.c/\1ggml\/src\/ggml\2.c/g' \ - -e 's/([[:space:]]| [ab]\/)src\/ggml(.*)\.cpp/\1ggml\/src\/ggml\2.cpp/g' \ - -e 's/([[:space:]]| [ab]\/)src\/ggml(.*)\.h/\1ggml\/src\/ggml\2.h/g' \ - -e 's/([[:space:]]| [ab]\/)src\/gguf(.*)\.cpp/\1ggml\/src\/gguf\2.cpp/g' \ - -e 's/([[:space:]]| [ab]\/)src\/ggml-blas\//\1ggml\/src\/ggml-blas\//g' \ - -e 's/([[:space:]]| [ab]\/)src\/ggml-cann\//\1ggml\/src\/ggml-cann\//g' \ - -e 's/([[:space:]]| [ab]\/)src\/ggml-cpu\//\1ggml\/src\/ggml-cpu\//g' \ - -e 's/([[:space:]]| [ab]\/)src\/ggml-cuda\//\1ggml\/src\/ggml-cuda\//g' \ - -e 's/([[:space:]]| [ab]\/)src\/ggml-hip\//\1ggml\/src\/ggml-hip\//g' \ - -e 's/([[:space:]]| [ab]\/)src\/ggml-metal\//\1ggml\/src\/ggml-metal\//g' \ - -e 's/([[:space:]]| [ab]\/)src\/ggml-opencl\//\1ggml\/src\/ggml-opencl\//g' \ - -e 's/([[:space:]]| [ab]\/)src\/ggml-rpc\//\1ggml\/src\/ggml-rpc\//g' \ - -e 's/([[:space:]]| [ab]\/)src\/ggml-sycl\//\1ggml\/src\/ggml-sycl\//g' \ - -e 's/([[:space:]]| [ab]\/)src\/ggml-vulkan\//\1ggml\/src\/ggml-vulkan\//g' \ + -e 's/([[:space:]]| [ab]\/)src\/ggml(.*)/\1ggml\/src\/ggml\2/g' \ -e 's/([[:space:]]| [ab]\/)include\/ggml(.*)\.h/\1ggml\/include\/ggml\2.h/g' \ -e 's/([[:space:]]| [ab]\/)include\/gguf(.*)\.h/\1ggml\/include\/gguf\2.h/g' \ -e 's/([[:space:]]| [ab]\/)tests\/(.*)\.cpp/\1tests\/\2.cpp/g' \ diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 7e2875f870..22348782d1 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -b141fc226b68e4af383101c39da90b54ede98850 +323951f1bdcdfbd5b5ff3a9a7c3770e63b1a560e diff --git a/scripts/sync-ggml.sh b/scripts/sync-ggml.sh index 9b98329e09..2da9b57891 100755 --- a/scripts/sync-ggml.sh +++ b/scripts/sync-ggml.sh @@ -6,21 +6,7 @@ cp -rpv ../ggml/src/CMakeLists.txt ./ggml/src/CMakeLists.txt cp -rpv ../ggml/cmake/* ./ggml/cmake/ cp -rpv ../ggml/src/ggml-cpu/cmake/* ./ggml/src/ggml-cpu/cmake/ -cp -rpv ../ggml/src/ggml*.c ./ggml/src/ -cp -rpv ../ggml/src/ggml*.cpp ./ggml/src/ -cp -rpv ../ggml/src/ggml*.h ./ggml/src/ -cp -rpv ../ggml/src/gguf*.cpp ./ggml/src/ -cp -rpv ../ggml/src/ggml-blas/* ./ggml/src/ggml-blas/ -cp -rpv ../ggml/src/ggml-cann/* ./ggml/src/ggml-cann/ -cp -rpv ../ggml/src/ggml-cpu/* ./ggml/src/ggml-cpu/ -cp -rpv ../ggml/src/ggml-cuda/* ./ggml/src/ggml-cuda/ -cp -rpv ../ggml/src/ggml-hip/* ./ggml/src/ggml-hip/ -cp -rpv ../ggml/src/ggml-metal/* ./ggml/src/ggml-metal/ -cp -rpv ../ggml/src/ggml-musa/* ./ggml/src/ggml-musa/ -cp -rpv ../ggml/src/ggml-opencl/* ./ggml/src/ggml-opencl/ -cp -rpv ../ggml/src/ggml-rpc/* ./ggml/src/ggml-rpc/ -cp -rpv ../ggml/src/ggml-sycl/* ./ggml/src/ggml-sycl/ -cp -rpv ../ggml/src/ggml-vulkan/* ./ggml/src/ggml-vulkan/ +cp -rpv ../ggml/src/ggml* ./ggml/src/ cp -rpv ../ggml/include/ggml*.h ./ggml/include/ cp -rpv ../ggml/include/gguf*.h ./ggml/include/ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 8f9cd65244..18cfc76564 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -20,8 +20,8 @@ add_library(llama llama-hparams.cpp llama-impl.cpp llama-io.cpp - llama-kv-cache-unified.cpp - llama-kv-cache-unified-iswa.cpp + llama-kv-cache.cpp + llama-kv-cache-iswa.cpp llama-memory.cpp llama-memory-hybrid.cpp llama-memory-recurrent.cpp diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 7d7abad5d4..e8e8b3450a 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -93,7 +93,7 @@ llama_context::llama_context( // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext) // ref: https://github.com/ggerganov/llama.cpp/pull/5021 - // TODO: this padding is not needed for the cache-less context so we should probably move it to llama_context_kv_self + // TODO: this padding is not needed for the cache-less context so we should probably move it to llama_memory if (cparams.n_batch < GGML_KQ_MASK_PAD) { LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD); cparams.n_batch = GGML_KQ_MASK_PAD; @@ -145,11 +145,6 @@ llama_context::llama_context( __func__, n_ctx_per_seq, hparams.n_ctx_train); } - if (!params.swa_full && cparams.n_seq_max > 1 && hparams.is_swa_any()) { - LLAMA_LOG_WARN("%s: requested n_seq_max (%u) > 1, but swa_full is not enabled -- performance may be degraded: %s\n", - __func__, cparams.n_seq_max, "https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573"); - } - if (!hparams.vocab_only) { // GPU backends for (auto * dev : model.devices) { @@ -444,26 +439,12 @@ llama_memory_t llama_context::get_memory() const { return memory.get(); } -// deprecated -void llama_context::kv_self_defrag_sched() { - if (!memory) { - return; - } - - memory_force_optimize = true; -} - -// deprecated -bool llama_context::kv_self_update(bool optimize) { +bool llama_context::memory_update(bool optimize) { if (!memory) { return false; } { - // TODO: remove in the future - optimize |= memory_force_optimize; - memory_force_optimize = false; - const auto mctx = memory->init_update(this, optimize); switch (mctx->get_status()) { case LLAMA_MEMORY_STATUS_SUCCESS: @@ -998,7 +979,7 @@ int llama_context::decode(const llama_batch & batch_inp) { bool did_optimize = false; // handle any pending defrags/shifts - kv_self_update(false); + memory_update(false); llama_memory_context_ptr mctx; @@ -1023,7 +1004,7 @@ int llama_context::decode(const llama_batch & batch_inp) { if (!did_optimize) { did_optimize = true; - if (kv_self_update(true)) { + if (memory_update(true)) { LLAMA_LOG_DEBUG("%s: retrying batch size %d after cache optimization\n", __func__, balloc->get_n_tokens()); continue; @@ -2343,16 +2324,6 @@ const llama_model * llama_get_model(const llama_context * ctx) { return &ctx->get_model(); } -// deprecated -llama_kv_cache * llama_get_kv_self(llama_context * ctx) { - return dynamic_cast(ctx->get_memory()); -} - -// deprecated -void llama_kv_self_update(llama_context * ctx) { - ctx->kv_self_update(false); -} - enum llama_pooling_type llama_pooling_type(const llama_context * ctx) { return ctx->pooling_type(); } @@ -2570,168 +2541,6 @@ bool llama_memory_can_shift(llama_memory_t mem) { return mem->get_can_shift(); } -// -// kv cache -// - -// deprecated -int32_t llama_kv_self_n_tokens(const llama_context * ctx) { - const auto * kv = llama_get_memory(ctx); - if (!kv) { - return 0; - } - - int32_t res = 0; - - for (uint32_t s = 0; s < ctx->get_cparams().n_seq_max; s++) { - const llama_pos p0 = kv->seq_pos_min(s); - const llama_pos p1 = kv->seq_pos_max(s); - - if (p0 >= 0) { - res += (p1 - p0) + 1; - } - } - - return res; -} - -// deprecated -// note: this is the same as above - will be removed anyway, so it's ok -int32_t llama_kv_self_used_cells(const llama_context * ctx) { - const auto * kv = llama_get_memory(ctx); - if (!kv) { - return 0; - } - - int32_t res = 0; - - for (uint32_t s = 0; s < ctx->get_cparams().n_seq_max; s++) { - const llama_pos p0 = kv->seq_pos_min(s); - const llama_pos p1 = kv->seq_pos_max(s); - - if (p0 >= 0) { - res += (p1 - p0) + 1; - } - } - - return res; -} - -// deprecated -void llama_kv_self_clear(llama_context * ctx) { - auto * kv = llama_get_memory(ctx); - if (!kv) { - return; - } - - llama_memory_clear(kv, true); -} - -// deprecated -bool llama_kv_self_seq_rm( - llama_context * ctx, - llama_seq_id seq_id, - llama_pos p0, - llama_pos p1) { - auto * kv = llama_get_memory(ctx); - if (!kv) { - return true; - } - - return llama_memory_seq_rm(kv, seq_id, p0, p1); -} - -// deprecated -void llama_kv_self_seq_cp( - llama_context * ctx, - llama_seq_id seq_id_src, - llama_seq_id seq_id_dst, - llama_pos p0, - llama_pos p1) { - auto * kv = llama_get_memory(ctx); - if (!kv) { - return; - } - - llama_memory_seq_cp(kv, seq_id_src, seq_id_dst, p0, p1); -} - -// deprecated -void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) { - auto * kv = llama_get_memory(ctx); - if (!kv) { - return; - } - - llama_memory_seq_keep(kv, seq_id); -} - -// deprecated -void llama_kv_self_seq_add( - llama_context * ctx, - llama_seq_id seq_id, - llama_pos p0, - llama_pos p1, - llama_pos delta) { - auto * kv = llama_get_memory(ctx); - if (!kv) { - return; - } - - llama_memory_seq_add(kv, seq_id, p0, p1, delta); -} - -// deprecated -void llama_kv_self_seq_div( - llama_context * ctx, - llama_seq_id seq_id, - llama_pos p0, - llama_pos p1, - int d) { - auto * kv = llama_get_memory(ctx); - if (!kv) { - return; - } - - llama_memory_seq_div(kv, seq_id, p0, p1, d); -} - -// deprecated -llama_pos llama_kv_self_seq_pos_min(llama_context * ctx, llama_seq_id seq_id) { - auto * kv = llama_get_memory(ctx); - if (!kv) { - return -1; - } - - return llama_memory_seq_pos_min(kv, seq_id); -} - -// deprecated -llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) { - auto * kv = llama_get_memory(ctx); - if (!kv) { - return -1; - } - - return llama_memory_seq_pos_max(kv, seq_id); -} - -// deprecated -void llama_kv_self_defrag(llama_context * ctx) { - // force defrag - ctx->kv_self_defrag_sched(); -} - -// deprecated -bool llama_kv_self_can_shift(const llama_context * ctx) { - auto * kv = llama_get_memory(ctx); - if (!kv) { - return false; - } - - return llama_memory_can_shift(kv); -} - // llama state API // deprecated diff --git a/src/llama-context.h b/src/llama-context.h index 230ef8962b..3dd9205446 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -46,10 +46,8 @@ struct llama_context { llama_memory_t get_memory() const; - // return true of the KV cache was updated - // TODO: remove - bool kv_self_update(bool optimize); - void kv_self_defrag_sched(); + // return true if the memory was updated + bool memory_update(bool optimize); enum llama_pooling_type pooling_type() const; @@ -230,9 +228,6 @@ private: std::unique_ptr memory; - // TODO: temporary, until the llama_kv_self_defrag() API is removed - bool memory_force_optimize = false; - // decode output (2-dimensional array: [n_outputs][n_vocab]) size_t logits_size = 0; // capacity (of floats) for logits float * logits = nullptr; diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 053c72d6dc..6419d739bd 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -4,8 +4,8 @@ #include "llama-batch.h" #include "llama-cparams.h" -#include "llama-kv-cache-unified.h" -#include "llama-kv-cache-unified-iswa.h" +#include "llama-kv-cache.h" +#include "llama-kv-cache-iswa.h" #include "llama-memory-hybrid.h" #include "llama-memory-recurrent.h" @@ -277,7 +277,7 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) { for (int s = 0; s < ubatch->n_seq_id[i0]; ++s) { const llama_seq_id s0 = ubatch->seq_id[i0][0]; - // TODO: reimplement this like in llama_kv_cache_unified + // TODO: reimplement this like in llama_kv_cache if (s0 == s1 && (!cparams.causal_attn || ubatch->pos[i0] <= ubatch->pos[i1])) { if (hparams.use_alibi) { f = -std::abs(ubatch->pos[i0] - ubatch->pos[i1]); @@ -294,15 +294,15 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) { } } -void llm_graph_input_attn_kv_unified::set_input(const llama_ubatch * ubatch) { +void llm_graph_input_attn_kv::set_input(const llama_ubatch * ubatch) { mctx->set_input_k_idxs(self_k_idxs, ubatch); mctx->set_input_v_idxs(self_v_idxs, ubatch); mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn); } -bool llm_graph_input_attn_kv_unified::can_reuse(const llm_graph_params & params) { - const auto * mctx = static_cast(params.mctx); +bool llm_graph_input_attn_kv::can_reuse(const llm_graph_params & params) { + const auto * mctx = static_cast(params.mctx); this->mctx = mctx; @@ -319,7 +319,7 @@ bool llm_graph_input_attn_kv_unified::can_reuse(const llm_graph_params & params) return res; } -void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch) { +void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) { mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch); mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch); @@ -331,8 +331,8 @@ void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn); } -bool llm_graph_input_attn_kv_unified_iswa::can_reuse(const llm_graph_params & params) { - const auto * mctx = static_cast(params.mctx); +bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) { + const auto * mctx = static_cast(params.mctx); this->mctx = mctx; @@ -1186,7 +1186,7 @@ ggml_tensor * llm_graph_context::build_inp_pos_bucket_enc() const { } ggml_tensor * llm_graph_context::build_inp_pos_bucket_dec() const { - const auto * mctx_cur = static_cast(mctx); + const auto * mctx_cur = static_cast(mctx); auto inp = std::make_unique(hparams, mctx_cur); @@ -1223,8 +1223,8 @@ ggml_tensor * llm_graph_context::build_attn_mha( ggml_tensor * v, ggml_tensor * kq_b, ggml_tensor * kq_mask, - ggml_tensor * v_mla, ggml_tensor * sinks, + ggml_tensor * v_mla, float kq_scale) const { const bool v_trans = v->nb[1] > v->nb[2]; @@ -1360,6 +1360,7 @@ ggml_tensor * llm_graph_context::build_attn( ggml_tensor * k_cur, ggml_tensor * v_cur, ggml_tensor * kq_b, + ggml_tensor * sinks, ggml_tensor * v_mla, float kq_scale, int il) const { @@ -1381,7 +1382,7 @@ ggml_tensor * llm_graph_context::build_attn( ggml_tensor * k = k_cur; ggml_tensor * v = v_cur; - ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, nullptr, kq_scale); + ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale); cb(cur, "kqv_out", il); if (wo) { @@ -1399,17 +1400,17 @@ ggml_tensor * llm_graph_context::build_attn( return cur; } -static std::unique_ptr build_attn_inp_kv_unified_impl( +static std::unique_ptr build_attn_inp_kv_impl( ggml_context * ctx0, const llama_ubatch & ubatch, const llama_hparams & hparams, const llama_cparams & cparams, - const llama_kv_cache_unified_context * mctx_cur) { + const llama_kv_cache_context * mctx_cur) { - auto inp = std::make_unique(hparams, cparams, mctx_cur); + auto inp = std::make_unique(hparams, cparams, mctx_cur); { - GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_unified_iswa for SWA"); + GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_iswa for SWA"); const auto n_kv = mctx_cur->get_n_kv(); const auto n_tokens = ubatch.n_tokens; @@ -1427,22 +1428,23 @@ static std::unique_ptr build_attn_inp_kv_unifie return inp; } -llm_graph_input_attn_kv_unified * llm_graph_context::build_attn_inp_kv_unified() const { - const auto * mctx_cur = static_cast(mctx); +llm_graph_input_attn_kv * llm_graph_context::build_attn_inp_kv() const { + const auto * mctx_cur = static_cast(mctx); - auto inp = build_attn_inp_kv_unified_impl(ctx0, ubatch, hparams, cparams, mctx_cur); + auto inp = build_attn_inp_kv_impl(ctx0, ubatch, hparams, cparams, mctx_cur); - return (llm_graph_input_attn_kv_unified *) res->add_input(std::move(inp)); + return (llm_graph_input_attn_kv *) res->add_input(std::move(inp)); } ggml_tensor * llm_graph_context::build_attn( - llm_graph_input_attn_kv_unified * inp, + llm_graph_input_attn_kv * inp, ggml_tensor * wo, ggml_tensor * wo_b, ggml_tensor * q_cur, ggml_tensor * k_cur, ggml_tensor * v_cur, ggml_tensor * kq_b, + ggml_tensor * sinks, ggml_tensor * v_mla, float kq_scale, int il) const { @@ -1469,7 +1471,7 @@ ggml_tensor * llm_graph_context::build_attn( ggml_tensor * k = mctx_cur->get_k(ctx0, il); ggml_tensor * v = mctx_cur->get_v(ctx0, il); - ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, nullptr, kq_scale); + ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale); cb(cur, "kqv_out", il); if (wo) { @@ -1488,40 +1490,15 @@ ggml_tensor * llm_graph_context::build_attn( } ggml_tensor * llm_graph_context::build_attn( - llm_graph_input_attn_kv_unified_iswa * inp, + llm_graph_input_attn_kv_iswa * inp, ggml_tensor * wo, ggml_tensor * wo_b, ggml_tensor * q_cur, ggml_tensor * k_cur, ggml_tensor * v_cur, ggml_tensor * kq_b, - ggml_tensor * v_mla, - float kq_scale, - int il) const { - return build_attn_with_sinks( - inp, - wo, - wo_b, - q_cur, - k_cur, - v_cur, - kq_b, - v_mla, - nullptr, - kq_scale, - il); -} - -ggml_tensor * llm_graph_context::build_attn_with_sinks( - llm_graph_input_attn_kv_unified_iswa * inp, - ggml_tensor * wo, - ggml_tensor * wo_b, - ggml_tensor * q_cur, - ggml_tensor * k_cur, - ggml_tensor * v_cur, - ggml_tensor * kq_b, - ggml_tensor * v_mla, ggml_tensor * sinks, + ggml_tensor * v_mla, float kq_scale, int il) const { // these nodes are added to the graph together so that they are not reordered @@ -1561,7 +1538,7 @@ ggml_tensor * llm_graph_context::build_attn_with_sinks( ggml_tensor * k = mctx_cur->get_k(ctx0, il); ggml_tensor * v = mctx_cur->get_v(ctx0, il); - ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, sinks, kq_scale); + ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale); cb(cur, "kqv_out", il); if (wo) { @@ -1600,6 +1577,7 @@ ggml_tensor * llm_graph_context::build_attn( ggml_tensor * k_cur, ggml_tensor * v_cur, ggml_tensor * kq_b, + ggml_tensor * sinks, ggml_tensor * v_mla, float kq_scale, int il) const { @@ -1615,7 +1593,7 @@ ggml_tensor * llm_graph_context::build_attn( ggml_tensor * k = k_cur; ggml_tensor * v = v_cur; - ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, nullptr, kq_scale); + ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale); cb(cur, "kqv_out", il); if (wo) { @@ -1636,10 +1614,10 @@ ggml_tensor * llm_graph_context::build_attn( // TODO: maybe separate the inner implementation into a separate function // like with the non-sliding window equivalent // once sliding-window hybrid caches are a thing. -llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unified_iswa() const { - const auto * mctx_cur = static_cast(mctx); +llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const { + const auto * mctx_cur = static_cast(mctx); - auto inp = std::make_unique(hparams, cparams, mctx_cur); + auto inp = std::make_unique(hparams, cparams, mctx_cur); const auto n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq; @@ -1656,7 +1634,7 @@ llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unif } { - GGML_ASSERT(hparams.swa_type != LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_unified for non-SWA"); + GGML_ASSERT(hparams.swa_type != LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache for non-SWA"); const auto n_kv = mctx_cur->get_swa()->get_n_kv(); @@ -1669,7 +1647,7 @@ llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unif inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa; } - return (llm_graph_input_attn_kv_unified_iswa *) res->add_input(std::move(inp)); + return (llm_graph_input_attn_kv_iswa *) res->add_input(std::move(inp)); } ggml_tensor * llm_graph_context::build_rs( @@ -1792,7 +1770,7 @@ llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const { const auto * mctx_cur = static_cast(mctx); auto inp_rs = build_rs_inp_impl(ctx0, ubatch, mctx_cur->get_recr()); - auto inp_attn = build_attn_inp_kv_unified_impl(ctx0, ubatch, hparams, cparams, mctx_cur->get_attn()); + auto inp_attn = build_attn_inp_kv_impl(ctx0, ubatch, hparams, cparams, mctx_cur->get_attn()); auto inp = std::make_unique(std::move(inp_attn), std::move(inp_rs), mctx_cur); diff --git a/src/llama-graph.h b/src/llama-graph.h index 6ff49de3a1..e11d91d529 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -19,8 +19,8 @@ struct llama_cparams; struct llama_memory_context_i; -class llama_kv_cache_unified_context; -class llama_kv_cache_unified_iswa_context; +class llama_kv_cache_context; +class llama_kv_cache_iswa_context; class llama_memory_recurrent_context; class llama_memory_hybrid_context; @@ -152,7 +152,7 @@ class llm_graph_input_pos_bucket_kv : public llm_graph_input_i { public: llm_graph_input_pos_bucket_kv( const llama_hparams & hparams, - const llama_kv_cache_unified_context * mctx) : hparams(hparams), mctx(mctx) {} + const llama_kv_cache_context * mctx) : hparams(hparams), mctx(mctx) {} virtual ~llm_graph_input_pos_bucket_kv() = default; void set_input(const llama_ubatch * ubatch) override; @@ -161,7 +161,7 @@ public: const llama_hparams hparams; - const llama_kv_cache_unified_context * mctx; + const llama_kv_cache_context * mctx; }; class llm_graph_input_out_ids : public llm_graph_input_i { @@ -257,17 +257,17 @@ public: const llama_cparams cparams; }; -class llm_graph_input_attn_kv_unified : public llm_graph_input_i { +class llm_graph_input_attn_kv : public llm_graph_input_i { public: - llm_graph_input_attn_kv_unified( + llm_graph_input_attn_kv( const llama_hparams & hparams, const llama_cparams & cparams, - const llama_kv_cache_unified_context * mctx) : + const llama_kv_cache_context * mctx) : hparams(hparams), cparams(cparams), mctx(mctx) { } - ~llm_graph_input_attn_kv_unified() = default; + ~llm_graph_input_attn_kv() = default; void set_input(const llama_ubatch * ubatch) override; @@ -290,20 +290,20 @@ public: const llama_hparams hparams; const llama_cparams cparams; - const llama_kv_cache_unified_context * mctx; + const llama_kv_cache_context * mctx; }; -class llm_graph_input_attn_kv_unified_iswa : public llm_graph_input_i { +class llm_graph_input_attn_kv_iswa : public llm_graph_input_i { public: - llm_graph_input_attn_kv_unified_iswa( + llm_graph_input_attn_kv_iswa( const llama_hparams & hparams, const llama_cparams & cparams, - const llama_kv_cache_unified_iswa_context * mctx) : + const llama_kv_cache_iswa_context * mctx) : hparams(hparams), cparams(cparams), mctx(mctx) { } - ~llm_graph_input_attn_kv_unified_iswa() = default; + ~llm_graph_input_attn_kv_iswa() = default; void set_input(const llama_ubatch * ubatch) override; @@ -330,7 +330,7 @@ public: const llama_hparams hparams; const llama_cparams cparams; - const llama_kv_cache_unified_iswa_context * mctx; + const llama_kv_cache_iswa_context * mctx; }; class llm_graph_input_attn_cross : public llm_graph_input_i { @@ -351,7 +351,7 @@ public: class llm_graph_input_mem_hybrid : public llm_graph_input_i { public: llm_graph_input_mem_hybrid( - std::unique_ptr inp_attn, + std::unique_ptr inp_attn, std::unique_ptr inp_rs, const llama_memory_hybrid_context * mctx) : inp_attn(std::move(inp_attn)), @@ -361,11 +361,11 @@ public: void set_input(const llama_ubatch * ubatch) override; - std::unique_ptr inp_attn; - std::unique_ptr inp_rs; + std::unique_ptr inp_attn; + std::unique_ptr inp_rs; - llm_graph_input_attn_kv_unified * get_attn() const { return inp_attn.get(); } - llm_graph_input_rs * get_recr() const { return inp_rs.get(); } + llm_graph_input_attn_kv * get_attn() const { return inp_attn.get(); } + llm_graph_input_rs * get_recr() const { return inp_rs.get(); } const llama_memory_hybrid_context * mctx; }; @@ -680,14 +680,14 @@ struct llm_graph_context { // ggml_tensor * build_attn_mha( - ggml_tensor * q, // [n_embd_head_q, n_head_q, n_tokens] - ggml_tensor * k, // [n_embd_head_k, n_head_k, n_tokens] - ggml_tensor * v, // [n_embd_head_v, n_head_v, n_tokens] (v_trans == false) - ggml_tensor * kq_b, - ggml_tensor * kq_mask, - ggml_tensor * sinks, - ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v] - float kq_scale) const; + ggml_tensor * q, // [n_embd_head_q, n_head_q, n_tokens] + ggml_tensor * k, // [n_embd_head_k, n_head_k, n_tokens] + ggml_tensor * v, // [n_embd_head_v, n_head_v, n_tokens] (v_trans == false) + ggml_tensor * kq_b, + ggml_tensor * kq_mask, + ggml_tensor * sinks, // [n_head_q] + ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v] + float kq_scale) const; llm_graph_input_attn_no_cache * build_attn_inp_no_cache() const; @@ -699,50 +699,39 @@ struct llm_graph_context { ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] ggml_tensor * kq_b, + ggml_tensor * sinks, // [n_head_q] ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v] float kq_scale, int il) const; - llm_graph_input_attn_kv_unified * build_attn_inp_kv_unified() const; + llm_graph_input_attn_kv * build_attn_inp_kv() const; ggml_tensor * build_attn( - llm_graph_input_attn_kv_unified * inp, + llm_graph_input_attn_kv * inp, ggml_tensor * wo, ggml_tensor * wo_b, ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens] ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] ggml_tensor * kq_b, + ggml_tensor * sinks, // [n_head_q] ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v] float kq_scale, int il) const; - llm_graph_input_attn_kv_unified_iswa * build_attn_inp_kv_unified_iswa() const; + llm_graph_input_attn_kv_iswa * build_attn_inp_kv_iswa() const; // note: if k_cur or v_cur are not provided, they will not be stored in the memory ggml_tensor * build_attn( - llm_graph_input_attn_kv_unified_iswa * inp, + llm_graph_input_attn_kv_iswa * inp, ggml_tensor * wo, ggml_tensor * wo_b, ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens] ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] optional ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] optional ggml_tensor * kq_b, - ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v] - float kq_scale, - int il) const; - - // TODO: temporary to keep the diff small. after the code is public will refactor to simplify this - ggml_tensor * build_attn_with_sinks( - llm_graph_input_attn_kv_unified_iswa * inp, - ggml_tensor * wo, - ggml_tensor * wo_b, - ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens] - ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] optional - ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] optional - ggml_tensor * kq_b, - ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v] ggml_tensor * sinks, // [n_head_q] + ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v] float kq_scale, int il) const; @@ -756,6 +745,7 @@ struct llm_graph_context { ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] ggml_tensor * kq_b, + ggml_tensor * sinks, // [n_head_q] ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v] float kq_scale, int il) const; @@ -765,7 +755,7 @@ struct llm_graph_context { // // TODO: move this implementation to llama_memory_recurrent. - // this is analogous to llama_kv_cache_unified::cpy_k / cpy_v + // this is analogous to llama_kv_cache::cpy_k / cpy_v // when moving, avoid passing `ggml_cgraph` - only pass `ggml_context`. would likely need to split the // implementation in 2 separate methods. the goal is to avoid calling `ggml_build_forward_expand` in // `llama_memory_recurrent` diff --git a/src/llama-kv-cache-unified-iswa.cpp b/src/llama-kv-cache-iswa.cpp similarity index 62% rename from src/llama-kv-cache-unified-iswa.cpp rename to src/llama-kv-cache-iswa.cpp index 1e363fff2a..a11ee5a5b1 100644 --- a/src/llama-kv-cache-unified-iswa.cpp +++ b/src/llama-kv-cache-iswa.cpp @@ -1,4 +1,4 @@ -#include "llama-kv-cache-unified-iswa.h" +#include "llama-kv-cache-iswa.h" #include "llama-impl.h" #include "llama-batch.h" @@ -8,10 +8,10 @@ #include // -// llama_kv_cache_unified_iswa +// llama_kv_cache_iswa // -llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa( +llama_kv_cache_iswa::llama_kv_cache_iswa( const llama_model & model, ggml_type type_k, ggml_type type_v, @@ -23,8 +23,8 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa( uint32_t n_seq_max, uint32_t n_ubatch, uint32_t n_pad) : hparams(model.hparams), unified(unified) { - llama_kv_cache_unified::layer_filter_cb filter_base = [&](int32_t il) { return !model.hparams.is_swa(il); }; - llama_kv_cache_unified::layer_filter_cb filter_swa = [&](int32_t il) { return model.hparams.is_swa(il); }; + llama_kv_cache::layer_filter_cb filter_base = [&](int32_t il) { return !model.hparams.is_swa(il); }; + llama_kv_cache::layer_filter_cb filter_swa = [&](int32_t il) { return model.hparams.is_swa(il); }; const uint32_t size_base = kv_size; @@ -40,25 +40,25 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa( LLAMA_LOG_INFO("%s: creating non-SWA KV cache, size = %u cells\n", __func__, size_base); - kv_base = std::make_unique( + kv_base = std::make_unique( model, std::move(filter_base), type_k, type_v, v_trans, offload, unified, size_base, n_seq_max, n_pad, 0, LLAMA_SWA_TYPE_NONE); LLAMA_LOG_INFO("%s: creating SWA KV cache, size = %u cells\n", __func__, size_swa); - kv_swa = std::make_unique( + kv_swa = std::make_unique( model, std::move(filter_swa), type_k, type_v, v_trans, offload, unified, size_swa, n_seq_max, n_pad, hparams.n_swa, hparams.swa_type); } -void llama_kv_cache_unified_iswa::clear(bool data) { +void llama_kv_cache_iswa::clear(bool data) { kv_base->clear(data); kv_swa ->clear(data); } -bool llama_kv_cache_unified_iswa::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) { +bool llama_kv_cache_iswa::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) { bool res = true; res = res & kv_base->seq_rm(seq_id, p0, p1); @@ -67,36 +67,36 @@ bool llama_kv_cache_unified_iswa::seq_rm(llama_seq_id seq_id, llama_pos p0, llam return res; } -void llama_kv_cache_unified_iswa::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) { +void llama_kv_cache_iswa::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) { kv_base->seq_cp(seq_id_src, seq_id_dst, p0, p1); kv_swa ->seq_cp(seq_id_src, seq_id_dst, p0, p1); } -void llama_kv_cache_unified_iswa::seq_keep(llama_seq_id seq_id) { +void llama_kv_cache_iswa::seq_keep(llama_seq_id seq_id) { kv_base->seq_keep(seq_id); kv_swa ->seq_keep(seq_id); } -void llama_kv_cache_unified_iswa::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) { +void llama_kv_cache_iswa::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) { kv_base->seq_add(seq_id, p0, p1, shift); kv_swa ->seq_add(seq_id, p0, p1, shift); } -void llama_kv_cache_unified_iswa::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) { +void llama_kv_cache_iswa::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) { kv_base->seq_div(seq_id, p0, p1, d); kv_swa ->seq_div(seq_id, p0, p1, d); } -llama_pos llama_kv_cache_unified_iswa::seq_pos_min(llama_seq_id seq_id) const { +llama_pos llama_kv_cache_iswa::seq_pos_min(llama_seq_id seq_id) const { // the base cache is a superset of the SWA cache, so we can just check the SWA cache return kv_swa->seq_pos_min(seq_id); } -llama_pos llama_kv_cache_unified_iswa::seq_pos_max(llama_seq_id seq_id) const { +llama_pos llama_kv_cache_iswa::seq_pos_max(llama_seq_id seq_id) const { return kv_swa->seq_pos_max(seq_id); } -llama_memory_context_ptr llama_kv_cache_unified_iswa::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) { +llama_memory_context_ptr llama_kv_cache_iswa::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) { GGML_UNUSED(embd_all); // first try simple split @@ -136,7 +136,7 @@ llama_memory_context_ptr llama_kv_cache_unified_iswa::init_batch(llama_batch_all assert(sinfos_base.size() == sinfos_swa.size()); - return std::make_unique( + return std::make_unique( this, std::move(sinfos_base), std::move(sinfos_swa), std::move(ubatches)); } while (false); @@ -172,29 +172,29 @@ llama_memory_context_ptr llama_kv_cache_unified_iswa::init_batch(llama_batch_all assert(sinfos_base.size() == sinfos_swa.size()); - return std::make_unique( + return std::make_unique( this, std::move(sinfos_base), std::move(sinfos_swa), std::move(ubatches)); } while (false); // TODO: if we fail again, we should attempt different splitting strategies // but to do that properly, we first have to refactor the batches to be more flexible - return std::make_unique(LLAMA_MEMORY_STATUS_FAILED_PREPARE); + return std::make_unique(LLAMA_MEMORY_STATUS_FAILED_PREPARE); } -llama_memory_context_ptr llama_kv_cache_unified_iswa::init_full() { - return std::make_unique(this); +llama_memory_context_ptr llama_kv_cache_iswa::init_full() { + return std::make_unique(this); } -llama_memory_context_ptr llama_kv_cache_unified_iswa::init_update(llama_context * lctx, bool optimize) { - return std::make_unique(this, lctx, optimize); +llama_memory_context_ptr llama_kv_cache_iswa::init_update(llama_context * lctx, bool optimize) { + return std::make_unique(this, lctx, optimize); } -bool llama_kv_cache_unified_iswa::get_can_shift() const { +bool llama_kv_cache_iswa::get_can_shift() const { return kv_base->get_size() == kv_swa->get_size(); } -void llama_kv_cache_unified_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const { +void llama_kv_cache_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const { if ((flags & LLAMA_STATE_SEQ_FLAGS_SWA_ONLY) == 0) { kv_base->state_write(io, seq_id, flags); } @@ -202,7 +202,7 @@ void llama_kv_cache_unified_iswa::state_write(llama_io_write_i & io, llama_seq_i kv_swa->state_write(io, seq_id, flags); } -void llama_kv_cache_unified_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) { +void llama_kv_cache_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) { if ((flags & LLAMA_STATE_SEQ_FLAGS_SWA_ONLY) == 0) { kv_base->state_read(io, seq_id, flags); } @@ -210,29 +210,29 @@ void llama_kv_cache_unified_iswa::state_read(llama_io_read_i & io, llama_seq_id kv_swa->state_read(io, seq_id, flags); } -llama_kv_cache_unified * llama_kv_cache_unified_iswa::get_base() const { +llama_kv_cache * llama_kv_cache_iswa::get_base() const { return kv_base.get(); } -llama_kv_cache_unified * llama_kv_cache_unified_iswa::get_swa() const { +llama_kv_cache * llama_kv_cache_iswa::get_swa() const { return kv_swa.get(); } // -// llama_kv_cache_unified_iswa_context +// llama_kv_cache_iswa_context // -llama_kv_cache_unified_iswa_context::llama_kv_cache_unified_iswa_context(llama_memory_status status) : status(status) {} +llama_kv_cache_iswa_context::llama_kv_cache_iswa_context(llama_memory_status status) : status(status) {} -llama_kv_cache_unified_iswa_context::llama_kv_cache_unified_iswa_context( - llama_kv_cache_unified_iswa * kv) : +llama_kv_cache_iswa_context::llama_kv_cache_iswa_context( + llama_kv_cache_iswa * kv) : ctx_base(kv->get_base()->init_full()), ctx_swa (kv->get_swa ()->init_full()), status(llama_memory_status_combine(ctx_base->get_status(), ctx_swa->get_status())) { } -llama_kv_cache_unified_iswa_context::llama_kv_cache_unified_iswa_context( - llama_kv_cache_unified_iswa * kv, +llama_kv_cache_iswa_context::llama_kv_cache_iswa_context( + llama_kv_cache_iswa * kv, llama_context * lctx, bool optimize) : ctx_base(kv->get_base()->init_update(lctx, optimize)), @@ -240,21 +240,21 @@ llama_kv_cache_unified_iswa_context::llama_kv_cache_unified_iswa_context( status(llama_memory_status_combine(ctx_base->get_status(), ctx_swa->get_status())) { } -llama_kv_cache_unified_iswa_context::llama_kv_cache_unified_iswa_context( - llama_kv_cache_unified_iswa * kv, +llama_kv_cache_iswa_context::llama_kv_cache_iswa_context( + llama_kv_cache_iswa * kv, slot_info_vec_t sinfos_base, slot_info_vec_t sinfos_swa, std::vector ubatches) : ubatches(std::move(ubatches)), // note: here we copy the ubatches. not sure if this is ideal - ctx_base(new llama_kv_cache_unified_context(kv->get_base(), std::move(sinfos_base), this->ubatches)), - ctx_swa (new llama_kv_cache_unified_context(kv->get_swa (), std::move(sinfos_swa), this->ubatches)), + ctx_base(new llama_kv_cache_context(kv->get_base(), std::move(sinfos_base), this->ubatches)), + ctx_swa (new llama_kv_cache_context(kv->get_swa (), std::move(sinfos_swa), this->ubatches)), status(llama_memory_status_combine(ctx_base->get_status(), ctx_swa->get_status())) { } -llama_kv_cache_unified_iswa_context:: ~llama_kv_cache_unified_iswa_context() = default; +llama_kv_cache_iswa_context:: ~llama_kv_cache_iswa_context() = default; -bool llama_kv_cache_unified_iswa_context::next() { +bool llama_kv_cache_iswa_context::next() { assert(status == LLAMA_MEMORY_STATUS_SUCCESS); ctx_base->next(); @@ -267,7 +267,7 @@ bool llama_kv_cache_unified_iswa_context::next() { return true; } -bool llama_kv_cache_unified_iswa_context::apply() { +bool llama_kv_cache_iswa_context::apply() { assert(!llama_memory_status_is_fail(status)); bool res = true; @@ -278,24 +278,24 @@ bool llama_kv_cache_unified_iswa_context::apply() { return res; } -llama_memory_status llama_kv_cache_unified_iswa_context::get_status() const { +llama_memory_status llama_kv_cache_iswa_context::get_status() const { return status; } -const llama_ubatch & llama_kv_cache_unified_iswa_context::get_ubatch() const { +const llama_ubatch & llama_kv_cache_iswa_context::get_ubatch() const { assert(status == LLAMA_MEMORY_STATUS_SUCCESS); return ubatches[i_next]; } -const llama_kv_cache_unified_context * llama_kv_cache_unified_iswa_context::get_base() const { +const llama_kv_cache_context * llama_kv_cache_iswa_context::get_base() const { assert(status == LLAMA_MEMORY_STATUS_SUCCESS); - return static_cast(ctx_base.get()); + return static_cast(ctx_base.get()); } -const llama_kv_cache_unified_context * llama_kv_cache_unified_iswa_context::get_swa() const { +const llama_kv_cache_context * llama_kv_cache_iswa_context::get_swa() const { assert(status == LLAMA_MEMORY_STATUS_SUCCESS); - return static_cast(ctx_swa.get()); + return static_cast(ctx_swa.get()); } diff --git a/src/llama-kv-cache-unified-iswa.h b/src/llama-kv-cache-iswa.h similarity index 68% rename from src/llama-kv-cache-unified-iswa.h rename to src/llama-kv-cache-iswa.h index 7bc4df718d..dd673f18e7 100644 --- a/src/llama-kv-cache-unified-iswa.h +++ b/src/llama-kv-cache-iswa.h @@ -1,32 +1,32 @@ #pragma once -#include "llama-kv-cache-unified.h" +#include "llama-kv-cache.h" #include // -// llama_kv_cache_unified_iswa +// llama_kv_cache_iswa // -// utilizes two instances of llama_kv_cache_unified +// utilizes two instances of llama_kv_cache // the first instance is for the non-SWA layers of the model and the second instance is for the SWA layers -class llama_kv_cache_unified_iswa : public llama_memory_i { +class llama_kv_cache_iswa : public llama_memory_i { public: - llama_kv_cache_unified_iswa( + llama_kv_cache_iswa( const llama_model & model, ggml_type type_k, ggml_type type_v, bool v_trans, bool offload, bool swa_full, - bool unified, + bool , uint32_t kv_size, uint32_t n_seq_max, uint32_t n_ubatch, uint32_t n_pad); - ~llama_kv_cache_unified_iswa() = default; + ~llama_kv_cache_iswa() = default; // // llama_memory_i @@ -60,46 +60,46 @@ public: void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override; // - // llama_kv_cache_unified_iswa specific API + // llama_kv_cache_iswa specific API // - llama_kv_cache_unified * get_base() const; - llama_kv_cache_unified * get_swa () const; + llama_kv_cache * get_base() const; + llama_kv_cache * get_swa () const; private: const llama_hparams & hparams; const bool unified; - std::unique_ptr kv_base; - std::unique_ptr kv_swa; + std::unique_ptr kv_base; + std::unique_ptr kv_swa; }; -class llama_kv_cache_unified_iswa_context : public llama_memory_context_i { +class llama_kv_cache_iswa_context : public llama_memory_context_i { public: - using slot_info_vec_t = llama_kv_cache_unified::slot_info_vec_t; + using slot_info_vec_t = llama_kv_cache::slot_info_vec_t; // used for errors - llama_kv_cache_unified_iswa_context(llama_memory_status status); + llama_kv_cache_iswa_context(llama_memory_status status); // used to create a full-cache context - llama_kv_cache_unified_iswa_context( - llama_kv_cache_unified_iswa * kv); + llama_kv_cache_iswa_context( + llama_kv_cache_iswa * kv); // used to create an update context - llama_kv_cache_unified_iswa_context( - llama_kv_cache_unified_iswa * kv, + llama_kv_cache_iswa_context( + llama_kv_cache_iswa * kv, llama_context * lctx, bool optimize); // used to create a batch processing context from a batch - llama_kv_cache_unified_iswa_context( - llama_kv_cache_unified_iswa * kv, + llama_kv_cache_iswa_context( + llama_kv_cache_iswa * kv, slot_info_vec_t sinfos_base, slot_info_vec_t sinfos_swa, std::vector ubatches); - virtual ~llama_kv_cache_unified_iswa_context(); + virtual ~llama_kv_cache_iswa_context(); // // llama_memory_context_i @@ -112,14 +112,14 @@ public: const llama_ubatch & get_ubatch() const override; // - // llama_kv_cache_unified_iswa_context specific API + // llama_kv_cache_iswa_context specific API // - const llama_kv_cache_unified_context * get_base() const; - const llama_kv_cache_unified_context * get_swa() const; + const llama_kv_cache_context * get_base() const; + const llama_kv_cache_context * get_swa() const; private: - //llama_kv_cache_unified_iswa * kv; + //llama_kv_cache_iswa * kv; // the index of the next ubatch to process size_t i_next = 0; diff --git a/src/llama-kv-cache-unified.cpp b/src/llama-kv-cache.cpp similarity index 90% rename from src/llama-kv-cache-unified.cpp rename to src/llama-kv-cache.cpp index 478ebffac0..bb490cf9e8 100644 --- a/src/llama-kv-cache-unified.cpp +++ b/src/llama-kv-cache.cpp @@ -1,4 +1,4 @@ -#include "llama-kv-cache-unified.h" +#include "llama-kv-cache.h" #include "llama-impl.h" #include "llama-io.h" @@ -13,10 +13,10 @@ #include // -// llama_kv_cache_unified +// llama_kv_cache // -llama_kv_cache_unified::llama_kv_cache_unified( +llama_kv_cache::llama_kv_cache( const llama_model & model, layer_filter_cb && filter, ggml_type type_k, @@ -209,7 +209,7 @@ llama_kv_cache_unified::llama_kv_cache_unified( } } -void llama_kv_cache_unified::clear(bool data) { +void llama_kv_cache::clear(bool data) { for (uint32_t s = 0; s < n_stream; ++s) { v_cells[s].reset(); v_heads[s] = 0; @@ -222,7 +222,7 @@ void llama_kv_cache_unified::clear(bool data) { } } -bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) { +bool llama_kv_cache::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) { GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (size_t) seq_id < seq_to_stream.size())); if (p0 < 0) { @@ -285,7 +285,7 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos return true; } -void llama_kv_cache_unified::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) { +void llama_kv_cache::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) { GGML_ASSERT(seq_id_src >= 0 && (size_t) seq_id_src < seq_to_stream.size()); GGML_ASSERT(seq_id_dst >= 0 && (size_t) seq_id_dst < seq_to_stream.size()); @@ -368,7 +368,7 @@ void llama_kv_cache_unified::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id //} } -void llama_kv_cache_unified::seq_keep(llama_seq_id seq_id) { +void llama_kv_cache::seq_keep(llama_seq_id seq_id) { GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()); auto & cells = v_cells[seq_to_stream[seq_id]]; @@ -390,7 +390,7 @@ void llama_kv_cache_unified::seq_keep(llama_seq_id seq_id) { } } -void llama_kv_cache_unified::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) { +void llama_kv_cache::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) { GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()); auto & cells = v_cells[seq_to_stream[seq_id]]; @@ -434,7 +434,7 @@ void llama_kv_cache_unified::seq_add(llama_seq_id seq_id, llama_pos p0, llama_po head = new_head != cells.size() ? new_head : 0; } -void llama_kv_cache_unified::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) { +void llama_kv_cache::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) { GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()); auto & cells = v_cells[seq_to_stream[seq_id]]; @@ -467,7 +467,7 @@ void llama_kv_cache_unified::seq_div(llama_seq_id seq_id, llama_pos p0, llama_po } } -llama_pos llama_kv_cache_unified::seq_pos_min(llama_seq_id seq_id) const { +llama_pos llama_kv_cache::seq_pos_min(llama_seq_id seq_id) const { GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()); const auto & cells = v_cells[seq_to_stream[seq_id]]; @@ -475,7 +475,7 @@ llama_pos llama_kv_cache_unified::seq_pos_min(llama_seq_id seq_id) const { return cells.seq_pos_min(seq_id); } -llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) const { +llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) const { GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()); const auto & cells = v_cells[seq_to_stream[seq_id]]; @@ -483,7 +483,7 @@ llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) const { return cells.seq_pos_max(seq_id); } -llama_memory_context_ptr llama_kv_cache_unified::init_batch( +llama_memory_context_ptr llama_kv_cache::init_batch( llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) { @@ -513,18 +513,18 @@ llama_memory_context_ptr llama_kv_cache_unified::init_batch( break; } - return std::make_unique( + return std::make_unique( this, std::move(sinfos), std::move(ubatches)); } while (false); - return std::make_unique(LLAMA_MEMORY_STATUS_FAILED_PREPARE); + return std::make_unique(LLAMA_MEMORY_STATUS_FAILED_PREPARE); } -llama_memory_context_ptr llama_kv_cache_unified::init_full() { - return std::make_unique(this); +llama_memory_context_ptr llama_kv_cache::init_full() { + return std::make_unique(this); } -llama_memory_context_ptr llama_kv_cache_unified::init_update(llama_context * lctx, bool optimize) { +llama_memory_context_ptr llama_kv_cache::init_update(llama_context * lctx, bool optimize) { bool do_shift = get_has_shift(); defrag_info dinfo; @@ -557,18 +557,18 @@ llama_memory_context_ptr llama_kv_cache_unified::init_update(llama_context * lct } } - return std::make_unique(this, lctx, do_shift, std::move(dinfo), std::move(sc_info)); + return std::make_unique(this, lctx, do_shift, std::move(dinfo), std::move(sc_info)); } -llama_kv_cache_unified::slot_info_vec_t llama_kv_cache_unified::prepare(const std::vector & ubatches) { - llama_kv_cache_unified::slot_info_vec_t res; +llama_kv_cache::slot_info_vec_t llama_kv_cache::prepare(const std::vector & ubatches) { + llama_kv_cache::slot_info_vec_t res; struct state_t { slot_info sinfo; // slot info for the ubatch std::vector v_heads_old; // old positions of the heads, before placing the ubatch - std::vector v_cells; // copy of the old cells, before placing the ubatch + std::vector v_cells; // copy of the old cells, before placing the ubatch }; // remember the old state of the cells so we can restore it in the end @@ -629,7 +629,7 @@ llama_kv_cache_unified::slot_info_vec_t llama_kv_cache_unified::prepare(const st return res; } -bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const defrag_info & dinfo, const stream_copy_info & sc_info) { +bool llama_kv_cache::update(llama_context * lctx, bool do_shift, const defrag_info & dinfo, const stream_copy_info & sc_info) { bool updated = false; auto * sched = lctx->get_sched(); @@ -749,7 +749,7 @@ bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const d return updated; } -llama_kv_cache_unified::slot_info llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch, bool cont) const { +llama_kv_cache::slot_info llama_kv_cache::find_slot(const llama_ubatch & ubatch, bool cont) const { if (debug > 0) { for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) { @@ -948,7 +948,7 @@ llama_kv_cache_unified::slot_info llama_kv_cache_unified::find_slot(const llama_ return res; } -void llama_kv_cache_unified::apply_ubatch(const slot_info & sinfo, const llama_ubatch & ubatch) { +void llama_kv_cache::apply_ubatch(const slot_info & sinfo, const llama_ubatch & ubatch) { // keep track of the max sequence position that we would overwrite with this ubatch // for non-SWA cache, this would be always empty llama_seq_id seq_pos_max_rm[LLAMA_MAX_SEQ]; @@ -1013,21 +1013,21 @@ void llama_kv_cache_unified::apply_ubatch(const slot_info & sinfo, const llama_u } } -bool llama_kv_cache_unified::get_can_shift() const { +bool llama_kv_cache::get_can_shift() const { return true; } -uint32_t llama_kv_cache_unified::get_size() const { +uint32_t llama_kv_cache::get_size() const { const auto & cells = v_cells[seq_to_stream[0]]; return cells.size(); } -uint32_t llama_kv_cache_unified::get_n_stream() const { +uint32_t llama_kv_cache::get_n_stream() const { return n_stream; } -bool llama_kv_cache_unified::get_has_shift() const { +bool llama_kv_cache::get_has_shift() const { bool result = false; for (uint32_t s = 0; s < n_stream; ++s) { @@ -1037,7 +1037,7 @@ bool llama_kv_cache_unified::get_has_shift() const { return result; } -uint32_t llama_kv_cache_unified::get_n_kv() const { +uint32_t llama_kv_cache::get_n_kv() const { uint32_t result = 0; for (uint32_t s = 0; s < n_stream; ++s) { @@ -1049,11 +1049,11 @@ uint32_t llama_kv_cache_unified::get_n_kv() const { return result; } -bool llama_kv_cache_unified::get_supports_set_rows() const { +bool llama_kv_cache::get_supports_set_rows() const { return supports_set_rows; } -ggml_tensor * llama_kv_cache_unified::get_k(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const { +ggml_tensor * llama_kv_cache::get_k(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const { const int32_t ikv = map_layer_ids.at(il); auto * k = layers[ikv].k; @@ -1073,7 +1073,7 @@ ggml_tensor * llama_kv_cache_unified::get_k(ggml_context * ctx, int32_t il, uint ggml_row_size(k->type, n_embd_k_gqa*kv_size)*sinfo.s0); } -ggml_tensor * llama_kv_cache_unified::get_v(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const { +ggml_tensor * llama_kv_cache::get_v(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const { const int32_t ikv = map_layer_ids.at(il); auto * v = layers[ikv].v; @@ -1105,7 +1105,7 @@ ggml_tensor * llama_kv_cache_unified::get_v(ggml_context * ctx, int32_t il, uint ggml_row_size(v->type, kv_size*n_embd_v_gqa)*sinfo.s0); } -ggml_tensor * llama_kv_cache_unified::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il, const slot_info & sinfo) const { +ggml_tensor * llama_kv_cache::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il, const slot_info & sinfo) const { const int32_t ikv = map_layer_ids.at(il); auto * k = layers[ikv].k; @@ -1135,7 +1135,7 @@ ggml_tensor * llama_kv_cache_unified::cpy_k(ggml_context * ctx, ggml_tensor * k_ return ggml_cpy(ctx, k_cur, k_view); } -ggml_tensor * llama_kv_cache_unified::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il, const slot_info & sinfo) const { +ggml_tensor * llama_kv_cache::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il, const slot_info & sinfo) const { const int32_t ikv = map_layer_ids.at(il); auto * v = layers[ikv].v; @@ -1189,7 +1189,7 @@ ggml_tensor * llama_kv_cache_unified::cpy_v(ggml_context * ctx, ggml_tensor * v_ return ggml_cpy(ctx, v_cur, v_view); } -ggml_tensor * llama_kv_cache_unified::build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const { +ggml_tensor * llama_kv_cache::build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const { const uint32_t n_tokens = ubatch.n_tokens; ggml_tensor * k_idxs = ggml_new_tensor_1d(ctx, GGML_TYPE_I64, n_tokens); @@ -1199,7 +1199,7 @@ ggml_tensor * llama_kv_cache_unified::build_input_k_idxs(ggml_context * ctx, con return k_idxs; } -ggml_tensor * llama_kv_cache_unified::build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const { +ggml_tensor * llama_kv_cache::build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const { const uint32_t n_tokens = ubatch.n_tokens; ggml_tensor * v_idxs; @@ -1215,7 +1215,7 @@ ggml_tensor * llama_kv_cache_unified::build_input_v_idxs(ggml_context * ctx, con return v_idxs; } -void llama_kv_cache_unified::set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const { +void llama_kv_cache::set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const { if (!supports_set_rows) { return; } @@ -1235,7 +1235,7 @@ void llama_kv_cache_unified::set_input_k_idxs(ggml_tensor * dst, const llama_uba } } -void llama_kv_cache_unified::set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const { +void llama_kv_cache::set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const { if (!supports_set_rows) { return; } @@ -1272,7 +1272,7 @@ void llama_kv_cache_unified::set_input_v_idxs(ggml_tensor * dst, const llama_uba } } -void llama_kv_cache_unified::set_input_k_shift(ggml_tensor * dst) const { +void llama_kv_cache::set_input_k_shift(ggml_tensor * dst) const { GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer)); int32_t * data = (int32_t *) dst->data; @@ -1286,7 +1286,7 @@ void llama_kv_cache_unified::set_input_k_shift(ggml_tensor * dst) const { } } -void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const { +void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const { const uint32_t n_tokens = ubatch->n_tokens; GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer)); @@ -1358,7 +1358,7 @@ void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ub } } -void llama_kv_cache_unified::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const { +void llama_kv_cache::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const { const int64_t n_tokens = ubatch->n_tokens; GGML_ASSERT(n_stream == 1 && "TODO: support multiple streams"); @@ -1383,7 +1383,7 @@ void llama_kv_cache_unified::set_input_pos_bucket(ggml_tensor * dst, const llama } } -size_t llama_kv_cache_unified::total_size() const { +size_t llama_kv_cache::total_size() const { size_t size = 0; for (const auto & buf : bufs) { @@ -1393,7 +1393,7 @@ size_t llama_kv_cache_unified::total_size() const { return size; } -size_t llama_kv_cache_unified::size_k_bytes() const { +size_t llama_kv_cache::size_k_bytes() const { size_t size_k_bytes = 0; for (const auto & layer : layers) { @@ -1403,7 +1403,7 @@ size_t llama_kv_cache_unified::size_k_bytes() const { return size_k_bytes; } -size_t llama_kv_cache_unified::size_v_bytes() const { +size_t llama_kv_cache::size_v_bytes() const { size_t size_v_bytes = 0; for (const auto & layer : layers) { @@ -1413,7 +1413,7 @@ size_t llama_kv_cache_unified::size_v_bytes() const { return size_v_bytes; } -ggml_tensor * llama_kv_cache_unified::build_rope_shift( +ggml_tensor * llama_kv_cache::build_rope_shift( const llama_cparams & cparams, ggml_context * ctx, ggml_tensor * cur, @@ -1465,14 +1465,14 @@ ggml_tensor * llama_kv_cache_unified::build_rope_shift( class llm_graph_input_k_shift : public llm_graph_input_i { public: - llm_graph_input_k_shift(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {} + llm_graph_input_k_shift(const llama_kv_cache * kv_self) : kv_self(kv_self) {} virtual ~llm_graph_input_k_shift() = default; void set_input(const llama_ubatch * ubatch) override; ggml_tensor * k_shift; // I32 [kv_size*n_stream] - const llama_kv_cache_unified * kv_self; + const llama_kv_cache * kv_self; }; void llm_graph_input_k_shift::set_input(const llama_ubatch * ubatch) { @@ -1483,7 +1483,7 @@ void llm_graph_input_k_shift::set_input(const llama_ubatch * ubatch) { } } -ggml_cgraph * llama_kv_cache_unified::build_graph_shift(llm_graph_result * res, llama_context * lctx) const { +ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_context * lctx) const { auto * ctx = res->get_ctx(); auto * gf = res->get_gf(); @@ -1525,7 +1525,7 @@ ggml_cgraph * llama_kv_cache_unified::build_graph_shift(llm_graph_result * res, return gf; } -ggml_cgraph * llama_kv_cache_unified::build_graph_defrag( +ggml_cgraph * llama_kv_cache::build_graph_defrag( llm_graph_result * res, llama_context * lctx, const defrag_info & dinfo) const { @@ -1679,7 +1679,7 @@ ggml_cgraph * llama_kv_cache_unified::build_graph_defrag( return gf; } -llama_kv_cache_unified::defrag_info llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) const { +llama_kv_cache::defrag_info llama_kv_cache::defrag_prepare(int32_t n_max_nodes) const { GGML_ASSERT(n_stream == 1 && "n_stream > 1 does not support defrag"); const auto & cells = v_cells[0]; @@ -1802,7 +1802,7 @@ llama_kv_cache_unified::defrag_info llama_kv_cache_unified::defrag_prepare(int32 return res; } -bool llama_kv_cache_unified::is_masked_swa(llama_pos p0, llama_pos p1) const { +bool llama_kv_cache::is_masked_swa(llama_pos p0, llama_pos p1) const { assert(p0 >= 0 && p1 >= 0); switch (swa_type) { @@ -1828,7 +1828,7 @@ bool llama_kv_cache_unified::is_masked_swa(llama_pos p0, llama_pos p1) const { return false; } -void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const { +void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const { GGML_UNUSED(flags); io.write(&n_stream, sizeof(n_stream)); @@ -1881,7 +1881,7 @@ void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq } } -void llama_kv_cache_unified::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) { +void llama_kv_cache::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) { GGML_UNUSED(flags); GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (size_t) seq_id < seq_to_stream.size())); @@ -1917,7 +1917,7 @@ void llama_kv_cache_unified::state_read(llama_io_read_i & io, llama_seq_id seq_i } } -void llama_kv_cache_unified::state_write_meta(llama_io_write_i & io, const cell_ranges_t & cr, llama_seq_id seq_id) const { +void llama_kv_cache::state_write_meta(llama_io_write_i & io, const cell_ranges_t & cr, llama_seq_id seq_id) const { const auto & cells = v_cells[cr.strm]; for (const auto & range : cr.data) { @@ -1945,7 +1945,7 @@ void llama_kv_cache_unified::state_write_meta(llama_io_write_i & io, const cell_ } } -void llama_kv_cache_unified::state_write_data(llama_io_write_i & io, const cell_ranges_t & cr) const { +void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t & cr) const { const auto & cells = v_cells[cr.strm]; const uint32_t v_trans = this->v_trans ? 1 : 0; @@ -2040,7 +2040,7 @@ void llama_kv_cache_unified::state_write_data(llama_io_write_i & io, const cell_ } } -bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, llama_seq_id dest_seq_id) { +bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, llama_seq_id dest_seq_id) { auto & cells = v_cells[strm]; auto & head = v_heads[strm]; @@ -2137,7 +2137,7 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t strm return true; } -bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count) { +bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count) { auto & cells = v_cells[strm]; auto & head = v_heads[strm]; @@ -2274,13 +2274,13 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t strm } // -// llama_kv_cache_unified_context +// llama_kv_cache_context // -llama_kv_cache_unified_context::llama_kv_cache_unified_context(llama_memory_status status) : status(status) {} +llama_kv_cache_context::llama_kv_cache_context(llama_memory_status status) : status(status) {} -llama_kv_cache_unified_context::llama_kv_cache_unified_context( - llama_kv_cache_unified * kv) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv) { +llama_kv_cache_context::llama_kv_cache_context( + llama_kv_cache * kv) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv) { n_kv = kv->get_size(); const uint32_t n_stream = kv->get_n_stream(); @@ -2296,8 +2296,8 @@ llama_kv_cache_unified_context::llama_kv_cache_unified_context( } } -llama_kv_cache_unified_context::llama_kv_cache_unified_context( - llama_kv_cache_unified * kv, +llama_kv_cache_context::llama_kv_cache_context( + llama_kv_cache * kv, llama_context * lctx, bool do_shift, defrag_info dinfo, @@ -2307,15 +2307,15 @@ llama_kv_cache_unified_context::llama_kv_cache_unified_context( } } -llama_kv_cache_unified_context::llama_kv_cache_unified_context( - llama_kv_cache_unified * kv, - llama_kv_cache_unified::slot_info_vec_t sinfos, +llama_kv_cache_context::llama_kv_cache_context( + llama_kv_cache * kv, + llama_kv_cache::slot_info_vec_t sinfos, std::vector ubatches) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv), sinfos(std::move(sinfos)), ubatches(std::move(ubatches)) { } -llama_kv_cache_unified_context::~llama_kv_cache_unified_context() = default; +llama_kv_cache_context::~llama_kv_cache_context() = default; -bool llama_kv_cache_unified_context::next() { +bool llama_kv_cache_context::next() { assert(status == LLAMA_MEMORY_STATUS_SUCCESS); if (++i_cur >= ubatches.size()) { @@ -2325,7 +2325,7 @@ bool llama_kv_cache_unified_context::next() { return true; } -bool llama_kv_cache_unified_context::apply() { +bool llama_kv_cache_context::apply() { assert(!llama_memory_status_is_fail(status)); // no ubatches -> this is a KV cache update @@ -2342,69 +2342,69 @@ bool llama_kv_cache_unified_context::apply() { return true; } -llama_memory_status llama_kv_cache_unified_context::get_status() const { +llama_memory_status llama_kv_cache_context::get_status() const { return status; } -const llama_ubatch & llama_kv_cache_unified_context::get_ubatch() const { +const llama_ubatch & llama_kv_cache_context::get_ubatch() const { assert(status == LLAMA_MEMORY_STATUS_SUCCESS); return ubatches[i_cur]; } -uint32_t llama_kv_cache_unified_context::get_n_kv() const { +uint32_t llama_kv_cache_context::get_n_kv() const { return n_kv; } -bool llama_kv_cache_unified_context::get_supports_set_rows() const { +bool llama_kv_cache_context::get_supports_set_rows() const { return kv->get_supports_set_rows(); } -ggml_tensor * llama_kv_cache_unified_context::get_k(ggml_context * ctx, int32_t il) const { +ggml_tensor * llama_kv_cache_context::get_k(ggml_context * ctx, int32_t il) const { return kv->get_k(ctx, il, n_kv, sinfos[i_cur]); } -ggml_tensor * llama_kv_cache_unified_context::get_v(ggml_context * ctx, int32_t il) const { +ggml_tensor * llama_kv_cache_context::get_v(ggml_context * ctx, int32_t il) const { return kv->get_v(ctx, il, n_kv, sinfos[i_cur]); } -ggml_tensor * llama_kv_cache_unified_context::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il) const { +ggml_tensor * llama_kv_cache_context::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il) const { return kv->cpy_k(ctx, k_cur, k_idxs, il, sinfos[i_cur]); } -ggml_tensor * llama_kv_cache_unified_context::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il) const { +ggml_tensor * llama_kv_cache_context::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il) const { return kv->cpy_v(ctx, v_cur, v_idxs, il, sinfos[i_cur]); } -ggml_tensor * llama_kv_cache_unified_context::build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const { +ggml_tensor * llama_kv_cache_context::build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const { return kv->build_input_k_idxs(ctx, ubatch); } -ggml_tensor * llama_kv_cache_unified_context::build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const { +ggml_tensor * llama_kv_cache_context::build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const { return kv->build_input_v_idxs(ctx, ubatch); } -void llama_kv_cache_unified_context::set_input_k_shift(ggml_tensor * dst) const { +void llama_kv_cache_context::set_input_k_shift(ggml_tensor * dst) const { kv->set_input_k_shift(dst); } -void llama_kv_cache_unified_context::set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch) const { +void llama_kv_cache_context::set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch) const { kv->set_input_k_idxs(dst, ubatch, sinfos[i_cur]); } -void llama_kv_cache_unified_context::set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch) const { +void llama_kv_cache_context::set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch) const { kv->set_input_v_idxs(dst, ubatch, sinfos[i_cur]); } -void llama_kv_cache_unified_context::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const { +void llama_kv_cache_context::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const { kv->set_input_kq_mask(dst, ubatch, causal_attn); } -void llama_kv_cache_unified_context::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const { +void llama_kv_cache_context::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const { kv->set_input_pos_bucket(dst, ubatch); } -uint32_t llama_kv_cache_unified::get_padding(const llama_cparams & cparams) { +uint32_t llama_kv_cache::get_padding(const llama_cparams & cparams) { // the FA kernels require padding to avoid extra runtime boundary checks return cparams.flash_attn ? 256u : 32u; } diff --git a/src/llama-kv-cache-unified.h b/src/llama-kv-cache.h similarity index 92% rename from src/llama-kv-cache-unified.h rename to src/llama-kv-cache.h index 07a7c9e4e4..5ca618e1b8 100644 --- a/src/llama-kv-cache-unified.h +++ b/src/llama-kv-cache.h @@ -14,10 +14,10 @@ struct llama_model; struct llama_context; // -// llama_kv_cache_unified +// llama_kv_cache // -class llama_kv_cache_unified : public llama_memory_i { +class llama_kv_cache : public llama_memory_i { public: static uint32_t get_padding(const llama_cparams & cparams); @@ -92,7 +92,7 @@ public: using slot_info_vec_t = std::vector; - llama_kv_cache_unified( + llama_kv_cache( const llama_model & model, layer_filter_cb && filter, ggml_type type_k, @@ -106,7 +106,7 @@ public: uint32_t n_swa, llama_swa_type swa_type); - ~llama_kv_cache_unified() = default; + ~llama_kv_cache() = default; // // llama_memory_i @@ -140,7 +140,7 @@ public: void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override; // - // llama_kv_cache_unified specific API + // llama_kv_cache specific API // uint32_t get_size() const; @@ -241,7 +241,7 @@ private: // note: this is not part of the KV state and it's only used to speed-up the find_slot() method std::vector v_heads; - std::vector v_cells; + std::vector v_cells; // maps from a sequence id to a stream id std::vector seq_to_stream; @@ -295,35 +295,35 @@ private: bool state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count); }; -class llama_kv_cache_unified_context : public llama_memory_context_i { +class llama_kv_cache_context : public llama_memory_context_i { public: // some shorthands - using slot_info_vec_t = llama_kv_cache_unified::slot_info_vec_t; - using defrag_info = llama_kv_cache_unified::defrag_info; - using stream_copy_info = llama_kv_cache_unified::stream_copy_info; + using slot_info_vec_t = llama_kv_cache::slot_info_vec_t; + using defrag_info = llama_kv_cache::defrag_info; + using stream_copy_info = llama_kv_cache::stream_copy_info; // used for errors - llama_kv_cache_unified_context(llama_memory_status status); + llama_kv_cache_context(llama_memory_status status); // used to create a full-cache context - llama_kv_cache_unified_context( - llama_kv_cache_unified * kv); + llama_kv_cache_context( + llama_kv_cache * kv); // used to create an update context - llama_kv_cache_unified_context( - llama_kv_cache_unified * kv, + llama_kv_cache_context( + llama_kv_cache * kv, llama_context * lctx, bool do_shift, defrag_info dinfo, stream_copy_info sc_info); // used to create a batch procesing context from a batch - llama_kv_cache_unified_context( - llama_kv_cache_unified * kv, + llama_kv_cache_context( + llama_kv_cache * kv, slot_info_vec_t sinfos, std::vector ubatches); - virtual ~llama_kv_cache_unified_context(); + virtual ~llama_kv_cache_context(); // // llama_memory_context_i @@ -336,7 +336,7 @@ public: const llama_ubatch & get_ubatch() const override; // - // llama_kv_cache_unified_context specific API + // llama_kv_cache_context specific API // uint32_t get_n_kv() const; @@ -365,7 +365,7 @@ public: private: llama_memory_status status; - llama_kv_cache_unified * kv; + llama_kv_cache * kv; llama_context * lctx; // diff --git a/src/llama-kv-cells.h b/src/llama-kv-cells.h index 0d0dd316fd..2651e30331 100644 --- a/src/llama-kv-cells.h +++ b/src/llama-kv-cells.h @@ -11,7 +11,7 @@ // meta information about KV cells that can be part of multiple sequences at the same time // TODO: add unit tests -class llama_kv_cells_unified { +class llama_kv_cells { public: void reset() { for (uint32_t i = 0; i < pos.size(); ++i) { @@ -97,10 +97,10 @@ public: } // copy the state of cells [i, i + n) (used for save/restore the state of the cells) - llama_kv_cells_unified cp(uint32_t i, uint32_t n) const { + llama_kv_cells cp(uint32_t i, uint32_t n) const { assert(i + n <= pos.size()); - llama_kv_cells_unified res; + llama_kv_cells res; res.resize(n); @@ -117,8 +117,8 @@ public: } // copy the state of cells [idxs[0], idxs[1], ..., idxs[idxs.size() - 1]) - llama_kv_cells_unified cp(const std::vector & idxs) const { - llama_kv_cells_unified res; + llama_kv_cells cp(const std::vector & idxs) const { + llama_kv_cells res; res.resize(idxs.size()); @@ -135,7 +135,7 @@ public: } // set the state of cells [i, i + other.pos.size()) (used for save/restore the state of the cells) - void set(uint32_t i, const llama_kv_cells_unified & other) { + void set(uint32_t i, const llama_kv_cells & other) { assert(i + other.pos.size() <= pos.size()); for (uint32_t j = 0; j < other.pos.size(); ++j) { @@ -165,7 +165,7 @@ public: } // set the state of cells [idxs[0], idxs[1], ..., idxs[idxs.size() - 1]) - void set(const std::vector & idxs, const llama_kv_cells_unified & other) { + void set(const std::vector & idxs, const llama_kv_cells & other) { assert(idxs.size() == other.pos.size()); for (uint32_t j = 0; j < other.pos.size(); ++j) { diff --git a/src/llama-memory-hybrid.cpp b/src/llama-memory-hybrid.cpp index cbeeb21344..f8303dacbf 100644 --- a/src/llama-memory-hybrid.cpp +++ b/src/llama-memory-hybrid.cpp @@ -30,7 +30,7 @@ llama_memory_hybrid::llama_memory_hybrid( layer_filter_cb && filter_attn, layer_filter_cb && filter_recr) : hparams(model.hparams), - mem_attn(new llama_kv_cache_unified( + mem_attn(new llama_kv_cache( model, filter_attn == nullptr ? [&](int32_t il) { return !hparams.is_recurrent(il); } @@ -179,7 +179,7 @@ void llama_memory_hybrid::state_read(llama_io_read_i & io, llama_seq_id seq_id, mem_recr->state_read(io, seq_id); } -llama_kv_cache_unified * llama_memory_hybrid::get_mem_attn() const { +llama_kv_cache * llama_memory_hybrid::get_mem_attn() const { return mem_attn.get(); } @@ -210,7 +210,7 @@ llama_memory_hybrid_context::llama_memory_hybrid_context( std::vector ubatches) : ubatches(std::move(ubatches)), // note: here we copy the ubatches. not sure if this is ideal - ctx_attn(new llama_kv_cache_unified_context(mem->get_mem_attn(), std::move(sinfos_attn), this->ubatches)), + ctx_attn(new llama_kv_cache_context(mem->get_mem_attn(), std::move(sinfos_attn), this->ubatches)), ctx_recr(new llama_memory_recurrent_context(mem->get_mem_recr(), this->ubatches)), status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) { } @@ -248,8 +248,8 @@ const llama_ubatch & llama_memory_hybrid_context::get_ubatch() const { return ubatches[i_next]; } -const llama_kv_cache_unified_context * llama_memory_hybrid_context::get_attn() const { - return static_cast(ctx_attn.get()); +const llama_kv_cache_context * llama_memory_hybrid_context::get_attn() const { + return static_cast(ctx_attn.get()); } const llama_memory_recurrent_context * llama_memory_hybrid_context::get_recr() const { diff --git a/src/llama-memory-hybrid.h b/src/llama-memory-hybrid.h index acdbc26bfb..e9c64ee40a 100644 --- a/src/llama-memory-hybrid.h +++ b/src/llama-memory-hybrid.h @@ -2,7 +2,7 @@ #include "llama-batch.h" #include "llama-graph.h" -#include "llama-kv-cache-unified.h" +#include "llama-kv-cache.h" #include "llama-memory.h" #include "llama-memory-recurrent.h" @@ -13,7 +13,7 @@ // llama_memory_hybrid // -// utilizes instances of llama_memory_recurrent and llama_kv_cache_unified to +// utilizes instances of llama_memory_recurrent and llama_kv_cache to // support models where each layer may be either attention-based or recurrent class llama_memory_hybrid : public llama_memory_i { @@ -81,19 +81,19 @@ public: // llama_memory_hybrid specific API // - llama_kv_cache_unified * get_mem_attn() const; + llama_kv_cache * get_mem_attn() const; llama_memory_recurrent * get_mem_recr() const; private: const llama_hparams & hparams; - const std::unique_ptr mem_attn; + const std::unique_ptr mem_attn; const std::unique_ptr mem_recr; }; class llama_memory_hybrid_context : public llama_memory_context_i { public: - using slot_info_vec_t = llama_kv_cache_unified::slot_info_vec_t; + using slot_info_vec_t = llama_kv_cache::slot_info_vec_t; // init failure explicit llama_memory_hybrid_context(llama_memory_status status); @@ -125,7 +125,7 @@ public: // llama_memory_hybrid_context // - const llama_kv_cache_unified_context * get_attn() const; + const llama_kv_cache_context * get_attn() const; const llama_memory_recurrent_context * get_recr() const; private: diff --git a/src/llama-memory-recurrent.h b/src/llama-memory-recurrent.h index 95c617b2c9..c8e8623602 100644 --- a/src/llama-memory-recurrent.h +++ b/src/llama-memory-recurrent.h @@ -12,7 +12,7 @@ // // TODO: extract the cache state used for graph computation into llama_memory_recurrent_context_i -// see the implementation of llama_kv_cache_unified_context_i for an example how to do it +// see the implementation of llama_kv_cache_context_i for an example how to do it class llama_memory_recurrent : public llama_memory_i { public: diff --git a/src/llama-memory.h b/src/llama-memory.h index 171d312cc9..42a7145c2f 100644 --- a/src/llama-memory.h +++ b/src/llama-memory.h @@ -36,8 +36,8 @@ bool llama_memory_status_is_fail(llama_memory_status status); // the interface for managing the memory context during batch processing // this interface is implemented per memory type. see: -// - llama_kv_cache_unified_context -// - llama_kv_cache_unified_iswa_context +// - llama_kv_cache_context +// - llama_kv_cache_iswa_context // ... // // the only method that should mutate the memory and the memory context is llama_memory_i::apply() @@ -109,8 +109,3 @@ struct llama_memory_i { }; using llama_memory_ptr = std::unique_ptr; - -// TODO: temporary until the llama_kv_cache is removed from the public API -struct llama_kv_cache : public llama_memory_i { - virtual ~llama_kv_cache() = default; -}; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 23a26f0c64..c4f0b12f24 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -6,8 +6,8 @@ #include "llama-cparams.h" #include "llama-model-loader.h" -#include "llama-kv-cache-unified.h" -#include "llama-kv-cache-unified-iswa.h" +#include "llama-kv-cache.h" +#include "llama-kv-cache-iswa.h" #include "llama-memory-hybrid.h" #include "llama-memory-recurrent.h" @@ -86,6 +86,7 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_40B: return "40B"; case LLM_TYPE_65B: return "65B"; case LLM_TYPE_70B: return "70B"; + case LLM_TYPE_120B: return "120B"; case LLM_TYPE_142B: return "142B"; case LLM_TYPE_236B: return "236B"; case LLM_TYPE_290B: return "290B"; @@ -1834,7 +1835,11 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; hparams.set_swa_pattern(2); - // TODO: switch (hparams.n_layer) + switch (hparams.n_layer) { + case 24: type = LLM_TYPE_20B; break; + case 36: type = LLM_TYPE_120B; break; + default: type = LLM_TYPE_UNKNOWN; + } } break; case LLM_ARCH_LFM2: { @@ -5981,7 +5986,7 @@ struct llm_build_llama : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; @@ -6045,7 +6050,7 @@ struct llm_build_llama : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); cb(cur, "attn_out", il); } @@ -6141,7 +6146,7 @@ struct llm_build_llama_iswa : public llm_graph_context { ggml_tensor * inp_attn_scale = nullptr; inp_attn_scale = build_inp_attn_scale(); - auto * inp_attn = build_attn_inp_kv_unified_iswa(); + auto * inp_attn = build_attn_inp_kv_iswa(); const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; @@ -6219,7 +6224,7 @@ struct llm_build_llama_iswa : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); cb(cur, "attn_out", il); } @@ -6320,7 +6325,7 @@ struct llm_build_deci : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; @@ -6396,7 +6401,7 @@ struct llm_build_deci : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); } if (il == n_layer - 1 && inp_out_ids) { @@ -6476,7 +6481,7 @@ struct llm_build_baichuan : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr; - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -6528,7 +6533,7 @@ struct llm_build_baichuan : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, NULL, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1 && inp_out_ids) { @@ -6598,7 +6603,7 @@ struct llm_build_xverse : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -6643,7 +6648,7 @@ struct llm_build_xverse : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, NULL, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1 && inp_out_ids) { @@ -6712,7 +6717,7 @@ struct llm_build_falcon : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -6743,9 +6748,9 @@ struct llm_build_falcon : public llm_graph_context { ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd)); ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd)); - ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)); - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); // using mode = 2 for neox mode Qcur = ggml_rope_ext( @@ -6766,7 +6771,7 @@ struct llm_build_falcon : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, NULL, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1 && inp_out_ids) { @@ -6836,7 +6841,7 @@ struct llm_build_grok : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -6896,7 +6901,7 @@ struct llm_build_grok : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il); } if (il == n_layer - 1 && inp_out_ids) { @@ -6996,7 +7001,7 @@ struct llm_build_dbrx : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -7023,9 +7028,9 @@ struct llm_build_dbrx : public llm_graph_context { Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd)); Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd)); - Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)); - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); Qcur = ggml_rope_ext( ctx0, Qcur, inp_pos, nullptr, @@ -7045,7 +7050,7 @@ struct llm_build_dbrx : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, NULL, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1 && inp_out_ids) { @@ -7120,7 +7125,7 @@ struct llm_build_starcoder : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); cb(pos, "pos_embd", -1); @@ -7145,13 +7150,13 @@ struct llm_build_starcoder : public llm_graph_context { cur = ggml_add(ctx0, cur, model.layers[il].bqkv); cb(cur, "bqkv", il); - ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); - ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)); + ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)); + ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)); - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); @@ -7159,7 +7164,7 @@ struct llm_build_starcoder : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1 && inp_out_ids) { @@ -7225,7 +7230,7 @@ struct llm_build_refact : public llm_graph_context { inpL = build_inp_embd(model.tok_embd); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -7258,7 +7263,7 @@ struct llm_build_refact : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, NULL, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1 && inp_out_ids) { @@ -7367,13 +7372,15 @@ struct llm_build_bert : public llm_graph_context { cb(cur, "bqkv", il); } - Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); - Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)); + Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)); + Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)); + Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); } else { Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq); Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk); Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); } if (model.layers[il].attn_q_norm) { @@ -7381,6 +7388,10 @@ struct llm_build_bert : public llm_graph_context { model.layers[il].attn_q_norm, model.layers[il].attn_q_norm_b, LLM_NORM, il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + } else { + Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); } if (model.layers[il].attn_k_norm) { @@ -7388,11 +7399,11 @@ struct llm_build_bert : public llm_graph_context { model.layers[il].attn_k_norm, model.layers[il].attn_k_norm_b, LLM_NORM, il); - } - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + } else { + Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + } // RoPE if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) { @@ -7415,7 +7426,7 @@ struct llm_build_bert : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); cb(cur, "kqv_out", il); } @@ -7537,9 +7548,9 @@ struct llm_build_neo_bert : public llm_graph_context { Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd)); Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd)); - Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)); - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); // RoPE Qcur = ggml_rope_ext( @@ -7560,7 +7571,7 @@ struct llm_build_neo_bert : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, nullptr, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); cb(cur, "kqv_out", il); } @@ -7621,7 +7632,7 @@ struct llm_build_bloom : public llm_graph_context { inpL = build_inp_embd(model.tok_embd); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); inpL = build_norm(inpL, model.tok_norm, @@ -7646,13 +7657,13 @@ struct llm_build_bloom : public llm_graph_context { cur = ggml_add(ctx0, cur, model.layers[il].bqkv); cb(cur, "bqkv", il); - ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); - ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)); + ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)); + ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)); - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); @@ -7660,7 +7671,7 @@ struct llm_build_bloom : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1 && inp_out_ids) { @@ -7728,7 +7739,7 @@ struct llm_build_mpt : public llm_graph_context { inpL = build_inp_embd(model.tok_embd); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); if (model.pos_embd) { // inp_pos - contains the positions @@ -7770,7 +7781,7 @@ struct llm_build_mpt : public llm_graph_context { ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)); ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)); - ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)); cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); @@ -7789,17 +7800,18 @@ struct llm_build_mpt : public llm_graph_context { model.layers[il].attn_k_norm_b, LLM_NORM, il); cb(Kcur, "Kcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); } else { - Qcur = ggml_cont(ctx0, Qcur); + Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); cb(Qcur, "Qcur", il); - Kcur = ggml_cont(ctx0, Kcur); + Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); cb(Kcur, "Kcur", il); } - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); @@ -7807,7 +7819,7 @@ struct llm_build_mpt : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1 && inp_out_ids) { @@ -7877,7 +7889,7 @@ struct llm_build_stablelm : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -7953,7 +7965,7 @@ struct llm_build_stablelm : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, NULL, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1 && inp_out_ids) { @@ -8029,7 +8041,7 @@ struct llm_build_qwen : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -8051,9 +8063,9 @@ struct llm_build_qwen : public llm_graph_context { ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd)); ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd)); - ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd))); + ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd)); - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); // using mode = 2 for neox mode Qcur = ggml_rope_ext( @@ -8074,7 +8086,7 @@ struct llm_build_qwen : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, NULL, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1 && inp_out_ids) { @@ -8144,7 +8156,7 @@ struct llm_build_qwen2 : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -8194,7 +8206,7 @@ struct llm_build_qwen2 : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1 && inp_out_ids) { @@ -8308,8 +8320,9 @@ struct llm_build_dream : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, - nullptr, 1.0f / sqrtf(float(n_embd_head)), il); + cur = build_attn(inp_attn, + model.layers[il].wo, model.layers[il].bo, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1 && inp_out_ids) { @@ -8408,8 +8421,9 @@ struct llm_build_llada : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, - 1.0f / sqrtf(float(n_embd_head)), il); + cur = build_attn(inp_attn, + model.layers[il].wo, NULL, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1 && inp_out_ids) { @@ -8469,7 +8483,7 @@ struct llm_build_qwen2vl : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); int sections[4]; std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); @@ -8522,7 +8536,7 @@ struct llm_build_qwen2vl : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1 && inp_out_ids) { @@ -8590,7 +8604,7 @@ struct llm_build_qwen2moe : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -8649,7 +8663,7 @@ struct llm_build_qwen2moe : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1 && inp_out_ids) { @@ -8749,7 +8763,7 @@ struct llm_build_qwen3 : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -8802,7 +8816,7 @@ struct llm_build_qwen3 : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1 && inp_out_ids) { @@ -8870,7 +8884,7 @@ struct llm_build_qwen3moe : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -8923,7 +8937,7 @@ struct llm_build_qwen3moe : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1 && inp_out_ids) { @@ -9000,7 +9014,7 @@ struct llm_build_phi2 : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -9026,21 +9040,21 @@ struct llm_build_phi2 : public llm_graph_context { Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd)); Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd)); - Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)); + Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); } else { Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq); Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk); Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv); Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); } cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); - Qcur = ggml_rope_ext( ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, @@ -9063,7 +9077,7 @@ struct llm_build_phi2 : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il); } if (il == n_layer - 1 && inp_out_ids) { @@ -9129,13 +9143,13 @@ struct llm_build_phi3 : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - using inp_attn_type = std::conditional_t; + using inp_attn_type = std::conditional_t; inp_attn_type * inp_attn = nullptr; if constexpr (iswa) { - inp_attn = build_attn_inp_kv_unified_iswa(); + inp_attn = build_attn_inp_kv_iswa(); } else { - inp_attn = build_attn_inp_kv_unified(); + inp_attn = build_attn_inp_kv(); } ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -9164,21 +9178,21 @@ struct llm_build_phi3 : public llm_graph_context { Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 0 * sizeof(float) * (n_embd)); Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 1 * sizeof(float) * (n_embd)); - Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa))); + Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)); + Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); } else { Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq); Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk); Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv); Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); } cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); - Qcur = ggml_rope_ext( ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, @@ -9200,7 +9214,7 @@ struct llm_build_phi3 : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il); } if (il == n_layer - 1 && inp_out_ids) { @@ -9287,7 +9301,7 @@ struct llm_build_plamo : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -9334,7 +9348,7 @@ struct llm_build_plamo : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, NULL, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1 && inp_out_ids) { @@ -9403,7 +9417,7 @@ struct llm_build_gpt2 : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); cb(pos, "pos_embd", -1); @@ -9428,21 +9442,21 @@ struct llm_build_gpt2 : public llm_graph_context { cur = ggml_add(ctx0, cur, model.layers[il].bqkv); cb(cur, "bqkv", il); - ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); - ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)); + ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)); + ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)); cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1 && inp_out_ids) { @@ -9513,7 +9527,7 @@ struct llm_build_codeshell : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -9534,9 +9548,9 @@ struct llm_build_codeshell : public llm_graph_context { ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd)); ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd)); - ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)); - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); Qcur = ggml_rope_ext( ctx0, Qcur, inp_pos, nullptr, @@ -9556,7 +9570,7 @@ struct llm_build_codeshell : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1 && inp_out_ids) { @@ -9626,7 +9640,7 @@ struct llm_build_orion : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -9685,7 +9699,7 @@ struct llm_build_orion : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, NULL, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1 && inp_out_ids) { @@ -9753,7 +9767,7 @@ struct llm_build_internlm2 : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -9812,7 +9826,7 @@ struct llm_build_internlm2 : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1 && inp_out_ids) { @@ -9889,7 +9903,7 @@ struct llm_build_minicpm3 : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -10000,7 +10014,7 @@ struct llm_build_minicpm3 : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, NULL, - q_states, k_states, v_states, nullptr, nullptr, kq_scale, il); + q_states, k_states, v_states, nullptr, nullptr, nullptr, kq_scale, il); } if (il == n_layer - 1 && inp_out_ids) { @@ -10084,7 +10098,7 @@ struct llm_build_gemma : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -10130,7 +10144,7 @@ struct llm_build_gemma : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, NULL, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il); } if (il == n_layer - 1 && inp_out_ids) { @@ -10200,7 +10214,7 @@ struct llm_build_gemma2_iswa : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified_iswa(); + auto * inp_attn = build_attn_inp_kv_iswa(); ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -10245,7 +10259,7 @@ struct llm_build_gemma2_iswa : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, NULL, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il); } if (il == n_layer - 1 && inp_out_ids) { @@ -10334,7 +10348,7 @@ struct llm_build_gemma3_iswa : public llm_graph_context { ggml_tensor * inp_pos = build_inp_pos(); // TODO: is causal == true correct? might need some changes - auto * inp_attn = build_attn_inp_kv_unified_iswa(); + auto * inp_attn = build_attn_inp_kv_iswa(); ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -10387,7 +10401,7 @@ struct llm_build_gemma3_iswa : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, NULL, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il); } if (il == n_layer - 1 && inp_out_ids) { @@ -10485,7 +10499,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context { ggml_tensor * inp_pos = build_inp_pos(); // TODO: is causal == true correct? might need some changes - auto * inp_attn = build_attn_inp_kv_unified_iswa(); + auto * inp_attn = build_attn_inp_kv_iswa(); // inp_per_layer shape: [n_embd_altup, n_tokens, n_layer] ggml_tensor * inp_per_layer = project_per_layer_inputs(inpL, get_per_layer_inputs()); @@ -10568,7 +10582,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, NULL, - Qcur, Kcur, Vcur, nullptr, nullptr, hparams.f_attention_scale, il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, hparams.f_attention_scale, il); } else { // no KV layers ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); @@ -10586,7 +10600,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, NULL, - Qcur, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il); + Qcur, nullptr, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il); } cur = build_norm(cur, @@ -10864,8 +10878,8 @@ struct llm_build_gemma3n_iswa : public llm_graph_context { ggml_tensor * all_coefs = build_lora_mm(model.layers[il].altup_correct_coef, modalities); // [n_altup, n_tokens] all_coefs = ggml_scale_bias(ctx0, all_coefs, 1.0f, 1.0f); // + 1.0 cb(all_coefs, "all_coefs", il); - all_coefs = ggml_cont(ctx0, ggml_transpose(ctx0, all_coefs)); // [n_tokens, n_altup] - all_coefs = ggml_reshape_3d(ctx0, all_coefs, 1, n_tokens, n_altup); // [1, n_tokens, n_altup] + all_coefs = ggml_transpose(ctx0, all_coefs); // [n_tokens, n_altup] + all_coefs = ggml_cont_3d(ctx0, all_coefs, 1, n_tokens, n_altup); // [1, n_tokens, n_altup] innovation = ggml_repeat_4d(ctx0, innovation, n_embd, n_tokens, n_altup, 1); ggml_tensor * corrected = ggml_mul(ctx0, innovation, all_coefs); // [n_embd, n_tokens, n_altup] @@ -10892,7 +10906,7 @@ struct llm_build_starcoder2 : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -10951,7 +10965,7 @@ struct llm_build_starcoder2 : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1 && inp_out_ids) { @@ -11378,7 +11392,9 @@ struct llm_build_jamba : public llm_graph_context_mamba { cb(Vcur, "Vcur", il); // No RoPE :) - cur = build_attn(inp_hybrid->get_attn(), model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f/sqrtf(float(n_embd_head)), il); + cur = build_attn(inp_hybrid->get_attn(), + model.layers[il].wo, NULL, + Qcur, Kcur, Vcur, NULL, NULL, NULL, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1 && inp_out_ids) { @@ -11461,7 +11477,7 @@ struct llm_build_command_r : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -11536,7 +11552,7 @@ struct llm_build_command_r : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1 && inp_out_ids) { @@ -11608,7 +11624,7 @@ struct llm_build_cohere2_iswa : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified_iswa(); + auto * inp_attn = build_attn_inp_kv_iswa(); ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -11671,7 +11687,7 @@ struct llm_build_cohere2_iswa : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1 && inp_out_ids) { @@ -11743,7 +11759,7 @@ struct llm_build_olmo : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -11802,7 +11818,7 @@ struct llm_build_olmo : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, nullptr, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1 && inp_out_ids) { @@ -11871,7 +11887,7 @@ struct llm_build_olmo2 : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -11922,7 +11938,7 @@ struct llm_build_olmo2 : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, NULL, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1 && inp_out_ids) { @@ -12000,7 +12016,7 @@ struct llm_build_olmoe : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -12055,7 +12071,7 @@ struct llm_build_olmoe : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, NULL, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1 && inp_out_ids) { @@ -12126,7 +12142,7 @@ struct llm_build_openelm : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -12188,7 +12204,7 @@ struct llm_build_openelm : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, NULL, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1 && inp_out_ids) { @@ -12257,7 +12273,7 @@ struct llm_build_gptneox : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -12278,9 +12294,9 @@ struct llm_build_gptneox : public llm_graph_context { ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd)); ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd)); - ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)); - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); Qcur = ggml_rope_ext( ctx0, Qcur, inp_pos, nullptr, @@ -12300,7 +12316,7 @@ struct llm_build_gptneox : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1 && inp_out_ids) { @@ -12403,7 +12419,7 @@ struct llm_build_arctic : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -12450,7 +12466,7 @@ struct llm_build_arctic : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, NULL, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1 && inp_out_ids) { @@ -12541,7 +12557,7 @@ struct llm_build_deepseek : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; @@ -12605,7 +12621,7 @@ struct llm_build_deepseek : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); } if (il == n_layer - 1 && inp_out_ids) { @@ -12718,7 +12734,7 @@ struct llm_build_deepseek2 : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -12833,7 +12849,7 @@ struct llm_build_deepseek2 : public llm_graph_context { // note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group) cur = build_attn(inp_attn, model.layers[il].wo, NULL, - Qcur, Kcur, Vcur, nullptr, model.layers[il].wv_b, kq_scale, il); + Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, kq_scale, il); } else { ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_cmpr); cb(kv, "kv", il); @@ -12867,7 +12883,7 @@ struct llm_build_deepseek2 : public llm_graph_context { // note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups) cur = build_attn(inp_attn, model.layers[il].wo, NULL, - Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); } } @@ -12965,7 +12981,7 @@ struct llm_build_bitnet : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -13034,7 +13050,7 @@ struct llm_build_bitnet : public llm_graph_context { cur = build_attn(inp_attn, NULL, NULL, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); cur = build_norm(cur, model.layers[il].attn_sub_norm, NULL, @@ -13157,7 +13173,7 @@ struct llm_build_t5_enc : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo_enc, nullptr, - Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il); + Qcur, Kcur, Vcur, kq_b, nullptr, nullptr, 1.0f, il); cb(cur, "kqv_out", il); } @@ -13229,7 +13245,7 @@ struct llm_build_t5_dec : public llm_graph_context { const int64_t n_outputs_enc = embd_enc->ne[1]; - auto * inp_attn_self = build_attn_inp_kv_unified(); + auto * inp_attn_self = build_attn_inp_kv(); auto * inp_attn_cross = build_attn_inp_cross(); ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -13263,7 +13279,7 @@ struct llm_build_t5_dec : public llm_graph_context { cur = build_attn(inp_attn_self, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il); + Qcur, Kcur, Vcur, kq_b, nullptr, nullptr, 1.0f, il); cb(cur, "kqv_out", il); } @@ -13295,7 +13311,7 @@ struct llm_build_t5_dec : public llm_graph_context { cur = build_attn(inp_attn_cross, model.layers[il].wo_cross, nullptr, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il); cb(cur, "kqv_out", il); //ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); @@ -13394,7 +13410,7 @@ struct llm_build_jais : public llm_graph_context { inpL = build_inp_embd(model.tok_embd); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -13413,21 +13429,21 @@ struct llm_build_jais : public llm_graph_context { cur = ggml_add(ctx0, cur, model.layers[il].bqkv); cb(cur, "bqkv", il); - ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*cur->nb[0]*(n_embd))); - ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd))); - ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa))); + ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*cur->nb[0]*(n_embd)); + ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd)); + ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa)); cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/float(n_embd_head), il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/float(n_embd_head), il); } if (il == n_layer - 1 && inp_out_ids) { @@ -13492,7 +13508,7 @@ struct llm_build_chatglm : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -13526,6 +13542,7 @@ struct llm_build_chatglm : public llm_graph_context { } Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); } else { cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); @@ -13535,11 +13552,10 @@ struct llm_build_chatglm : public llm_graph_context { } Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd)); Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd)); - Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)); + Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); } - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); - //printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor); Qcur = ggml_rope_ext( ctx0, Qcur, inp_pos, nullptr, @@ -13559,7 +13575,7 @@ struct llm_build_chatglm : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, NULL, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1 && inp_out_ids) { @@ -13625,7 +13641,7 @@ struct llm_build_glm4 : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -13660,6 +13676,7 @@ struct llm_build_glm4 : public llm_graph_context { } Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); } else { cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); @@ -13669,11 +13686,10 @@ struct llm_build_glm4 : public llm_graph_context { } Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd)); Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd)); - Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)); + Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); } - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); - Qcur = ggml_rope_ext( ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, @@ -13692,7 +13708,7 @@ struct llm_build_glm4 : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, NULL, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1 && inp_out_ids) { @@ -13775,7 +13791,7 @@ struct llm_build_glm4_moe : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -13841,7 +13857,7 @@ struct llm_build_glm4_moe : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, NULL, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_transformer_layers - 1 && inp_out_ids) { @@ -13935,7 +13951,7 @@ struct llm_build_nemotron : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -13995,7 +14011,7 @@ struct llm_build_nemotron : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1 && inp_out_ids) { @@ -14064,7 +14080,7 @@ struct llm_build_exaone : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -14126,7 +14142,7 @@ struct llm_build_exaone : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1 && inp_out_ids) { @@ -14196,13 +14212,13 @@ struct llm_build_exaone4 : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - using inp_attn_type = std::conditional_t; + using inp_attn_type = std::conditional_t; inp_attn_type * inp_attn = nullptr; if constexpr (iswa) { - inp_attn = build_attn_inp_kv_unified_iswa(); + inp_attn = build_attn_inp_kv_iswa(); } else { - inp_attn = build_attn_inp_kv_unified(); + inp_attn = build_attn_inp_kv(); } ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -14257,7 +14273,7 @@ struct llm_build_exaone4 : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, NULL, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); cb(cur, "attn_out", il); } @@ -15085,7 +15101,7 @@ struct llm_build_granite : public llm_graph_context { inp_pos = build_inp_pos(); } - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -15136,12 +15152,12 @@ struct llm_build_granite : public llm_graph_context { } ggml_tensor * build_attention_layer( - ggml_tensor * cur, - ggml_tensor * inp_pos, - llm_graph_input_attn_kv_unified * inp_attn, - const llama_model & model, - const int64_t n_embd_head, - const int il) { + ggml_tensor * cur, + ggml_tensor * inp_pos, + llm_graph_input_attn_kv * inp_attn, + const llama_model & model, + const int64_t n_embd_head, + const int il) { // compute Q and K and (optionally) RoPE them ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); @@ -15192,7 +15208,7 @@ struct llm_build_granite : public llm_graph_context { const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); cb(cur, "attn_out", il); return cur; } @@ -15355,12 +15371,12 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba { } ggml_tensor * build_attention_layer( - ggml_tensor * cur, - ggml_tensor * inp_pos, - llm_graph_input_attn_kv_unified * inp_attn, - const llama_model & model, - const int64_t n_embd_head, - const int il) { + ggml_tensor * cur, + ggml_tensor * inp_pos, + llm_graph_input_attn_kv * inp_attn, + const llama_model & model, + const int64_t n_embd_head, + const int il) { // compute Q and K and (optionally) RoPE them ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); @@ -15411,7 +15427,7 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba { const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); cb(cur, "attn_out", il); return cur; } @@ -15517,7 +15533,7 @@ struct llm_build_chameleon : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -15596,7 +15612,7 @@ struct llm_build_chameleon : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, nullptr, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1 && inp_out_ids) { @@ -15848,7 +15864,7 @@ struct llm_build_plm : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -15952,7 +15968,7 @@ struct llm_build_plm : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, NULL, - q_states, k_states, v_states, nullptr, nullptr, kq_scale, il); + q_states, k_states, v_states, nullptr, nullptr, nullptr, kq_scale, il); } if (il == n_layer - 1 && inp_out_ids) { @@ -16013,7 +16029,7 @@ struct llm_build_bailingmoe : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -16075,7 +16091,7 @@ struct llm_build_bailingmoe : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il); } if (il == n_layer - 1 && inp_out_ids) { @@ -16162,7 +16178,7 @@ struct llm_build_dots1 : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -16215,7 +16231,7 @@ struct llm_build_dots1 : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1 && inp_out_ids) { @@ -16312,7 +16328,7 @@ struct llm_build_ernie4_5 : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; @@ -16370,7 +16386,7 @@ struct llm_build_ernie4_5 : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, NULL, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -16442,7 +16458,7 @@ struct llm_build_ernie4_5_moe : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -16503,7 +16519,7 @@ struct llm_build_ernie4_5_moe : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, NULL, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); cb(cur, "attn_out", il); } @@ -16656,7 +16672,7 @@ struct llm_build_falcon_h1 : public llm_graph_context_mamba { ggml_tensor * attn_out = build_attn(inp->get_attn(), model.layers[il].wo, NULL, - Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); cb(attn_out, "attn_out", il); cur = build_norm(inpL, @@ -16816,7 +16832,7 @@ struct llm_build_plamo2 : public llm_graph_context_mamba { private: ggml_tensor * build_plamo2_attn_layer( - llm_graph_input_attn_kv_unified * inp, + llm_graph_input_attn_kv * inp, ggml_tensor * inp_pos, ggml_tensor * cur, const llama_model & model, @@ -16840,13 +16856,13 @@ private: ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, n_embd_head_q, n_head, n_tokens, n_embd_head_q * sizeof(float), qkv->nb[1], q_offset * ggml_element_size(qkv)); ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, n_embd_head_k, n_head_kv, n_tokens, n_embd_head_k * sizeof(float), qkv->nb[1], k_offset * ggml_element_size(qkv)); - ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, qkv, n_embd_head_v * n_head_kv, n_tokens, qkv->nb[1], v_offset * ggml_element_size(qkv))); + ggml_tensor * Vcur = ggml_view_2d(ctx0, qkv, n_embd_head_v * n_head_kv, n_tokens, qkv->nb[1], v_offset * ggml_element_size(qkv)); cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head_v, n_head_kv, n_tokens); + Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head_v, n_head_kv, n_tokens); Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il); cb(Qcur, "Qcur_normed", il); @@ -16866,7 +16882,9 @@ private: ext_factor, attn_factor, beta_fast, beta_slow ); - cur = build_attn(inp, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f/sqrtf(float(n_embd_head_v)), il); + cur = build_attn(inp, + model.layers[il].wo, NULL, + Qcur, Kcur, Vcur, NULL, NULL, NULL, 1.0f/sqrtf(float(n_embd_head_v)), il); } cb(cur, "attn_out", il); @@ -16913,15 +16931,13 @@ private: cb(zx, "mamba_in_proj", il); // {8192, 5, 1, 1} -> {8192, 1, 5, 1} zx = ggml_permute(ctx0, zx, 0, 2, 1, 3); - zx = ggml_cont(ctx0, zx); - zx = ggml_reshape_4d(ctx0, zx, head_dim * 2, n_heads, n_seq_tokens, n_seqs); + zx = ggml_cont_4d(ctx0, zx, head_dim * 2, n_heads, n_seq_tokens, n_seqs); cb(zx, "mamba_in_proj_out", il); // split into z and x // => {head_dim * n_heads, n_seq_tokens, n_seqs} ggml_tensor * x = ggml_view_4d(ctx0, zx, head_dim, n_heads, n_seq_tokens, n_seqs, zx->nb[1], zx->nb[2], zx->nb[3], head_dim*ggml_element_size(zx)); - x = ggml_cont(ctx0, x); - x = ggml_reshape_3d(ctx0, x, head_dim * n_heads, n_seq_tokens, n_seqs); + x = ggml_cont_3d(ctx0, x, head_dim * n_heads, n_seq_tokens, n_seqs); // x = ggml_permute(ctx0, x, 0, 2, 1, 3); cb(x, "mamba_x_split", il); @@ -17051,7 +17067,7 @@ struct llm_build_arcee : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; @@ -17115,7 +17131,7 @@ struct llm_build_arcee : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); cb(cur, "attn_out", il); } @@ -17186,7 +17202,7 @@ struct llm_build_hunyuan_moe : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); const float kq_scale = 1.0f / sqrtf(float(n_embd_head)); @@ -17260,7 +17276,7 @@ struct llm_build_hunyuan_moe : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); cb(cur, "attn_out", il); } @@ -17347,7 +17363,7 @@ struct llm_build_hunyuan_dense : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); const float kq_scale = 1.0f / sqrtf(float(n_embd_head)); @@ -17420,7 +17436,7 @@ struct llm_build_hunyuan_dense : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); cb(cur, "attn_out", il); } @@ -17485,7 +17501,7 @@ struct llm_build_smollm3 : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; @@ -17550,7 +17566,7 @@ struct llm_build_smollm3 : public llm_graph_context { cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); cb(cur, "attn_out", il); } @@ -17617,7 +17633,7 @@ struct llm_build_openai_moe_iswa : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified_iswa(); + auto * inp_attn = build_attn_inp_kv_iswa(); for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; @@ -17672,9 +17688,9 @@ struct llm_build_openai_moe_iswa : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn_with_sinks(inp_attn, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].attn_sinks, 1.0f/sqrtf(float(n_rot)), il); + Qcur, Kcur, Vcur, nullptr, model.layers[il].attn_sinks, nullptr, 1.0f/sqrtf(float(n_rot)), il); cb(cur, "attn_out", il); } @@ -17799,10 +17815,10 @@ struct llm_build_lfm2 : public llm_graph_context { return cur; } - ggml_tensor * build_attn_block(ggml_tensor * cur, - ggml_tensor * inp_pos, - llm_graph_input_attn_kv_unified * inp_attn, - int il) const { + ggml_tensor * build_attn_block(ggml_tensor * cur, + ggml_tensor * inp_pos, + llm_graph_input_attn_kv * inp_attn, + int il) const { GGML_ASSERT(hparams.n_embd_v_gqa(il) == hparams.n_embd_k_gqa(il)); auto const n_embd_head = hparams.n_embd_head_v; auto const n_head_kv = hparams.n_head_kv(il); @@ -17837,7 +17853,7 @@ struct llm_build_lfm2 : public llm_graph_context { ); cur = build_attn(inp_attn, model.layers[il].wo, NULL, - q, k, v, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + q, k, v, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); cb(cur, "model.layers.{}.self_attn.out_proj", il); @@ -17930,13 +17946,13 @@ struct llm_build_smallthinker : public llm_graph_context{ // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - using inp_attn_type = std::conditional_t; + using inp_attn_type = std::conditional_t; inp_attn_type * inp_attn = nullptr; if constexpr (iswa) { - inp_attn = build_attn_inp_kv_unified_iswa(); + inp_attn = build_attn_inp_kv_iswa(); } else { - inp_attn = build_attn_inp_kv_unified(); + inp_attn = build_attn_inp_kv(); } ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -17981,7 +17997,7 @@ struct llm_build_smallthinker : public llm_graph_context{ cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1 && inp_out_ids) { @@ -18066,7 +18082,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, std::max((uint32_t) 1, cparams.n_seq_max), cparams.n_seq_max); } else if (llm_arch_is_hybrid(arch)) { - const auto padding = llama_kv_cache_unified::get_padding(cparams); + const auto padding = llama_kv_cache::get_padding(cparams); cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding); @@ -18088,7 +18104,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, /* filter_attn */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr, /* filter_recr */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr); } else { - const auto padding = llama_kv_cache_unified::get_padding(cparams); + const auto padding = llama_kv_cache::get_padding(cparams); uint32_t n_ctx_per_stream = cparams.n_ctx; @@ -18108,7 +18124,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) { GGML_ASSERT(hparams.is_swa_any()); - res = new llama_kv_cache_unified_iswa( + res = new llama_kv_cache_iswa( *this, params.type_k, params.type_v, @@ -18123,7 +18139,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, } else { GGML_ASSERT(!hparams.is_swa_any()); - res = new llama_kv_cache_unified( + res = new llama_kv_cache( *this, nullptr, params.type_k, diff --git a/src/llama-model.h b/src/llama-model.h index 46f7d0480f..f639fa1398 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -79,6 +79,7 @@ enum llm_type { LLM_TYPE_40B, LLM_TYPE_65B, LLM_TYPE_70B, + LLM_TYPE_120B, LLM_TYPE_142B, LLM_TYPE_236B, LLM_TYPE_290B, diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index f4565f9b71..e21e904278 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -2491,12 +2491,12 @@ struct test_bin_bcast : public test_case { : op(op), type(type), ne(ne), nr(nr), nf(nf) {} ggml_tensor * build_graph(ggml_context * ctx) override { - GGML_ASSERT(nf <= 8); + GGML_ASSERT(nf <= 16); ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0]*nr[0], ne[1]*nr[1], ne[2]*nr[2], ne[3]*nr[3]); ggml_set_name(a, "a"); - ggml_tensor * b[8]; + ggml_tensor * b[16]; for (int i = 0; i < nf; ++i) { b[i] = ggml_new_tensor(ctx, type, 4, ne.data()); ggml_set_name(b[i], (std::string("b") + std::to_string(i)).c_str()); @@ -3098,9 +3098,10 @@ struct test_mul_mat : public test_case { const std::array nr; // repeat in dims 3 and 4 const std::array per; // permutation of dimensions const bool v; // whether a and b are non-contiguous views + const uint32_t o; // number of outputs std::string vars() override { - return VARS_TO_STR9(type_a, type_b, m, n, k, bs, nr, per, v); + return VARS_TO_STR10(type_a, type_b, m, n, k, bs, nr, per, v, o); } double max_nmse_err() override { @@ -3121,8 +3122,8 @@ struct test_mul_mat : public test_case { std::array bs = {10, 10}, std::array nr = {2, 2}, std::array per = {0, 1, 2, 3}, - bool v = false) - : type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr), per(per), v(v) {} + bool v = false, uint32_t o = 1) + : type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr), per(per), v(v), o(o) {} ggml_tensor * build_graph(ggml_context * ctx) override { // C^T = A * B^T: (k, m) * (k, n) => (m, n) @@ -3186,9 +3187,21 @@ struct test_mul_mat : public test_case { ggml_tensor * out = ggml_mul_mat(ctx, a, b); ggml_set_name(out, "out"); + for (uint32_t i = 1; i < o; ++i) { + ggml_tensor * out2 = ggml_mul_mat(ctx, a, b); + ggml_set_name(out2, "out2"); + out = ggml_add(ctx, out, out2); + } return out; } + + bool run_whole_graph() override { return o > 1; } + + std::string op_desc(ggml_tensor * t) override { + GGML_UNUSED(t); + return ggml_op_name(GGML_OP_MUL_MAT); + } }; // GGML_OP_MUL_MAT_ID @@ -3201,9 +3214,10 @@ struct test_mul_mat_id : public test_case { const int64_t m; const int64_t n; const int64_t k; + const uint32_t o; // number of outputs std::string vars() override { - return VARS_TO_STR8(type_a, type_b, n_mats, n_used, b, m, n, k); + return VARS_TO_STR9(type_a, type_b, n_mats, n_used, b, m, n, k, o); } double max_nmse_err() override { @@ -3217,9 +3231,9 @@ struct test_mul_mat_id : public test_case { test_mul_mat_id(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32, int n_mats = 8, int n_used = 2, bool b = false, - int64_t m = 32, int64_t n = 32, int64_t k = 32) + int64_t m = 32, int64_t n = 32, int64_t k = 32, uint32_t o = 1) : type_a(type_a), type_b(type_b), n_mats(n_mats), n_used(n_used), b(b), - m(m), n(n), k(k) { + m(m), n(n), k(k), o(o) { GGML_ASSERT(n_used <= n_mats); } @@ -3241,6 +3255,13 @@ struct test_mul_mat_id : public test_case { ggml_tensor * out = ggml_mul_mat_id(ctx, as, b, ids); ggml_set_name(out, "out"); + for (uint32_t i = 1; i < o; ++i) { + ggml_tensor * a2 = ggml_new_tensor_3d(ctx, type_a, k, m, n_mats); + ggml_tensor * out2 = ggml_mul_mat_id(ctx, a2, b, ids); + ggml_set_name(out2, "out2"); + out = ggml_add(ctx, out, out2); + } + return out; } @@ -3264,6 +3285,13 @@ struct test_mul_mat_id : public test_case { } } } + + bool run_whole_graph() override { return o > 1; } + + std::string op_desc(ggml_tensor * t) override { + GGML_UNUSED(t); + return ggml_op_name(GGML_OP_MUL_MAT_ID); + } }; // GGML_OP_OUT_PROD @@ -5658,6 +5686,7 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 2, 2}, 6)); test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {10, 5, 4, 3}, {1, 2, 2, 2}, 7)); test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {16, 5, 4, 3}, {2, 2, 2, 2}, 8)); + test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {16, 5, 4, 3}, {1, 1, 1, 1}, 16)); test_cases.emplace_back(new test_add1()); test_cases.emplace_back(new test_scale()); @@ -5797,6 +5826,7 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 45, 64, { 8, 1}, {4, 1})); test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 193, {1, 1}, {4, 1}, {0, 2, 1, 3})); test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 67, {1, 1}, {4, 1}, {0, 2, 1, 3})); + test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32, GGML_TYPE_F32, 16, 32, 32, { 1, 1}, {1, 1}, {0, 1, 2, 3}, true, 3)); for (auto bs2 : {1,3}) { for (auto bs : {1,2,4,8}) { @@ -5824,6 +5854,9 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_F16, GGML_TYPE_F32, 16, 16, b, 32, 1024, 16)); } + test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_F16, GGML_TYPE_F32, 1, 1, false, 8, 16, 1)); + test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_F16, GGML_TYPE_F32, 16, 16, false, 32, 32, 32, 3)); + for (ggml_type type_a : base_types) { for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) { for (int n_mats : {4, 8}) { @@ -6025,6 +6058,7 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {8, 1, 1, 1}, order)); test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {16, 10, 10, 10}, order)); test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {60, 10, 10, 10}, order)); // qwen + test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {1024, 1, 1, 1}, order)); } for (ggml_scale_mode mode : {GGML_SCALE_MODE_NEAREST, GGML_SCALE_MODE_BILINEAR}) { diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp index ea4060c876..a6daa93a82 100644 --- a/tests/test-chat.cpp +++ b/tests/test-chat.cpp @@ -1408,7 +1408,7 @@ static void test_template_output_parsers() { /* is_partial= */ false, { /* .format = */ COMMON_CHAT_FORMAT_GRANITE, - /* .reasoning_format = */ COMMON_REASONING_FORMAT_GRANITE, + /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK, })); // Test parsing tool calls diff --git a/tools/batched-bench/batched-bench.cpp b/tools/batched-bench/batched-bench.cpp index 03628f74b2..c6c601add3 100644 --- a/tools/batched-bench/batched-bench.cpp +++ b/tools/batched-bench/batched-bench.cpp @@ -57,6 +57,13 @@ int main(int argc, char ** argv) { return 1; } + const llama_vocab * vocab = llama_model_get_vocab(model); + const int32_t n_vocab = llama_vocab_n_tokens(vocab); + + const auto get_token_rand = [n_vocab]() -> llama_token { + return std::rand() % n_vocab; + }; + auto * mem = llama_get_memory(ctx); const int32_t n_kv_max = llama_n_ctx(ctx); @@ -93,7 +100,7 @@ int main(int argc, char ** argv) { // warm up { for (int i = 0; i < 16; ++i) { - common_batch_add(batch, 0, i, { 0 }, false); + common_batch_add(batch, get_token_rand(), i, { 0 }, false); } if (!decode_helper(ctx, batch, ctx_params.n_batch)) { @@ -127,7 +134,7 @@ int main(int argc, char ** argv) { for (int j = 0; j < (is_pp_shared ? 1 : pl); ++j) { for (int i = 0; i < pp; ++i) { - common_batch_add(batch, 0, i, { j }, i == pp - 1); + common_batch_add(batch, get_token_rand(), i, { j }, i == pp - 1); } } @@ -154,7 +161,7 @@ int main(int argc, char ** argv) { common_batch_clear(batch); for (int j = 0; j < pl; ++j) { - common_batch_add(batch, 0, pp + i, { j }, true); + common_batch_add(batch, get_token_rand(), pp + i, { j }, true); } if (!decode_helper(ctx, batch, ctx_params.n_batch)) { diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index f1eb633369..706ed2e3b5 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -82,6 +82,7 @@ #define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s" #define TN_IMAGE_NEWLINE "model.image_newline" #define TN_MM_INP_NORM "mm.input_norm.weight" +#define TN_MM_INP_NORM_B "mm.input_norm.bias" #define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3 #define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3 #define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3 @@ -133,6 +134,7 @@ enum projector_type { PROJECTOR_TYPE_QWEN2A, PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx PROJECTOR_TYPE_VOXTRAL, + PROJECTOR_TYPE_LFM2, PROJECTOR_TYPE_UNKNOWN, }; @@ -153,6 +155,7 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_QWEN2A, "qwen2a"}, { PROJECTOR_TYPE_QWEN25O, "qwen2.5o"}, { PROJECTOR_TYPE_VOXTRAL, "voxtral"}, + { PROJECTOR_TYPE_LFM2, "lfm2"}, }; static projector_type clip_projector_type_from_string(const std::string & str) { diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index fdaf9738e8..1676c32836 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -265,6 +265,7 @@ struct clip_model { // LLaVA projection ggml_tensor * mm_input_norm_w = nullptr; + ggml_tensor * mm_input_norm_b = nullptr; ggml_tensor * mm_0_w = nullptr; ggml_tensor * mm_0_b = nullptr; ggml_tensor * mm_2_w = nullptr; @@ -488,11 +489,17 @@ struct clip_graph { ggml_cgraph * build_siglip() { ggml_tensor * inp = build_inp(); + + ggml_tensor * learned_pos_embd = model.position_embeddings; + if (ctx->proj_type() == PROJECTOR_TYPE_LFM2) { + learned_pos_embd = resize_position_embeddings(); + } + ggml_tensor * cur = build_vit( inp, n_patches, NORM_TYPE_NORMAL, hparams.ffn_op, - model.position_embeddings, + learned_pos_embd, nullptr); if (ctx->proj_type() == PROJECTOR_TYPE_GEMMA3) { @@ -501,8 +508,8 @@ struct clip_graph { const int patches_per_image = n_patches_x; const int kernel_size = hparams.proj_scale_factor; - cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); - cur = ggml_reshape_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size); + cur = ggml_transpose(ctx0, cur); + cur = ggml_cont_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size); // doing a pool2d to reduce the number of output tokens cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0); @@ -530,18 +537,57 @@ struct clip_graph { GGML_ASSERT(scale_factor != 0); cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height, bsz); cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); - cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur), + cur = ggml_cont_4d(ctx0, cur, n_embd * scale_factor * scale_factor, height / scale_factor, width / scale_factor, bsz); cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); - cur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, cur), + cur = ggml_cont_3d(ctx0, cur, n_embd * scale_factor * scale_factor, seq / (scale_factor * scale_factor), bsz); cur = ggml_mul_mat(ctx0, model.projection, cur); + } else if (ctx->proj_type() == PROJECTOR_TYPE_LFM2) { + // pixel unshuffle block + const int scale_factor = model.hparams.proj_scale_factor; + GGML_ASSERT(scale_factor > 1); + + const int n_embd = cur->ne[0]; + int width = img.nx / patch_size; + int height = img.ny / patch_size; + + // pad width and height to factor + const int64_t pad_width = CLIP_ALIGN(width, scale_factor) - width; + const int64_t pad_height = CLIP_ALIGN(height, scale_factor) - height; + cur = ggml_reshape_3d(ctx0, cur, n_embd, width, height); + if (pad_width || pad_height) { + cur = ggml_pad(ctx0, cur, 0, pad_width, pad_height, 0); + width += pad_width; + height += pad_height; + } + + // unshuffle h + cur = ggml_reshape_3d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height); + cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + + // unshuffle w + cur = ggml_cont_3d(ctx0, cur, n_embd * scale_factor * scale_factor, height / scale_factor, width / scale_factor); + cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + + cur = ggml_cont_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]); + + // projection + cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm + cur = ggml_mul(ctx0, cur, model.mm_input_norm_w); + cur = ggml_add(ctx0, cur, model.mm_input_norm_b); + + cur = ggml_mul_mat(ctx0, model.mm_1_w, cur); + cur = ggml_add(ctx0, cur, model.mm_1_b); + cur = ggml_gelu(ctx0, cur); + cur = ggml_mul_mat(ctx0, model.mm_2_w, cur); + cur = ggml_add(ctx0, cur, model.mm_2_b); } else { GGML_ABORT("SigLIP: Unsupported projector type"); } @@ -669,15 +715,15 @@ struct clip_graph { auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1); inp = ggml_add(ctx0, inp, inp_1); - inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 2, 0, 3)); // [w, h, c, b] -> [c, w, h, b] - inp = ggml_reshape_4d( + inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b] + inp = ggml_cont_4d( ctx0, inp, n_embd * 2, n_patches_x / 2, n_patches_y, batch_size); inp = ggml_reshape_4d( ctx0, inp, n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2)); - inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3)); - inp = ggml_reshape_3d( + inp = ggml_permute(ctx0, inp, 0, 2, 1, 3); + inp = ggml_cont_3d( ctx0, inp, n_embd, n_patches_x * n_patches_y, batch_size); } @@ -942,14 +988,14 @@ struct clip_graph { GGML_ASSERT(scale_factor > 0); cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, height / scale_factor, width, bsz); cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); - cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur), + cur = ggml_cont_4d(ctx0, cur, n_embd * scale_factor * scale_factor, height / scale_factor, width / scale_factor, bsz); cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); // flatten to 2D - cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, cur), + cur = ggml_cont_2d(ctx0, cur, n_embd * scale_factor * scale_factor, cur->ne[1] * cur->ne[2]); } @@ -1035,14 +1081,14 @@ struct clip_graph { n_patches_y, bsz); cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); - cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur), + cur = ggml_cont_4d(ctx0, cur, n_embd * scale_factor * scale_factor, n_patches_x / scale_factor, n_patches_y / scale_factor, bsz); cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); // flatten to 2D - cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, cur), + cur = ggml_cont_2d(ctx0, cur, n_embd * scale_factor * scale_factor, n_patches / scale_factor / scale_factor); cb(cur, "pixel_shuffle", -1); @@ -1275,8 +1321,8 @@ struct clip_graph { ggml_tensor * block_1 = nullptr; { // transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24] - mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3)); - mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]); + mlp_3 = ggml_permute(ctx0, mlp_3, 1, 0, 2, 3); + mlp_3 = ggml_cont_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]); // stride = 1, padding = 1, bias is nullptr block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1); @@ -1381,9 +1427,9 @@ struct clip_graph { mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b); // mlp_2 ne = [2048, 576, 1, 1] // // AVG Pool Layer 2*2, strides = 2 - mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 0, 2, 3)); + mlp_2 = ggml_permute(ctx0, mlp_2, 1, 0, 2, 3); // mlp_2 ne = [576, 2048, 1, 1] - mlp_2 = ggml_reshape_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]); + mlp_2 = ggml_cont_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]); // mlp_2 ne [24, 24, 2048, 1] mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0); // weight ne = [3, 3, 2048, 1] @@ -1403,8 +1449,8 @@ struct clip_graph { // glm projector else if (ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE) { size_t gridsz = (size_t)sqrt(embeddings->ne[1]); - embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings,1,0,2,3)); - embeddings = ggml_reshape_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]); + embeddings = ggml_permute(ctx0,embeddings,1,0,2,3); + embeddings = ggml_cont_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]); embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1); embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size); embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3)); @@ -1560,6 +1606,27 @@ private: } } + // siglip2 naflex + ggml_tensor * resize_position_embeddings() { + ggml_tensor * pos_embd = model.position_embeddings; + const int height = img.ny / patch_size; + const int width = img.nx / patch_size; + + if (!pos_embd || height * width == pos_embd->ne[1]) { + return pos_embd; + } + + const int n_pos_embd = std::sqrt(pos_embd->ne[1]); + pos_embd = ggml_reshape_3d(ctx0, pos_embd, n_embd, n_pos_embd, n_pos_embd); // -> (n_embd, n_pos_embd, n_pos_embd) + pos_embd = ggml_permute(ctx0, pos_embd, 2, 0, 1, 3); // -> (n_pos_embd, n_pos_embd, n_embd) + pos_embd = ggml_interpolate(ctx0, pos_embd, width, height, n_embd, 1, 1); // -> (width, height, n_embd) + pos_embd = ggml_reshape_2d(ctx0, pos_embd, height * width, n_embd); // -> (height * width, n_embd) + pos_embd = ggml_transpose(ctx0, pos_embd); // -> (n_embd, height * width) + pos_embd = ggml_cont(ctx0, pos_embd); + + return pos_embd; + } + // build vision transformer (ViT) cgraph // this function should cover most of the models // if your model has specific features, you should probably duplicate this function @@ -1938,7 +2005,6 @@ private: ggml_row_size(cur->type, n_dim), ggml_row_size(cur->type, n_dim*n_head), n_dim/2 * ggml_element_size(cur)); - second = ggml_cont(ctx0, second); // copy, because ggml_rope don't play well with non-contiguous tensors second = ggml_rope_ext( ctx0, second, @@ -1966,6 +2032,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 switch (ctx->proj_type()) { case PROJECTOR_TYPE_GEMMA3: case PROJECTOR_TYPE_IDEFICS3: + case PROJECTOR_TYPE_LFM2: { res = graph.build_siglip(); } break; @@ -2230,6 +2297,7 @@ struct clip_model_loader { } } break; case PROJECTOR_TYPE_IDEFICS3: + case PROJECTOR_TYPE_LFM2: case PROJECTOR_TYPE_INTERNVL: { get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false); @@ -2533,6 +2601,15 @@ struct clip_model_loader { { model.projection = get_tensor(TN_MM_PROJECTOR); } break; + case PROJECTOR_TYPE_LFM2: + { + model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM); + model.mm_input_norm_b = get_tensor(TN_MM_INP_NORM_B); + model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight")); + model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias")); + model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); + model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias")); + } break; case PROJECTOR_TYPE_PIXTRAL: { model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight")); @@ -3428,6 +3505,43 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str res_imgs->grid_y = inst.grid_size.height; return true; + } else if (ctx->proj_type() == PROJECTOR_TYPE_LFM2) { + GGML_ASSERT(params.proj_scale_factor); + + // smart resize + const int width = img->nx; + const int height = img->ny; + const int total_factor = params.patch_size * params.proj_scale_factor; + constexpr int min_image_tokens = 64; + constexpr int max_image_tokens = 256; + const float min_pixels = min_image_tokens * total_factor * total_factor; + const float max_pixels = max_image_tokens * total_factor * total_factor; + + auto round_by_factor = [f = total_factor](float x) { return static_cast(std::nearbyintf(x / static_cast(f))) * f; }; + auto ceil_by_factor = [f = total_factor](float x) { return static_cast(std::ceil(x / static_cast(f))) * f; }; + auto floor_by_factor = [f = total_factor](float x) { return static_cast(std::floor(x / static_cast(f))) * f; }; + + int h_bar = std::max(total_factor, round_by_factor(height)); + int w_bar = std::max(total_factor, round_by_factor(width)); + + if (h_bar * w_bar > max_pixels) { + const auto beta = std::sqrt((height * width) / max_pixels); + h_bar = std::max(total_factor, floor_by_factor(height / beta)); + w_bar = std::max(total_factor, floor_by_factor(width / beta)); + } else if (h_bar * w_bar < min_pixels) { + const auto beta = std::sqrt(min_pixels / (height * width)); + h_bar = ceil_by_factor(height * beta); + w_bar = ceil_by_factor(width * beta); + } + + const std::array pad_color = {122, 116, 104}; + + clip_image_u8 resized_img; + image_manipulation::resize_and_pad_image(*img, resized_img, clip_image_size{w_bar, h_bar}, pad_color); + clip_image_f32_ptr res(clip_image_f32_init()); + normalize_image_u8_to_f32(resized_img, *res, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(res)); + return true; } // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104) @@ -3467,10 +3581,10 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str } return true; - + } else { + GGML_ABORT("Unknown image preprocessing type"); } - GGML_ASSERT(false && "Unknown image preprocessing type"); } ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) { @@ -3534,8 +3648,9 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img) { const auto & params = ctx->model.hparams; - // only for models using fixed size square images - int n_patches_sq = (params.image_size / params.patch_size) * (params.image_size / params.patch_size); + // for models with fixed size image, the input image is already pre-processed and resized to square + int patch_size = params.patch_size; + int n_patches = (img->nx / patch_size) * (img->ny / patch_size); projector_type proj = ctx->proj_type(); @@ -3549,27 +3664,27 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im case PROJECTOR_TYPE_LDPV2: case PROJECTOR_TYPE_GLM_EDGE: { - n_patches_sq /= 4; + n_patches /= 4; if (ctx->model.mm_glm_tok_boi) { - n_patches_sq += 2; // for BOI and EOI token embeddings + n_patches += 2; // for BOI and EOI token embeddings } } break; case PROJECTOR_TYPE_MINICPMV: { // Use actual config value if available, otherwise fall back to hardcoded values if (params.minicpmv_query_num > 0) { - n_patches_sq = params.minicpmv_query_num; + n_patches = params.minicpmv_query_num; } else { // Fallback to hardcoded values for legacy models if (params.minicpmv_version == 2) { - n_patches_sq = 96; + n_patches = 96; } else if (params.minicpmv_version == 3) { - n_patches_sq = 64; + n_patches = 64; } else if (params.minicpmv_version == 4) { - n_patches_sq = 64; + n_patches = 64; } else if (params.minicpmv_version == 5) { // MiniCPM-V 4.0 - n_patches_sq = 64; + n_patches = 64; } else { GGML_ABORT("Unknown minicpmv version"); } @@ -3578,63 +3693,56 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im case PROJECTOR_TYPE_QWEN2VL: case PROJECTOR_TYPE_QWEN25VL: { - // dynamic size + // dynamic size (2 conv, so double patch size) int patch_size = params.patch_size * 2; int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0); int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0); - n_patches_sq = x_patch * y_patch; + n_patches = x_patch * y_patch; } break; case PROJECTOR_TYPE_GEMMA3: - { - int n_per_side = params.image_size / params.patch_size; - int n_per_side_2d_pool = n_per_side / params.proj_scale_factor; - n_patches_sq = n_per_side_2d_pool * n_per_side_2d_pool; - } break; case PROJECTOR_TYPE_IDEFICS3: case PROJECTOR_TYPE_INTERNVL: + case PROJECTOR_TYPE_LLAMA4: + case PROJECTOR_TYPE_LFM2: { // both W and H are divided by proj_scale_factor - n_patches_sq /= (params.proj_scale_factor * params.proj_scale_factor); + int scale_factor = ctx->model.hparams.proj_scale_factor; + n_patches /= (scale_factor * scale_factor); } break; case PROJECTOR_TYPE_PIXTRAL: { // dynamic size int n_merge = params.spatial_merge_size; - int n_patches_x = img->nx / params.patch_size / (n_merge > 0 ? n_merge : 1); - int n_patches_y = img->ny / params.patch_size / (n_merge > 0 ? n_merge : 1); - n_patches_sq = n_patches_y * n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row - } break; - case PROJECTOR_TYPE_LLAMA4: - { - int scale_factor = ctx->model.hparams.proj_scale_factor; - n_patches_sq /= (scale_factor * scale_factor); + int n_patches_x = img->nx / patch_size / (n_merge > 0 ? n_merge : 1); + int n_patches_y = img->ny / patch_size / (n_merge > 0 ? n_merge : 1); + n_patches = n_patches_y * n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row } break; case PROJECTOR_TYPE_VOXTRAL: case PROJECTOR_TYPE_ULTRAVOX: case PROJECTOR_TYPE_QWEN2A: { - n_patches_sq = img->nx; + n_patches = img->nx; const int proj_stack_factor = ctx->model.hparams.proj_stack_factor; if (ctx->model.audio_has_stack_frames()) { GGML_ASSERT(proj_stack_factor > 0); - const int n_len = CLIP_ALIGN(n_patches_sq, proj_stack_factor); - n_patches_sq = n_len / proj_stack_factor; + const int n_len = CLIP_ALIGN(n_patches, proj_stack_factor); + n_patches = n_len / proj_stack_factor; } // whisper downscales input token by half after conv1d - n_patches_sq /= 2; + n_patches /= 2; if (ctx->model.audio_has_avgpool()) { // divide by 2 because of nn.AvgPool1d(2, stride=2) - n_patches_sq /= 2; + n_patches /= 2; } } break; default: GGML_ABORT("unsupported projector type"); } - return n_patches_sq; + return n_patches; } static std::vector>> get_1d_sincos_pos_embed_from_grid_new(int embed_dim, const std::vector> & pos) { @@ -4034,6 +4142,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima case PROJECTOR_TYPE_INTERNVL: case PROJECTOR_TYPE_QWEN2A: case PROJECTOR_TYPE_ULTRAVOX: + case PROJECTOR_TYPE_LFM2: case PROJECTOR_TYPE_VOXTRAL: { // do nothing @@ -4135,6 +4244,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { return ctx->model.mm_model_proj->ne[1]; case PROJECTOR_TYPE_QWEN2A: return ctx->model.mm_fc_w->ne[1]; + case PROJECTOR_TYPE_LFM2: + return ctx->model.mm_2_w->ne[1]; default: GGML_ABORT("Unknown projector type"); } diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h index 08f3efb7b1..3387cdbd36 100644 --- a/tools/mtmd/clip.h +++ b/tools/mtmd/clip.h @@ -82,11 +82,6 @@ struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch */ void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img); -bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img); - -/** interpret bytes as an image file with length bytes_length, and use the result to populate img */ -bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img); - /** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */ bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs ); diff --git a/tools/mtmd/tests.sh b/tools/mtmd/tests.sh index e73cf96af2..6f8a5f86ac 100755 --- a/tools/mtmd/tests.sh +++ b/tools/mtmd/tests.sh @@ -68,6 +68,7 @@ add_test_vision "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M" add_test_vision "ggml-org/InternVL2_5-1B-GGUF:Q8_0" add_test_vision "ggml-org/InternVL3-1B-Instruct-GGUF:Q8_0" add_test_vision "ggml-org/Qwen2.5-Omni-3B-GGUF:Q4_K_M" +add_test_vision "ggml-org/LFM2-VL-450M-GGUF:Q8_0" add_test_audio "ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF:Q8_0" add_test_audio "ggml-org/Qwen2.5-Omni-3B-GGUF:Q4_K_M" diff --git a/tools/server/CMakeLists.txt b/tools/server/CMakeLists.txt index c2a56aaa7f..83b608c32a 100644 --- a/tools/server/CMakeLists.txt +++ b/tools/server/CMakeLists.txt @@ -33,7 +33,7 @@ endforeach() add_executable(${TARGET} ${TARGET_SRCS}) install(TARGETS ${TARGET} RUNTIME) -target_include_directories(${TARGET} PRIVATE ../llava) +target_include_directories(${TARGET} PRIVATE ../mtmd) target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR}) target_link_libraries(${TARGET} PRIVATE common mtmd ${CMAKE_THREAD_LIBS_INIT}) diff --git a/tools/server/public/index.html.gz b/tools/server/public/index.html.gz index fad36ed339..0e3e1f9834 100644 Binary files a/tools/server/public/index.html.gz and b/tools/server/public/index.html.gz differ diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 0b40f7bfa4..35b060674b 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -911,6 +911,17 @@ struct server_task_result_cmpl_final : server_task_result { {"model", oaicompat_model}, {"system_fingerprint", build_info}, {"object", "chat.completion.chunk"}, + }); + + // OpenAI API spec for chat.completion.chunks specifies an empty `choices` array for the last chunk when including usage + // https://platform.openai.com/docs/api-reference/chat_streaming/streaming#chat_streaming/streaming-choices + deltas.push_back({ + {"choices", json::array()}, + {"created", t}, + {"id", oaicompat_cmpl_id}, + {"model", oaicompat_model}, + {"system_fingerprint", build_info}, + {"object", "chat.completion.chunk"}, {"usage", json { {"completion_tokens", n_decoded}, {"prompt_tokens", n_prompt_tokens}, @@ -1201,6 +1212,8 @@ struct server_task_result_metrics : server_task_result { uint64_t n_tokens_predicted_total = 0; uint64_t t_tokens_generation_total = 0; + uint64_t n_past_max = 0; + uint64_t n_prompt_tokens_processed = 0; uint64_t t_prompt_processing = 0; @@ -1226,6 +1239,8 @@ struct server_task_result_metrics : server_task_result { { "n_tokens_predicted_total", n_tokens_predicted_total }, { "t_prompt_processing_total", t_prompt_processing_total }, + { "n_past_max", n_past_max }, + { "n_prompt_tokens_processed", n_prompt_tokens_processed }, { "t_prompt_processing", t_prompt_processing }, { "n_tokens_predicted", n_tokens_predicted }, @@ -1587,6 +1602,8 @@ struct server_metrics { uint64_t n_tokens_predicted_total = 0; uint64_t t_tokens_generation_total = 0; + uint64_t n_past_max = 0; + uint64_t n_prompt_tokens_processed = 0; uint64_t t_prompt_processing = 0; @@ -1605,6 +1622,10 @@ struct server_metrics { n_prompt_tokens_processed += slot.n_prompt_tokens_processed; t_prompt_processing += slot.t_prompt_processing; t_prompt_processing_total += slot.t_prompt_processing; + + if (slot.n_past > 0) { + n_past_max = std::max(n_past_max, (uint64_t) slot.n_past); + } } void on_prediction(const server_slot & slot) { @@ -1620,6 +1641,9 @@ struct server_metrics { if (slot.is_processing()) { n_busy_slots_total++; } + if (slot.n_past > 0) { + n_past_max = std::max(n_past_max, (uint64_t) slot.n_past); + } } } @@ -1716,7 +1740,7 @@ struct server_queue { void pop_deferred_task() { std::unique_lock lock(mutex_tasks); if (!queue_tasks_deferred.empty()) { - queue_tasks.emplace_back(std::move(queue_tasks_deferred.front())); + queue_tasks.emplace_front(std::move(queue_tasks_deferred.front())); queue_tasks_deferred.pop_front(); } condition_tasks.notify_one(); @@ -2875,6 +2899,8 @@ struct server_context { res->n_tokens_predicted_total = metrics.n_tokens_predicted_total; res->t_tokens_generation_total = metrics.t_tokens_generation_total; + res->n_past_max = metrics.n_past_max; + res->n_prompt_tokens_processed = metrics.n_prompt_tokens_processed; res->t_prompt_processing = metrics.t_prompt_processing; res->n_tokens_predicted = metrics.n_tokens_predicted; @@ -4077,6 +4103,10 @@ int main(int argc, char ** argv) { {"name", "n_decode_total"}, {"help", "Total number of llama_decode() calls"}, {"value", res_metrics->n_decode_total} + }, { + {"name", "n_past_max"}, + {"help", "Largest observed n_past."}, + {"value", res_metrics->n_past_max} }, { {"name", "n_busy_slots_per_decode"}, {"help", "Average number of busy slots per llama_decode() call"}, diff --git a/tools/server/tests/unit/test_basic.py b/tools/server/tests/unit/test_basic.py index 1485de8ceb..c7b3af0489 100644 --- a/tools/server/tests/unit/test_basic.py +++ b/tools/server/tests/unit/test_basic.py @@ -5,7 +5,7 @@ from utils import * server = ServerPreset.tinyllama2() -@pytest.fixture(scope="module", autouse=True) +@pytest.fixture(autouse=True) def create_server(): global server server = ServerPreset.tinyllama2() diff --git a/tools/server/tests/unit/test_chat_completion.py b/tools/server/tests/unit/test_chat_completion.py index 6c6f64f5e2..509c024b75 100644 --- a/tools/server/tests/unit/test_chat_completion.py +++ b/tools/server/tests/unit/test_chat_completion.py @@ -72,27 +72,29 @@ def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_conte content = "" last_cmpl_id = None for i, data in enumerate(res): - choice = data["choices"][0] - if i == 0: - # Check first role message for stream=True - assert choice["delta"]["content"] is None - assert choice["delta"]["role"] == "assistant" + if data["choices"]: + choice = data["choices"][0] + if i == 0: + # Check first role message for stream=True + assert choice["delta"]["content"] is None + assert choice["delta"]["role"] == "assistant" + else: + assert "role" not in choice["delta"] + assert data["system_fingerprint"].startswith("b") + assert "gpt-3.5" in data["model"] # DEFAULT_OAICOMPAT_MODEL, maybe changed in the future + if last_cmpl_id is None: + last_cmpl_id = data["id"] + assert last_cmpl_id == data["id"] # make sure the completion id is the same for all events in the stream + if choice["finish_reason"] in ["stop", "length"]: + assert "content" not in choice["delta"] + assert match_regex(re_content, content) + assert choice["finish_reason"] == finish_reason + else: + assert choice["finish_reason"] is None + content += choice["delta"]["content"] or '' else: - assert "role" not in choice["delta"] - assert data["system_fingerprint"].startswith("b") - assert "gpt-3.5" in data["model"] # DEFAULT_OAICOMPAT_MODEL, maybe changed in the future - if last_cmpl_id is None: - last_cmpl_id = data["id"] - assert last_cmpl_id == data["id"] # make sure the completion id is the same for all events in the stream - if choice["finish_reason"] in ["stop", "length"]: assert data["usage"]["prompt_tokens"] == n_prompt assert data["usage"]["completion_tokens"] == n_predicted - assert "content" not in choice["delta"] - assert match_regex(re_content, content) - assert choice["finish_reason"] == finish_reason - else: - assert choice["finish_reason"] is None - content += choice["delta"]["content"] or '' def test_chat_completion_with_openai_library(): @@ -278,12 +280,14 @@ def test_chat_completion_with_timings_per_token(): assert data["choices"][0]["delta"]["role"] == "assistant" assert "timings" not in data, f'First event should not have timings: {data}' else: - assert "role" not in data["choices"][0]["delta"] - assert "timings" in data - assert "prompt_per_second" in data["timings"] - assert "predicted_per_second" in data["timings"] - assert "predicted_n" in data["timings"] - assert data["timings"]["predicted_n"] <= 10 + if data["choices"]: + assert "role" not in data["choices"][0]["delta"] + else: + assert "timings" in data + assert "prompt_per_second" in data["timings"] + assert "predicted_per_second" in data["timings"] + assert "predicted_n" in data["timings"] + assert data["timings"]["predicted_n"] <= 10 def test_logprobs(): @@ -332,24 +336,25 @@ def test_logprobs_stream(): output_text = '' aggregated_text = '' for i, data in enumerate(res): - choice = data.choices[0] - if i == 0: - # Check first role message for stream=True - assert choice.delta.content is None - assert choice.delta.role == "assistant" - else: - assert choice.delta.role is None - if choice.finish_reason is None: - if choice.delta.content: - output_text += choice.delta.content - assert choice.logprobs is not None - assert choice.logprobs.content is not None - for token in choice.logprobs.content: - aggregated_text += token.token - assert token.logprob <= 0.0 - assert token.bytes is not None - assert token.top_logprobs is not None - assert len(token.top_logprobs) > 0 + if data.choices: + choice = data.choices[0] + if i == 0: + # Check first role message for stream=True + assert choice.delta.content is None + assert choice.delta.role == "assistant" + else: + assert choice.delta.role is None + if choice.finish_reason is None: + if choice.delta.content: + output_text += choice.delta.content + assert choice.logprobs is not None + assert choice.logprobs.content is not None + for token in choice.logprobs.content: + aggregated_text += token.token + assert token.logprob <= 0.0 + assert token.bytes is not None + assert token.top_logprobs is not None + assert len(token.top_logprobs) > 0 assert aggregated_text == output_text diff --git a/tools/server/tests/unit/test_completion.py b/tools/server/tests/unit/test_completion.py index be3a0052c6..adb6f27864 100644 --- a/tools/server/tests/unit/test_completion.py +++ b/tools/server/tests/unit/test_completion.py @@ -7,7 +7,7 @@ from utils import * server = ServerPreset.tinyllama2() -@pytest.fixture(scope="module", autouse=True) +@pytest.fixture(autouse=True) def create_server(): global server server = ServerPreset.tinyllama2() @@ -229,7 +229,7 @@ def test_nocache_long_input_prompt(): "temperature": 1.0, "cache_prompt": False, }) - assert res.status_code == 200 + assert res.status_code == 400 def test_completion_with_tokens_input(): diff --git a/tools/server/tests/unit/test_ctx_shift.py b/tools/server/tests/unit/test_ctx_shift.py index 2431ac7088..8f51bc301a 100644 --- a/tools/server/tests/unit/test_ctx_shift.py +++ b/tools/server/tests/unit/test_ctx_shift.py @@ -11,7 +11,7 @@ Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. """.strip() -@pytest.fixture(scope="module", autouse=True) +@pytest.fixture(autouse=True) def create_server(): global server server = ServerPreset.tinyllama2() @@ -25,6 +25,7 @@ def test_ctx_shift_enabled(): # the prompt is truncated to keep the last 109 tokens # 64 tokens are generated thanks to shifting the context when it gets full global server + server.enable_ctx_shift = True server.start() res = server.make_request("POST", "/completion", data={ "n_predict": 64, @@ -42,7 +43,6 @@ def test_ctx_shift_enabled(): ]) def test_ctx_shift_disabled_short_prompt(n_predict: int, n_token_output: int, truncated: bool): global server - server.disable_ctx_shift = True server.n_predict = -1 server.start() res = server.make_request("POST", "/completion", data={ @@ -56,7 +56,6 @@ def test_ctx_shift_disabled_short_prompt(n_predict: int, n_token_output: int, tr def test_ctx_shift_disabled_long_prompt(): global server - server.disable_ctx_shift = True server.start() res = server.make_request("POST", "/completion", data={ "n_predict": 64, @@ -68,7 +67,6 @@ def test_ctx_shift_disabled_long_prompt(): def test_ctx_shift_disabled_stream(): global server - server.disable_ctx_shift = True server.start() res = server.make_stream_request("POST", "/v1/completions", data={ "n_predict": 256, diff --git a/tools/server/tests/unit/test_embedding.py b/tools/server/tests/unit/test_embedding.py index 0feb452ccf..50601b8396 100644 --- a/tools/server/tests/unit/test_embedding.py +++ b/tools/server/tests/unit/test_embedding.py @@ -8,7 +8,7 @@ server = ServerPreset.bert_bge_small() EPSILON = 1e-3 -@pytest.fixture(scope="module", autouse=True) +@pytest.fixture(autouse=True) def create_server(): global server server = ServerPreset.bert_bge_small() diff --git a/tools/server/tests/unit/test_infill.py b/tools/server/tests/unit/test_infill.py index 10554db0f6..73dacdae81 100644 --- a/tools/server/tests/unit/test_infill.py +++ b/tools/server/tests/unit/test_infill.py @@ -3,7 +3,7 @@ from utils import * server = ServerPreset.tinyllama_infill() -@pytest.fixture(scope="module", autouse=True) +@pytest.fixture(autouse=True) def create_server(): global server server = ServerPreset.tinyllama_infill() diff --git a/tools/server/tests/unit/test_lora.py b/tools/server/tests/unit/test_lora.py index c1aa8be70e..00b2f245f6 100644 --- a/tools/server/tests/unit/test_lora.py +++ b/tools/server/tests/unit/test_lora.py @@ -5,7 +5,7 @@ server = ServerPreset.stories15m_moe() LORA_FILE_URL = "https://huggingface.co/ggml-org/stories15M_MOE/resolve/main/moe_shakespeare15M.gguf" -@pytest.fixture(scope="module", autouse=True) +@pytest.fixture(autouse=True) def create_server(): global server server = ServerPreset.stories15m_moe() diff --git a/tools/server/tests/unit/test_rerank.py b/tools/server/tests/unit/test_rerank.py index f4f570ad5e..0b63c7821e 100644 --- a/tools/server/tests/unit/test_rerank.py +++ b/tools/server/tests/unit/test_rerank.py @@ -4,7 +4,7 @@ from utils import * server = ServerPreset.jina_reranker_tiny() -@pytest.fixture(scope="module", autouse=True) +@pytest.fixture(autouse=True) def create_server(): global server server = ServerPreset.jina_reranker_tiny() diff --git a/tools/server/tests/unit/test_security.py b/tools/server/tests/unit/test_security.py index 620b25376b..0e11580553 100644 --- a/tools/server/tests/unit/test_security.py +++ b/tools/server/tests/unit/test_security.py @@ -6,7 +6,7 @@ server = ServerPreset.tinyllama2() TEST_API_KEY = "sk-this-is-the-secret-key" -@pytest.fixture(scope="module", autouse=True) +@pytest.fixture(autouse=True) def create_server(): global server server = ServerPreset.tinyllama2() diff --git a/tools/server/tests/unit/test_slot_save.py b/tools/server/tests/unit/test_slot_save.py index 38704f5ece..1b428cc2a8 100644 --- a/tools/server/tests/unit/test_slot_save.py +++ b/tools/server/tests/unit/test_slot_save.py @@ -3,7 +3,7 @@ from utils import * server = ServerPreset.tinyllama2() -@pytest.fixture(scope="module", autouse=True) +@pytest.fixture(autouse=True) def create_server(): global server server = ServerPreset.tinyllama2() diff --git a/tools/server/tests/unit/test_speculative.py b/tools/server/tests/unit/test_speculative.py index 54db38cf3b..38ca4325ba 100644 --- a/tools/server/tests/unit/test_speculative.py +++ b/tools/server/tests/unit/test_speculative.py @@ -16,7 +16,7 @@ def create_server(): server.draft_max = 8 -@pytest.fixture(scope="module", autouse=True) +@pytest.fixture(autouse=True) def fixture_create_server(): return create_server() @@ -91,6 +91,7 @@ def test_slot_ctx_not_exceeded(): def test_with_ctx_shift(): global server server.n_ctx = 64 + server.enable_ctx_shift = True server.start() res = server.make_request("POST", "/completion", data={ "prompt": "Hello " * 56, diff --git a/tools/server/tests/unit/test_tokenize.py b/tools/server/tests/unit/test_tokenize.py index 382457c9d6..424cac5f3d 100644 --- a/tools/server/tests/unit/test_tokenize.py +++ b/tools/server/tests/unit/test_tokenize.py @@ -4,7 +4,7 @@ from utils import * server = ServerPreset.tinyllama2() -@pytest.fixture(scope="module", autouse=True) +@pytest.fixture(autouse=True) def create_server(): global server server = ServerPreset.tinyllama2() diff --git a/tools/server/tests/unit/test_tool_call.py b/tools/server/tests/unit/test_tool_call.py index 20f048c6f6..a3c3ccdf58 100755 --- a/tools/server/tests/unit/test_tool_call.py +++ b/tools/server/tests/unit/test_tool_call.py @@ -22,6 +22,8 @@ def create_server(): server.model_alias = "tinyllama-2-tool-call" server.server_port = 8081 server.n_slots = 1 + server.n_ctx = 8192 + server.n_batch = 2048 class CompletionMode(Enum): NORMAL = "normal" diff --git a/tools/server/tests/utils.py b/tools/server/tests/utils.py index bc547ca03b..5f42bcae61 100644 --- a/tools/server/tests/utils.py +++ b/tools/server/tests/utils.py @@ -79,7 +79,7 @@ class ServerProcess: draft: int | None = None api_key: str | None = None lora_files: List[str] | None = None - disable_ctx_shift: int | None = False + enable_ctx_shift: int | None = False draft_min: int | None = None draft_max: int | None = None no_webui: bool | None = None @@ -178,8 +178,8 @@ class ServerProcess: if self.lora_files: for lora_file in self.lora_files: server_args.extend(["--lora", lora_file]) - if self.disable_ctx_shift: - server_args.extend(["--no-context-shift"]) + if self.enable_ctx_shift: + server_args.append("--context-shift") if self.api_key: server_args.extend(["--api-key", self.api_key]) if self.draft_max: @@ -318,46 +318,53 @@ class ServerProcess: arguments_parts = 0 for chunk in self.make_stream_request(method, path, data, headers): - assert len(chunk['choices']) == 1, f'Expected 1 choice, got {len(chunk["choices"])}' - choice = chunk['choices'][0] - if choice['delta'].get('content') is not None: - assert len(choice['delta']['content']) > 0, f'Expected non empty content delta!' - content.append(choice['delta']['content']) - content_parts += 1 - if choice['delta'].get('reasoning_content') is not None: - assert len(choice['delta']['reasoning_content']) > 0, f'Expected non empty reasoning_content delta!' - reasoning_content.append(choice['delta']['reasoning_content']) - reasoning_content_parts += 1 - if choice['delta'].get('finish_reason') is not None: - finish_reason = choice['delta']['finish_reason'] - for tc in choice['delta'].get('tool_calls', []): - if 'function' not in tc: - raise ValueError(f"Expected function type, got {tc['type']}") - if tc['index'] >= len(tool_calls): - assert 'id' in tc - assert tc.get('type') == 'function' - assert 'function' in tc and 'name' in tc['function'] and len(tc['function']['name']) > 0, \ - f"Expected function call with name, got {tc.get('function')}" - tool_calls.append(dict( - id="", - type="function", - function=dict( - name="", - arguments="", - ) - )) - tool_call = tool_calls[tc['index']] - if tc.get('id') is not None: - tool_call['id'] = tc['id'] - fct = tc['function'] - assert 'id' not in fct, f"Function call should not have id: {fct}" - if fct.get('name') is not None: - tool_call['function']['name'] = tool_call['function'].get('name', '') + fct['name'] - if fct.get('arguments') is not None: - tool_call['function']['arguments'] += fct['arguments'] - arguments_parts += 1 - tool_call_parts += 1 - + if chunk['choices']: + assert len(chunk['choices']) == 1, f'Expected 1 choice, got {len(chunk["choices"])}' + choice = chunk['choices'][0] + if choice['delta'].get('content') is not None: + assert len(choice['delta']['content']) > 0, f'Expected non empty content delta!' + content.append(choice['delta']['content']) + content_parts += 1 + if choice['delta'].get('reasoning_content') is not None: + assert len(choice['delta']['reasoning_content']) > 0, f'Expected non empty reasoning_content delta!' + reasoning_content.append(choice['delta']['reasoning_content']) + reasoning_content_parts += 1 + if choice['delta'].get('finish_reason') is not None: + finish_reason = choice['delta']['finish_reason'] + for tc in choice['delta'].get('tool_calls', []): + if 'function' not in tc: + raise ValueError(f"Expected function type, got {tc['type']}") + if tc['index'] >= len(tool_calls): + assert 'id' in tc + assert tc.get('type') == 'function' + assert 'function' in tc and 'name' in tc['function'] and len(tc['function']['name']) > 0, \ + f"Expected function call with name, got {tc.get('function')}" + tool_calls.append(dict( + id="", + type="function", + function=dict( + name="", + arguments="", + ) + )) + tool_call = tool_calls[tc['index']] + if tc.get('id') is not None: + tool_call['id'] = tc['id'] + fct = tc['function'] + assert 'id' not in fct, f"Function call should not have id: {fct}" + if fct.get('name') is not None: + tool_call['function']['name'] = tool_call['function'].get('name', '') + fct['name'] + if fct.get('arguments') is not None: + tool_call['function']['arguments'] += fct['arguments'] + arguments_parts += 1 + tool_call_parts += 1 + else: + # When `include_usage` is True (the default), we expect the last chunk of the stream + # immediately preceding the `data: [DONE]` message to contain a `choices` field with an empty array + # and a `usage` field containing the usage statistics (n.b., llama-server also returns `timings` in + # the last chunk) + assert 'usage' in chunk, f"Expected finish_reason in chunk: {chunk}" + assert 'timings' in chunk, f"Expected finish_reason in chunk: {chunk}" print(f'Streamed response had {content_parts} content parts, {reasoning_content_parts} reasoning_content parts, {tool_call_parts} tool call parts incl. {arguments_parts} arguments parts') result = dict( choices=[ diff --git a/tools/server/webui/src/utils/app.context.tsx b/tools/server/webui/src/utils/app.context.tsx index 0b95e88f89..1860ecc721 100644 --- a/tools/server/webui/src/utils/app.context.tsx +++ b/tools/server/webui/src/utils/app.context.tsx @@ -255,7 +255,7 @@ export const AppContextProvider = ({ if (chunk.error) { throw new Error(chunk.error?.message || 'Unknown error'); } - const addedContent = chunk.choices[0].delta.content; + const addedContent = chunk.choices[0]?.delta.content; const lastContent = pendingMsg.content || ''; if (addedContent) { pendingMsg = { diff --git a/tools/tts/tts.cpp b/tools/tts/tts.cpp index a71e9bf5b5..18f01a9946 100644 --- a/tools/tts/tts.cpp +++ b/tools/tts/tts.cpp @@ -581,7 +581,6 @@ int main(int argc, char ** argv) { params.model = params.vocoder.model; params.embedding = true; - params.ctx_shift = false; // silence warning params.n_ubatch = params.n_batch; common_init_result llama_init_cts = common_init_from_params(params);