Merge 76775a5b8e into 0ccbfdef3e
This commit is contained in:
commit
15cbe21cd0
|
|
@ -0,0 +1,134 @@
|
|||
ARG OPENVINO_VERSION_MAJOR=2025.3
|
||||
ARG OPENVINO_VERSION_FULL=2025.3.0.19807.44526285f24
|
||||
ARG UBUNTU_VERSION=24.04
|
||||
|
||||
# Optional proxy build arguments - empty by default
|
||||
ARG http_proxy=
|
||||
ARG https_proxy=
|
||||
|
||||
## Build Image
|
||||
FROM ubuntu:${UBUNTU_VERSION} AS build
|
||||
|
||||
# Pass proxy args to build stage
|
||||
ARG http_proxy
|
||||
ARG https_proxy
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
ca-certificates \
|
||||
gnupg \
|
||||
wget \
|
||||
git \
|
||||
cmake \
|
||||
ninja-build \
|
||||
build-essential \
|
||||
libtbb12 \
|
||||
libcurl4-openssl-dev && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install OpenVINO for Ubuntu 24.04
|
||||
ARG OPENVINO_VERSION_MAJOR
|
||||
ARG OPENVINO_VERSION_FULL
|
||||
RUN mkdir -p /opt/intel && \
|
||||
wget https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz && \
|
||||
tar -xf openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz && \
|
||||
mv openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64 /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} && \
|
||||
cd /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} && \
|
||||
echo "Y" | ./install_dependencies/install_openvino_dependencies.sh && \
|
||||
cd - && \
|
||||
ln -s /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} /opt/intel/openvino
|
||||
|
||||
ENV OpenVINO_DIR=/opt/intel/openvino
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
# Build Stage
|
||||
RUN bash -c "source ${OpenVINO_DIR}/setupvars.sh && \
|
||||
cmake -B build/ReleaseOV -G Ninja \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_OPENVINO=ON && \
|
||||
cmake --build build/ReleaseOV -j$(nproc)"
|
||||
|
||||
# Copy all necessary libraries
|
||||
RUN mkdir -p /app/lib && \
|
||||
find build/ReleaseOV -name '*.so*' -exec cp {} /app/lib \; && \
|
||||
find ${OpenVINO_DIR}/runtime/lib/intel64 -name '*.so*' -exec cp -P {} /app/lib \; 2>/dev/null || \
|
||||
find ${OpenVINO_DIR}/lib/intel64 -name '*.so*' -exec cp -P {} /app/lib \;
|
||||
|
||||
# Create runtime directories and copy binaries
|
||||
RUN mkdir -p /app/full \
|
||||
&& cp build/ReleaseOV/bin/* /app/full/ \
|
||||
&& cp *.py /app/full \
|
||||
&& cp -r gguf-py /app/full \
|
||||
&& cp -r requirements /app/full \
|
||||
&& cp requirements.txt /app/full \
|
||||
&& cp .devops/tools.sh /app/full/tools.sh
|
||||
|
||||
## Base Runtime Image
|
||||
FROM ubuntu:${UBUNTU_VERSION} AS base
|
||||
|
||||
# Pass proxy args to runtime stage
|
||||
ARG http_proxy
|
||||
ARG https_proxy
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgomp1 libtbb12 curl\
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||
&& find /var/cache -type f -delete
|
||||
|
||||
COPY --from=build /app/lib/ /app/
|
||||
|
||||
### Full (all binaries)
|
||||
FROM base AS full
|
||||
|
||||
ARG http_proxy
|
||||
ARG https_proxy
|
||||
|
||||
COPY --from=build /app/full /app/
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
git \
|
||||
python3 \
|
||||
python3-venv \
|
||||
python3-pip && \
|
||||
python3 -m venv /ov-venv && \
|
||||
/ov-venv/bin/pip install --no-cache-dir --upgrade pip setuptools wheel && \
|
||||
/ov-venv/bin/pip install --no-cache-dir -r requirements.txt && \
|
||||
apt-get autoremove -y && \
|
||||
apt-get clean && \
|
||||
rm -rf /tmp/* /var/tmp/* && \
|
||||
find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
|
||||
find /var/cache -type f -delete
|
||||
|
||||
ENTRYPOINT ["/bin/bash", "-c", "source /ov-venv/bin/activate && exec /app/tools.sh \"$@\"", "--"]
|
||||
|
||||
|
||||
### Light, CLI only
|
||||
FROM base AS light
|
||||
|
||||
COPY --from=build /app/full/llama-cli /app/
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
ENTRYPOINT [ "/app/llama-cli" ]
|
||||
|
||||
### Server, Server only
|
||||
FROM base AS server
|
||||
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
|
||||
COPY --from=build /app/full/llama-server /app/
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||
|
||||
ENTRYPOINT [ "/app/llama-server" ]
|
||||
|
|
@ -0,0 +1,25 @@
|
|||
name: "Linux - Setup OpenVINO Toolkit"
|
||||
description: "Setup OpenVINO Toolkit for Linux"
|
||||
inputs:
|
||||
path:
|
||||
description: "Installation path"
|
||||
required: true
|
||||
version_major:
|
||||
description: "OpenVINO major version (e.g., 2025.3)"
|
||||
required: true
|
||||
version_full:
|
||||
description: "OpenVINO full version (e.g., 2025.3.0.19807.44526285f24)"
|
||||
required: true
|
||||
|
||||
runs:
|
||||
using: "composite"
|
||||
steps:
|
||||
- name: Setup OpenVINO Toolkit
|
||||
id: setup
|
||||
uses: ./.github/actions/unarchive-tar
|
||||
with:
|
||||
url: https://storage.openvinotoolkit.org/repositories/openvino/packages/${{ inputs.version_major }}/linux/openvino_toolkit_ubuntu24_${{ inputs.version_full }}_x86_64.tgz
|
||||
path: ${{ inputs.path }}
|
||||
type: z
|
||||
strip: 1
|
||||
|
||||
|
|
@ -63,6 +63,34 @@ jobs:
|
|||
path: ./spacemit_toolchain
|
||||
version: ${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}
|
||||
|
||||
ubuntu-24-openvino-cache:
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
env:
|
||||
# Make sure this is in sync with build.yml
|
||||
OPENVINO_VERSION_MAJOR: "2025.3"
|
||||
OPENVINO_VERSION_FULL: "2025.3.0.19807.44526285f24"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Cache
|
||||
uses: actions/cache@v4
|
||||
id: cache-openvino
|
||||
with:
|
||||
path: ./openvino_toolkit
|
||||
key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
|
||||
|
||||
- name: Setup OpenVINO Toolkit
|
||||
if: steps.cache-openvino.outputs.cache-hit != 'true'
|
||||
uses: ./.github/actions/linux-setup-openvino
|
||||
with:
|
||||
path: ./openvino_toolkit
|
||||
version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
|
||||
version_full: ${{ env.OPENVINO_VERSION_FULL }}
|
||||
|
||||
windows-2022-rocm-cache:
|
||||
runs-on: windows-2022
|
||||
|
||||
|
|
|
|||
|
|
@ -743,6 +743,61 @@ jobs:
|
|||
-DGGML_SYCL_F16=ON
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
ubuntu-24-cmake-openvino:
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
env:
|
||||
# Make sure this is in sync with build-cache.yml
|
||||
OPENVINO_VERSION_MAJOR: "2025.3"
|
||||
OPENVINO_VERSION_FULL: "2025.3.0.19807.44526285f24"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ubuntu-24-cmake-openvino-no-preset-v1
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y build-essential libcurl4-openssl-dev libtbb12 cmake ninja-build python3-pip
|
||||
|
||||
- name: Use OpenVINO Toolkit Cache
|
||||
uses: actions/cache@v4
|
||||
id: cache-openvino
|
||||
with:
|
||||
path: ./openvino_toolkit
|
||||
key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
|
||||
|
||||
- name: Setup OpenVINO Toolkit
|
||||
if: steps.cache-openvino.outputs.cache-hit != 'true'
|
||||
uses: ./.github/actions/linux-setup-openvino
|
||||
with:
|
||||
path: ./openvino_toolkit
|
||||
version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
|
||||
version_full: ${{ env.OPENVINO_VERSION_FULL }}
|
||||
|
||||
- name: Install OpenVINO dependencies
|
||||
run: |
|
||||
cd ./openvino_toolkit
|
||||
chmod +x ./install_dependencies/install_openvino_dependencies.sh
|
||||
echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
source ./openvino_toolkit/setupvars.sh
|
||||
cmake -B build/ReleaseOV -G Ninja \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_OPENVINO=ON
|
||||
cmake --build build/ReleaseOV --config Release -j $(nproc)
|
||||
|
||||
build-linux-cross:
|
||||
uses: ./.github/workflows/build-linux-cross.yml
|
||||
|
||||
|
|
|
|||
|
|
@ -47,6 +47,7 @@ jobs:
|
|||
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
|
||||
- { tag: "s390x", dockerfile: ".devops/s390x.Dockerfile", platforms: "linux/s390x", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04-s390x" }
|
||||
- { tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04" }
|
||||
- { tag: "openvino", dockerfile: ".devops/openvino.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
|
||||
steps:
|
||||
- name: Check out the repo
|
||||
uses: actions/checkout@v6
|
||||
|
|
|
|||
|
|
@ -231,6 +231,78 @@ jobs:
|
|||
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz
|
||||
name: llama-bin-ubuntu-vulkan-x64.tar.gz
|
||||
|
||||
ubuntu-24-openvino:
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
env:
|
||||
# Make sure this is in sync with build.yml
|
||||
OPENVINO_VERSION_MAJOR: "2025.3"
|
||||
OPENVINO_VERSION_FULL: "2025.3.0.19807.44526285f24"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ubuntu-24-cmake-openvino-release-no-preset-v1
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y build-essential libcurl4-openssl-dev libtbb12 cmake ninja-build python3-pip
|
||||
|
||||
- name: Use OpenVINO Toolkit Cache
|
||||
uses: actions/cache@v4
|
||||
id: cache-openvino
|
||||
with:
|
||||
path: ./openvino_toolkit
|
||||
key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
|
||||
|
||||
- name: Setup OpenVINO Toolkit
|
||||
if: steps.cache-openvino.outputs.cache-hit != 'true'
|
||||
uses: ./.github/actions/linux-setup-openvino
|
||||
with:
|
||||
path: ./openvino_toolkit
|
||||
version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
|
||||
version_full: ${{ env.OPENVINO_VERSION_FULL }}
|
||||
|
||||
- name: Install OpenVINO dependencies
|
||||
run: |
|
||||
cd ./openvino_toolkit
|
||||
chmod +x ./install_dependencies/install_openvino_dependencies.sh
|
||||
echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
source ./openvino_toolkit/setupvars.sh
|
||||
cmake -B build/ReleaseOV -G Ninja \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_OPENVINO=ON
|
||||
cmake --build build/ReleaseOV --config Release -j $(nproc)
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
cp LICENSE ./build/ReleaseOV/bin/
|
||||
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-x64.zip ./build/ReleaseOV/bin/*
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-x64.zip
|
||||
name: llama-bin-ubuntu-openvino-x64.zip
|
||||
|
||||
windows-cpu:
|
||||
runs-on: windows-2025
|
||||
|
||||
|
|
|
|||
12
ci/run.sh
12
ci/run.sh
|
|
@ -25,6 +25,9 @@
|
|||
# # with KLEIDIAI support
|
||||
# GG_BUILD_KLEIDIAI=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
#
|
||||
# # with OPENVINO support
|
||||
# GG_BUILD_OPENVINO=1 GG_BUILD_LOW_PERF=1 GGML_OPENVINO_DEVICE=CPU bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
#
|
||||
|
||||
if [ -z "$2" ]; then
|
||||
echo "usage: $0 <output-dir> <mnt-dir>"
|
||||
|
|
@ -165,6 +168,15 @@ if [ -n "${GG_BUILD_KLEIDIAI}" ]; then
|
|||
-DBUILD_SHARED_LIBS=OFF"
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_OPENVINO} ]; then
|
||||
if [ -z ${OpenVINO_DIR} ]; then
|
||||
echo "OpenVINO_DIR not found, please install OpenVINO via archives and enable it by:"
|
||||
echo "source /opt/intel/openvino/setupvars.sh"
|
||||
exit 1
|
||||
fi
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_OPENVINO=ON -DGGML_CPU_REPACK=OFF"
|
||||
fi
|
||||
|
||||
## helpers
|
||||
|
||||
# download a file if it does not exist or if it is outdated
|
||||
|
|
|
|||
|
|
@ -0,0 +1,124 @@
|
|||
# OpenVINO Backend for llama.cpp
|
||||
|
||||
This document describes the OpenVINO backend for `llama.cpp`, which enables hardware-accelerated inference on **Intel® CPUs, GPUs, and NPUs** while remaining compatible with the existing **GGUF model ecosystem**.
|
||||
|
||||
The backend translates GGML compute graphs into OpenVINO graphs and leverages graph compilation, kernel fusion, and device-specific optimizations to improve inference performance on supported Intel hardware.
|
||||
|
||||
## Overview
|
||||
|
||||
The OpenVINO backend is implemented in ggml/src/ggml-openvino and provides a translation layer for core GGML operations. It supports FP16 and BF16 models, as well as selected quantized GGUF formats. This backend enables accelerated inference on Intel CPUs, integrated and discrete GPUs, and NPUs, while integrating seamlessly with the existing `llama.cpp` execution flow.
|
||||
|
||||
## Supported Devices
|
||||
|
||||
OpenVINO backend supports the following hardware:
|
||||
|
||||
- Intel CPUs
|
||||
- Intel integrated and discrete GPUs
|
||||
- Intel NPUs (Requires UD32+ driver)
|
||||
|
||||
Although OpenVINO supports a wide range of [Intel hardware](https://docs.openvino.ai/2025/about-openvino/release-notes-openvino/system-requirements.html), the llama.cpp OpenVINO backend has been validated specifically on AI PCs such as the Intel® Core™ Ultra Series 1 and Series 2.
|
||||
|
||||
## Supported Model Precisions
|
||||
|
||||
- `FP16`
|
||||
- `BF16` (on Intel Xeon)
|
||||
- `Q4_0`
|
||||
- `Q4_1`
|
||||
- `Q4_K_M`
|
||||
- `Q6_K`
|
||||
|
||||
Accuracy and performance optimizations for quantized models are still work in progress.
|
||||
|
||||
## Quantization Support Details
|
||||
|
||||
### CPU and GPU
|
||||
|
||||
- **`Q4_0`, `Q4_1`, `Q4_K_M`, `Q6_K` models are supported**
|
||||
- `Q5_K` and `Q6_K` tensors are converted to `Q8_0_C`
|
||||
|
||||
### NPU
|
||||
|
||||
- **Primary supported quantization scheme is `Q4_0`**
|
||||
- `Q6_K` tensors are requantized to `Q4_0_128` in general. For embedding weights, `Q6_K` tensors are requantized to `Q8_0_C` except for the token embedding matrix which is dequantized to fp16
|
||||
|
||||
### Additional Notes
|
||||
|
||||
- Both `Q4_0` and `Q4_1` models use `Q6_K` for the token embedding tensor and the final matmul weight tensor (often the same tensor)
|
||||
- `Q4_0` models may produce some `Q4_1` tensors if an imatrix is provided during quantization using `llama-quantize`
|
||||
- `Q4_K_M` models may include both `Q6_K` and `Q5_K` tensors (observed in Phi-3)
|
||||
|
||||
## Validated Models
|
||||
|
||||
The following models have been validated for functionality on Intel® Core™ Ultra Series 1 and Series 2:
|
||||
|
||||
- [Llama-3.2-1B-Instruct-GGUF](https://huggingface.co/MaziyarPanahi/Llama-3.2-1B-Instruct-GGUF)
|
||||
- [Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct)
|
||||
- [microsoft/Phi-3-mini-4k-instruct-gguf](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf)
|
||||
- [Qwen/Qwen2.5-1.5B-Instruct-GGUF](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF)
|
||||
- [Qwen/Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B)
|
||||
- [openbmb/MiniCPM-1B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-1B-sft-bf16)
|
||||
- [tencent/Hunyuan-7B-Instruct](https://huggingface.co/tencent/Hunyuan-7B-Instruct)
|
||||
- [mistralai/Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3)
|
||||
- [bartowski/DeepSeek-R1-Distill-Llama-8B-GGUF](https://huggingface.co/bartowski/DeepSeek-R1-Distill-Llama-8B-GGUF)
|
||||
|
||||
## Build Instructions
|
||||
|
||||
For detailed build instructions, refer to [build.md](../build.md#openvino)
|
||||
|
||||
## Runtime Configuration
|
||||
|
||||
The OpenVINO backend can be configured using the following environment variables at runtime to control device selection, caching, debugging, and profiling behavior.
|
||||
|
||||
### Configuration Options
|
||||
|
||||
| Variable | Description |
|
||||
|--------|-------------|
|
||||
| `GGML_OPENVINO_DEVICE` | Specify the target device (`CPU`, `GPU`, `NPU`). If not set, the backend automatically selects the first available device in priority order: **GPU → CPU → NPU**. When set to `NPU`, static compilation mode is enabled for optimal performance. |
|
||||
| `GGML_OPENVINO_CACHE_DIR` | Directory for OpenVINO model caching (recommended: `/tmp/ov_cache`). Enables model caching when set. **Not supported on NPU devices.** |
|
||||
| `GGML_OPENVINO_PROFILING` | Enable execution-time profiling. |
|
||||
| `GGML_OPENVINO_DUMP_CGRAPH` | Dump the GGML compute graph to `cgraph.txt`. |
|
||||
| `GGML_OPENVINO_DUMP_IR` | Export OpenVINO IR files with timestamps. |
|
||||
| `GGML_OPENVINO_DEBUG_INPUT` | Enable input debugging. |
|
||||
| `GGML_OPENVINO_DEBUG_OUTPUT` | Enable output debugging. |
|
||||
| *`GGML_OPENVINO_STATEFUL_EXECUTION` | Enable stateful execution for better performance |
|
||||
|
||||
*`GGML_OPENVINO_STATEFUL_EXECUTION` is an **Experimental** feature to allow stateful execution for managing the KV cache internally inside the OpenVINO model, improving performance on CPUs and GPUs. Stateful execution is not effective on NPUs, and not all models currently support this feature. This feature is experimental and has been validated only with the llama-simple, llama-cli, llama-bench, and llama-run applications and is recommended to enable for the best performance. Other applications, such as llama-server and llama-perplexity, are not yet supported.
|
||||
|
||||
### Example Usage
|
||||
|
||||
#### GPU Inference with Profiling
|
||||
|
||||
```bash
|
||||
export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache
|
||||
export GGML_OPENVINO_PROFILING=1
|
||||
export GGML_OPENVINO_DEVICE=GPU
|
||||
|
||||
./build/ReleaseOV/bin/llama-simple \
|
||||
-m ~/models/Llama-3.2-1B-Instruct.fp16.gguf \
|
||||
-n 50 \
|
||||
"The story of AI is "
|
||||
```
|
||||
|
||||
#### llama-bench
|
||||
|
||||
```bash
|
||||
GGML_OPENVINO_DEVICE=GPU ./llama-bench -fa 1
|
||||
```
|
||||
-fa 1 is required when running llama-bench with the OpenVINO backend.
|
||||
|
||||
### NPU Notes
|
||||
|
||||
- Model caching is not yet supported
|
||||
- Does not support llama-server -np > 1 (multiple parallel sequences)
|
||||
- Only supports llama-perplexity -b 512 or smaller
|
||||
|
||||
## Llama.cpp Tools
|
||||
|
||||
The following tools work with the OpenVINO backend on CPU and GPU: llama-simple, llama-run, llama-cli, llama-server, llama-bench, llama-perplexity.
|
||||
|
||||
## Work in Progress
|
||||
|
||||
- Performance and memory optimizations
|
||||
- Broader quantization coverage
|
||||
- Support for additional model architectures
|
||||
- Extensive accuracy validation
|
||||
199
docs/build.md
199
docs/build.md
|
|
@ -13,6 +13,21 @@ cd llama.cpp
|
|||
|
||||
The following sections describe how to build with different backends and options.
|
||||
|
||||
* [CPU Build](#cpu-build)
|
||||
* [BLAS Build](#blas-build)
|
||||
* [Metal Build](#metal-build)
|
||||
* [SYCL](#sycl)
|
||||
* [CUDA](#cuda)
|
||||
* [MUSA](#musa)
|
||||
* [HIP](#hip)
|
||||
* [Vulkan](#vulkan)
|
||||
* [CANN](#cann)
|
||||
* [Arm® KleidiAI™](#arm-kleidiai)
|
||||
* [OpenCL](#opencl)
|
||||
* [Android](#android-1)
|
||||
* [OpenVINO](#openvino)
|
||||
* [Notes about GPU-accelerated backends](#notes-about-gpu-accelerated-backends)
|
||||
|
||||
## CPU Build
|
||||
|
||||
Build llama.cpp using `CMake`:
|
||||
|
|
@ -718,6 +733,190 @@ Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/m
|
|||
|
||||
To read documentation for how to build on IBM Z & LinuxONE, [click here](./build-s390x.md)
|
||||
|
||||
## OpenVINO
|
||||
|
||||
[OpenVINO](https://docs.openvino.ai/2025/index.html) is an open-source toolkit for optimizing and deploying high-performance AI inference, specifically designed for Intel hardware, including CPUs, GPUs, and NPUs, in the cloud, on-premises, and on the edge.
|
||||
The OpenVINO backend enhances performance by leveraging hardware-specific optimizations and can be enabled for use with llama.cpp.
|
||||
|
||||
Follow the instructions below to install OpenVINO runtime and build llama.cpp with OpenVINO support. For more detailed information on OpenVINO backend, refer to [OPENVINO.md](backend/OPENVINO.md)
|
||||
|
||||
### Prerequisites
|
||||
|
||||
- Linux or Windows system with Intel hardware (CPU, GPU, or NPU)
|
||||
- **For Intel GPU or NPU Usage**: Install the appropriate hardware drivers for your Intel GPU or NPU. For detailed instructions, see: [Additional Configurations for Hardware Acceleration](https://docs.openvino.ai/2025/get-started/install-openvino/configurations.html).
|
||||
|
||||
- **Linux:**
|
||||
- Git, CMake, and Ninja software tools are needed for building.
|
||||
```bash
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y build-essential libcurl4-openssl-dev libtbb12 cmake ninja-build python3-pip curl wget tar
|
||||
```
|
||||
- OpenCL
|
||||
```bash
|
||||
sudo apt install ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd
|
||||
```
|
||||
|
||||
- **Windows:**
|
||||
- Download Microsoft.VisualStudio.2022.BuildTools: [Visual_Studio_Build_Tools](https://aka.ms/vs/17/release/vs_BuildTools.exe)
|
||||
Select "Desktop development with C++" under workloads
|
||||
- Install git
|
||||
- Install OpenCL with vcpkg
|
||||
```powershell
|
||||
cd C:\
|
||||
git clone https://github.com/microsoft/vcpkg
|
||||
cd vcpkg
|
||||
bootstrap-vcpkg.bat
|
||||
vcpkg install opencl
|
||||
```
|
||||
- Use "x64 Native Tools Command Prompt" for Build
|
||||
|
||||
### 1. Install OpenVINO Runtime
|
||||
|
||||
- Follow the guide to install OpenVINO Runtime from an archive file: [Linux](https://docs.openvino.ai/2025/get-started/install-openvino/install-openvino-archive-linux.html) | [Windows](https://docs.openvino.ai/2025/get-started/install-openvino/install-openvino-archive-windows.html)
|
||||
|
||||
- **Linux:**
|
||||
|
||||
<details>
|
||||
<summary>📦 Click to expand OpenVINO 2025.3 installation from an archive file on Ubuntu</summary>
|
||||
<br>
|
||||
|
||||
```bash
|
||||
wget https://raw.githubusercontent.com/ravi9/misc-scripts/main/openvino/ov-archive-install/install-openvino-from-archive.sh
|
||||
chmod +x install-openvino-from-archive.sh
|
||||
./install-openvino-from-archive.sh
|
||||
```
|
||||
|
||||
Verify OpenVINO is initialized properly:
|
||||
```bash
|
||||
echo $OpenVINO_DIR
|
||||
```
|
||||
</details>
|
||||
|
||||
|
||||
### 2. Build llama.cpp with OpenVINO Backend
|
||||
|
||||
Clone the OpenVINO-enabled llama.cpp fork and build it:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/ravi9/llama.cpp.git
|
||||
cd llama.cpp
|
||||
git switch dev_backend_openvino
|
||||
```
|
||||
|
||||
- **Linux:**
|
||||
```bash
|
||||
source /opt/intel/openvino/setupvars.sh
|
||||
cmake -B build/ReleaseOV -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON -DGGML_CPU_REPACK=OFF
|
||||
cmake --build build/ReleaseOV --parallel
|
||||
```
|
||||
|
||||
- **Windows:**
|
||||
```bash
|
||||
"C:\Program Files (x86)\Intel\openvino_2025.3.0\setupvars.bat"
|
||||
cmake -B build\ReleaseOV -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON -DGGML_CPU_REPACK=OFF -DLLAMA_CURL=OFF -DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake
|
||||
cmake --build build\ReleaseOV --parallel
|
||||
```
|
||||
|
||||
### 3. Download Sample Model
|
||||
|
||||
Download models for testing:
|
||||
|
||||
```bash
|
||||
mkdir -p ~/models/
|
||||
wget https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf \
|
||||
-O ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf
|
||||
```
|
||||
|
||||
### 4. Run inference with OpenVINO backend:
|
||||
|
||||
When using the OpenVINO backend, the first inference token may have slightly higher latency due to on-the-fly conversion to the OpenVINO graph. Subsequent tokens and runs will be faster.
|
||||
|
||||
```bash
|
||||
# If device is unset or unavailable, default to CPU.
|
||||
export GGML_OPENVINO_DEVICE=GPU
|
||||
./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -n 50 "The story of AI is "
|
||||
```
|
||||
|
||||
To run in chat mode:
|
||||
```bash
|
||||
./build/ReleaseOV/bin/llama-cli -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf
|
||||
```
|
||||
|
||||
### Configuration Options
|
||||
|
||||
Control OpenVINO behavior using these environment variables:
|
||||
|
||||
- **`GGML_OPENVINO_DEVICE`**: Specify the target device for OpenVINO inference. If not set, automatically selects the first available device in priority order: GPU, CPU, NPU. When set to `NPU` to use Intel NPUs, it enables static compilation mode for optimal performance.
|
||||
- **`GGML_OPENVINO_CACHE_DIR`**: Directory for model caching (recommended: `/tmp/ov_cache`). If set, enables model caching in OpenVINO. Note: Not supported when using NPU devices yet.
|
||||
- **`GGML_OPENVINO_PROFILING`**: Enable execution time profiling.
|
||||
- **`GGML_OPENVINO_DUMP_CGRAPH`**: Save compute graph to `cgraph.txt`.
|
||||
- **`GGML_OPENVINO_DUMP_IR`**: Export OpenVINO IR files with timestamps.
|
||||
|
||||
### Example with Profiling
|
||||
|
||||
```bash
|
||||
GGML_OPENVINO_PROFILING=1 GGML_OPENVINO_DEVICE=GPU ./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -n 50 "The story of AI is "
|
||||
```
|
||||
|
||||
### Docker build Llama.cpp with OpenVINO Backend
|
||||
You can build and run llama.cpp with OpenVINO backend using Docker.
|
||||
|
||||
```bash
|
||||
# Build the base runtime image with compiled shared libraries and minimal dependencies.
|
||||
docker build -t llama-openvino:base -f .devops/openvino.Dockerfile .
|
||||
|
||||
# Build the complete image with all binaries, Python tools, gguf-py library, and model conversion utilities.
|
||||
docker build --target=full -t llama-openvino:full -f .devops/openvino.Dockerfile .
|
||||
|
||||
# Build a minimal CLI-only image containing just the llama-cli executable.
|
||||
docker build --target=light -t llama-openvino:light -f .devops/openvino.Dockerfile .
|
||||
|
||||
# Builds a server-only image with llama-server executable, health check endpoint, and REST API support.
|
||||
docker build --target=server -t llama-openvino:server -f .devops/openvino.Dockerfile .
|
||||
|
||||
# If you are behind a proxy:
|
||||
docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy --target=light --t llama-openvino:light -f .devops/openvino.Dockerfile .
|
||||
```
|
||||
|
||||
Run llama.cpp with OpenVINO backend Docker container.
|
||||
Save sample models in `~/models` as [shown above](#3-download-sample-model). It will be mounted to the container in the examples below.
|
||||
|
||||
```bash
|
||||
# Run Docker container
|
||||
docker run --rm -it -v ~/models:/models llama-openvino:light --no-warmup -m /models/Llama-3.2-1B-Instruct.fp16.gguf
|
||||
|
||||
# With Intel GPU access (iGPU or dGPU)
|
||||
docker run --rm -it -v ~/models:/models \
|
||||
--device=/dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \
|
||||
llama-openvino:light --no-warmup -m /models/Llama-3.2-1B-Instruct.fp16.gguf
|
||||
|
||||
# With Intel NPU access
|
||||
docker run --rm -it --env GGML_OPENVINO_DEVICE=NPU -v ~/models:/models \
|
||||
--device=/dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \
|
||||
llama-openvino:light --no-warmup -m /models/Llama-3.2-1B-Instruct.fp16.gguf
|
||||
```
|
||||
|
||||
Run Llama.cpp Server with OpenVINO Backend
|
||||
```bash
|
||||
# Run the Server Docker container server
|
||||
docker run --rm -it -p 8080:8080 -v ~/models:/models llama-openvino:server --no-warmup -m /models/Llama-3.2-1B-Instruct.fp16.gguf
|
||||
|
||||
# In a NEW terminal, test the server with curl
|
||||
|
||||
# If you are behind a proxy, make sure to set NO_PROXY to avoid proxy for localhost
|
||||
export NO_PROXY=localhost,127.0.0.1
|
||||
|
||||
# Test health endpoint
|
||||
curl -f http://localhost:8080/health
|
||||
|
||||
# Test with a simple prompt
|
||||
curl -X POST "http://localhost:8080/v1/chat/completions" -H "Content-Type: application/json" \
|
||||
-d '{"messages":[{"role":"user","content":"Write a poem about OpenVINO"}],"max_tokens":100}' | jq .
|
||||
|
||||
```
|
||||
|
||||
|
||||
---
|
||||
## Notes about GPU-accelerated backends
|
||||
|
||||
The GPU may still be used to accelerate some parts of the computation even when using the `-ngl 0` option. You can fully disable GPU acceleration by using `--device none`.
|
||||
|
|
|
|||
|
|
@ -248,6 +248,8 @@ set (GGML_SYCL_TARGET "INTEL" CACHE STRING
|
|||
set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
|
||||
"ggml: sycl device architecture")
|
||||
|
||||
option(GGML_OPENVINO "ggml: use OPENVINO" OFF)
|
||||
|
||||
option(GGML_OPENCL "ggml: use OpenCL" OFF)
|
||||
option(GGML_OPENCL_PROFILING "ggml: use OpenCL profiling (increases overhead)" OFF)
|
||||
option(GGML_OPENCL_EMBED_KERNELS "ggml: embed kernels" ON)
|
||||
|
|
@ -327,6 +329,7 @@ set(GGML_PUBLIC_HEADERS
|
|||
include/ggml-vulkan.h
|
||||
include/ggml-webgpu.h
|
||||
include/ggml-zendnn.h
|
||||
include/ggml-openvino.h
|
||||
include/gguf.h)
|
||||
|
||||
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
|
||||
|
|
|
|||
|
|
@ -0,0 +1,62 @@
|
|||
#pragma once
|
||||
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#include <array>
|
||||
#include <cstring>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define GGML_OPENVINO_NAME "OPENVINO"
|
||||
#define GGML_OPENVINO_MAX_DEVICES 16
|
||||
|
||||
// backend API
|
||||
GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device);
|
||||
|
||||
GGML_BACKEND_API bool ggml_backend_is_openvino(ggml_backend_t backend);
|
||||
|
||||
GGML_BACKEND_API bool ggml_backend_buffer_is_openvino(ggml_backend_buffer_t buffer);
|
||||
|
||||
GGML_BACKEND_API bool ggml_backend_buft_is_openvino(ggml_backend_buffer_type_t buft);
|
||||
|
||||
GGML_BACKEND_API bool ggml_backend_buft_is_openvino_host(ggml_backend_buffer_type_t buft);
|
||||
|
||||
GGML_BACKEND_API size_t ggml_backend_openvino_buffer_get_ctx_id(ggml_backend_buffer_t buffer);
|
||||
|
||||
// device buffer
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_buffer_type(int device);
|
||||
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_host_buffer_type(int device);
|
||||
|
||||
GGML_BACKEND_API int ggml_backend_openvino_get_device_count(void);
|
||||
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_openvino_reg(void);
|
||||
|
||||
struct ggml_openvino_device_info {
|
||||
int device_count;
|
||||
|
||||
struct openvino_device_info {
|
||||
int cc; // compute capability
|
||||
int nsm; // number of streaming multiprocessors
|
||||
size_t smpb; // max. shared memory per block
|
||||
size_t smpbo; // max. shared memory per block (with opt-in)
|
||||
bool vmm; // virtual memory support
|
||||
size_t vmm_granularity; // granularity of virtual memory
|
||||
size_t total_vram;
|
||||
};
|
||||
|
||||
openvino_device_info devices[GGML_OPENVINO_MAX_DEVICES] = {};
|
||||
|
||||
std::array<float, GGML_OPENVINO_MAX_DEVICES> default_tensor_split = {};
|
||||
};
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
const ggml_openvino_device_info & ggml_openvino_info();
|
||||
#endif
|
||||
|
|
@ -460,6 +460,7 @@ ggml_add_backend(zDNN)
|
|||
ggml_add_backend(OpenCL)
|
||||
ggml_add_backend(Hexagon)
|
||||
ggml_add_backend(ZenDNN)
|
||||
ggml_add_backend(OPENVINO)
|
||||
|
||||
foreach (target ggml-base ggml)
|
||||
target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
|
||||
|
|
|
|||
|
|
@ -82,6 +82,10 @@
|
|||
#include "ggml-zendnn.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_OPENVINO
|
||||
#include "ggml-openvino.h"
|
||||
#endif
|
||||
|
||||
namespace fs = std::filesystem;
|
||||
|
||||
static std::string path_str(const fs::path & path) {
|
||||
|
|
@ -154,6 +158,9 @@ struct ggml_backend_registry {
|
|||
#ifdef GGML_USE_RPC
|
||||
register_backend(ggml_backend_rpc_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_OPENVINO
|
||||
register_backend(ggml_backend_openvino_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_CPU
|
||||
register_backend(ggml_backend_cpu_reg());
|
||||
#endif
|
||||
|
|
@ -557,6 +564,7 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
|
|||
ggml_backend_load_best("opencl", silent, dir_path);
|
||||
ggml_backend_load_best("hexagon", silent, dir_path);
|
||||
ggml_backend_load_best("musa", silent, dir_path);
|
||||
ggml_backend_load_best("openvino", silent, dir_path);
|
||||
ggml_backend_load_best("cpu", silent, dir_path);
|
||||
// check the environment variable GGML_BACKEND_PATH to load an out-of-tree backend
|
||||
const char * backend_path = std::getenv("GGML_BACKEND_PATH");
|
||||
|
|
|
|||
|
|
@ -0,0 +1,154 @@
|
|||
---
|
||||
# Override root .clang-format
|
||||
AlignConsecutiveAssignments: false
|
||||
AlignConsecutiveDeclarations: false
|
||||
Cpp11BracedListStyle: true
|
||||
SpacesInContainerLiterals: false
|
||||
BreakBeforeBraces: Attach
|
||||
AccessModifierOffset: -4
|
||||
IndentCaseBlocks: false
|
||||
IndentCaseLabels: false
|
||||
|
||||
Language: Cpp
|
||||
AlignAfterOpenBracket: Align
|
||||
AlignArrayOfStructures: Left
|
||||
AlignConsecutiveBitFields: AcrossComments
|
||||
AlignConsecutiveMacros: AcrossComments
|
||||
# AlignConsecutiveShortCaseStatements: AcrossComments
|
||||
AlignEscapedNewlines: Left # LeftWithLastLine
|
||||
AlignOperands: Align
|
||||
AlignTrailingComments:
|
||||
Kind: Always
|
||||
OverEmptyLines: 1
|
||||
AllowAllArgumentsOnNextLine: true
|
||||
AllowAllParametersOfDeclarationOnNextLine: false
|
||||
# AllowBreakBeforeNoexceptSpecifier: OnlyWithParen
|
||||
AllowShortBlocksOnASingleLine: Never
|
||||
AllowShortCaseLabelsOnASingleLine: false
|
||||
AllowShortFunctionsOnASingleLine: Inline
|
||||
AllowShortIfStatementsOnASingleLine: Never
|
||||
AllowShortLambdasOnASingleLine: Inline
|
||||
AllowShortLoopsOnASingleLine: false
|
||||
AlwaysBreakBeforeMultilineStrings: true
|
||||
# Treat CUDA keywords/attributes as "attribute macros" and avoid breaking lines inside them
|
||||
AttributeMacros:
|
||||
- __host__
|
||||
- __device__
|
||||
- __global__
|
||||
- __forceinline__
|
||||
- __launch_bounds__
|
||||
BinPackArguments: true
|
||||
BinPackParameters: false # OnePerLine
|
||||
BitFieldColonSpacing: Both
|
||||
# BreakAdjacentStringLiterals: true
|
||||
BreakAfterAttributes: Never
|
||||
BreakBeforeBinaryOperators: None
|
||||
BreakBeforeInlineASMColon: OnlyMultiline
|
||||
BreakBeforeTernaryOperators: false
|
||||
# BreakBinaryOperations: Never
|
||||
BreakConstructorInitializers: AfterColon
|
||||
# BreakFunctionDefinitionParameters: false
|
||||
BreakInheritanceList: AfterComma
|
||||
BreakStringLiterals: true
|
||||
# BreakTemplateDeclarations: Yes
|
||||
ColumnLimit: 120
|
||||
CommentPragmas: '^ IWYU pragma:'
|
||||
CompactNamespaces: false
|
||||
ConstructorInitializerIndentWidth: 4
|
||||
ContinuationIndentWidth: 4
|
||||
DerivePointerAlignment: false
|
||||
DisableFormat: false
|
||||
EmptyLineBeforeAccessModifier: Leave
|
||||
EmptyLineAfterAccessModifier: Never
|
||||
ExperimentalAutoDetectBinPacking: false
|
||||
FixNamespaceComments: true
|
||||
IncludeBlocks: Regroup
|
||||
IncludeCategories:
|
||||
- Regex: '".*"'
|
||||
Priority: 1
|
||||
SortPriority: 0
|
||||
- Regex: '^<.*\.h>'
|
||||
Priority: 2
|
||||
SortPriority: 0
|
||||
- Regex: '^<.*'
|
||||
Priority: 3
|
||||
SortPriority: 0
|
||||
- Regex: '.*'
|
||||
Priority: 4
|
||||
SortPriority: 0
|
||||
IncludeIsMainRegex: '([-_](test|unittest))?$'
|
||||
IncludeIsMainSourceRegex: ''
|
||||
IndentAccessModifiers: false
|
||||
IndentExternBlock: NoIndent
|
||||
IndentGotoLabels: false
|
||||
IndentPPDirectives: AfterHash
|
||||
IndentWidth: 4
|
||||
IndentWrappedFunctionNames: false
|
||||
InsertBraces: true # NOTE: may lead to incorrect formatting
|
||||
InsertNewlineAtEOF: true
|
||||
JavaScriptQuotes: Leave
|
||||
JavaScriptWrapImports: true
|
||||
KeepEmptyLinesAtTheStartOfBlocks: false
|
||||
LambdaBodyIndentation: Signature
|
||||
LineEnding: LF
|
||||
MacroBlockBegin: ''
|
||||
MacroBlockEnd: ''
|
||||
MaxEmptyLinesToKeep: 1
|
||||
NamespaceIndentation: None
|
||||
ObjCBinPackProtocolList: Auto
|
||||
ObjCBlockIndentWidth: 4
|
||||
ObjCSpaceAfterProperty: true
|
||||
ObjCSpaceBeforeProtocolList: true
|
||||
PPIndentWidth: -1
|
||||
PackConstructorInitializers: CurrentLine
|
||||
PenaltyBreakAssignment: 2
|
||||
PenaltyBreakBeforeFirstCallParameter: 1
|
||||
PenaltyBreakComment: 300
|
||||
PenaltyBreakFirstLessLess: 120
|
||||
PenaltyBreakString: 1000
|
||||
PenaltyBreakTemplateDeclaration: 10
|
||||
PenaltyExcessCharacter: 1000000
|
||||
PenaltyReturnTypeOnItsOwnLine: 200
|
||||
PointerAlignment: Middle
|
||||
QualifierAlignment: Left
|
||||
#QualifierOrder: ['static', 'inline', 'friend', 'constexpr', 'const', 'volatile', 'type', 'restrict']
|
||||
RawStringFormats:
|
||||
- Language: Cpp
|
||||
Delimiters:
|
||||
- cc
|
||||
- CC
|
||||
- cpp
|
||||
- Cpp
|
||||
- CPP
|
||||
- 'c++'
|
||||
- 'C++'
|
||||
CanonicalDelimiter: ''
|
||||
ReferenceAlignment: Middle
|
||||
ReflowComments: false # IndentOnly
|
||||
SeparateDefinitionBlocks: Always
|
||||
SortIncludes: CaseInsensitive
|
||||
SortUsingDeclarations: LexicographicNumeric
|
||||
SpaceAfterCStyleCast: true
|
||||
SpaceAfterLogicalNot: false
|
||||
SpaceAfterTemplateKeyword: true
|
||||
SpaceBeforeAssignmentOperators: true
|
||||
SpaceBeforeCpp11BracedList: false
|
||||
SpaceBeforeCtorInitializerColon: true
|
||||
SpaceBeforeInheritanceColon: true
|
||||
SpaceBeforeParens: ControlStatements
|
||||
SpaceBeforeRangeBasedForLoopColon: true
|
||||
SpaceInEmptyBlock: false
|
||||
SpaceInEmptyParentheses: false
|
||||
SpacesBeforeTrailingComments: 2
|
||||
SpacesInAngles: Never
|
||||
SpacesInLineCommentPrefix:
|
||||
Minimum: 1
|
||||
Maximum: -1
|
||||
SpacesInParentheses: false
|
||||
SpacesInSquareBrackets: false
|
||||
SpaceBeforeSquareBrackets: false
|
||||
Standard: c++17
|
||||
TabWidth: 4
|
||||
UseTab: Never
|
||||
WhitespaceSensitiveMacros: ['STRINGIZE']
|
||||
...
|
||||
|
|
@ -0,0 +1,22 @@
|
|||
find_package(OpenVINO REQUIRED)
|
||||
find_package(OpenCL REQUIRED)
|
||||
|
||||
include("${OpenVINO_DIR}/../3rdparty/tbb/lib/cmake/TBB/TBBConfig.cmake")
|
||||
|
||||
file(GLOB_RECURSE GGML_HEADERS_OPENVINO "*.h" "*.hpp")
|
||||
file(GLOB_RECURSE GGML_SOURCES_OPENVINO "*.cpp")
|
||||
|
||||
ggml_add_backend_library(ggml-openvino
|
||||
${GGML_SOURCES_OPENVINO}
|
||||
${GGML_HEADERS_OPENVINO}
|
||||
)
|
||||
|
||||
target_link_libraries(ggml-openvino PRIVATE openvino::runtime TBB::tbb OpenCL::OpenCL)
|
||||
|
||||
if (GGML_OPENVINO)
|
||||
if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
|
||||
elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "amd64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64")
|
||||
else()
|
||||
message(FATAL_ERROR "OpenVINO: OpenVINO toolkit supports x86-64 and arm64 but not ${CMAKE_SYSTEM_PROCESSOR}")
|
||||
endif()
|
||||
endif()
|
||||
|
|
@ -0,0 +1,930 @@
|
|||
#include "ggml-decoder.h"
|
||||
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml-openvino-extra.h"
|
||||
#include "ggml-openvino.h"
|
||||
#include "ggml-quants.hpp"
|
||||
|
||||
#include <ggml-impl.h>
|
||||
#include <ggml.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <execution>
|
||||
#include <fstream>
|
||||
#include <iomanip>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <openvino/core/dimension.hpp>
|
||||
#include <openvino/core/except.hpp>
|
||||
#include <openvino/core/node.hpp>
|
||||
#include <openvino/core/partial_shape.hpp>
|
||||
#include <openvino/core/type/bfloat16.hpp>
|
||||
#include <openvino/core/type/element_type.hpp>
|
||||
#include <openvino/core/type/float16.hpp>
|
||||
#include <openvino/op/constant.hpp>
|
||||
#include <openvino/op/convert.hpp>
|
||||
#include <openvino/op/parameter.hpp>
|
||||
#include <openvino/runtime/tensor.hpp>
|
||||
#include <optional>
|
||||
#include <ostream>
|
||||
#include <set>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph,
|
||||
ModelParams & model_params,
|
||||
ComputeParams & compute_params,
|
||||
std::map<std::string, std::shared_ptr<ov::Node>> & model_weights,
|
||||
bool is_static,
|
||||
bool is_stateful,
|
||||
bool is_prefill,
|
||||
int prefill_chunk_size) :
|
||||
m_is_static(is_static),
|
||||
m_is_stateful(is_stateful),
|
||||
m_is_prefill(is_prefill),
|
||||
m_naive(false),
|
||||
m_prefill_chunk_size(prefill_chunk_size),
|
||||
m_cgraph(cgraph),
|
||||
m_model_weights(model_weights),
|
||||
m_model_params(model_params),
|
||||
m_compute_params(compute_params) {
|
||||
if (auto * env = getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS"); env && std::string(env) != "0") {
|
||||
#ifdef _WIN32
|
||||
_putenv_s("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS", "");
|
||||
#else
|
||||
unsetenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS");
|
||||
#endif
|
||||
print_tensor_address_map(cgraph);
|
||||
}
|
||||
|
||||
validate_cgraph();
|
||||
|
||||
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
|
||||
auto * cur_node = cgraph->nodes[node_n];
|
||||
set_input_output(cur_node);
|
||||
}
|
||||
|
||||
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
|
||||
m_node_info_list[node_n].node_op_case = compute_op_case(m_node_info_list[node_n].node);
|
||||
m_node_info_list[node_n].node_op_type = compute_op_type(m_node_info_list[node_n].node);
|
||||
}
|
||||
|
||||
add_extra_inputs();
|
||||
}
|
||||
|
||||
void GgmlOvDecoder::update_io(ggml_cgraph * cgraph) {
|
||||
m_cgraph = cgraph;
|
||||
m_model_inputs.clear();
|
||||
m_model_outputs.clear();
|
||||
m_node_info_list.clear();
|
||||
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
|
||||
auto * cur_node = cgraph->nodes[node_n];
|
||||
set_input_output(cur_node);
|
||||
}
|
||||
}
|
||||
|
||||
GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::map<std::string, std::shared_ptr<ov::Node>> & model_weights) {
|
||||
m_cgraph = cgraph;
|
||||
m_model_weights = model_weights;
|
||||
m_naive = true;
|
||||
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
|
||||
auto * cur_node = cgraph->nodes[node_n];
|
||||
set_input_output(cur_node);
|
||||
}
|
||||
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
|
||||
m_node_info_list[node_n].node_op_case = compute_op_case(m_node_info_list[node_n].node);
|
||||
m_node_info_list[node_n].node_op_type = compute_op_type(m_node_info_list[node_n].node);
|
||||
}
|
||||
// Iterate through node_info_list to create model inputs and outputs.
|
||||
// For inputs: if an input of a node is not seen as an output of any previous node, it is a model input.
|
||||
// For outputs: every node output is a model output unless its data_addr is overridden by a later node.
|
||||
std::map<void *, ggml_tensor *> data_addr_map;
|
||||
std::unordered_set<std::string> output_name_set;
|
||||
for (const auto & node_info : m_node_info_list) {
|
||||
if (node_info.node->op == GGML_OP_NONE) {
|
||||
continue;
|
||||
}
|
||||
for (const auto & it : node_info.node_inputs) {
|
||||
const auto & src_name = it.first;
|
||||
const auto & src_node = it.second;
|
||||
|
||||
if (output_name_set.find(src_name) == output_name_set.end() &&
|
||||
m_model_weights.find(src_name) == m_model_weights.end() &&
|
||||
m_model_inputs.find(src_name) == m_model_inputs.end()) {
|
||||
auto param_node = std::make_shared<ov::op::v0::Parameter>(get_ov_type(src_node), get_shape(src_node));
|
||||
param_node->set_friendly_name(src_name);
|
||||
param_node->output(0).get_tensor().set_names({src_name});
|
||||
m_model_inputs[src_name] = param_node;
|
||||
}
|
||||
}
|
||||
output_name_set.emplace(node_info.node_output_name);
|
||||
data_addr_map[node_info.data_addr] = node_info.node_output;
|
||||
}
|
||||
for (const auto & it : data_addr_map) {
|
||||
// No need to add view tensors as model outputs
|
||||
if (it.second->op != GGML_OP_VIEW) {
|
||||
m_model_outputs[std::string(it.second->name)] = it.second;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GgmlOvDecoder::set_input_output(ggml_tensor * node) {
|
||||
NodeInfo current_node_info;
|
||||
auto node_name = std::string(node->name);
|
||||
auto node_output_name = node_name;
|
||||
auto * node_output = node;
|
||||
if (node->op == GGML_OP_SET_ROWS) {
|
||||
// SET_ROWS updates the tensor in place. For later ov op that uses the
|
||||
// the view_src of SET_ROWS, we need to make sure they get the updated tensor
|
||||
// by putting the view_src name in the tensor_map in
|
||||
// <openvino>/src/frontends/ggml/src/translate_session.cpp
|
||||
node_output_name = std::string(node->view_src->name);
|
||||
node_output = node->view_src;
|
||||
}
|
||||
|
||||
current_node_info.node = node;
|
||||
current_node_info.node_name = node_name;
|
||||
current_node_info.node_output = node_output;
|
||||
current_node_info.node_output_name = node_output_name;
|
||||
current_node_info.node_op_case = 0;
|
||||
current_node_info.data_addr = node->data;
|
||||
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
auto * src = node->src[i];
|
||||
if (src == nullptr) {
|
||||
continue;
|
||||
}
|
||||
auto src_name = std::string(src->name);
|
||||
if (src->flags & GGML_TENSOR_FLAG_INPUT) {
|
||||
src_name = get_graph_input_ov_name(src, node);
|
||||
}
|
||||
m_inputs[src_name] = src;
|
||||
current_node_info.node_inputs[src_name] = src;
|
||||
current_node_info.node_inputs_names.push_back(src_name);
|
||||
|
||||
// Add model inputs
|
||||
if (!m_naive && !src->view_src) {
|
||||
ggml_backend_buffer * buffer = src->buffer;
|
||||
|
||||
if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || src->flags & GGML_TENSOR_FLAG_INPUT) {
|
||||
ov::PartialShape stateful_kv_shape;
|
||||
// GGML_BACKEND_BUFFER_USAGE_ANY are kv caches
|
||||
if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY) {
|
||||
if (auto it = std::find(m_model_params.kv_names.begin(), m_model_params.kv_names.end(), src_name);
|
||||
it == m_model_params.kv_names.end()) {
|
||||
m_model_params.kv_names.push_back(src_name);
|
||||
if (is_stateful()) {
|
||||
// TODO: The shape modification for stateful model below is not validated for all supported models yet. More generic solution might be needed
|
||||
// to enable additional cases. Ideally, this could be removed from decoder and done as part of a transformation later.
|
||||
auto stateless_kv_shape = get_graph_input_shape(node, src);
|
||||
assert(stateless_kv_shape.size() == 4 && stateless_kv_shape[0] == 1 &&
|
||||
stateless_kv_shape[1] == 1 && stateless_kv_shape[2].is_dynamic() &&
|
||||
stateless_kv_shape[3] == (m_model_params.n_heads_kv * m_model_params.head_size));
|
||||
stateful_kv_shape = {stateless_kv_shape[0], ov::Dimension::dynamic(),
|
||||
m_model_params.n_heads_kv, m_model_params.head_size};
|
||||
}
|
||||
}
|
||||
}
|
||||
if (m_model_inputs.find(src_name) != m_model_inputs.end()) {
|
||||
continue;
|
||||
}
|
||||
assert(stateful_kv_shape.rank().is_static());
|
||||
ov::PartialShape param_shape =
|
||||
(stateful_kv_shape.rank().get_length() != 0) ? stateful_kv_shape : get_graph_input_shape(node, src);
|
||||
auto param_node = std::make_shared<ov::op::v0::Parameter>(get_ov_type(src), param_shape);
|
||||
param_node->set_friendly_name(src_name);
|
||||
param_node->output(0).get_tensor().set_names({src_name});
|
||||
m_model_inputs[src_name] = param_node;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Add model outputs
|
||||
if (!m_naive) {
|
||||
// Model outputs are tensors with GGML_TENSOR_FLAG_OUTPUT flag and kv_caches
|
||||
static std::set<std::string> debug_output_names = {};
|
||||
// Workaround: the final tensor "result_output" does not have GGML_TENSOR_FLAG_OUTPUT flag set in cgraph
|
||||
if (node->op == GGML_OP_SET_ROWS || node->flags & GGML_TENSOR_FLAG_OUTPUT ||
|
||||
debug_output_names.count(node_output_name)) {
|
||||
if (m_model_outputs.find(node_output_name) == m_model_outputs.end()) {
|
||||
m_model_outputs[node_output_name] = node_output;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
m_node_info_list.push_back(current_node_info);
|
||||
}
|
||||
|
||||
int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const {
|
||||
int op_case = 0;
|
||||
switch (node->op) {
|
||||
case GGML_OP_RESHAPE: {
|
||||
auto * src = node->src[0];
|
||||
if (src->op == GGML_OP_RESHAPE && src->src[0]->ne[0] == node->ne[0] && src->src[0]->ne[1] == node->ne[1]) {
|
||||
op_case = 4;
|
||||
} else if (node->ne[0] * node->ne[1] == src->ne[0]) {
|
||||
op_case = 1;
|
||||
} else if (src->ne[0] * src->ne[1] == node->ne[0]) {
|
||||
op_case = 2;
|
||||
if (src->ne[2] * src->ne[3] == node->ne[1]) {
|
||||
op_case = 5;
|
||||
}
|
||||
} else if (src->ne[0] * src->ne[1] == node->ne[1]) {
|
||||
op_case = 3;
|
||||
} else if (src->ne[1] * src->ne[2] == node->ne[1]) {
|
||||
op_case = 6;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case GGML_OP_CONT: {
|
||||
if (node->src[0]->op == GGML_OP_PERMUTE) {
|
||||
op_case = 1;
|
||||
} else if (node->src[0]->op == GGML_OP_TRANSPOSE) {
|
||||
op_case = 2;
|
||||
} else if (node->src[0]->op == GGML_OP_VIEW) {
|
||||
op_case = 3;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case GGML_OP_PERMUTE: {
|
||||
if (node->src[0]->op != GGML_OP_VIEW) {
|
||||
op_case = 1;
|
||||
} else if (node->src[0]->src[0]->op == GGML_OP_NONE) {
|
||||
// kv cache tensor
|
||||
std::string src_name(node->view_src->name);
|
||||
int layer = extract_layer_from_name(src_name);
|
||||
if (!is_swa_layer(layer)) {
|
||||
op_case = 2;
|
||||
} else {
|
||||
op_case = 3;
|
||||
}
|
||||
} else {
|
||||
// rope'ed query tensor
|
||||
op_case = 4;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case GGML_OP_MUL_MAT: {
|
||||
if (node->src[0]->op == GGML_OP_CONT && node->src[0]->src[0]->op == GGML_OP_TRANSPOSE) {
|
||||
op_case = 2;
|
||||
} else if (node->src[0]->op == GGML_OP_VIEW && node->src[1]->op == GGML_OP_VIEW) {
|
||||
op_case = 3;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case GGML_OP_GET_ROWS: {
|
||||
if (node->src[1]->op == GGML_OP_VIEW) {
|
||||
op_case = 2;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case GGML_OP_ROPE: {
|
||||
if (node->src[0]->op == GGML_OP_VIEW) {
|
||||
op_case = 2;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case GGML_OP_VIEW: {
|
||||
if (node->src[0]->op == GGML_OP_VIEW) {
|
||||
auto * src = node->src[0];
|
||||
if (ggml_nelements(node) != ggml_nelements(src)) {
|
||||
throw std::runtime_error("Unsupported VIEW case");
|
||||
}
|
||||
op_case = 2;
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return op_case;
|
||||
}
|
||||
|
||||
int extract_layer_from_name(const std::string & name) {
|
||||
size_t pos1 = name.find("_l");
|
||||
assert(pos1 != std::string::npos);
|
||||
pos1 += 2;
|
||||
size_t pos2 = name.find(' ', pos1);
|
||||
if (pos2 == std::string::npos) {
|
||||
pos2 = name.length();
|
||||
}
|
||||
std::string layer_str = name.substr(pos1, pos2 - pos1);
|
||||
int layer = std::stoi(layer_str);
|
||||
return layer;
|
||||
}
|
||||
|
||||
std::pair<ModelParams, ComputeParams> GgmlOvDecoder::compute_llm_params(ggml_cgraph * cgraph, bool is_static) {
|
||||
ModelParams model_params;
|
||||
ComputeParams compute_params;
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
auto * node = cgraph->nodes[i];
|
||||
std::string name = std::string(node->name);
|
||||
if (node->op == GGML_OP_FLASH_ATTN_EXT) {
|
||||
model_params.n_heads = node->src[0]->ne[2];
|
||||
model_params.n_heads_kv = node->src[1]->ne[2];
|
||||
model_params.head_size = node->src[0]->ne[0];
|
||||
compute_params.input_len = node->src[0]->ne[1];
|
||||
|
||||
auto * cache_k_perm = node->src[1];
|
||||
if (cache_k_perm->op == GGML_OP_CPY) {
|
||||
cache_k_perm = cache_k_perm->src[0];
|
||||
}
|
||||
assert(cache_k_perm->op == GGML_OP_PERMUTE);
|
||||
auto * cache_k_view = cache_k_perm->src[0];
|
||||
assert(cache_k_view->op == GGML_OP_VIEW);
|
||||
|
||||
auto * cache_k = cache_k_view->src[0];
|
||||
int layer = extract_layer_from_name(cache_k->name);
|
||||
auto * mask = node->src[3];
|
||||
std::string mask_name(mask->name);
|
||||
|
||||
model_params.kv_buffer_ctx_id = ggml_backend_openvino_buffer_get_ctx_id(cache_k->buffer);
|
||||
if (mask_name.find("swa") != std::string::npos) {
|
||||
model_params.swa_layers.push_back(layer);
|
||||
model_params.ctx_per_seq_swa = cache_k->ne[1];
|
||||
} else {
|
||||
model_params.ctx_per_seq = cache_k->ne[1];
|
||||
model_params.n_seq = cache_k->ne[2];
|
||||
}
|
||||
|
||||
compute_params.n_seq_active = mask->ne[3];
|
||||
auto seq_size = cache_k->ne[0] * cache_k->ne[1] * ggml_type_size(cache_k->type);
|
||||
size_t offset;
|
||||
memcpy(&offset, cache_k_view->op_params, sizeof(size_t));
|
||||
compute_params.seq_active_start = offset / seq_size;
|
||||
compute_params.token_len_per_seq = node->ne[2];
|
||||
|
||||
if (mask_name.find("swa") != std::string::npos) {
|
||||
compute_params.attention_size_swa = mask->ne[0];
|
||||
} else {
|
||||
compute_params.attention_size = mask->ne[0];
|
||||
}
|
||||
if (is_static) {
|
||||
compute_params.attention_size = model_params.ctx_per_seq;
|
||||
compute_params.attention_size_swa = model_params.ctx_per_seq_swa;
|
||||
compute_params.token_len_per_seq = 1;
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (node->op == GGML_OP_ROPE) {
|
||||
memcpy(model_params.rope_params, node->op_params, sizeof(int32_t) * 15);
|
||||
}
|
||||
}
|
||||
auto * output_tensor = cgraph->nodes[cgraph->n_nodes - 1];
|
||||
compute_params.output_len = output_tensor->ne[1];
|
||||
// for NPU, output_len is always 1 except for llama-perplexity
|
||||
if (is_static && compute_params.output_len == 0) {
|
||||
compute_params.output_len = 1;
|
||||
}
|
||||
model_params.ctx = model_params.ctx_per_seq * model_params.n_seq;
|
||||
model_params.ctx_swa = model_params.ctx_per_seq_swa * model_params.n_seq;
|
||||
return {model_params, compute_params};
|
||||
}
|
||||
|
||||
void GgmlOvDecoder::validate_cgraph() const {
|
||||
if (m_model_params.n_seq > 1 && m_is_static == true) {
|
||||
throw std::runtime_error("n_seq > 1 is not supported on NPU. Try setting -np 1.");
|
||||
}
|
||||
}
|
||||
|
||||
ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input) const {
|
||||
auto name = std::string(input->name);
|
||||
ov::PartialShape input_shape;
|
||||
|
||||
if (is_inp_tok(input, op) || is_inp_pos(input, op)) {
|
||||
// tokens or positions
|
||||
int len = m_is_static ? (m_is_prefill ? m_prefill_chunk_size : 1) : -1;
|
||||
input_shape = ov::PartialShape{1, 1, 1, len};
|
||||
|
||||
} else if (is_output_idx(input, op)) {
|
||||
// output index
|
||||
input_shape = ov::PartialShape{1, 1, 1, m_is_static ? m_compute_params.output_len : -1};
|
||||
|
||||
} else if (is_inp_mask(input, op)) {
|
||||
// mask
|
||||
if (m_is_static) {
|
||||
input_shape = ov::PartialShape{1, 1, m_is_prefill ? m_prefill_chunk_size : 1, m_model_params.ctx};
|
||||
} else if (m_is_stateful) {
|
||||
input_shape = ov::PartialShape{1, 1, -1, -1};
|
||||
} else {
|
||||
input_shape = ov::PartialShape{-1, 1, -1, -1};
|
||||
}
|
||||
|
||||
} else if (is_kvcache(input, op)) {
|
||||
// kvcache
|
||||
input_shape = ov::PartialShape{get_shape(input)};
|
||||
if (!m_is_static) {
|
||||
// do not fix ctx size to make llama-bench work across test params
|
||||
input_shape[2] = -1;
|
||||
}
|
||||
|
||||
} else if (is_kv_idx(input, op)) {
|
||||
// kv update index
|
||||
int len = m_is_static ? (m_is_prefill ? m_prefill_chunk_size : 1) : -1;
|
||||
input_shape = ov::PartialShape{1, 1, 1, len};
|
||||
|
||||
} else {
|
||||
input_shape = ov::PartialShape{get_shape(input)};
|
||||
}
|
||||
return input_shape;
|
||||
}
|
||||
|
||||
void GgmlOvDecoder::add_extra_inputs() {
|
||||
// Extra inputs:
|
||||
// 1. `attention_size`, used in FLASH_ATTN where the shape of the matmul's are 256 aligned,
|
||||
// see llama_kv_cache_unified::get_n_kv and llama_kv_cache_unified::get_padding.
|
||||
// 2. `n_seq_active` and `seq_active_start`, used in FLASH_ATTN_EXT to indicate the active sequences in the batch
|
||||
|
||||
auto create_1d_input = [this](const std::string & name, int64_t value) {
|
||||
if (m_is_static) {
|
||||
auto constant =
|
||||
std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{value});
|
||||
constant->set_friendly_name(name);
|
||||
m_model_extra_inputs[name] = constant;
|
||||
} else {
|
||||
auto param_node = std::make_shared<ov::op::v0::Parameter>(ov::element::i64, ov::Shape{1});
|
||||
param_node->set_friendly_name(name);
|
||||
param_node->output(0).get_tensor().set_names({name});
|
||||
m_model_extra_inputs[name] = param_node;
|
||||
|
||||
auto tensor = std::make_shared<ov::Tensor>(ov::element::i64, ov::Shape{1});
|
||||
*tensor->data<int64_t>() = value;
|
||||
m_model_extra_input_values[name] = tensor;
|
||||
}
|
||||
};
|
||||
|
||||
create_1d_input("attention_size", m_compute_params.attention_size);
|
||||
if (m_compute_params.attention_size_swa != -1) {
|
||||
create_1d_input("attention_size_swa", m_compute_params.attention_size_swa);
|
||||
}
|
||||
create_1d_input("n_seq_active", m_compute_params.n_seq_active);
|
||||
create_1d_input("seq_active_start", m_compute_params.seq_active_start);
|
||||
create_1d_input("seq_active_end", m_compute_params.seq_active_start + m_compute_params.n_seq_active);
|
||||
create_1d_input("token_len_per_seq", m_compute_params.token_len_per_seq);
|
||||
// create_1d_input("token_len", m_token_len_per_seq * m_n_seq_active);
|
||||
}
|
||||
|
||||
const ggml_tensor * GgmlOvDecoder::get_tensor_used_op(const ggml_tensor * tensor) const {
|
||||
if (tensor == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
for (int i = 0; i < m_cgraph->n_nodes; i++) {
|
||||
const auto * node = m_cgraph->nodes[i];
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
if (node->src[j] == tensor) {
|
||||
return node;
|
||||
}
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
const ggml_tensor * GgmlOvDecoder::get_tensor_from_name(const std::string & name) const {
|
||||
for (int i = 0; i < m_cgraph->n_nodes; i++) {
|
||||
const auto * node = m_cgraph->nodes[i];
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
const auto * src = node->src[j];
|
||||
if (src == nullptr) {
|
||||
break;
|
||||
}
|
||||
if (std::string(src->name) == name) {
|
||||
return src;
|
||||
}
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names() const {
|
||||
std::map<std::string, std::string> kv_param_res_names;
|
||||
for (const auto & name : m_model_params.kv_names) {
|
||||
kv_param_res_names[name] = name;
|
||||
}
|
||||
return kv_param_res_names;
|
||||
}
|
||||
|
||||
std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(ggml_cgraph * cgraph, bool naive) {
|
||||
std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
|
||||
// static std::mutex weights_mutex;
|
||||
auto * nodes = cgraph->nodes;
|
||||
auto n_nodes = cgraph->n_nodes;
|
||||
// std::for_each(std::execution::par, nodes, nodes + n_nodes, [&](ggml_tensor * node) {
|
||||
for (int node_i = 0; node_i < n_nodes; node_i++) {
|
||||
auto * node = nodes[node_i];
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
auto * src = node->src[i];
|
||||
if (src == nullptr) {
|
||||
continue;
|
||||
}
|
||||
|
||||
std::string src_name(src->name);
|
||||
if (is_rope_freqs_weight(src, node)) {
|
||||
src_name = "rope_freqs.weight";
|
||||
}
|
||||
if (!src->view_src) {
|
||||
ggml_backend_buffer * buffer = src->buffer;
|
||||
if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS || ggml_is_quantized(src->type)) {
|
||||
// bool should_create = false;
|
||||
// {
|
||||
// std::lock_guard<std::mutex> lock(weights_mutex);
|
||||
// if (model_weights.find(src_name) == model_weights.end()) {
|
||||
// model_weights[src_name] = nullptr;
|
||||
// should_create = true;
|
||||
// }
|
||||
// }
|
||||
// if (should_create) {
|
||||
// auto weight_node = create_weight_node(src);
|
||||
// weight_node->set_friendly_name(src_name);
|
||||
// {
|
||||
// std::lock_guard<std::mutex> lock(weights_mutex);
|
||||
// model_weights[src_name] = weight_node;
|
||||
// }
|
||||
// }
|
||||
if (model_weights.find(src_name) == model_weights.end()) {
|
||||
auto weight_node = create_weight_node(src, naive);
|
||||
weight_node->set_friendly_name(src_name);
|
||||
model_weights[src_name] = weight_node;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// });
|
||||
return model_weights;
|
||||
}
|
||||
|
||||
std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor, bool naive) {
|
||||
const bool is_ov_buffer = ggml_backend_buffer_is_openvino(tensor->buffer);
|
||||
|
||||
// Check if we have a pre-built constant from the OpenVINO backend buffer
|
||||
// This is set during ggml_backend_openvino_buffer_set_tensor
|
||||
if (tensor->extra) {
|
||||
OPENVINO_ASSERT(is_ov_buffer, "Unsupported weight tensor: " + std::string(tensor->name) +
|
||||
" Possibly this is a cpu backend repacked quantized weights");
|
||||
// Cast to our extra base type and check the type
|
||||
auto * extra_base = static_cast<ggml_openvino_extra_base *>(tensor->extra);
|
||||
|
||||
if (extra_base->type == ggml_openvino_extra_base::Type::WEIGHT) {
|
||||
// F16/F32/BF16 weight with shared-memory constant
|
||||
auto * weight_extra = static_cast<ggml_openvino_weight_extra *>(tensor->extra);
|
||||
if (weight_extra->weight_node) {
|
||||
// GGML_LOG_DEBUG("%s: using pre-built weight node for %s\n", __func__, tensor->name);
|
||||
return weight_extra->weight_node;
|
||||
}
|
||||
} else if (extra_base->type == ggml_openvino_extra_base::Type::QUANTIZED_WEIGHT) {
|
||||
// Quantized weight with pre-extracted data
|
||||
auto * quant_extra = static_cast<ggml_openvino_quantized_weight_extra *>(tensor->extra);
|
||||
if (quant_extra->weight_node) {
|
||||
// GGML_LOG_DEBUG("%s: using pre-extracted quantized weight node for %s\n", __func__, tensor->name);
|
||||
return quant_extra->weight_node;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// There are three cases where we need to create a new weight node:
|
||||
// 1. weights are in openvino_host_buffer. Weight loading to host buffer will not trigger backend_buffer_set_tensor
|
||||
// 2. weights are in cpu/cpu_mapped buffer. On token_embd.weight goes to case 1 or 2, depending on whether mmap or direct_io is used
|
||||
// 3. test-backend-ops. buffers in test-backend-ops does not set USAGE_WEIGHT so backend_buffer_set_tensor will not create weight node
|
||||
|
||||
// GGML_LOG_DEBUG("%s: creating new weight node for %s\n", __func__, tensor->name);
|
||||
static const std::set<ggml_type> weight_types = {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16,
|
||||
GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
|
||||
GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K};
|
||||
if (weight_types.find(tensor->type) == weight_types.end()) {
|
||||
throw std::runtime_error("Unexpected weight tensor type: " + std::string(tensor->name) + " with type " +
|
||||
ggml_type_name(tensor->type));
|
||||
}
|
||||
|
||||
OvWeight ov_weight;
|
||||
if (ggml_is_quantized(tensor->type)) {
|
||||
auto use_bias = naive;
|
||||
if (is_ov_buffer) {
|
||||
// For quantized weights, copy raw data to a temp buffer first because
|
||||
// process_weight_tensor reads from data and writes extracted results
|
||||
// (weights/scales/zp) to output_base_ptr — they would overlap if both
|
||||
// point to tensor->data.
|
||||
size_t raw_size = ggml_nbytes(tensor);
|
||||
std::vector<uint8_t> tmp(raw_size);
|
||||
memcpy(tmp.data(), tensor->data, raw_size);
|
||||
ov_weight = process_weight_tensor(tensor, tmp.data(), tensor->data, use_bias);
|
||||
} else {
|
||||
ov_weight = process_weight_tensor(tensor, tensor->data, nullptr, use_bias);
|
||||
}
|
||||
} else {
|
||||
// For non-quantized weights (F16/F32/BF16), data is already in tensor->data.
|
||||
// process_weight_tensor will create an ov::Tensor wrapping tensor->data directly.
|
||||
ov_weight = process_weight_tensor(tensor, tensor->data, tensor->data);
|
||||
}
|
||||
|
||||
ov_weight.weight_node->set_friendly_name(tensor->name);
|
||||
if (!is_ov_buffer) {
|
||||
return ov_weight.weight_node;
|
||||
}
|
||||
|
||||
ggml_openvino_extra_base * extra;
|
||||
if (ov_weight.is_quantized()) {
|
||||
extra = new ggml_openvino_quantized_weight_extra(std::move(ov_weight.weights), std::move(ov_weight.scales),
|
||||
std::move(ov_weight.zp), ov_weight.weight_node);
|
||||
} else {
|
||||
extra = new ggml_openvino_weight_extra(std::move(ov_weight.weights), ov_weight.weight_node);
|
||||
}
|
||||
ggml_openvino_buffer_register_extra(tensor, extra);
|
||||
|
||||
return ov_weight.weight_node;
|
||||
}
|
||||
|
||||
void GgmlOvDecoder::dump_cgraph(const ggml_cgraph * cgraph, std::string & filename) {
|
||||
std::ofstream file(filename);
|
||||
if (!file.is_open()) {
|
||||
std::cerr << "Failed to open file" << std::endl;
|
||||
return;
|
||||
}
|
||||
|
||||
file << "=== GRAPH ===\n";
|
||||
|
||||
// clang-format off
|
||||
file << "n_nodes = " << cgraph->n_nodes << "\n";
|
||||
file << " " << std::setw(3) << "nodes"
|
||||
<< std::setw(15) << "shape"
|
||||
<< std::setw(20) << "op"
|
||||
<< std::setw(20) << "name"
|
||||
<< std::setw(3) << " "
|
||||
<< std::setw(62) << "stride"
|
||||
<< std::setw(20) << "buffer_type"
|
||||
<< "\n";
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor * node = cgraph->nodes[i];
|
||||
|
||||
// Get buffer type name
|
||||
const char * buf_name = "none";
|
||||
ggml_backend_buffer_t buf = node->view_src ? node->view_src->buffer : node->buffer;
|
||||
if (buf) {
|
||||
buf_name = ggml_backend_buffer_name(buf);
|
||||
}
|
||||
|
||||
file << " - " << std::setw(3) << i << ": [ "
|
||||
<< std::setw(5) << node->ne[0] << ", "
|
||||
<< std::setw(5) << node->ne[1] << ", "
|
||||
<< std::setw(5) << node->ne[2] << ", "
|
||||
<< std::setw(5) << node->ne[3] << "] "
|
||||
<< std::left << std::setw(20) << ggml_op_name(node->op) << std::right << " "
|
||||
<< std::left << std::setw(45) << node->name << std::right
|
||||
<< std::setw(2) << "[ "
|
||||
<< std::setw(0) << node->nb[0] << ", "
|
||||
<< std::setw(5) << node->nb[1] << ", "
|
||||
<< std::setw(5) << node->nb[2] << ", "
|
||||
<< std::setw(5) << node->nb[3] << "] "
|
||||
<< std::right << std::setw(15) << buf_name << std::right
|
||||
<< "\n";
|
||||
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
if (auto* src = node->src[i]) {
|
||||
// Get buffer type name for source
|
||||
const char * src_buf_name = "none";
|
||||
ggml_backend_buffer_t src_buf = src->view_src ? src->view_src->buffer : src->buffer;
|
||||
if (src_buf) {
|
||||
src_buf_name = ggml_backend_buffer_name(src_buf);
|
||||
}
|
||||
|
||||
file << std::setw(10) << " [ "
|
||||
<< std::setw(5) << src->ne[0] << ", "
|
||||
<< std::setw(5) << src->ne[1] << ", "
|
||||
<< std::setw(5) << src->ne[2] << ", "
|
||||
<< std::setw(5) << src->ne[3] << "] "
|
||||
<< std::setw(12)
|
||||
<< i << ": " << std::left << std::setw(12) << ggml_op_name(src->op) << std::right;
|
||||
file << std::left << std::setw(30) << src->name << std::right
|
||||
<< std::setw(16) << "[ "
|
||||
<< std::setw(0) << src->nb[0] << ", "
|
||||
<< std::setw(5) << src->nb[1] << ", "
|
||||
<< std::setw(5) << src->nb[2] << ", "
|
||||
<< std::setw(5) << src->nb[3] << "] "
|
||||
<< std::right << std::setw(15) << src_buf_name << std::right
|
||||
<< "\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
file << "n_leafs = " << cgraph->n_leafs << "\n";
|
||||
for (int i = 0; i < cgraph->n_leafs; i++) {
|
||||
ggml_tensor * node = cgraph->leafs[i];
|
||||
|
||||
// Get buffer type name for leaf
|
||||
const char * leaf_buf_name = "none";
|
||||
ggml_backend_buffer_t leaf_buf = node->view_src ? node->view_src->buffer : node->buffer;
|
||||
if (leaf_buf) {
|
||||
leaf_buf_name = ggml_backend_buffer_name(leaf_buf);
|
||||
}
|
||||
|
||||
file << " - " << std::setw(3) << i << ": [ "
|
||||
<< std::setw(5) << node->ne[0] << ", "
|
||||
<< std::setw(5) << node->ne[1] << "] "
|
||||
<< std::setw(8) << ggml_op_name(node->op) << " "
|
||||
<< std::setw(16) << ggml_get_name(node)
|
||||
<< std::setw(20) << leaf_buf_name << "\n";
|
||||
}
|
||||
// clang-format on
|
||||
file << "========================================\n";
|
||||
|
||||
file.close();
|
||||
}
|
||||
|
||||
void print_tensor_address_map(const ggml_cgraph * cgraph) {
|
||||
std::map<void *, std::vector<std::string>> address_map;
|
||||
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
|
||||
auto * node = cgraph->nodes[node_n];
|
||||
if (node->data) {
|
||||
auto it = address_map.find(node->data);
|
||||
if (it == address_map.end()) {
|
||||
address_map[node->data] = std::vector<std::string>();
|
||||
}
|
||||
address_map[node->data].push_back(node->name);
|
||||
}
|
||||
}
|
||||
for (const auto & pair : address_map) {
|
||||
std::cout << "Address: " << pair.first << std::endl;
|
||||
for (const auto & name : pair.second) {
|
||||
std::cout << name << " ; ";
|
||||
}
|
||||
std::cout << std::endl << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
ov::Shape GgmlOvDecoder::get_shape(const ggml_tensor * tensor) {
|
||||
std::vector<size_t> shape;
|
||||
for (int i = GGML_MAX_DIMS - 1; i >= 0; --i) {
|
||||
shape.push_back(static_cast<size_t>(tensor->ne[i]));
|
||||
}
|
||||
return shape;
|
||||
}
|
||||
|
||||
std::vector<size_t> GgmlOvDecoder::get_stride(const ggml_tensor * tensor) {
|
||||
std::vector<size_t> stride;
|
||||
for (int i = GGML_MAX_DIMS - 1; i >= 0; --i) {
|
||||
stride.push_back(static_cast<size_t>(tensor->nb[i]));
|
||||
}
|
||||
return stride;
|
||||
}
|
||||
|
||||
ov::element::Type GgmlOvDecoder::get_ov_type(const ggml_tensor * tensor) {
|
||||
switch (tensor->type) {
|
||||
case GGML_TYPE_F64:
|
||||
return ov::element::f64;
|
||||
case GGML_TYPE_F32:
|
||||
return ov::element::f32;
|
||||
case GGML_TYPE_F16:
|
||||
return ov::element::f16;
|
||||
case GGML_TYPE_BF16:
|
||||
return ov::element::bf16;
|
||||
case GGML_TYPE_I8:
|
||||
return ov::element::i8;
|
||||
case GGML_TYPE_I16:
|
||||
return ov::element::i16;
|
||||
case GGML_TYPE_I32:
|
||||
return ov::element::i32;
|
||||
case GGML_TYPE_I64:
|
||||
return ov::element::i64;
|
||||
default:
|
||||
return ov::element::dynamic;
|
||||
}
|
||||
}
|
||||
|
||||
ov::PartialShape GgmlOvDecoder::get_input_shape(int node_idx, const std::string & name) const {
|
||||
return ov::PartialShape(get_shape(m_node_info_list[node_idx].node_inputs.at(name)));
|
||||
}
|
||||
|
||||
std::vector<size_t> GgmlOvDecoder::get_input_stride(int node_idx, const std::string & name) const {
|
||||
return get_stride(m_node_info_list[node_idx].node_inputs.at(name));
|
||||
}
|
||||
|
||||
ov::element::Type GgmlOvDecoder::get_input_type(int node_idx, const std::string & name) const {
|
||||
return get_ov_type(m_node_info_list[node_idx].node_inputs.at(name));
|
||||
}
|
||||
|
||||
size_t GgmlOvDecoder::get_input_size() const {
|
||||
return m_model_inputs.size();
|
||||
}
|
||||
|
||||
size_t GgmlOvDecoder::get_input_size(int node_idx) const {
|
||||
return m_node_info_list[node_idx].node_inputs_names.size();
|
||||
}
|
||||
|
||||
std::vector<std::string> GgmlOvDecoder::get_input_names(int node_idx) const {
|
||||
return m_node_info_list[node_idx].node_inputs_names;
|
||||
}
|
||||
|
||||
ov::PartialShape GgmlOvDecoder::get_output_shape(int node_idx) const {
|
||||
auto * ggml_tensor = m_node_info_list[node_idx].node_output;
|
||||
return ov::PartialShape(get_shape(ggml_tensor));
|
||||
}
|
||||
|
||||
ov::element::Type GgmlOvDecoder::get_output_type(const int node_idx) const {
|
||||
return get_ov_type(m_node_info_list[node_idx].node);
|
||||
}
|
||||
|
||||
std::vector<std::string> GgmlOvDecoder::get_output_names(int node_idx) const {
|
||||
return {m_node_info_list[node_idx].node_output_name};
|
||||
}
|
||||
|
||||
const std::string & GgmlOvDecoder::get_op_name() const {
|
||||
static const std::string unknown_name = "UNKNOWN_OP_NAME";
|
||||
return unknown_name;
|
||||
}
|
||||
|
||||
const std::string & GgmlOvDecoder::get_op_name(int node_idx) const {
|
||||
return m_node_info_list[node_idx].node_name;
|
||||
}
|
||||
|
||||
int32_t * GgmlOvDecoder::get_input_op_params(int node_idx, const std::string & name) const {
|
||||
return m_node_info_list[node_idx].node_inputs.at(name)->op_params;
|
||||
}
|
||||
|
||||
int32_t * GgmlOvDecoder::get_output_op_params(int node_idx) const {
|
||||
return m_node_info_list[node_idx].node->op_params;
|
||||
}
|
||||
|
||||
void GgmlOvDecoder::visit_subgraph(std::function<void(std::shared_ptr<GgmlDecoder>, int node_idx)> node_visitor) const {
|
||||
for (int node_idx = 0; node_idx < m_cgraph->n_nodes; node_idx++) {
|
||||
if (m_cgraph->nodes[node_idx]->op == GGML_OP_NONE) {
|
||||
continue;
|
||||
}
|
||||
node_visitor(std::make_shared<GgmlOvDecoder>(*this), node_idx);
|
||||
}
|
||||
}
|
||||
|
||||
std::string GgmlOvDecoder::compute_op_type(const ggml_tensor * node) {
|
||||
static const std::map<ggml_op, std::string> ops = {
|
||||
{GGML_OP_NONE, "GGML_OP_NONE" },
|
||||
{GGML_OP_ACC, "GGML_OP_ACC" },
|
||||
{GGML_OP_ADD, "GGML_OP_ADD" },
|
||||
{GGML_OP_ADD1, "GGML_OP_ADD1" },
|
||||
{GGML_OP_CONT, "GGML_OP_CONT" },
|
||||
{GGML_OP_DIV, "GGML_OP_DIV" },
|
||||
{GGML_OP_DUP, "GGML_OP_DUP" },
|
||||
{GGML_OP_GET_ROWS, "GGML_OP_GET_ROWS" },
|
||||
{GGML_OP_MUL, "GGML_OP_MUL" },
|
||||
{GGML_OP_MUL_MAT, "GGML_OP_MUL_MAT" },
|
||||
{GGML_OP_PERMUTE, "GGML_OP_PERMUTE" },
|
||||
{GGML_OP_RESHAPE, "GGML_OP_RESHAPE" },
|
||||
{GGML_OP_RMS_NORM, "GGML_OP_RMS_NORM" },
|
||||
{GGML_OP_ROPE, "GGML_OP_ROPE" },
|
||||
{GGML_OP_SCALE, "GGML_OP_SCALE" },
|
||||
{GGML_OP_SOFT_MAX, "GGML_OP_SOFT_MAX" },
|
||||
{GGML_OP_SUB, "GGML_OP_SUB" },
|
||||
{GGML_OP_TRANSPOSE, "GGML_OP_TRANSPOSE" },
|
||||
{GGML_OP_VIEW, "GGML_OP_VIEW" },
|
||||
{GGML_OP_SET_ROWS, "GGML_OP_SET_ROWS" },
|
||||
{GGML_OP_CPY, "GGML_OP_CPY" },
|
||||
{GGML_OP_FLASH_ATTN_EXT, "GGML_OP_FLASH_ATTN_EXT"},
|
||||
};
|
||||
static const std::map<ggml_unary_op, std::string> unary_ops = {
|
||||
{GGML_UNARY_OP_ABS, "GGML_UNARY_OP_ABS" },
|
||||
{GGML_UNARY_OP_SGN, "GGML_UNARY_OP_SGN" },
|
||||
{GGML_UNARY_OP_NEG, "GGML_UNARY_OP_NEG" },
|
||||
{GGML_UNARY_OP_STEP, "GGML_UNARY_OP_STEP" },
|
||||
{GGML_UNARY_OP_TANH, "GGML_UNARY_OP_TANH" },
|
||||
{GGML_UNARY_OP_ELU, "GGML_UNARY_OP_ELU" },
|
||||
{GGML_UNARY_OP_RELU, "GGML_UNARY_OP_RELU" },
|
||||
{GGML_UNARY_OP_SIGMOID, "GGML_UNARY_OP_SIGMOID" },
|
||||
{GGML_UNARY_OP_GELU, "GGML_UNARY_OP_GELU" },
|
||||
{GGML_UNARY_OP_GELU_QUICK, "GGML_UNARY_OP_GELU_QUICK" },
|
||||
{GGML_UNARY_OP_SILU, "GGML_UNARY_OP_SILU" },
|
||||
{GGML_UNARY_OP_HARDSWISH, "GGML_UNARY_OP_HARDSWISH" },
|
||||
{GGML_UNARY_OP_HARDSIGMOID, "GGML_UNARY_OP_HARDSIGMOID"},
|
||||
{GGML_UNARY_OP_EXP, "GGML_UNARY_OP_EXP" },
|
||||
{GGML_UNARY_OP_COUNT, "GGML_UNARY_OP_COUNT" }
|
||||
};
|
||||
static const std::map<ggml_glu_op, std::string> glu_ops = {
|
||||
{GGML_GLU_OP_SWIGLU, "GGML_GLU_OP_SWIGLU"},
|
||||
{GGML_GLU_OP_GEGLU, "GGML_GLU_OP_GEGLU" },
|
||||
{GGML_GLU_OP_REGLU, "GGML_GLU_OP_REGLU" }
|
||||
};
|
||||
|
||||
switch (node->op) {
|
||||
case GGML_OP_UNARY:
|
||||
return unary_ops.at(ggml_get_unary_op(node));
|
||||
case GGML_OP_GLU:
|
||||
return glu_ops.at(ggml_get_glu_op(node));
|
||||
default:
|
||||
return ops.at(node->op);
|
||||
}
|
||||
static const std::string unknown_op = "UNKNOWN_GGML_OP";
|
||||
return unknown_op;
|
||||
}
|
||||
|
||||
const std::string & GgmlOvDecoder::get_op_type(int node_idx) const {
|
||||
return m_node_info_list[node_idx].node_op_type;
|
||||
}
|
||||
|
||||
const std::string & GgmlOvDecoder::get_op_type() const {
|
||||
static const std::string unknown_op = "UNKNOWN_GGML_OP";
|
||||
return unknown_op;
|
||||
}
|
||||
|
|
@ -0,0 +1,295 @@
|
|||
#pragma once
|
||||
|
||||
#include "ggml-quants.hpp"
|
||||
#include "ggml.h"
|
||||
#include "openvino/decoder.hpp"
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <openvino/core/partial_shape.hpp>
|
||||
#include <optional>
|
||||
#include <vector>
|
||||
|
||||
struct ModelParams {
|
||||
int ctx = -1;
|
||||
int ctx_swa = -1;
|
||||
int ctx_per_seq = -1;
|
||||
int ctx_per_seq_swa = -1;
|
||||
int n_seq = 1;
|
||||
int n_heads = -1;
|
||||
int n_heads_kv = -1;
|
||||
int head_size = -1;
|
||||
int32_t rope_params[15];
|
||||
std::vector<int> swa_layers;
|
||||
|
||||
std::vector<std::string> kv_names;
|
||||
size_t kv_buffer_ctx_id = 0;
|
||||
|
||||
bool same_rope_params(const ModelParams & other) const {
|
||||
return memcmp(rope_params, other.rope_params, sizeof(int32_t) * 15) == 0;
|
||||
}
|
||||
|
||||
bool can_reuse_dynamically(const ModelParams & other) const { return same_rope_params(other); }
|
||||
|
||||
bool can_reuse_statically(const ModelParams & other) const { return same_rope_params(other) && ctx == other.ctx; }
|
||||
|
||||
bool kv_buffer_changed(const ModelParams & other) const { return kv_buffer_ctx_id != other.kv_buffer_ctx_id; }
|
||||
};
|
||||
|
||||
struct ComputeParams {
|
||||
int n_seq_active = 1;
|
||||
int seq_active_start = 0;
|
||||
int attention_size = -1;
|
||||
int attention_size_swa = -1;
|
||||
int input_len = -1;
|
||||
int token_len_per_seq = -1;
|
||||
int past_kv_len = -1;
|
||||
int output_len = 1;
|
||||
};
|
||||
|
||||
class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
|
||||
public:
|
||||
struct NodeInfo {
|
||||
ggml_tensor * node;
|
||||
std::string node_name;
|
||||
std::string node_op_type;
|
||||
std::map<std::string, ggml_tensor *> node_inputs;
|
||||
std::vector<std::string> node_inputs_names;
|
||||
ggml_tensor * node_output;
|
||||
std::string node_output_name;
|
||||
int node_op_case = 0;
|
||||
void * data_addr;
|
||||
};
|
||||
// Graph decoder
|
||||
GgmlOvDecoder(ggml_cgraph * cgraph,
|
||||
ModelParams & model_params,
|
||||
ComputeParams & compute_params,
|
||||
std::map<std::string, std::shared_ptr<ov::Node>> & model_weights,
|
||||
bool is_static,
|
||||
bool is_stateful = false,
|
||||
bool is_prefill = false,
|
||||
int prefill_chunk_size = 256);
|
||||
|
||||
// Naive graph decoder
|
||||
GgmlOvDecoder(ggml_cgraph * cgraph, std::map<std::string, std::shared_ptr<ov::Node>> & model_weights);
|
||||
|
||||
virtual ov::Any get_attribute(const std::string & name) const override {
|
||||
return nullptr;
|
||||
GGML_UNUSED(name);
|
||||
}
|
||||
|
||||
virtual ov::PartialShape get_input_shape(int node_idx, const std::string & name) const override;
|
||||
|
||||
virtual std::vector<size_t> get_input_stride(int node_idx, const std::string & name) const override;
|
||||
|
||||
virtual ov::element::Type get_input_type(int node_idx, const std::string & name) const override;
|
||||
|
||||
virtual size_t get_input_size() const override;
|
||||
|
||||
virtual size_t get_input_size(int node_idx) const override;
|
||||
|
||||
virtual void get_input_node(size_t input_port_idx,
|
||||
std::string & producer_name,
|
||||
std::string & producer_output_port_name,
|
||||
size_t & producer_output_port_index) const override {
|
||||
GGML_UNUSED(input_port_idx);
|
||||
GGML_UNUSED(producer_name);
|
||||
GGML_UNUSED(producer_output_port_name);
|
||||
GGML_UNUSED(producer_output_port_index);
|
||||
}
|
||||
|
||||
virtual std::vector<std::string> get_input_names(int node_idx) const override;
|
||||
|
||||
virtual ov::PartialShape get_output_shape(int node_idx) const override;
|
||||
|
||||
virtual ov::element::Type get_output_type(int node_idx) const override;
|
||||
|
||||
virtual int32_t * get_input_op_params(int node_idx, const std::string & name) const override;
|
||||
|
||||
virtual int32_t * get_output_op_params(int node_idx) const override;
|
||||
|
||||
virtual std::vector<std::string> get_output_names(int node_idx) const override;
|
||||
|
||||
virtual const std::string & get_op_type() const override;
|
||||
|
||||
virtual const std::string & get_op_type(int node_idx) const override;
|
||||
|
||||
virtual const std::string & get_op_name() const override;
|
||||
|
||||
virtual const std::string & get_op_name(int node_idx) const override;
|
||||
|
||||
virtual void visit_subgraph(std::function<void(std::shared_ptr<GgmlDecoder>, int node_idx)> node_visitor) const override;
|
||||
|
||||
ggml_tensor * get_input_ggml_tensor(const std::string & name) const { return m_inputs.at(name); }
|
||||
|
||||
virtual int get_op_case(int node_idx) const override { return m_node_info_list[node_idx].node_op_case; }
|
||||
|
||||
virtual const std::map<std::string, std::shared_ptr<ov::Node>> & get_model_inputs() const override {
|
||||
return m_model_inputs;
|
||||
}
|
||||
|
||||
virtual const std::map<std::string, std::shared_ptr<ov::Node>> & get_model_extra_inputs() const override {
|
||||
return m_model_extra_inputs;
|
||||
}
|
||||
|
||||
virtual const std::map<std::string, std::shared_ptr<ov::Tensor>> & get_model_extra_input_values() const {
|
||||
return m_model_extra_input_values;
|
||||
}
|
||||
|
||||
virtual const std::map<std::string, std::shared_ptr<ov::Node>> & get_model_weights() const override {
|
||||
return m_model_weights;
|
||||
}
|
||||
|
||||
virtual std::vector<std::string> get_model_output_names() const override {
|
||||
std::vector<std::string> output_names;
|
||||
output_names.reserve(m_model_outputs.size());
|
||||
for (const auto & [name, tensor] : m_model_outputs) {
|
||||
output_names.push_back(name);
|
||||
}
|
||||
return output_names;
|
||||
}
|
||||
|
||||
const std::map<std::string, ggml_tensor *> & get_model_outputs() const { return m_model_outputs; }
|
||||
|
||||
virtual int get_ctx_size() const { return m_model_params.ctx; }
|
||||
|
||||
virtual int get_ctx_swa_size() const { return m_model_params.ctx_swa; }
|
||||
|
||||
virtual int get_ctx_per_seq() const { return m_model_params.ctx_per_seq; }
|
||||
|
||||
virtual int get_ctx_per_seq_swa() const { return m_model_params.ctx_per_seq_swa; }
|
||||
|
||||
virtual int get_n_seq() const { return m_model_params.n_seq; }
|
||||
|
||||
virtual int is_swa_layer(int layer) const override {
|
||||
return std::find(m_model_params.swa_layers.begin(), m_model_params.swa_layers.end(), layer) !=
|
||||
m_model_params.swa_layers.end();
|
||||
}
|
||||
|
||||
int get_past_kv_len() const { return m_compute_params.past_kv_len; }
|
||||
|
||||
int get_input_len() const { return m_compute_params.input_len; }
|
||||
|
||||
virtual int32_t * get_rope_params() const override { return const_cast<int32_t *>(m_model_params.rope_params); }
|
||||
|
||||
virtual std::map<std::string, std::string> get_kv_param_res_names() const override;
|
||||
|
||||
virtual bool is_static() const override { return m_is_static; }
|
||||
|
||||
virtual bool is_stateful() const override { return m_is_stateful; }
|
||||
|
||||
ov::PartialShape get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input) const;
|
||||
|
||||
static void dump_cgraph(const ggml_cgraph * cgraph, std::string & filename);
|
||||
|
||||
static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor * tensor, bool naive = false);
|
||||
|
||||
static std::map<std::string, std::shared_ptr<ov::Node>> create_weight_nodes(ggml_cgraph * cgraph,
|
||||
bool naive = false);
|
||||
|
||||
const ggml_tensor * get_tensor_used_op(const ggml_tensor * tensor) const;
|
||||
|
||||
const ggml_tensor * get_tensor_from_name(const std::string & name) const;
|
||||
|
||||
void clear_model_weights() { m_model_weights.clear(); }
|
||||
|
||||
static std::pair<ModelParams, ComputeParams> compute_llm_params(ggml_cgraph * cgraph, bool is_static);
|
||||
|
||||
ModelParams get_model_params() const { return m_model_params; }
|
||||
|
||||
ComputeParams get_compute_params() const { return m_compute_params; }
|
||||
|
||||
void set_model_params(const ModelParams & model_params) { m_model_params = model_params; }
|
||||
|
||||
void set_compute_params(const ComputeParams & compute_params) { m_compute_params = compute_params; }
|
||||
|
||||
bool m_is_static = false;
|
||||
bool m_is_stateful = false;
|
||||
bool m_is_prefill = false;
|
||||
bool m_naive = false;
|
||||
int m_prefill_chunk_size = 0;
|
||||
|
||||
static ov::Shape get_shape(const ggml_tensor * tensor);
|
||||
static std::vector<size_t> get_stride(const ggml_tensor * tensor);
|
||||
static ov::element::Type get_ov_type(const ggml_tensor * tensor);
|
||||
static std::string compute_op_type(const ggml_tensor * node);
|
||||
void add_extra_inputs();
|
||||
|
||||
void update_io(ggml_cgraph * cgraph);
|
||||
|
||||
inline static bool is_inp_tok(const ggml_tensor * tensor, const ggml_tensor * op) {
|
||||
return op->op == GGML_OP_GET_ROWS && tensor == op->src[1] && op->src[0]->op == GGML_OP_NONE;
|
||||
}
|
||||
|
||||
inline static bool is_inp_pos(const ggml_tensor * tensor, const ggml_tensor * op) {
|
||||
return op->op == GGML_OP_ROPE && tensor == op->src[1];
|
||||
}
|
||||
|
||||
inline static bool is_inp_emb(const ggml_tensor * tensor, const ggml_tensor * op) {
|
||||
return tensor->op == GGML_OP_GET_ROWS && op->op == GGML_OP_RMS_NORM;
|
||||
}
|
||||
|
||||
inline static bool is_inp_mask(const ggml_tensor * tensor, const ggml_tensor * op) {
|
||||
return op->op == GGML_OP_CPY || (op->op == GGML_OP_FLASH_ATTN_EXT && tensor == op->src[3]);
|
||||
}
|
||||
|
||||
inline static bool is_rope_freqs_weight(const ggml_tensor * tensor, const ggml_tensor * op) {
|
||||
return op->op == GGML_OP_ROPE && tensor == op->src[2];
|
||||
}
|
||||
|
||||
inline static bool is_kvcache(const ggml_tensor * tensor, const ggml_tensor * op) {
|
||||
return op->op == GGML_OP_SET_ROWS && op->src[2] == tensor;
|
||||
}
|
||||
|
||||
inline static bool is_kv_idx(const ggml_tensor * tensor, const ggml_tensor * op) {
|
||||
return op->op == GGML_OP_SET_ROWS && op->src[1] == tensor;
|
||||
}
|
||||
|
||||
inline static bool is_output_idx(const ggml_tensor * tensor, const ggml_tensor * op) {
|
||||
return op->op == GGML_OP_GET_ROWS && tensor == op->src[1] && op->src[0]->op != GGML_OP_NONE;
|
||||
}
|
||||
|
||||
static std::string get_graph_input_ov_name(const ggml_tensor * tensor, const ggml_tensor * op) {
|
||||
if (is_inp_tok(tensor, op)) {
|
||||
return "inp_tokens";
|
||||
}
|
||||
if (is_inp_pos(tensor, op)) {
|
||||
return "inp_pos";
|
||||
}
|
||||
if (is_inp_emb(tensor, op)) {
|
||||
return "embd";
|
||||
}
|
||||
if (is_output_idx(tensor, op)) {
|
||||
return "inp_out_ids";
|
||||
}
|
||||
if (is_inp_mask(tensor, op)) {
|
||||
return std::string(tensor->name).find("swa") == std::string::npos ? "self_kq_mask" : "self_kq_mask_swa";
|
||||
}
|
||||
return tensor->name;
|
||||
}
|
||||
|
||||
private:
|
||||
void set_input_output(ggml_tensor * node);
|
||||
int compute_op_case(const ggml_tensor * node) const;
|
||||
|
||||
void validate_cgraph() const;
|
||||
|
||||
ggml_cgraph * m_cgraph = nullptr;
|
||||
std::map<std::string, ggml_tensor *> m_inputs;
|
||||
|
||||
std::map<std::string, std::shared_ptr<ov::Node>> m_model_inputs;
|
||||
std::map<std::string, std::shared_ptr<ov::Node>> m_model_extra_inputs;
|
||||
std::map<std::string, std::shared_ptr<ov::Tensor>> m_model_extra_input_values;
|
||||
std::map<std::string, std::shared_ptr<ov::Node>> m_model_weights;
|
||||
std::map<std::string, ggml_tensor *> m_model_outputs;
|
||||
std::vector<NodeInfo> m_node_info_list;
|
||||
|
||||
ModelParams m_model_params;
|
||||
ComputeParams m_compute_params;
|
||||
};
|
||||
|
||||
void print_tensor_address_map(const ggml_cgraph * cgraph);
|
||||
|
||||
int extract_layer_from_name(const std::string & name);
|
||||
|
|
@ -0,0 +1,373 @@
|
|||
#include "ggml-openvino-extra.h"
|
||||
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#include <cstring>
|
||||
#include <openvino/runtime/intel_gpu/ocl/ocl.hpp>
|
||||
#include <openvino/runtime/intel_npu/level_zero/level_zero.hpp>
|
||||
#include <optional>
|
||||
|
||||
ov::Core & ov_singleton_core() {
|
||||
static ov::Core core;
|
||||
return core;
|
||||
}
|
||||
|
||||
// =====================================================
|
||||
// Device Configuration Implementations
|
||||
// =====================================================
|
||||
|
||||
void ggml_openvino_device_config::init() {
|
||||
if (initialized) {
|
||||
return;
|
||||
}
|
||||
device_name = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : "CPU";
|
||||
auto available_devices = ov_singleton_core().get_available_devices();
|
||||
if (std::find(available_devices.begin(), available_devices.end(), device_name) == available_devices.end()) {
|
||||
GGML_LOG_WARN("GGML OpenVINO Backend: device %s is not available, fallback to CPU\n", device_name.c_str());
|
||||
device_name = "CPU";
|
||||
}
|
||||
is_npu = (device_name == "NPU");
|
||||
|
||||
auto * cache_dir = getenv("GGML_OPENVINO_CACHE_DIR");
|
||||
if (device_name == "NPU") {
|
||||
compile_config = {
|
||||
{"NPU_COMPILER_DYNAMIC_QUANTIZATION", "YES" },
|
||||
{"NPU_USE_NPUW", "YES" },
|
||||
{"NPUW_DEVICES", "NPU" },
|
||||
{"NPUW_FOLD", "YES" },
|
||||
{"NPUW_WEIGHTS_BANK", "shared"},
|
||||
{"NPUW_FUNCALL_FOR_ALL", "YES" },
|
||||
{"NPUW_FUNCALL_ASYNC", "YES" },
|
||||
{"NPUW_DQ", "YES" },
|
||||
{"NPUW_DQ_FULL", "NO" },
|
||||
};
|
||||
if (cache_dir) {
|
||||
compile_config["NPUW_CACHE_DIR"] = cache_dir;
|
||||
}
|
||||
} else if (cache_dir) {
|
||||
ov_singleton_core().set_property(ov::cache_dir(cache_dir));
|
||||
}
|
||||
|
||||
// Initialize remote context with queue sharing for GPU
|
||||
if (device_name == "GPU") {
|
||||
// Create OpenCL context and queue
|
||||
cl_int err;
|
||||
cl_platform_id platform;
|
||||
err = clGetPlatformIDs(1, &platform, nullptr);
|
||||
if (err != CL_SUCCESS) {
|
||||
GGML_LOG_ERROR("Failed to get OpenCL platform: %d\n", err);
|
||||
return;
|
||||
}
|
||||
|
||||
cl_device_id cl_device;
|
||||
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &cl_device, nullptr);
|
||||
if (err != CL_SUCCESS) {
|
||||
GGML_LOG_ERROR("Failed to get OpenCL device: %d\n", err);
|
||||
return;
|
||||
}
|
||||
|
||||
cl_context cl_ctx = clCreateContext(nullptr, 1, &cl_device, nullptr, nullptr, &err);
|
||||
if (err != CL_SUCCESS) {
|
||||
GGML_LOG_ERROR("Failed to create OpenCL context: %d\n", err);
|
||||
return;
|
||||
}
|
||||
|
||||
cl_queue = clCreateCommandQueueWithProperties(cl_ctx, cl_device, nullptr, &err);
|
||||
if (err != CL_SUCCESS) {
|
||||
GGML_LOG_ERROR("Failed to create OpenCL command queue: %d\n", err);
|
||||
clReleaseContext(cl_ctx);
|
||||
return;
|
||||
}
|
||||
|
||||
// Create OpenVINO remote context with queue sharing
|
||||
remote_context = ov::intel_gpu::ocl::ClContext(ov_singleton_core(), cl_queue);
|
||||
|
||||
// Release the context (queue keeps a reference)
|
||||
clReleaseContext(cl_ctx);
|
||||
} else if (device_name == "NPU") {
|
||||
// remote tensor is not used for NPU yet
|
||||
// remote_context = ov_singleton_core().get_default_context(device_name);
|
||||
}
|
||||
|
||||
initialized = true;
|
||||
}
|
||||
|
||||
ggml_openvino_device_config::~ggml_openvino_device_config() {
|
||||
if (cl_queue != nullptr) {
|
||||
clReleaseCommandQueue(cl_queue);
|
||||
cl_queue = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
// Get the global device config singleton
|
||||
ggml_openvino_device_config & ggml_openvino_get_device_config() {
|
||||
static ggml_openvino_device_config config;
|
||||
return config;
|
||||
}
|
||||
|
||||
// Initialize device config (call during backend init)
|
||||
void ggml_openvino_init_device_config() {
|
||||
ggml_openvino_get_device_config().init();
|
||||
}
|
||||
|
||||
// Get the device name
|
||||
const std::string & ggml_openvino_get_device_name() {
|
||||
return ggml_openvino_get_device_config().device_name;
|
||||
}
|
||||
|
||||
// Check if running on NPU
|
||||
bool ggml_openvino_is_npu() {
|
||||
return ggml_openvino_get_device_config().is_npu;
|
||||
}
|
||||
|
||||
// Get the remote context for the current device (returns empty optional for CPU)
|
||||
std::optional<ov::RemoteContext> ggml_openvino_get_remote_context() {
|
||||
return ggml_openvino_get_device_config().remote_context;
|
||||
}
|
||||
|
||||
// Get the compile config for the current device
|
||||
const ov::AnyMap & ggml_openvino_get_compile_config() {
|
||||
return ggml_openvino_get_device_config().compile_config;
|
||||
}
|
||||
|
||||
// Get the OpenCL command queue for GPU operations
|
||||
cl_command_queue ggml_openvino_get_cl_queue() {
|
||||
return ggml_openvino_get_device_config().cl_queue;
|
||||
}
|
||||
|
||||
// Get the clEnqueueMemFillINTEL function pointer (lazy load)
|
||||
clEnqueueMemFillINTEL_fn ggml_openvino_get_clEnqueueMemFillINTEL() {
|
||||
static clEnqueueMemFillINTEL_fn fn = nullptr;
|
||||
static bool loaded = false;
|
||||
if (!loaded) {
|
||||
loaded = true;
|
||||
cl_platform_id platform;
|
||||
if (clGetPlatformIDs(1, &platform, nullptr) == CL_SUCCESS) {
|
||||
fn = (clEnqueueMemFillINTEL_fn) clGetExtensionFunctionAddressForPlatform(platform, "clEnqueueMemFillINTEL");
|
||||
}
|
||||
}
|
||||
return fn;
|
||||
}
|
||||
|
||||
// Get the clEnqueueMemcpyINTEL function pointer (lazy load)
|
||||
clEnqueueMemcpyINTEL_fn ggml_openvino_get_clEnqueueMemcpyINTEL() {
|
||||
static clEnqueueMemcpyINTEL_fn fn = nullptr;
|
||||
static bool loaded = false;
|
||||
if (!loaded) {
|
||||
loaded = true;
|
||||
cl_platform_id platform;
|
||||
if (clGetPlatformIDs(1, &platform, nullptr) == CL_SUCCESS) {
|
||||
fn = (clEnqueueMemcpyINTEL_fn) clGetExtensionFunctionAddressForPlatform(platform, "clEnqueueMemcpyINTEL");
|
||||
}
|
||||
}
|
||||
return fn;
|
||||
}
|
||||
|
||||
// Get requantization type for a tensor type (returns nullopt if no requant needed)
|
||||
std::optional<ExtraQuantType> ggml_openvino_get_requant_type(const ggml_tensor * tensor, bool no_requant) {
|
||||
if (strncmp(tensor->name, "token_embd.weight", 17) == 0) {
|
||||
return ((ggml_openvino_is_npu() && tensor->type == GGML_TYPE_Q6_K) ? ExtraQuantType::F16 : ExtraQuantType::Q8_0_C);
|
||||
}
|
||||
if (strncmp(tensor->name, "output.weight", 13) == 0) {
|
||||
return ExtraQuantType::Q8_0_C;
|
||||
}
|
||||
if (ggml_openvino_is_npu()) {
|
||||
return ExtraQuantType::Q4_0_128;
|
||||
}
|
||||
if (no_requant) {
|
||||
return std::nullopt;
|
||||
}
|
||||
switch (tensor->type) {
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_Q5_K:
|
||||
return ExtraQuantType::Q8_0_C;
|
||||
default:
|
||||
return std::nullopt;
|
||||
}
|
||||
}
|
||||
|
||||
// =====================================================
|
||||
// Extracted Layout Calculation
|
||||
// =====================================================
|
||||
|
||||
ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor, bool use_bias) {
|
||||
ggml_openvino_extracted_layout layout = {};
|
||||
layout.is_symmetric = false;
|
||||
|
||||
if (!ggml_is_quantized(tensor->type)) {
|
||||
return layout;
|
||||
}
|
||||
|
||||
// Only handle 2D weight tensors
|
||||
if (tensor->ne[2] != 1 || tensor->ne[3] != 1) {
|
||||
return layout;
|
||||
}
|
||||
|
||||
int64_t n_elements = ggml_nelements(tensor);
|
||||
const size_t alignment = 64; // Good for SIMD
|
||||
|
||||
// Check if requantization is needed (NPU-specific)
|
||||
auto requant_type = ggml_openvino_get_requant_type(tensor, use_bias);
|
||||
if (requant_type.has_value()) {
|
||||
layout.is_requant = true;
|
||||
layout.requant_type = requant_type;
|
||||
|
||||
// Special case: requant to F16 - just store F16 weights, no scales/zp
|
||||
if (requant_type.value() == ExtraQuantType::F16) {
|
||||
layout.weights_size = n_elements * sizeof(uint16_t); // F16 = 2 bytes
|
||||
layout.total_size = layout.weights_size;
|
||||
layout.weights_offset = 0;
|
||||
// No scales/zp for F16
|
||||
return layout;
|
||||
}
|
||||
|
||||
// Requant to different quantized format (e.g., Q4_0_128)
|
||||
switch (requant_type.value()) {
|
||||
case ExtraQuantType::Q4_0_128:
|
||||
layout.is_u4 = true;
|
||||
layout.weights_per_block = 128;
|
||||
layout.is_symmetric = true;
|
||||
break;
|
||||
case ExtraQuantType::Q4_0_C:
|
||||
layout.is_u4 = true;
|
||||
layout.weights_per_block = tensor->ne[0];
|
||||
layout.is_symmetric = true;
|
||||
break;
|
||||
case ExtraQuantType::Q8_0_32:
|
||||
layout.is_u4 = false;
|
||||
layout.weights_per_block = 32;
|
||||
layout.is_symmetric = true;
|
||||
break;
|
||||
case ExtraQuantType::Q8_0_C:
|
||||
layout.is_u4 = false;
|
||||
layout.weights_per_block = tensor->ne[0];
|
||||
layout.is_symmetric = true;
|
||||
break;
|
||||
case ExtraQuantType::Q8_1_C:
|
||||
layout.is_u4 = false;
|
||||
layout.weights_per_block = tensor->ne[0];
|
||||
break;
|
||||
default:
|
||||
layout.weights_per_block = -1;
|
||||
GGML_ABORT("Code of re-quantizing to channel-wise is not updated");
|
||||
break;
|
||||
}
|
||||
|
||||
if (layout.is_requant) {
|
||||
// Calculate sizes for requantized format
|
||||
layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements;
|
||||
int64_t n_blocks = n_elements / layout.weights_per_block;
|
||||
layout.scales_size = n_blocks * sizeof(uint16_t);
|
||||
// For symmetric quantization, we only need one zp value (not one per block)
|
||||
// Zero points are stored in U4 or U8 format matching the weight type
|
||||
size_t n_zp_elements = layout.is_symmetric ? 1 : n_blocks;
|
||||
layout.zp_size = layout.is_u4 ? ((n_zp_elements + 1) / 2) : n_zp_elements;
|
||||
|
||||
layout.weights_offset = 0;
|
||||
layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment;
|
||||
layout.zp_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment;
|
||||
layout.total_size = layout.zp_offset + layout.zp_size;
|
||||
layout.total_size = std::max(layout.total_size, ggml_nbytes(tensor));
|
||||
return layout;
|
||||
}
|
||||
}
|
||||
|
||||
// Normal extraction (no requant) - determine format based on tensor type
|
||||
layout.is_u4 = false;
|
||||
layout.weights_per_block = 32;
|
||||
layout.is_symmetric = false;
|
||||
|
||||
switch (tensor->type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
layout.is_u4 = true;
|
||||
layout.is_symmetric = true;
|
||||
break;
|
||||
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q4_K:
|
||||
layout.is_u4 = true;
|
||||
break;
|
||||
|
||||
case GGML_TYPE_Q8_0:
|
||||
layout.is_symmetric = true;
|
||||
break;
|
||||
|
||||
case GGML_TYPE_Q6_K:
|
||||
layout.weights_per_block = 16;
|
||||
layout.is_symmetric = true;
|
||||
break;
|
||||
|
||||
case GGML_TYPE_Q5_K:
|
||||
break;
|
||||
|
||||
default:
|
||||
// Unsupported quantization type
|
||||
return layout;
|
||||
}
|
||||
|
||||
// Calculate sizes
|
||||
// Weights: U4 = n_elements/2 bytes, U8 = n_elements bytes
|
||||
layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements;
|
||||
|
||||
// Scales: F16 per block
|
||||
int64_t n_blocks = n_elements / layout.weights_per_block;
|
||||
layout.scales_size = n_blocks * sizeof(uint16_t); // F16 = 2 bytes
|
||||
// Zero points: U4 or U8 matching weight type
|
||||
// For symmetric quantization, we only need one zp value (not one per block)
|
||||
size_t n_zp_elements = layout.is_symmetric ? 1 : n_blocks;
|
||||
layout.zp_size = layout.is_u4 ? ((n_zp_elements + 1) / 2) : n_zp_elements;
|
||||
|
||||
// Layout in buffer: [weights | scales | zp] with alignment
|
||||
layout.weights_offset = 0;
|
||||
layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment;
|
||||
layout.zp_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment;
|
||||
layout.total_size = layout.zp_offset + layout.zp_size;
|
||||
layout.total_size = std::max(layout.total_size, ggml_nbytes(tensor));
|
||||
|
||||
return layout;
|
||||
}
|
||||
|
||||
ggml_openvino_tensor_extra * ggml_openvino_create_tensor_extra(const ggml_tensor * tensor, bool is_remote) {
|
||||
ov::Shape shape;
|
||||
for (int i = GGML_MAX_DIMS - 1; i >= 0; --i) {
|
||||
shape.push_back(static_cast<size_t>(tensor->ne[i]));
|
||||
}
|
||||
|
||||
ov::element::Type element_type;
|
||||
switch (tensor->type) {
|
||||
case GGML_TYPE_F32:
|
||||
element_type = ov::element::f32;
|
||||
break;
|
||||
case GGML_TYPE_F16:
|
||||
element_type = ov::element::f16;
|
||||
break;
|
||||
case GGML_TYPE_BF16:
|
||||
element_type = ov::element::bf16;
|
||||
break;
|
||||
case GGML_TYPE_I32:
|
||||
element_type = ov::element::i32;
|
||||
break;
|
||||
case GGML_TYPE_I64:
|
||||
element_type = ov::element::i64;
|
||||
break;
|
||||
default:
|
||||
// GGML_LOG_WARN("%s: unsupported tensor type for ov::Tensor: %s\n", __func__, ggml_type_name(tensor->type));
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
const auto & device_name = ggml_openvino_get_device_name();
|
||||
auto remote_context = ggml_openvino_get_remote_context();
|
||||
|
||||
std::shared_ptr<ov::Tensor> ov_tensor;
|
||||
if (is_remote) {
|
||||
GGML_ASSERT(device_name == "GPU");
|
||||
auto gpu_context = remote_context->as<ov::intel_gpu::ocl::ClContext>();
|
||||
auto usm_tensor = gpu_context.create_tensor(element_type, shape, tensor->data);
|
||||
ov_tensor = std::make_shared<ov::intel_gpu::ocl::USMTensor>(std::move(usm_tensor));
|
||||
} else {
|
||||
ov_tensor = std::make_shared<ov::Tensor>(element_type, shape, tensor->data);
|
||||
}
|
||||
|
||||
return new ggml_openvino_tensor_extra(ov_tensor);
|
||||
}
|
||||
|
|
@ -0,0 +1,169 @@
|
|||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "openvino/runtime/core.hpp"
|
||||
|
||||
#define CL_TARGET_OPENCL_VERSION 300
|
||||
#include <CL/cl.h>
|
||||
|
||||
#include <cstdlib>
|
||||
#include <memory>
|
||||
#include <openvino/core/node.hpp>
|
||||
#include <openvino/runtime/remote_context.hpp>
|
||||
#include <openvino/runtime/tensor.hpp>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
|
||||
// ExtraQuantType enum - defines requantization target formats
|
||||
enum class ExtraQuantType { F16, Q4_0_C, Q8_1_C, Q4_0_128, Q8_0_C, Q8_0_32 };
|
||||
|
||||
ov::Core & ov_singleton_core();
|
||||
|
||||
// Get the remote context for the current device (returns empty optional for CPU)
|
||||
std::optional<ov::RemoteContext> ggml_openvino_get_remote_context();
|
||||
|
||||
// Get the compile config for the current device
|
||||
const ov::AnyMap & ggml_openvino_get_compile_config();
|
||||
|
||||
// Get the OpenCL command queue for GPU operations (returns nullptr for CPU/NPU)
|
||||
cl_command_queue ggml_openvino_get_cl_queue();
|
||||
|
||||
// Intel USM extension function type
|
||||
typedef cl_int(CL_API_CALL * clEnqueueMemFillINTEL_fn)(cl_command_queue queue,
|
||||
void * dst_ptr,
|
||||
const void * pattern,
|
||||
size_t pattern_size,
|
||||
size_t size,
|
||||
cl_uint num_events_in_wait_list,
|
||||
const cl_event * event_wait_list,
|
||||
cl_event * event);
|
||||
|
||||
typedef cl_int(CL_API_CALL * clEnqueueMemcpyINTEL_fn)(cl_command_queue queue,
|
||||
cl_bool blocking,
|
||||
void * dst_ptr,
|
||||
const void * src_ptr,
|
||||
size_t size,
|
||||
cl_uint num_events_in_wait_list,
|
||||
const cl_event * event_wait_list,
|
||||
cl_event * event);
|
||||
|
||||
// Get the clEnqueueMemFillINTEL function pointer (returns nullptr if not available)
|
||||
clEnqueueMemFillINTEL_fn ggml_openvino_get_clEnqueueMemFillINTEL();
|
||||
|
||||
// Get the clEnqueueMemcpyINTEL function pointer (returns nullptr if not available)
|
||||
clEnqueueMemcpyINTEL_fn ggml_openvino_get_clEnqueueMemcpyINTEL();
|
||||
|
||||
// =====================================================
|
||||
// Global Device Configuration (singleton)
|
||||
// =====================================================
|
||||
// Initialized once during backend init from GGML_OPENVINO_DEVICE env var
|
||||
|
||||
struct ggml_openvino_device_config {
|
||||
std::string device_name = "CPU";
|
||||
bool is_npu = false;
|
||||
bool initialized = false;
|
||||
std::optional<ov::RemoteContext> remote_context;
|
||||
ov::AnyMap compile_config;
|
||||
cl_command_queue cl_queue = nullptr;
|
||||
|
||||
void init();
|
||||
~ggml_openvino_device_config();
|
||||
};
|
||||
|
||||
// Get the global device config singleton
|
||||
ggml_openvino_device_config & ggml_openvino_get_device_config();
|
||||
|
||||
// Initialize device config (call during backend init)
|
||||
void ggml_openvino_init_device_config();
|
||||
|
||||
// Get the device name
|
||||
const std::string & ggml_openvino_get_device_name();
|
||||
|
||||
// Check if running on NPU
|
||||
bool ggml_openvino_is_npu();
|
||||
|
||||
// Get requantization type for a tensor type (returns nullopt if no requant needed)
|
||||
std::optional<ExtraQuantType> ggml_openvino_get_requant_type(const ggml_tensor * tensor, bool no_requant = false);
|
||||
|
||||
// =====================================================
|
||||
// OpenVINO Tensor Extra Types
|
||||
// =====================================================
|
||||
// These types are stored in tensor->extra by the OpenVINO backend buffer.
|
||||
// They allow:
|
||||
// 1. Pre-built ov::Constant nodes for weights (avoiding memcpy during graph construction)
|
||||
// 2. ov::Tensor wrappers for KV cache / compute tensors (for direct use with infer_request)
|
||||
|
||||
// Base class for OpenVINO tensor extra data
|
||||
struct ggml_openvino_extra_base {
|
||||
enum class Type { WEIGHT, QUANTIZED_WEIGHT, TENSOR };
|
||||
Type type;
|
||||
virtual ~ggml_openvino_extra_base() = default;
|
||||
protected:
|
||||
explicit ggml_openvino_extra_base(Type t) : type(t) {}
|
||||
};
|
||||
|
||||
// Extra data for F16/F32/BF16 weight tensors - stores the pre-built weight node
|
||||
struct ggml_openvino_weight_extra : public ggml_openvino_extra_base {
|
||||
ov::Tensor weights; // The underlying weight data tensor
|
||||
std::shared_ptr<ov::Node> weight_node; // Pre-built OpenVINO weight node
|
||||
|
||||
ggml_openvino_weight_extra(ov::Tensor w, std::shared_ptr<ov::Node> n) :
|
||||
ggml_openvino_extra_base(Type::WEIGHT),
|
||||
weights(std::move(w)),
|
||||
weight_node(std::move(n)) {}
|
||||
};
|
||||
|
||||
// Extra data for quantized weight tensors - stores extracted weights/scales/zp and weight node
|
||||
struct ggml_openvino_quantized_weight_extra : public ggml_openvino_extra_base {
|
||||
ov::Tensor weights; // U4 or U8 extracted weights
|
||||
ov::Tensor scales; // F16 scales
|
||||
ov::Tensor zp; // U4 or U8 zero points (same type as weights)
|
||||
std::shared_ptr<ov::Node> weight_node; // Pre-built OpenVINO weight subgraph
|
||||
|
||||
ggml_openvino_quantized_weight_extra(ov::Tensor w, ov::Tensor s, ov::Tensor z, std::shared_ptr<ov::Node> n) :
|
||||
ggml_openvino_extra_base(Type::QUANTIZED_WEIGHT),
|
||||
weights(std::move(w)),
|
||||
scales(std::move(s)),
|
||||
zp(std::move(z)),
|
||||
weight_node(std::move(n)) {}
|
||||
};
|
||||
|
||||
// Extra data for KV cache / compute tensors - stores ov::Tensor for infer_request
|
||||
struct ggml_openvino_tensor_extra : public ggml_openvino_extra_base {
|
||||
std::shared_ptr<ov::Tensor> tensor; // For direct use with infer_request
|
||||
|
||||
explicit ggml_openvino_tensor_extra(std::shared_ptr<ov::Tensor> t)
|
||||
: ggml_openvino_extra_base(Type::TENSOR), tensor(std::move(t)) {}
|
||||
};
|
||||
|
||||
// =====================================================
|
||||
// Extracted Size Calculation for Quantized Tensors
|
||||
// =====================================================
|
||||
// For quantized tensors, we need extra space to store extracted weights, scales, and zero points.
|
||||
// Returns the total size needed in the buffer for extracted data.
|
||||
|
||||
struct ggml_openvino_extracted_layout {
|
||||
size_t total_size = 0; // Total bytes needed
|
||||
size_t weights_offset = 0; // Offset to weights in buffer
|
||||
size_t weights_size = 0; // Size of weights in bytes
|
||||
size_t scales_offset = 0; // Offset to scales in buffer
|
||||
size_t scales_size = 0; // Size of scales in bytes
|
||||
size_t zp_offset = 0; // Offset to zero points in buffer
|
||||
size_t zp_size = 0; // Size of zero points in bytes (U4 or U8)
|
||||
bool is_u4; // true for U4 weights, false for U8
|
||||
int64_t weights_per_block; // weights per scale/zp block
|
||||
bool is_symmetric; // true for symmetric quantization
|
||||
|
||||
// Requantization info
|
||||
bool is_requant = false; // true if this tensor needs requantization
|
||||
std::optional<ExtraQuantType> requant_type; // target requant type if is_requant
|
||||
};
|
||||
|
||||
// Calculate the buffer layout for extracted quantized data
|
||||
ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor, bool use_bias = false);
|
||||
|
||||
ggml_openvino_tensor_extra * ggml_openvino_create_tensor_extra(const ggml_tensor * tensor, bool is_remote);
|
||||
|
||||
// Register an extra with the tensor's OpenVINO buffer context for proper lifetime management.
|
||||
// This sets tensor->extra and tracks the extra in the buffer context for cleanup.
|
||||
void ggml_openvino_buffer_register_extra(ggml_tensor * tensor, ggml_openvino_extra_base * extra);
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,884 @@
|
|||
#include "ggml-quants.hpp"
|
||||
|
||||
#include "ggml-common.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <openvino/core/except.hpp>
|
||||
#include <openvino/core/node.hpp>
|
||||
#include <openvino/core/node_output.hpp>
|
||||
#include <openvino/core/parallel.hpp>
|
||||
#include <openvino/core/shape.hpp>
|
||||
#include <openvino/core/type/element_type.hpp>
|
||||
#include <openvino/core/type/element_type_traits.hpp>
|
||||
#include <openvino/core/type/float16.hpp>
|
||||
#include <openvino/op/add.hpp>
|
||||
#include <openvino/op/constant.hpp>
|
||||
#include <openvino/op/convert.hpp>
|
||||
#include <openvino/op/multiply.hpp>
|
||||
#include <openvino/op/reshape.hpp>
|
||||
#include <openvino/op/subtract.hpp>
|
||||
#include <openvino/op/util/attr_types.hpp>
|
||||
#include <openvino/runtime/tensor.hpp>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
void unpack_32_4(const uint8_t * data, uint8_t * dst) {
|
||||
std::fill_n(dst, 16, 0);
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
uint8_t x = (data[j] & 0x0F);
|
||||
uint8_t y = (data[j] >> 4);
|
||||
if (j % 2 != 0) {
|
||||
x <<= 4;
|
||||
y <<= 4;
|
||||
}
|
||||
dst[j / 2] |= x;
|
||||
dst[8 + j / 2] |= y; // Last 16 weights are in the higher bits
|
||||
}
|
||||
}
|
||||
|
||||
// Extracts (weight, scales, zp) from Q4_0 tensors.
|
||||
// Data layout is: |16 bit scale|32 x 4bit weights|.
|
||||
void extract_q4_0_data(const ggml_tensor * tensor,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
ov::Tensor & zp_arr) {
|
||||
const uint64_t bytes_per_block = 18; // 2 bytes scale, 32x0.5 byte weights
|
||||
|
||||
auto * data = static_cast<uint8_t *>(tensor->data);
|
||||
auto * weights = static_cast<uint8_t *>(weights_arr.data());
|
||||
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto * zp = static_cast<uint8_t *>(zp_arr.data());
|
||||
|
||||
bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization
|
||||
|
||||
// For Q4_0, zero point is always 8
|
||||
if (is_scalar_zp) {
|
||||
zp[0] = 8 | (8 << 4); // Pack two 4-bit values
|
||||
}
|
||||
|
||||
ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
|
||||
scales[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block)));
|
||||
// For asymmetric quantization, compute per-block zero points
|
||||
if (!is_scalar_zp) {
|
||||
// Pack two 4-bit zero points per byte
|
||||
if (i % 2 == 0) {
|
||||
zp[i / 2] = 8; // Lower nibble
|
||||
} else {
|
||||
zp[i / 2] |= (8 << 4); // Upper nibble
|
||||
}
|
||||
}
|
||||
unpack_32_4(data + i * bytes_per_block + 2, weights + i * 16);
|
||||
});
|
||||
}
|
||||
|
||||
// Extracts (weight, scales, zp) from Q4_1 tensors.
|
||||
// Data layout is: |16 bit scale|16 bit min|32 x 4bit weights|.
|
||||
void extract_q4_1_data(const ggml_tensor * tensor,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
ov::Tensor & zp_arr,
|
||||
bool use_bias) {
|
||||
const uint64_t bytes_per_block = 20; // 2 bytes scale, 2 bytes min, 32x0.5 byte weights
|
||||
|
||||
auto * data = static_cast<uint8_t *>(tensor->data);
|
||||
auto * weights = static_cast<uint8_t *>(weights_arr.data());
|
||||
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
|
||||
if (use_bias) {
|
||||
// Store bias (min) directly as f16 instead of computing u4 zero points
|
||||
auto * bias = zp_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
|
||||
float scale = static_cast<float>(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block))));
|
||||
float min = static_cast<float>(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block + 2))));
|
||||
scales[i] = ov::float16(scale);
|
||||
bias[i] = ov::float16(min); // bias = min, dequant: w*s + bias
|
||||
unpack_32_4(data + i * bytes_per_block + 4, weights + i * 16);
|
||||
});
|
||||
} else {
|
||||
auto * zp = static_cast<uint8_t *>(zp_arr.data());
|
||||
ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
|
||||
float scale = static_cast<float>(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block))));
|
||||
float min = static_cast<float>(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block + 2))));
|
||||
scales[i] = ov::float16(scale);
|
||||
// zp = -min / scale (bias = min, so zp = -bias/scale)
|
||||
uint8_t zp_val = (scale != 0.0f) ? (uint8_t) std::round(-min / scale) : 0;
|
||||
// Pack two 4-bit zero points per byte
|
||||
if (i % 2 == 0) {
|
||||
zp[i / 2] = zp_val & 0x0F; // Lower nibble
|
||||
} else {
|
||||
zp[i / 2] |= (zp_val << 4); // Upper nibble
|
||||
}
|
||||
unpack_32_4(data + i * bytes_per_block + 4, weights + i * 16);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Extracts (weight, scales, zp) from Q8_0 tensors.
|
||||
// Data layout is: |16 bit scale|32 x 8bit weights|.
|
||||
void extract_q8_0_data(const ggml_tensor * tensor,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
ov::Tensor & zp_arr) {
|
||||
const uint64_t weights_per_block = 32;
|
||||
const uint64_t bytes_per_block = 34; // 2 bytes scale, 32x1 byte weights
|
||||
|
||||
auto * data = static_cast<uint8_t *>(tensor->data);
|
||||
auto * weights = static_cast<uint8_t *>(weights_arr.data());
|
||||
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto * zp = static_cast<uint8_t *>(zp_arr.data());
|
||||
|
||||
bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization
|
||||
|
||||
// For Q8_0, zero point is always 128
|
||||
if (is_scalar_zp) {
|
||||
zp[0] = 128;
|
||||
}
|
||||
|
||||
ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
|
||||
uint8_t * block_data = data + i * bytes_per_block;
|
||||
scales[i] = ov::float16::from_bits(*(uint16_t *) block_data);
|
||||
// For asymmetric quantization, store per-block zero points
|
||||
if (!is_scalar_zp) {
|
||||
zp[i] = 128;
|
||||
}
|
||||
for (size_t j = 0; j < weights_per_block; ++j) {
|
||||
uint8_t x = block_data[j + 2]; // j+2 to skip the scale bytes.
|
||||
// Original data is in int8_t, so we add a bias of -128 and invert the first bit.
|
||||
x ^= 1 << 7;
|
||||
weights[i * weights_per_block + j] = x;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
void unpack_256_4(const uint8_t * data, uint8_t * dst) {
|
||||
// Initialize the output array with zeros
|
||||
std::fill_n(dst, 128, 0);
|
||||
|
||||
for (size_t i = 0; i < 4; ++i) {
|
||||
for (int j = 0; j < 32; ++j) {
|
||||
uint8_t x = (data[i * 32 + j] & 0x0F);
|
||||
uint8_t y = (data[i * 32 + j] >> 4);
|
||||
if (j % 2 != 0) {
|
||||
x <<= 4;
|
||||
y <<= 4;
|
||||
}
|
||||
dst[i * 32 + j / 2] |= x;
|
||||
dst[i * 32 + 16 + j / 2] |= y; // Last 16 weights are in the higher bits
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void extract_q4_k_data(const ggml_tensor * tensor,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
ov::Tensor & zp_arr,
|
||||
bool use_bias) {
|
||||
const uint64_t bytes_per_block = 2 + 2 + 12 + 128;
|
||||
const uint64_t n_super_block = tensor->nb[3] / bytes_per_block;
|
||||
|
||||
auto * data = static_cast<uint8_t *>(tensor->data);
|
||||
auto * weights = static_cast<uint8_t *>(weights_arr.data());
|
||||
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
|
||||
// For bias path, zp_arr holds f16 bias values; for zp path, it holds packed u4 zero points
|
||||
auto * zp_u4 = use_bias ? nullptr : static_cast<uint8_t *>(zp_arr.data());
|
||||
auto * bias_f16 = use_bias ? zp_arr.data<ov::element_type_traits<ov::element::f16>::value_type>() : nullptr;
|
||||
|
||||
ov::parallel_for(n_super_block, [&](size_t i) {
|
||||
uint8_t * block_data = data + i * bytes_per_block;
|
||||
|
||||
// Extract scale factors and offsets
|
||||
float scale_scales = static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data)));
|
||||
float scale_mins = static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data + 1)));
|
||||
|
||||
// Extract qs1 and qs2
|
||||
uint8_t * qs1 = block_data + 4;
|
||||
|
||||
// Calculate scales
|
||||
float scale_vals[8];
|
||||
scale_vals[0] = scale_scales * static_cast<float>((*(qs1) & 0b111111));
|
||||
scale_vals[1] = scale_scales * static_cast<float>((*(qs1 + 1) & 0b111111));
|
||||
scale_vals[2] = scale_scales * static_cast<float>((*(qs1 + 2) & 0b111111));
|
||||
scale_vals[3] = scale_scales * static_cast<float>((*(qs1 + 3) & 0b111111));
|
||||
scale_vals[4] = scale_scales * static_cast<float>((*(qs1 + 8) & 0b00001111) | ((*(qs1) >> 6) << 4));
|
||||
scale_vals[5] = scale_scales * static_cast<float>((*(qs1 + 9) & 0b00001111) | ((*(qs1 + 1) >> 6) << 4));
|
||||
scale_vals[6] = scale_scales * static_cast<float>((*(qs1 + 10) & 0b00001111) | ((*(qs1 + 2) >> 6) << 4));
|
||||
scale_vals[7] = scale_scales * static_cast<float>((*(qs1 + 11) & 0b00001111) | ((*(qs1 + 3) >> 6) << 4));
|
||||
|
||||
// Calculate min values (bias = -min)
|
||||
float min_vals[8];
|
||||
min_vals[0] = scale_mins * static_cast<float>((*(qs1 + 4) & 0b111111));
|
||||
min_vals[1] = scale_mins * static_cast<float>((*(qs1 + 5) & 0b111111));
|
||||
min_vals[2] = scale_mins * static_cast<float>((*(qs1 + 6) & 0b111111));
|
||||
min_vals[3] = scale_mins * static_cast<float>((*(qs1 + 7) & 0b111111));
|
||||
min_vals[4] = scale_mins * static_cast<float>((*(qs1 + 8) >> 4) | ((*(qs1 + 4) >> 6) << 4));
|
||||
min_vals[5] = scale_mins * static_cast<float>((*(qs1 + 9) >> 4) | ((*(qs1 + 5) >> 6) << 4));
|
||||
min_vals[6] = scale_mins * static_cast<float>((*(qs1 + 10) >> 4) | ((*(qs1 + 6) >> 6) << 4));
|
||||
min_vals[7] = scale_mins * static_cast<float>((*(qs1 + 11) >> 4) | ((*(qs1 + 7) >> 6) << 4));
|
||||
|
||||
// Store scales and compute zero points or bias
|
||||
for (int j = 0; j < 8; j++) {
|
||||
scales[i * 8 + j] = ov::float16(scale_vals[j]);
|
||||
if (use_bias) {
|
||||
// Store bias = -min directly as f16, dequant: w*s + bias
|
||||
bias_f16[i * 8 + j] = ov::float16(-min_vals[j]);
|
||||
} else {
|
||||
// zp = min / scale (since bias = -min and zp = -bias/scale)
|
||||
uint8_t zp_val = (scale_vals[j] != 0.0f) ? (uint8_t) std::round(min_vals[j] / scale_vals[j]) : 0;
|
||||
// Pack two 4-bit zero points per byte
|
||||
size_t idx = i * 8 + j;
|
||||
if (idx % 2 == 0) {
|
||||
zp_u4[idx / 2] = zp_val & 0x0F;
|
||||
} else {
|
||||
zp_u4[idx / 2] |= (zp_val << 4);
|
||||
}
|
||||
}
|
||||
}
|
||||
unpack_256_4(block_data + 16, weights + i * 128);
|
||||
});
|
||||
}
|
||||
|
||||
void extract_q6_k_data(const ggml_tensor * tensor,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
ov::Tensor & zp_arr) {
|
||||
const uint64_t bytes_per_block = 128 + 64 + 16 + 2;
|
||||
const uint64_t n_super_block = tensor->nb[3] / bytes_per_block;
|
||||
|
||||
auto * data = static_cast<uint8_t *>(tensor->data);
|
||||
auto * weights = static_cast<uint8_t *>(weights_arr.data());
|
||||
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto * zp = static_cast<uint8_t *>(zp_arr.data());
|
||||
|
||||
bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization
|
||||
|
||||
// For Q6_K, zero point is always 32
|
||||
if (is_scalar_zp) {
|
||||
zp[0] = 32;
|
||||
}
|
||||
|
||||
ov::parallel_for(n_super_block, [&](size_t i) {
|
||||
uint8_t * block_data = data + i * bytes_per_block;
|
||||
|
||||
float scale_factor =
|
||||
static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data + 104))); // (128+64+16)/2
|
||||
|
||||
for (size_t j = 0; j < 16; j++) {
|
||||
scales[j + i * 16] =
|
||||
ov::float16(scale_factor * static_cast<float>(*((int8_t *) (block_data + 128 + 64 + j))));
|
||||
// For asymmetric quantization, store per-block zero points
|
||||
if (!is_scalar_zp) {
|
||||
zp[j + i * 16] = 32;
|
||||
}
|
||||
}
|
||||
|
||||
uint8_t * ql = block_data;
|
||||
uint8_t * qh = block_data + 128;
|
||||
|
||||
for (int64_t j = 0; j < 32; ++j) {
|
||||
weights[i * 256 + j] = (ql[j] & 0xF) | (((qh[j] >> 0) & 3) << 4);
|
||||
weights[i * 256 + j + 32] = (ql[32 + j] & 0xF) | (((qh[j] >> 2) & 3) << 4);
|
||||
weights[i * 256 + j + 64] = (ql[j] >> 4) | (((qh[j] >> 4) & 3) << 4);
|
||||
weights[i * 256 + j + 96] = (ql[32 + j] >> 4) | (((qh[j] >> 6) & 3) << 4);
|
||||
weights[i * 256 + j + 128] = (ql[64 + j] & 0xF) | (((qh[32 + j] >> 0) & 3) << 4);
|
||||
weights[i * 256 + j + 160] = (ql[96 + j] & 0xF) | (((qh[32 + j] >> 2) & 3) << 4);
|
||||
weights[i * 256 + j + 192] = (ql[64 + j] >> 4) | (((qh[32 + j] >> 4) & 3) << 4);
|
||||
weights[i * 256 + j + 224] = (ql[96 + j] >> 4) | (((qh[32 + j] >> 6) & 3) << 4);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t * d, uint8_t * m) {
|
||||
if (j < 4) {
|
||||
*d = q[j] & 63;
|
||||
*m = q[j + 4] & 63;
|
||||
} else {
|
||||
*d = (q[j + 4] & 0xF) | ((q[j - 4] >> 6) << 4);
|
||||
*m = (q[j + 4] >> 4) | ((q[j - 0] >> 6) << 4);
|
||||
}
|
||||
}
|
||||
|
||||
void extract_q5_k_data(const ggml_tensor * tensor,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
ov::Tensor & zp_arr,
|
||||
bool use_bias) {
|
||||
const uint64_t bytes_per_block = 4 + 12 + 32 + 128;
|
||||
const uint64_t n_super_block = tensor->nb[3] / bytes_per_block;
|
||||
|
||||
auto * data = static_cast<uint8_t *>(tensor->data);
|
||||
auto * weights = static_cast<uint8_t *>(weights_arr.data());
|
||||
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
|
||||
// For bias path, zp_arr holds f16 bias values; for zp path, it holds u8 zero points
|
||||
auto * zp_u8 = use_bias ? nullptr : static_cast<uint8_t *>(zp_arr.data());
|
||||
auto * bias_f16 = use_bias ? zp_arr.data<ov::element_type_traits<ov::element::f16>::value_type>() : nullptr;
|
||||
|
||||
ov::parallel_for(n_super_block, [&](size_t i) {
|
||||
uint8_t * block_data = data + i * bytes_per_block;
|
||||
|
||||
const float d = static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data)));
|
||||
const float min_factor = static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data + 1)));
|
||||
|
||||
const uint8_t * scales_data = block_data + 4; // 12 bytes of scales
|
||||
const uint8_t * qh = block_data + 4 + 12; // 32 bytes of high bits
|
||||
const uint8_t * ql = block_data + 4 + 12 + 32; // 128 bytes of low bits
|
||||
|
||||
int is = 0;
|
||||
uint8_t u1 = 1;
|
||||
uint8_t u2 = 2;
|
||||
|
||||
// Process 2 blocks in one iteration
|
||||
for (int j = 0; j < 256; j += 64) { // 256 = QK_K, so 4 iterations of 64
|
||||
uint8_t sc;
|
||||
uint8_t m;
|
||||
|
||||
// Get scale and min for first 32 elements
|
||||
get_scale_min_k4(is + 0, scales_data, &sc, &m);
|
||||
const float d1 = d * sc;
|
||||
const float m1 = min_factor * m;
|
||||
|
||||
// Get scale and min for second 32 elements
|
||||
get_scale_min_k4(is + 1, scales_data, &sc, &m);
|
||||
const float d2 = d * sc;
|
||||
const float m2 = min_factor * m;
|
||||
|
||||
scales[i * 8 + is] = ov::float16(d1);
|
||||
scales[i * 8 + is + 1] = ov::float16(d2);
|
||||
if (use_bias) {
|
||||
// Store bias = -min directly as f16, dequant: w*s + bias
|
||||
bias_f16[i * 8 + is] = ov::float16(-m1);
|
||||
bias_f16[i * 8 + is + 1] = ov::float16(-m2);
|
||||
} else {
|
||||
// zp = min / scale (since bias = -min and zp = -bias/scale)
|
||||
zp_u8[i * 8 + is] = (d1 != 0.0f) ? (uint8_t) std::round(m1 / d1) : 0;
|
||||
zp_u8[i * 8 + is + 1] = (d2 != 0.0f) ? (uint8_t) std::round(m2 / d2) : 0;
|
||||
}
|
||||
|
||||
// Extract weights for first 32 elements (matching deq formula exactly)
|
||||
for (int l = 0; l < 32; ++l) {
|
||||
weights[i * 256 + j + l] = (ql[l] & 0xF) + ((qh[l] & u1) ? 16 : 0);
|
||||
}
|
||||
|
||||
// Extract weights for second 32 elements
|
||||
for (int l = 0; l < 32; ++l) {
|
||||
weights[i * 256 + j + l + 32] = (ql[l] >> 4) + ((qh[l] & u2) ? 16 : 0);
|
||||
}
|
||||
|
||||
ql += 32;
|
||||
is += 2;
|
||||
u1 <<= 2;
|
||||
u2 <<= 2;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// TODO Reorder for make_intX_weights
|
||||
|
||||
ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight,
|
||||
ov::Tensor & scales,
|
||||
ov::Tensor & zp,
|
||||
size_t group_size,
|
||||
bool use_bias) {
|
||||
ov::Shape orig_shape = weight.get_shape();
|
||||
|
||||
// Expand dimensions for scales and zp/bias
|
||||
auto scale_shape = scales.get_shape();
|
||||
auto zp_shape = zp.get_shape();
|
||||
bool is_scalar_zp = zp_shape.empty(); // Symmetric quantization
|
||||
|
||||
ov::Shape packed_shape = {orig_shape[0], orig_shape[1] / group_size, group_size};
|
||||
|
||||
if (packed_shape[1] == 1) {
|
||||
// Requantized channel-wise case
|
||||
packed_shape.erase(packed_shape.begin() + 1);
|
||||
} else {
|
||||
scale_shape.push_back(1);
|
||||
scales.set_shape(scale_shape);
|
||||
// For symmetric quantization, zp remains scalar (don't resize)
|
||||
if (!is_scalar_zp) {
|
||||
zp_shape.push_back(1);
|
||||
zp.set_shape(zp_shape);
|
||||
}
|
||||
}
|
||||
|
||||
// Create graph nodes
|
||||
auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u8, packed_shape,
|
||||
static_cast<uint8_t *>(weight.data()), nullptr);
|
||||
weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
|
||||
auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales);
|
||||
auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
|
||||
|
||||
ov::Output<ov::Node> result;
|
||||
if (use_bias && !is_scalar_zp) {
|
||||
// Bias path: w * s + b (zp tensor holds f16 bias values)
|
||||
auto bias_f16 = std::make_shared<ov::op::v0::Constant>(zp);
|
||||
auto w_s = std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
result = std::make_shared<ov::op::v1::Add>(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
} else {
|
||||
// Zero point path: (w - zp) * s
|
||||
auto zero_point = std::make_shared<ov::op::v0::Constant>(zp);
|
||||
float zp_value;
|
||||
if (ov::op::util::get_single_value(zero_point, zp_value)) {
|
||||
zero_point = ov::op::v0::Constant::create(zero_point->get_element_type(), {}, {zp_value});
|
||||
}
|
||||
auto zero_point_f16 = std::make_shared<ov::op::v0::Convert>(zero_point, ov::element::f16);
|
||||
auto w_zp =
|
||||
std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
result = std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
}
|
||||
|
||||
if (packed_shape.size() != 2) {
|
||||
// If not requantized channel-wise case, reshape back to original shape
|
||||
auto final_shape =
|
||||
std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{orig_shape.size()}, orig_shape);
|
||||
result = std::make_shared<ov::op::v1::Reshape>(result, final_shape, false);
|
||||
}
|
||||
|
||||
return std::make_shared<ov::op::v0::Convert>(result, ov::element::f32);
|
||||
}
|
||||
|
||||
ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight,
|
||||
ov::Tensor & scales,
|
||||
ov::Tensor & zp,
|
||||
size_t group_size,
|
||||
bool use_bias) {
|
||||
ov::Shape orig_weight_shape = weight.get_shape();
|
||||
|
||||
// Expand dimensions for scales and zp/bias
|
||||
ov::Shape scale_shape = scales.get_shape();
|
||||
auto zp_shape = zp.get_shape();
|
||||
bool is_scalar_zp = zp_shape.empty(); // Symmetric quantization
|
||||
|
||||
// Create INT4 weight tensor
|
||||
ov::Shape packed_shape = {orig_weight_shape[0], orig_weight_shape[1] / group_size, group_size};
|
||||
|
||||
if (packed_shape[1] == 1) {
|
||||
// Requantized channel-wise case
|
||||
packed_shape.erase(packed_shape.begin() + 1);
|
||||
} else {
|
||||
scale_shape.push_back(1);
|
||||
scales.set_shape(scale_shape);
|
||||
// For symmetric quantization, zp remains scalar (don't resize)
|
||||
if (!is_scalar_zp) {
|
||||
zp_shape.push_back(1);
|
||||
zp.set_shape(zp_shape);
|
||||
}
|
||||
}
|
||||
|
||||
auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u4, packed_shape,
|
||||
static_cast<uint8_t *>(weight.data()), nullptr);
|
||||
weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
|
||||
auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
|
||||
auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales);
|
||||
|
||||
ov::Output<ov::Node> result;
|
||||
if (use_bias && !is_scalar_zp) {
|
||||
// Bias path: w * s + b (zp tensor holds f16 bias values)
|
||||
auto bias_f16 = std::make_shared<ov::op::v0::Constant>(zp);
|
||||
auto w_s = std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
result = std::make_shared<ov::op::v1::Add>(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
} else {
|
||||
// Zero point path: (w - zp) * s
|
||||
auto zero_points_node = std::make_shared<ov::op::v0::Constant>(zp);
|
||||
float zp_value;
|
||||
if (ov::op::util::get_single_value(zero_points_node, zp_value)) {
|
||||
zero_points_node = ov::op::v0::Constant::create(zero_points_node->get_element_type(), {}, {zp_value});
|
||||
}
|
||||
auto zero_points_f16 = std::make_shared<ov::op::v0::Convert>(zero_points_node, ov::element::f16);
|
||||
auto w_zp =
|
||||
std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
result = std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
}
|
||||
|
||||
if (packed_shape.size() != 2) {
|
||||
// If not requantized channel-wise case, reshape back to original shape
|
||||
auto final_shape = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{orig_weight_shape.size()},
|
||||
orig_weight_shape);
|
||||
result = std::make_shared<ov::op::v1::Reshape>(result, final_shape, false);
|
||||
}
|
||||
|
||||
return std::make_shared<ov::op::v0::Convert>(result, ov::element::f32);
|
||||
}
|
||||
|
||||
// Extract quantized weights from tensor and create weight subgraph
|
||||
std::shared_ptr<ov::Node> extract_quantized_weights(const ggml_tensor * tensor,
|
||||
const void * data,
|
||||
ov::Tensor & weights,
|
||||
ov::Tensor & scales,
|
||||
ov::Tensor & zp,
|
||||
bool use_bias) {
|
||||
// Create a temporary tensor for extraction functions that read from tensor->data
|
||||
ggml_tensor temp_tensor = *tensor;
|
||||
temp_tensor.data = const_cast<void *>(data);
|
||||
|
||||
// Determine block size based on tensor type
|
||||
int64_t weights_per_block;
|
||||
bool is_u4;
|
||||
switch (tensor->type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q4_K:
|
||||
is_u4 = true;
|
||||
weights_per_block = 32;
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q5_K:
|
||||
is_u4 = false;
|
||||
weights_per_block = 32;
|
||||
break;
|
||||
case GGML_TYPE_Q6_K:
|
||||
is_u4 = false;
|
||||
weights_per_block = 16;
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error("Unsupported quantized type for extraction: " +
|
||||
std::string(ggml_type_name(tensor->type)));
|
||||
}
|
||||
|
||||
// Extract quantized data
|
||||
switch (tensor->type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
extract_q4_0_data(&temp_tensor, weights, scales, zp);
|
||||
break;
|
||||
case GGML_TYPE_Q4_1:
|
||||
extract_q4_1_data(&temp_tensor, weights, scales, zp, use_bias);
|
||||
break;
|
||||
case GGML_TYPE_Q4_K:
|
||||
extract_q4_k_data(&temp_tensor, weights, scales, zp, use_bias);
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
extract_q8_0_data(&temp_tensor, weights, scales, zp);
|
||||
break;
|
||||
case GGML_TYPE_Q6_K:
|
||||
extract_q6_k_data(&temp_tensor, weights, scales, zp);
|
||||
break;
|
||||
case GGML_TYPE_Q5_K:
|
||||
extract_q5_k_data(&temp_tensor, weights, scales, zp, use_bias);
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error("Unsupported quantized type: " + std::string(ggml_type_name(tensor->type)));
|
||||
}
|
||||
|
||||
// Create the OpenVINO weight subgraph
|
||||
ov::Output<ov::Node> weight_node;
|
||||
if (is_u4) {
|
||||
weight_node = make_int4_weights(weights, scales, zp, weights_per_block, use_bias);
|
||||
} else {
|
||||
weight_node = make_int8_weights(weights, scales, zp, weights_per_block, use_bias);
|
||||
}
|
||||
|
||||
auto result = weight_node.get_node_shared_ptr();
|
||||
result->set_friendly_name(tensor->name);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Requantize weights to target format, writing to provided buffers
|
||||
std::shared_ptr<ov::Node> requantize_to_buffers(const ggml_tensor * tensor,
|
||||
const void * data,
|
||||
ExtraQuantType requant_type,
|
||||
int64_t block_size,
|
||||
ov::Tensor & weights,
|
||||
ov::Tensor & scales,
|
||||
ov::Tensor & zp) {
|
||||
int64_t n_elements = ggml_nelements(tensor);
|
||||
|
||||
// First dequantize to F32
|
||||
std::vector<float> weights_f32(n_elements);
|
||||
ggml_get_type_traits(tensor->type)->to_float(data, weights_f32.data(), n_elements);
|
||||
|
||||
// Handle F16 case - just convert and create constant
|
||||
if (requant_type == ExtraQuantType::F16) {
|
||||
ggml_get_type_traits(GGML_TYPE_F16)->from_float_ref(weights_f32.data(), weights.data(), n_elements);
|
||||
auto result = std::make_shared<ov::op::v0::Constant>(weights);
|
||||
result->set_friendly_name(tensor->name);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Requantize to target quantized format
|
||||
bool is_u4 = (requant_type == ExtraQuantType::Q4_0_C || requant_type == ExtraQuantType::Q4_0_128);
|
||||
|
||||
if (is_u4) {
|
||||
quantize_q4_0(weights_f32.data(), weights, scales, zp, n_elements, block_size);
|
||||
} else if (requant_type == ExtraQuantType::Q8_1_C) {
|
||||
quantize_q8_1(weights_f32.data(), weights, scales, zp, n_elements, block_size);
|
||||
} else {
|
||||
quantize_q8_0(weights_f32.data(), weights, scales, zp, n_elements, block_size);
|
||||
}
|
||||
|
||||
// Create the OpenVINO weight subgraph
|
||||
ov::Output<ov::Node> weight_node;
|
||||
if (is_u4) {
|
||||
weight_node = make_int4_weights(weights, scales, zp, block_size);
|
||||
} else {
|
||||
weight_node = make_int8_weights(weights, scales, zp, block_size);
|
||||
}
|
||||
|
||||
auto result = weight_node.get_node_shared_ptr();
|
||||
result->set_friendly_name(tensor->name);
|
||||
return result;
|
||||
}
|
||||
|
||||
OvWeight process_weight_tensor(const ggml_tensor * tensor, const void * data, void * output_base_ptr, bool use_bias) {
|
||||
GGML_ASSERT(tensor != nullptr);
|
||||
GGML_ASSERT(data != nullptr);
|
||||
|
||||
OvWeight result;
|
||||
|
||||
// Get 2D shape for weights [rows, cols]
|
||||
ov::Shape node_shape = {static_cast<size_t>(tensor->ne[1]), static_cast<size_t>(tensor->ne[0])};
|
||||
|
||||
// Handle F16/F32/BF16 weights
|
||||
if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) {
|
||||
ov::element::Type element_type;
|
||||
switch (tensor->type) {
|
||||
case GGML_TYPE_F32:
|
||||
element_type = ov::element::f32;
|
||||
break;
|
||||
case GGML_TYPE_F16:
|
||||
element_type = ov::element::f16;
|
||||
break;
|
||||
case GGML_TYPE_BF16:
|
||||
element_type = ov::element::bf16;
|
||||
break;
|
||||
default:
|
||||
OPENVINO_THROW("Unexpected tensor type in F16/F32/BF16 path");
|
||||
}
|
||||
|
||||
if (output_base_ptr && output_base_ptr != data) {
|
||||
// Using external buffer - copy data and create shared-memory constant
|
||||
size_t tensor_bytes = ggml_nbytes(tensor);
|
||||
memcpy(output_base_ptr, data, tensor_bytes);
|
||||
result.weights = ov::Tensor(element_type, node_shape, output_base_ptr);
|
||||
} else {
|
||||
result.weights = ov::Tensor(element_type, node_shape, data);
|
||||
}
|
||||
result.weight_node = std::make_shared<ov::op::v0::Constant>(result.weights);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Handle quantized weights
|
||||
if (!ggml_is_quantized(tensor->type)) {
|
||||
OPENVINO_THROW("Unsupported weight tensor type: ", ggml_type_name(tensor->type));
|
||||
}
|
||||
|
||||
result.layout = ggml_openvino_get_extracted_layout(tensor, use_bias);
|
||||
const auto & layout = result.layout;
|
||||
if (layout.total_size == 0) {
|
||||
OPENVINO_THROW("Unsupported quantized type: ", ggml_type_name(tensor->type));
|
||||
}
|
||||
|
||||
if (use_bias) {
|
||||
OPENVINO_ASSERT(!layout.is_requant,
|
||||
"use_bias is only used for test-backend-ops, which should not have requantization");
|
||||
// bias node will be created on the fly and not use backend buffer
|
||||
output_base_ptr = nullptr;
|
||||
}
|
||||
|
||||
// F16 requant path - no separate scales/zp needed in result
|
||||
if (layout.is_requant && layout.requant_type.has_value() && layout.requant_type.value() == ExtraQuantType::F16) {
|
||||
if (output_base_ptr) {
|
||||
result.weights = ov::Tensor(ov::element::f16, node_shape,
|
||||
static_cast<uint8_t *>(output_base_ptr) + layout.weights_offset);
|
||||
} else {
|
||||
result.weights = ov::Tensor(ov::element::f16, node_shape);
|
||||
}
|
||||
ov::Tensor dummy_scales, dummy_zp; // Not used for F16
|
||||
result.weight_node =
|
||||
requantize_to_buffers(tensor, data, ExtraQuantType::F16, 0, result.weights, dummy_scales, dummy_zp);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Quantized path (normal extraction or quantized requant)
|
||||
// Create weight/scale/zp tensors - shared between both paths
|
||||
ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
|
||||
ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block};
|
||||
ov::Shape zp_shape = layout.is_symmetric ? ov::Shape{} : scale_shape;
|
||||
|
||||
if (output_base_ptr) {
|
||||
uint8_t * buf_base = static_cast<uint8_t *>(output_base_ptr);
|
||||
result.weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset);
|
||||
result.scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset);
|
||||
result.zp = ov::Tensor(weight_type, zp_shape, buf_base + layout.zp_offset);
|
||||
} else {
|
||||
result.weights = ov::Tensor(weight_type, node_shape);
|
||||
result.scales = ov::Tensor(ov::element::f16, scale_shape);
|
||||
if (use_bias && !layout.is_symmetric) {
|
||||
// bias only has effect for asymmetric quant
|
||||
result.zp = ov::Tensor(ov::element::f16, zp_shape);
|
||||
} else {
|
||||
result.zp = ov::Tensor(weight_type, zp_shape);
|
||||
}
|
||||
}
|
||||
|
||||
if (layout.is_requant && layout.requant_type.has_value()) {
|
||||
result.weight_node = requantize_to_buffers(tensor, data, layout.requant_type.value(), layout.weights_per_block,
|
||||
result.weights, result.scales, result.zp);
|
||||
} else {
|
||||
result.weight_node =
|
||||
extract_quantized_weights(tensor, data, result.weights, result.scales, result.zp, use_bias);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
void quantize_q4_0(const float * x,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
ov::Tensor & zp_arr,
|
||||
int64_t k,
|
||||
int64_t qk) {
|
||||
assert(k % qk == 0);
|
||||
const int nb = k / qk;
|
||||
|
||||
auto * weights = static_cast<uint8_t *>(weights_arr.data());
|
||||
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto * zp = static_cast<uint8_t *>(zp_arr.data());
|
||||
bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization
|
||||
|
||||
// For Q4_0, zero point is always 8
|
||||
if (is_scalar_zp) {
|
||||
zp[0] = 8 | (8 << 4); // Pack two 4-bit values
|
||||
}
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
float amax = 0.0f; // absolute max
|
||||
float max = 0.0f;
|
||||
|
||||
for (int j = 0; j < qk; j++) {
|
||||
const float v = x[i * qk + j];
|
||||
if (amax < fabsf(v)) {
|
||||
amax = fabsf(v);
|
||||
max = v;
|
||||
}
|
||||
}
|
||||
|
||||
const float d = max / -8;
|
||||
|
||||
if (d == 0) {
|
||||
scales[i] = ov::float16(1.0f);
|
||||
// zp is already set to 8 for symmetric, or set per-block for asymmetric
|
||||
if (!is_scalar_zp) {
|
||||
if (i % 2 == 0) {
|
||||
zp[i / 2] = 8;
|
||||
} else {
|
||||
zp[i / 2] |= (8 << 4);
|
||||
}
|
||||
}
|
||||
memset(weights + i * qk / 2, 8 | (8 << 4), qk / 2);
|
||||
continue;
|
||||
}
|
||||
|
||||
const float id = 1.0f / d;
|
||||
scales[i] = ov::float16(d);
|
||||
// For asymmetric quantization, store per-block zero points
|
||||
if (!is_scalar_zp) {
|
||||
if (i % 2 == 0) {
|
||||
zp[i / 2] = 8;
|
||||
} else {
|
||||
zp[i / 2] |= (8 << 4);
|
||||
}
|
||||
}
|
||||
|
||||
for (int j = 0; j < qk / 2; ++j) {
|
||||
const float x0 = x[i * qk + 2 * j] * id;
|
||||
const float x1 = x[i * qk + 2 * j + 1] * id;
|
||||
const uint8_t xi0 = MIN(15, (int8_t) (x0 + 8.5f));
|
||||
const uint8_t xi1 = MIN(15, (int8_t) (x1 + 8.5f));
|
||||
weights[i * qk / 2 + j] = xi0 | (xi1 << 4);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void quantize_q8_0(const float * x,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
ov::Tensor & zp_arr,
|
||||
int64_t k,
|
||||
int64_t qk) {
|
||||
assert(k % qk == 0);
|
||||
const int nb = k / qk;
|
||||
|
||||
auto * weights = static_cast<uint8_t *>(weights_arr.data());
|
||||
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto * zp = static_cast<uint8_t *>(zp_arr.data());
|
||||
bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization
|
||||
|
||||
// For Q8_0, zero point is always 128
|
||||
if (is_scalar_zp) {
|
||||
zp[0] = 128;
|
||||
}
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
float amax = 0.0f; // absolute max
|
||||
|
||||
for (int j = 0; j < qk; j++) {
|
||||
const float v = x[i * qk + j];
|
||||
if (amax < fabsf(v)) {
|
||||
amax = fabsf(v);
|
||||
}
|
||||
}
|
||||
|
||||
const float d = amax / 127.0f;
|
||||
const float id = d ? 1.0f / d : 0.0f;
|
||||
scales[i] = ov::float16(d);
|
||||
// For asymmetric quantization, store per-block zero points
|
||||
if (!is_scalar_zp) {
|
||||
zp[i] = 128;
|
||||
}
|
||||
|
||||
for (int j = 0; j < qk; ++j) {
|
||||
const float x0 = x[i * qk + j] * id;
|
||||
const int8_t xi0 = roundf(x0);
|
||||
weights[i * qk + j] = (uint8_t) (xi0 + 128);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void quantize_q8_1(const float * x,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
ov::Tensor & zp_arr,
|
||||
int64_t k,
|
||||
int64_t qk) {
|
||||
assert(k % qk == 0);
|
||||
const int nb = k / qk;
|
||||
|
||||
auto * weights = static_cast<uint8_t *>(weights_arr.data());
|
||||
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto * zp = static_cast<uint8_t *>(zp_arr.data());
|
||||
for (int i = 0; i < nb; i++) {
|
||||
float min = std::numeric_limits<float>::max();
|
||||
float max = std::numeric_limits<float>::lowest();
|
||||
|
||||
for (int j = 0; j < qk; j++) {
|
||||
const float v = x[i * qk + j];
|
||||
if (v < min) {
|
||||
min = v;
|
||||
}
|
||||
if (v > max) {
|
||||
max = v;
|
||||
}
|
||||
}
|
||||
|
||||
const float d = (max - min) / ((1 << 8) - 1);
|
||||
const float id = d ? 1.0f / d : 0.0f;
|
||||
scales[i] = ov::float16(d);
|
||||
// zp = -min / scale (Q8_1 is asymmetric)
|
||||
zp[i] = (d != 0.0f) ? (uint8_t) std::round(-min / d) : 0;
|
||||
|
||||
for (int j = 0; j < qk; ++j) {
|
||||
const float x0 = (x[i * qk + j] - min) * id;
|
||||
const uint8_t xi0 = roundf(x0);
|
||||
weights[i * qk + j] = xi0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,153 @@
|
|||
#pragma once
|
||||
#include "ggml-openvino-extra.h" // For ExtraQuantType
|
||||
#include "ggml.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <openvino/op/constant.hpp>
|
||||
#include <openvino/runtime/tensor.hpp>
|
||||
|
||||
void unpack_32_4(const uint8_t* data, uint8_t* dst);
|
||||
|
||||
void extract_q4_0_data(const ggml_tensor * tensor,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
ov::Tensor & zp_arr);
|
||||
|
||||
void extract_q4_1_data(const ggml_tensor * tensor,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
ov::Tensor & zp_arr,
|
||||
bool use_bias = false);
|
||||
|
||||
void extract_q8_0_data(const ggml_tensor * tensor,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
ov::Tensor & zp_arr);
|
||||
|
||||
void unpack_256_4(const uint8_t* data, uint8_t* dst);
|
||||
|
||||
void extract_q4_k_data(const ggml_tensor * tensor,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
ov::Tensor & zp_arr,
|
||||
bool use_bias = false);
|
||||
|
||||
void extract_q5_k_data(const ggml_tensor * tensor,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
ov::Tensor & zp_arr,
|
||||
bool use_bias = false);
|
||||
|
||||
void extract_q6_k_data(const ggml_tensor * tensor,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
ov::Tensor & zp_arr);
|
||||
|
||||
static constexpr size_t GGML_QUANTIZATION_GROUP_SIZE = 32;
|
||||
|
||||
ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight,
|
||||
ov::Tensor & scales,
|
||||
ov::Tensor & zp,
|
||||
size_t group_size = GGML_QUANTIZATION_GROUP_SIZE,
|
||||
bool use_bias = false);
|
||||
|
||||
ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight,
|
||||
ov::Tensor & scales,
|
||||
ov::Tensor & zp,
|
||||
size_t group_size = GGML_QUANTIZATION_GROUP_SIZE,
|
||||
bool use_bias = false);
|
||||
|
||||
// Extract quantized weights from tensor and create weight subgraph
|
||||
// If weights/scales/zp are provided (non-empty), uses them as output buffers
|
||||
// Otherwise allocates new ov::Tensors internally
|
||||
// Returns the weight node (make_int4_weights or make_int8_weights result)
|
||||
std::shared_ptr<ov::Node> extract_quantized_weights(
|
||||
const ggml_tensor * tensor,
|
||||
const void * data, // Source data pointer (may differ from tensor->data)
|
||||
ov::Tensor & weights,
|
||||
ov::Tensor & scales,
|
||||
ov::Tensor & zp,
|
||||
bool use_bias = false); // Use fp bias instead of quantized zero_point (for test-backend-ops)
|
||||
|
||||
// Requantize weights from tensor to target format, writing to provided buffers
|
||||
// For F16 target, only weights buffer is used (scales/zp ignored)
|
||||
// Returns the weight node
|
||||
std::shared_ptr<ov::Node> requantize_to_buffers(const ggml_tensor * tensor,
|
||||
const void * data, // Source data pointer
|
||||
ExtraQuantType requant_type,
|
||||
int64_t block_size,
|
||||
ov::Tensor & weights,
|
||||
ov::Tensor & scales,
|
||||
ov::Tensor & zp);
|
||||
|
||||
inline const char * extra_quant_type_name(ExtraQuantType t) {
|
||||
switch (t) {
|
||||
case ExtraQuantType::F16:
|
||||
return "F16";
|
||||
case ExtraQuantType::Q4_0_C:
|
||||
return "Q4_0_C";
|
||||
case ExtraQuantType::Q4_0_128:
|
||||
return "Q4_0_128";
|
||||
case ExtraQuantType::Q8_0_C:
|
||||
return "Q8_0_C";
|
||||
case ExtraQuantType::Q8_0_32:
|
||||
return "Q8_0_32";
|
||||
case ExtraQuantType::Q8_1_C:
|
||||
return "Q8_1_C";
|
||||
default:
|
||||
return "unknown";
|
||||
}
|
||||
}
|
||||
|
||||
// Result from process_weight_tensor containing the weight node and tensors.
|
||||
// For quantized weights, also contains the extracted layout and scale/zp tensors.
|
||||
struct OvWeight {
|
||||
std::shared_ptr<ov::Node> weight_node;
|
||||
ggml_openvino_extracted_layout layout; // Only meaningful for quantized (layout.total_size > 0)
|
||||
ov::Tensor weights;
|
||||
ov::Tensor scales;
|
||||
ov::Tensor zp;
|
||||
|
||||
bool is_quantized() const { return layout.scales_size > 0; }
|
||||
};
|
||||
|
||||
// Process weight tensor and create an OpenVINO weight node
|
||||
// Handles F16/F32/BF16 and quantized weights, with optional requantization
|
||||
// If output_base_ptr is nullptr, allocates internal buffers (for decoder use)
|
||||
// If output_base_ptr is provided, uses pre-allocated buffers at specified offsets (for backend buffer use)
|
||||
// Returns OvWeight with the weight node and optional quantized tensors
|
||||
OvWeight process_weight_tensor(
|
||||
const ggml_tensor * tensor,
|
||||
const void * data, // Source data pointer (may differ from tensor->data)
|
||||
void * output_base_ptr = nullptr, // Base pointer for output buffers (or nullptr for internal allocation)
|
||||
bool use_bias = false); // Use fp bias instead of quantized zero_point, only used in test-backend-ops
|
||||
|
||||
void quantize_q4_0(const float * x,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
ov::Tensor & zp_arr,
|
||||
int64_t k,
|
||||
int64_t qk);
|
||||
void quantize_q8_1(const float * x,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
ov::Tensor & zp_arr,
|
||||
int64_t k,
|
||||
int64_t qk);
|
||||
void quantize_q8_0(const float * x,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
ov::Tensor & zp_arr,
|
||||
int64_t k,
|
||||
int64_t qk);
|
||||
|
||||
namespace ov {
|
||||
namespace op {
|
||||
namespace util {
|
||||
// From <openvino>/src/common/transformations/include/transformations/utils/utils.hpp
|
||||
bool get_single_value(const std::shared_ptr<ov::op::v0::Constant>& const_node,
|
||||
float& value,
|
||||
bool check_value_range = true);
|
||||
} // namespace util
|
||||
} // namespace op
|
||||
} // namespace ov
|
||||
|
|
@ -0,0 +1,74 @@
|
|||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <map>
|
||||
#include <openvino/core/node.hpp>
|
||||
#include <openvino/frontend/decoder.hpp>
|
||||
#include <string>
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
|
||||
class GgmlDecoder : public DecoderBase {
|
||||
public:
|
||||
virtual ov::Any get_attribute(const std::string& name) const = 0;
|
||||
|
||||
virtual PartialShape get_input_shape(int node_idx, const std::string& name) const = 0;
|
||||
|
||||
virtual std::vector<size_t> get_input_stride(int node_idx, const std::string& name) const = 0;
|
||||
|
||||
virtual element::Type get_input_type(int node_idx, const std::string& name) const = 0;
|
||||
|
||||
virtual size_t get_input_size() const = 0;
|
||||
|
||||
virtual size_t get_input_size(int node_idx) const = 0;
|
||||
|
||||
virtual void get_input_node(size_t input_port_idx,
|
||||
std::string& producer_name,
|
||||
std::string& producer_output_port_name,
|
||||
size_t& producer_output_port_index) const = 0;
|
||||
|
||||
virtual std::vector<std::string> get_input_names(int node_idx) const = 0;
|
||||
|
||||
virtual PartialShape get_output_shape(int node_idx) const = 0;
|
||||
|
||||
virtual element::Type get_output_type(const int node_idx) const = 0;
|
||||
|
||||
virtual int32_t* get_input_op_params(int node_idx, const std::string& name) const = 0;
|
||||
|
||||
virtual int32_t * get_output_op_params(int node_idx) const = 0;
|
||||
|
||||
virtual std::vector<std::string> get_output_names(int node_idx) const = 0;
|
||||
|
||||
virtual const std::string& get_op_type() const = 0;
|
||||
|
||||
virtual const std::string& get_op_type(int node_idx) const = 0;
|
||||
|
||||
virtual const std::string& get_op_name() const = 0;
|
||||
|
||||
virtual const std::string& get_op_name(int node_idx) const = 0;
|
||||
|
||||
virtual void visit_subgraph(std::function<void(std::shared_ptr<GgmlDecoder>, int node_idx)> node_visitor) const = 0;
|
||||
|
||||
virtual int get_op_case(int node_idx) const = 0;
|
||||
|
||||
virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_inputs() const = 0;
|
||||
virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_extra_inputs() const = 0;
|
||||
virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_weights() const = 0;
|
||||
virtual std::vector<std::string> get_model_output_names() const = 0;
|
||||
|
||||
virtual int32_t* get_rope_params() const = 0;
|
||||
|
||||
virtual std::map<std::string, std::string> get_kv_param_res_names() const = 0;
|
||||
|
||||
virtual bool is_static() const = 0;
|
||||
|
||||
virtual bool is_stateful() const = 0;
|
||||
|
||||
virtual int is_swa_layer(int layer) const = 0;
|
||||
};
|
||||
|
||||
} // namespace ggml
|
||||
} // namespace frontend
|
||||
} // namespace ov
|
||||
|
|
@ -0,0 +1,27 @@
|
|||
#include "frontend.hpp"
|
||||
|
||||
#include "input_model.hpp"
|
||||
#include "op_table.hpp"
|
||||
#include "translate_session.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
|
||||
FrontEnd::FrontEnd() {}
|
||||
|
||||
std::shared_ptr<Model> FrontEnd::convert(const InputModel::Ptr & model, bool naive) {
|
||||
auto ggml_model = std::dynamic_pointer_cast<ggml::InputModel>(model);
|
||||
FRONT_END_GENERAL_CHECK(ggml_model, "Invalid input model");
|
||||
std::shared_ptr<Model> converted_model;
|
||||
const auto & supported_ops = get_supported_ops();
|
||||
{
|
||||
TranslateSession translate_session(model, supported_ops, naive);
|
||||
converted_model = translate_session.get_converted_model();
|
||||
}
|
||||
return converted_model;
|
||||
}
|
||||
|
||||
} // namespace ggml
|
||||
} // namespace frontend
|
||||
} // namespace ov
|
||||
|
|
@ -0,0 +1,23 @@
|
|||
// Copyright (C) 2018-2024 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <openvino/frontend/frontend.hpp>
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
|
||||
class FrontEnd {
|
||||
public:
|
||||
using Ptr = std::shared_ptr<FrontEnd>;
|
||||
FrontEnd();
|
||||
|
||||
static std::shared_ptr<Model> convert(const InputModel::Ptr& model, bool naive = false);
|
||||
};
|
||||
|
||||
} // namespace ggml
|
||||
} // namespace frontend
|
||||
} // namespace ov
|
||||
|
|
@ -0,0 +1,17 @@
|
|||
#include "input_model.hpp"
|
||||
|
||||
#include "decoder.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
|
||||
InputModel::InputModel(const std::shared_ptr<GgmlDecoder> & gdecoder) : m_decoder(gdecoder) {}
|
||||
|
||||
const std::shared_ptr<GgmlDecoder> & InputModel::get_model_decoder() const {
|
||||
return m_decoder;
|
||||
}
|
||||
|
||||
} // namespace ggml
|
||||
} // namespace frontend
|
||||
} // namespace ov
|
||||
|
|
@ -0,0 +1,29 @@
|
|||
#pragma once
|
||||
|
||||
#include <openvino/frontend/input_model.hpp>
|
||||
|
||||
#include "decoder.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
|
||||
class FrontEnd;
|
||||
class GgmlDecoder;
|
||||
using ov::frontend::ggml::GgmlDecoder;
|
||||
|
||||
class InputModel : public ov::frontend::InputModel {
|
||||
friend class ::ov::frontend::ggml::FrontEnd;
|
||||
|
||||
public:
|
||||
explicit InputModel(const std::shared_ptr<GgmlDecoder>& gdecoder);
|
||||
|
||||
const std::shared_ptr<GgmlDecoder>& get_model_decoder() const;
|
||||
|
||||
private:
|
||||
std::shared_ptr<GgmlDecoder> m_decoder;
|
||||
};
|
||||
|
||||
} // namespace ggml
|
||||
} // namespace frontend
|
||||
} // namespace ov
|
||||
|
|
@ -0,0 +1,112 @@
|
|||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <openvino/frontend/node_context.hpp>
|
||||
#include <string>
|
||||
|
||||
#include "decoder.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
|
||||
class TranslateSession;
|
||||
|
||||
typedef std::map<std::string, Output<Node>> TensorMap;
|
||||
|
||||
class NodeContext : public frontend::NodeContext {
|
||||
public:
|
||||
NodeContext(const std::shared_ptr<GgmlDecoder>& decoder,
|
||||
std::shared_ptr<TensorMap>& tensor_map,
|
||||
int node_idx,
|
||||
TranslateSession* translate_session = nullptr)
|
||||
: ov::frontend::NodeContext(decoder->get_op_type(node_idx)),
|
||||
m_decoder(decoder),
|
||||
m_tensor_map(tensor_map),
|
||||
m_node_idx(node_idx),
|
||||
m_translate_session(translate_session) {
|
||||
m_input_names = decoder->get_input_names(m_node_idx);
|
||||
m_output_names = decoder->get_output_names(m_node_idx);
|
||||
}
|
||||
|
||||
TranslateSession* get_translate_session() const {
|
||||
return m_translate_session;
|
||||
}
|
||||
|
||||
const std::vector<std::string>& get_input_names() const { return m_input_names; }
|
||||
|
||||
size_t get_input_size() const override {
|
||||
return m_decoder->get_input_size(m_node_idx);
|
||||
}
|
||||
|
||||
ov::element::Type get_input_type(size_t index) const {
|
||||
return m_decoder->get_input_type(m_node_idx, m_input_names[index]);
|
||||
}
|
||||
|
||||
PartialShape get_input_shape(size_t input_index) const {
|
||||
return m_decoder->get_input_shape(m_node_idx, m_input_names[input_index]);
|
||||
}
|
||||
|
||||
std::vector<size_t> get_input_stride(size_t index) const {
|
||||
return m_decoder->get_input_stride(m_node_idx, m_input_names[index]);
|
||||
}
|
||||
|
||||
std::string get_output_name() const { return m_output_names[0]; }
|
||||
|
||||
PartialShape get_output_shape() const { return m_decoder->get_output_shape(m_node_idx); }
|
||||
|
||||
int32_t* get_input_op_params(size_t index) const {
|
||||
return m_decoder->get_input_op_params(m_node_idx, m_input_names[index]);
|
||||
}
|
||||
|
||||
int32_t * get_output_op_params() const { return m_decoder->get_output_op_params(m_node_idx); }
|
||||
|
||||
ov::element::Type get_output_type() const {
|
||||
return m_decoder->get_output_type(m_node_idx);
|
||||
}
|
||||
|
||||
Output<Node> get_input(int idx) const override {
|
||||
return m_tensor_map->at(m_input_names[idx]);
|
||||
}
|
||||
|
||||
Output<Node> get_input(const std::string& name) const override {
|
||||
if (m_tensor_map->find(name) == m_tensor_map->end()) {
|
||||
throw std::runtime_error("'" + name + "' not found in tensor map.");
|
||||
}
|
||||
return m_tensor_map->at(name);
|
||||
}
|
||||
|
||||
bool has_input(const std::string& name) const {
|
||||
return m_tensor_map->find(name) != m_tensor_map->end();
|
||||
}
|
||||
|
||||
const std::string& get_name() const override {
|
||||
return m_decoder->get_op_name(m_node_idx);
|
||||
}
|
||||
|
||||
ov::Any get_attribute_as_any(const std::string& name) const override {
|
||||
return m_decoder->get_attribute(name);
|
||||
}
|
||||
|
||||
int get_op_case() const {
|
||||
return m_decoder->get_op_case(m_node_idx);
|
||||
}
|
||||
|
||||
bool is_static() const { return m_decoder->is_static(); }
|
||||
|
||||
bool is_stateful() const { return m_decoder->is_stateful(); }
|
||||
|
||||
private:
|
||||
std::shared_ptr<GgmlDecoder> m_decoder;
|
||||
std::shared_ptr<TensorMap>& m_tensor_map;
|
||||
int m_node_idx;
|
||||
TranslateSession* m_translate_session;
|
||||
std::vector<std::string> m_input_names;
|
||||
std::vector<std::string> m_output_names;
|
||||
};
|
||||
|
||||
using CreatorFunction = std::function<ov::OutputVector(const ov::frontend::ggml::NodeContext&)>;
|
||||
|
||||
} // namespace ggml
|
||||
} // namespace frontend
|
||||
} // namespace ov
|
||||
|
|
@ -0,0 +1,48 @@
|
|||
|
||||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
#include <climits>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <openvino/op/reshape.hpp>
|
||||
#include <openvino/op/slice.hpp>
|
||||
#include <vector>
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace op {
|
||||
|
||||
OutputVector translate_cont(const NodeContext & context) {
|
||||
num_inputs_check(context, 1, 1);
|
||||
|
||||
int op_case = context.get_op_case();
|
||||
FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3, "Unsupported CONT case");
|
||||
|
||||
auto src_shape = context.get_input_shape(0).to_shape();
|
||||
auto dst_shape = context.get_output_shape().to_shape();
|
||||
ov::Output<Node> res;
|
||||
|
||||
if (op_case == 1) {
|
||||
// The input comes from a PERMUTE
|
||||
throw std::runtime_error("Code of this case might be outdated");
|
||||
dst_shape[1] = -1;
|
||||
res = std::make_shared<ov::op::v1::Reshape>(
|
||||
context.get_input(0), ov::op::v0::Constant::create(ov::element::i64, {dst_shape.size()}, dst_shape), false);
|
||||
} else if (op_case == 2) {
|
||||
// The input comes from a TRANSPOSE
|
||||
return {context.get_input(0)};
|
||||
} else {
|
||||
// The input comes from a VIEW
|
||||
res = process_view_input(context, 0);
|
||||
}
|
||||
|
||||
return rename_outputs_with_suffix({res}, context.get_name());
|
||||
}
|
||||
|
||||
} // namespace op
|
||||
} // namespace ggml
|
||||
} // namespace frontend
|
||||
} // namespace ov
|
||||
|
|
@ -0,0 +1,21 @@
|
|||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
#include <memory>
|
||||
#include <openvino/op/convert.hpp>
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace op {
|
||||
|
||||
OutputVector translate_cpy(const NodeContext & context) {
|
||||
auto res = std::make_shared<ov::op::v0::Convert>(context.get_input(0), context.get_output_type());
|
||||
return rename_outputs_with_suffix({res}, context.get_name());
|
||||
}
|
||||
|
||||
} // namespace op
|
||||
} // namespace ggml
|
||||
} // namespace frontend
|
||||
} // namespace ov
|
||||
|
|
@ -0,0 +1,90 @@
|
|||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <openvino/op/broadcast.hpp>
|
||||
#include <openvino/op/concat.hpp>
|
||||
#include <openvino/op/constant.hpp>
|
||||
#include <openvino/op/convert.hpp>
|
||||
#include <openvino/op/reshape.hpp>
|
||||
#include <openvino/op/scaled_dot_product_attention.hpp>
|
||||
#include <openvino/op/transpose.hpp>
|
||||
#include <openvino/op/unsqueeze.hpp>
|
||||
#include <string>
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace op {
|
||||
|
||||
OutputVector translate_flash_attn_ext(const NodeContext & context) {
|
||||
num_inputs_check(context, 4, 4);
|
||||
auto q_f32 = context.get_input(0);
|
||||
auto k = context.get_input(1);
|
||||
auto v = context.get_input(2);
|
||||
auto mask = context.get_input(3);
|
||||
|
||||
float * params = reinterpret_cast<float *>(context.get_output_op_params());
|
||||
float scale = params[0];
|
||||
// float max_bias = params[1];
|
||||
// float logit_softcap = params[2];
|
||||
|
||||
auto q = std::make_shared<ov::op::v0::Convert>(q_f32, ov::element::f16);
|
||||
auto scale_node = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{}, std::vector<float>{scale});
|
||||
|
||||
ov::Output<ov::Node> mask_sliced, res;
|
||||
std::string mask_name = "KQ_mask_sliced";
|
||||
if (context.get_input_names()[3].find("swa") != std::string::npos) {
|
||||
mask_name = "KQ_mask_swa_sliced";
|
||||
}
|
||||
if (context.has_input(mask_name)) {
|
||||
mask_sliced = context.get_input(mask_name);
|
||||
} else {
|
||||
auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
|
||||
auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
|
||||
auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2});
|
||||
auto token_len = get_dimensions(q, {2});
|
||||
mask_sliced = std::make_shared<ov::op::v8::Slice>(mask, zero, token_len, one, two);
|
||||
}
|
||||
|
||||
if (mask_sliced.get_element_type() != ov::element::f16) {
|
||||
mask_sliced = std::make_shared<ov::op::v0::Convert>(mask_sliced, ov::element::f16);
|
||||
}
|
||||
|
||||
auto tile_kv = [&](int64_t num_heads, int64_t num_heads_kv, int64_t head_size, ov::Output<Node> kv) {
|
||||
int64_t factor = num_heads / num_heads_kv;
|
||||
if (factor > 1 && num_heads_kv > 1) {
|
||||
ov::Output<ov::Node> kv_broadcast_shape, kv_unsqueezed, new_kv_shape;
|
||||
auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {2});
|
||||
kv_unsqueezed = std::make_shared<ov::op::v0::Unsqueeze>(kv, unsqueeze_axes);
|
||||
|
||||
kv_broadcast_shape = ov::op::v0::Constant::create(
|
||||
ov::element::i64, {5}, {(int64_t) 1, (int64_t) 1, factor, (int64_t) 1, (int64_t) 1});
|
||||
new_kv_shape =
|
||||
ov::op::v0::Constant::create(ov::element::i64, {4}, {(int64_t) 0, num_heads, (int64_t) -1, head_size});
|
||||
|
||||
kv = std::make_shared<ov::op::v3::Broadcast>(kv_unsqueezed, kv_broadcast_shape,
|
||||
ov::op::BroadcastType::BIDIRECTIONAL);
|
||||
kv = std::make_shared<ov::op::v1::Reshape>(kv, new_kv_shape, true);
|
||||
}
|
||||
return kv;
|
||||
};
|
||||
|
||||
auto q_shape = context.get_input_shape(0).to_shape();
|
||||
auto k_shape = context.get_input_shape(1).to_shape();
|
||||
k = tile_kv(q_shape[1], k_shape[1], q_shape[3], k);
|
||||
v = tile_kv(q_shape[1], k_shape[1], q_shape[3], v);
|
||||
|
||||
auto sdpa = std::make_shared<ov::op::v13::ScaledDotProductAttention>(q, k, v, mask_sliced, scale_node, false);
|
||||
res = std::make_shared<ov::op::v1::Transpose>(sdpa,
|
||||
ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3}));
|
||||
res = std::make_shared<ov::op::v0::Convert>(res, ov::element::f32);
|
||||
return rename_outputs_with_suffix({res}, context.get_name());
|
||||
}
|
||||
|
||||
} // namespace op
|
||||
} // namespace ggml
|
||||
} // namespace frontend
|
||||
} // namespace ov
|
||||
|
|
@ -0,0 +1,69 @@
|
|||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
#include <openvino/core/node.hpp>
|
||||
#include <openvino/core/node_output.hpp>
|
||||
#include <openvino/op/constant.hpp>
|
||||
#include <openvino/op/convert.hpp>
|
||||
#include <openvino/op/gather.hpp>
|
||||
#include <openvino/op/squeeze.hpp>
|
||||
#include <openvino/op/unsqueeze.hpp>
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace op {
|
||||
|
||||
OutputVector translate_get_rows(const NodeContext & context) {
|
||||
num_inputs_check(context, 2, 2);
|
||||
|
||||
int op_case = context.get_op_case();
|
||||
|
||||
Output<Node> res;
|
||||
auto data = context.get_input(0);
|
||||
auto indices = context.get_input(1);
|
||||
|
||||
if (op_case == 2) {
|
||||
// The input comes from a VIEW
|
||||
indices = process_view_input(context, 1);
|
||||
}
|
||||
|
||||
// data[1,b,x,y] ind[1,1,b,x'] test-backend-ops case
|
||||
// data[x,y] ind[1,1,1,x'] normal case
|
||||
indices =
|
||||
std::make_shared<ov::op::v0::Squeeze>(indices, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1}));
|
||||
if (data.get_partial_shape().rank() == 4) {
|
||||
if (data.get_partial_shape()[1].get_length() == 1) {
|
||||
// Work-around for a bug in ov cpu plugin for test-backend-ops
|
||||
data = std::make_shared<ov::op::v0::Squeeze>(data,
|
||||
ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1}));
|
||||
auto axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {0});
|
||||
res = std::make_shared<ov::op::v8::Gather>(data, indices, axis);
|
||||
} else {
|
||||
auto axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {1});
|
||||
data =
|
||||
std::make_shared<ov::op::v0::Squeeze>(data, ov::op::v0::Constant::create(ov::element::i64, {1}, {0}));
|
||||
res = std::make_shared<ov::op::v8::Gather>(data, indices, axis, 1);
|
||||
}
|
||||
} else if (context.is_stateful() && data.get_partial_shape().rank() == 3) {
|
||||
auto axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {1});
|
||||
res = std::make_shared<ov::op::v8::Gather>(data, indices, axis, 1);
|
||||
} else {
|
||||
auto axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {0});
|
||||
res = std::make_shared<ov::op::v8::Gather>(data, indices, axis);
|
||||
}
|
||||
|
||||
if (res.get_element_type() != context.get_output_type()) {
|
||||
res = std::make_shared<ov::op::v0::Convert>(res, context.get_output_type());
|
||||
}
|
||||
if (!(context.is_stateful())) {
|
||||
res = std::make_shared<ov::op::v0::Unsqueeze>(res, ov::op::v0::Constant::create(ov::element::i64, {1}, {0}));
|
||||
}
|
||||
return rename_outputs_with_suffix({res}, context.get_name());
|
||||
}
|
||||
|
||||
} // namespace op
|
||||
} // namespace ggml
|
||||
} // namespace frontend
|
||||
} // namespace ov
|
||||
|
|
@ -0,0 +1,61 @@
|
|||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
#include <memory>
|
||||
#include <openvino/core/node_output.hpp>
|
||||
#include <openvino/op/constant.hpp>
|
||||
#include <openvino/op/gelu.hpp>
|
||||
#include <openvino/op/multiply.hpp>
|
||||
#include <openvino/op/sigmoid.hpp>
|
||||
#include <openvino/op/slice.hpp>
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace op {
|
||||
|
||||
OutputVector translate_glu_geglu(const NodeContext & context) {
|
||||
num_inputs_check(context, 1, 2);
|
||||
|
||||
ov::Output<ov::Node> src0;
|
||||
ov::Output<ov::Node> src1;
|
||||
if (context.get_input_size() == 2) {
|
||||
src0 = context.get_input(0);
|
||||
src1 = context.get_input(1);
|
||||
} else {
|
||||
// GGML splits along ne[0] (OV last axis) using floor division: nc = ne[0] / 2.
|
||||
// Both halves are nc elements; if the dimension is odd, the last element is dropped.
|
||||
// Use Slice instead of Split to handle odd dimensions correctly.
|
||||
auto combined = context.get_input(0);
|
||||
auto combined_shape = combined.get_partial_shape();
|
||||
int64_t last_dim_val = combined_shape[combined_shape.rank().get_length() - 1].get_length();
|
||||
int64_t nc = last_dim_val / 2;
|
||||
|
||||
auto axis = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1});
|
||||
auto step = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
|
||||
auto start0 = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
|
||||
auto stop0 = ov::op::v0::Constant::create(ov::element::i64, {1}, {nc});
|
||||
auto start1 = ov::op::v0::Constant::create(ov::element::i64, {1}, {nc});
|
||||
auto stop1 = ov::op::v0::Constant::create(ov::element::i64, {1}, {2 * nc});
|
||||
|
||||
src0 = std::make_shared<ov::op::v8::Slice>(combined, start0, stop0, step, axis);
|
||||
src1 = std::make_shared<ov::op::v8::Slice>(combined, start1, stop1, step, axis);
|
||||
}
|
||||
|
||||
int32_t * params = context.get_output_op_params();
|
||||
const int32_t swapped = params[1];
|
||||
if (swapped) {
|
||||
std::swap(src0, src1);
|
||||
}
|
||||
|
||||
auto gelu = std::make_shared<ov::op::v7::Gelu>(src0);
|
||||
auto res = std::make_shared<ov::op::v1::Multiply>(gelu, src1);
|
||||
|
||||
return rename_outputs_with_suffix({res}, context.get_name());
|
||||
}
|
||||
|
||||
} // namespace op
|
||||
} // namespace ggml
|
||||
} // namespace frontend
|
||||
} // namespace ov
|
||||
|
|
@ -0,0 +1,62 @@
|
|||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <openvino/core/node_output.hpp>
|
||||
#include <openvino/op/constant.hpp>
|
||||
#include <openvino/op/multiply.hpp>
|
||||
#include <openvino/op/sigmoid.hpp>
|
||||
#include <openvino/op/slice.hpp>
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace op {
|
||||
|
||||
OutputVector translate_glu_swiglu(const NodeContext & context) {
|
||||
num_inputs_check(context, 1, 2);
|
||||
|
||||
ov::Output<ov::Node> src0;
|
||||
ov::Output<ov::Node> src1;
|
||||
if (context.get_input_size() == 2) {
|
||||
src0 = context.get_input(0);
|
||||
src1 = context.get_input(1);
|
||||
} else {
|
||||
// GGML splits along ne[0] (OV last axis) using floor division: nc = ne[0] / 2.
|
||||
// Both halves are nc elements; if the dimension is odd, the last element is dropped.
|
||||
// Use Slice instead of Split to handle odd dimensions correctly.
|
||||
auto combined = context.get_input(0);
|
||||
auto combined_shape = combined.get_partial_shape();
|
||||
int64_t last_dim_val = combined_shape[combined_shape.rank().get_length() - 1].get_length();
|
||||
int64_t nc = last_dim_val / 2;
|
||||
|
||||
auto axis = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1});
|
||||
auto step = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
|
||||
auto start0 = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
|
||||
auto stop0 = ov::op::v0::Constant::create(ov::element::i64, {1}, {nc});
|
||||
auto start1 = ov::op::v0::Constant::create(ov::element::i64, {1}, {nc});
|
||||
auto stop1 = ov::op::v0::Constant::create(ov::element::i64, {1}, {2 * nc});
|
||||
|
||||
src0 = std::make_shared<ov::op::v8::Slice>(combined, start0, stop0, step, axis);
|
||||
src1 = std::make_shared<ov::op::v8::Slice>(combined, start1, stop1, step, axis);
|
||||
}
|
||||
|
||||
int32_t * params = context.get_output_op_params();
|
||||
const int32_t swapped = params[1];
|
||||
if (swapped) {
|
||||
std::swap(src0, src1);
|
||||
}
|
||||
|
||||
auto sigmoid = std::make_shared<ov::op::v0::Sigmoid>(src0);
|
||||
auto silu = std::make_shared<ov::op::v1::Multiply>(src0, sigmoid);
|
||||
auto res = std::make_shared<ov::op::v1::Multiply>(silu, src1);
|
||||
|
||||
return rename_outputs_with_suffix({res}, context.get_name());
|
||||
}
|
||||
|
||||
} // namespace op
|
||||
} // namespace ggml
|
||||
} // namespace frontend
|
||||
} // namespace ov
|
||||
|
|
@ -0,0 +1,90 @@
|
|||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
#include <climits>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <openvino/core/node.hpp>
|
||||
#include <openvino/core/node_output.hpp>
|
||||
#include <openvino/op/broadcast.hpp>
|
||||
#include <openvino/op/concat.hpp>
|
||||
#include <openvino/op/constant.hpp>
|
||||
#include <openvino/op/convert.hpp>
|
||||
#include <openvino/op/matmul.hpp>
|
||||
#include <openvino/op/reshape.hpp>
|
||||
#include <openvino/op/slice.hpp>
|
||||
#include <openvino/op/transpose.hpp>
|
||||
#include <openvino/op/unsqueeze.hpp>
|
||||
#include <openvino/op/util/op_types.hpp>
|
||||
#include <vector>
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace op {
|
||||
|
||||
OutputVector translate_mulmat(const NodeContext & context) {
|
||||
num_inputs_check(context, 2, 2);
|
||||
|
||||
int op_case = context.get_op_case();
|
||||
|
||||
ov::Output<Node> res;
|
||||
ov::Output<ov::Node> B = context.get_input(0);
|
||||
ov::Output<ov::Node> A = context.get_input(1);
|
||||
|
||||
bool transpose_b = true;
|
||||
if (op_case == 2) {
|
||||
B = B.get_node_shared_ptr()->input_value(0);
|
||||
transpose_b = false;
|
||||
} else if (op_case == 3) {
|
||||
B = process_view_input(context, 0);
|
||||
A = process_view_input(context, 1);
|
||||
}
|
||||
if (A.get_element_type() != B.get_element_type()) {
|
||||
B = std::make_shared<ov::op::v0::Convert>(context.get_input(0), context.get_input_type(1));
|
||||
}
|
||||
|
||||
auto B_shape = context.get_input_shape(0).to_shape();
|
||||
auto A_shape = context.get_input_shape(1).to_shape();
|
||||
int64_t A_batch = A_shape[1];
|
||||
int64_t B_batch = B_shape[1];
|
||||
|
||||
auto A_batch_larger = A_batch > B_batch;
|
||||
auto batch_large = A_batch_larger ? A_batch : B_batch;
|
||||
auto batch_small = A_batch_larger ? B_batch : A_batch;
|
||||
|
||||
Output<Node> Z = A_batch_larger ? B : A;
|
||||
int64_t factor = batch_large / batch_small;
|
||||
if (factor > 1 && batch_small > 1) {
|
||||
auto batch_large_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector<int64_t>{batch_large});
|
||||
auto batch_small_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector<int64_t>{batch_small});
|
||||
auto factor_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector<int64_t>{factor});
|
||||
|
||||
auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {2});
|
||||
auto Z_unsqueezed = std::make_shared<ov::op::v0::Unsqueeze>(Z, unsqueeze_axes);
|
||||
|
||||
auto broadcast_shape = ov::op::v0::Constant::create(
|
||||
ov::element::i64, {5}, {(int64_t) 1, (int64_t) 1, factor, (int64_t) 1, (int64_t) 1});
|
||||
auto new_Z_shape = ov::op::v0::Constant::create(ov::element::i64, {4},
|
||||
{(int64_t) 0, batch_large, (int64_t) -1, (int64_t) A_shape[3]});
|
||||
|
||||
auto Z_broadcasted = std::make_shared<ov::op::v3::Broadcast>(Z_unsqueezed, broadcast_shape,
|
||||
ov::op::BroadcastType::BIDIRECTIONAL);
|
||||
Z = std::make_shared<ov::op::v1::Reshape>(Z_broadcasted, new_Z_shape, true);
|
||||
}
|
||||
if (A_batch_larger) {
|
||||
B = Z;
|
||||
} else {
|
||||
A = Z;
|
||||
}
|
||||
|
||||
res = std::make_shared<ov::op::v0::MatMul>(A, B, false, transpose_b);
|
||||
|
||||
return rename_outputs_with_suffix({res}, context.get_name());
|
||||
}
|
||||
|
||||
} // namespace op
|
||||
} // namespace ggml
|
||||
} // namespace frontend
|
||||
} // namespace ov
|
||||
|
|
@ -0,0 +1,102 @@
|
|||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
#include <climits>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <openvino/core/node.hpp>
|
||||
#include <openvino/op/add.hpp>
|
||||
#include <openvino/op/concat.hpp>
|
||||
#include <openvino/op/constant.hpp>
|
||||
#include <openvino/op/reshape.hpp>
|
||||
#include <openvino/op/slice.hpp>
|
||||
#include <openvino/op/transpose.hpp>
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace op {
|
||||
|
||||
OutputVector translate_permute(const NodeContext & context) {
|
||||
num_inputs_check(context, 1, 1);
|
||||
|
||||
int op_case = context.get_op_case();
|
||||
FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3 || op_case == 4,
|
||||
"Unsupported PERMUTE case");
|
||||
|
||||
ov::Output<Node> res;
|
||||
auto src = context.get_input(0);
|
||||
auto perm = ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3});
|
||||
|
||||
if (op_case == 1 || context.is_stateful()) {
|
||||
res = std::make_shared<ov::op::v1::Transpose>(src, perm);
|
||||
} else if (op_case == 4) {
|
||||
auto output_shape = context.get_output_shape().to_shape();
|
||||
auto n_heads = ov::op::v0::Constant::create(ov::element::i64, {1}, {output_shape[1]});
|
||||
auto head_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {output_shape[3]});
|
||||
auto n_seq_active = context.has_input("n_seq_active") ?
|
||||
context.get_input("n_seq_active") :
|
||||
ov::op::v0::Constant::create(ov::element::i64, {1}, {output_shape[0]});
|
||||
auto neg_one = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1});
|
||||
|
||||
auto new_shape =
|
||||
std::make_shared<ov::op::v0::Concat>(ov::OutputVector{n_seq_active, neg_one, n_heads, head_size}, 0);
|
||||
|
||||
// // Alternative
|
||||
// auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
|
||||
// auto new_shape = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{n_seq_active, neg_one, zero, zero}, 0);
|
||||
|
||||
auto reshaped = std::make_shared<ov::op::v1::Reshape>(src, new_shape, true);
|
||||
res = std::make_shared<ov::op::v1::Transpose>(reshaped, perm);
|
||||
} else {
|
||||
auto cache_shape = src.get_partial_shape();
|
||||
auto output_shape = context.get_output_shape().to_shape();
|
||||
int64_t head_size = output_shape[3];
|
||||
int64_t n_heads = output_shape[1];
|
||||
int64_t ctx_per_seq = cache_shape[2].is_static() ? cache_shape[2].get_length() : -1;
|
||||
int64_t n_seq = cache_shape[1].get_length();
|
||||
|
||||
Output<Node> attention_size;
|
||||
if (!context.has_input("attention_size")) {
|
||||
attention_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {output_shape[2]});
|
||||
} else if (op_case == 2) {
|
||||
attention_size = context.get_input("attention_size");
|
||||
} else {
|
||||
attention_size = context.get_input("attention_size_swa");
|
||||
}
|
||||
|
||||
Output<Node> seq_active_start;
|
||||
Output<Node> seq_active_end;
|
||||
if (context.has_input("seq_active_start")) {
|
||||
seq_active_start = context.get_input("seq_active_start");
|
||||
seq_active_end = context.get_input("seq_active_end");
|
||||
} else {
|
||||
int64_t n_seq_active = output_shape[0];
|
||||
size_t offset = *((size_t *) context.get_input_op_params(0));
|
||||
int64_t seq_active_start_val = offset / context.get_input_stride(0)[0];
|
||||
int64_t seq_active_end_val = seq_active_start_val + n_seq_active;
|
||||
seq_active_start = ov::op::v0::Constant::create(ov::element::i64, {1}, {seq_active_start_val});
|
||||
seq_active_end = ov::op::v0::Constant::create(ov::element::i64, {1}, {seq_active_end_val});
|
||||
}
|
||||
|
||||
// 1. reshape to [n_seq, ctx_per_seq, n_heads, head_size]
|
||||
// 2. slice out the active sequences
|
||||
// 3. slice out the attention part in each sequence
|
||||
// 4. permute
|
||||
auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
|
||||
auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
|
||||
|
||||
auto src_reshaped = std::make_shared<ov::op::v1::Reshape>(
|
||||
src, ov::op::v0::Constant::create(ov::element::i64, {4}, {n_seq, ctx_per_seq, n_heads, head_size}), false);
|
||||
auto slice1 = std::make_shared<ov::op::v8::Slice>(src_reshaped, seq_active_start, seq_active_end, one, zero);
|
||||
auto slice2 = std::make_shared<ov::op::v8::Slice>(slice1, zero, attention_size, one, one);
|
||||
res = std::make_shared<ov::op::v1::Transpose>(slice2, perm);
|
||||
}
|
||||
return rename_outputs_with_suffix({res}, context.get_name());
|
||||
}
|
||||
|
||||
} // namespace op
|
||||
} // namespace ggml
|
||||
} // namespace frontend
|
||||
} // namespace ov
|
||||
|
|
@ -0,0 +1,83 @@
|
|||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <openvino/core/node.hpp>
|
||||
#include <openvino/core/node_output.hpp>
|
||||
#include <openvino/frontend/exception.hpp>
|
||||
#include <openvino/op/concat.hpp>
|
||||
#include <openvino/op/constant.hpp>
|
||||
#include <openvino/op/reshape.hpp>
|
||||
#include <stdexcept>
|
||||
#include <vector>
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace op {
|
||||
|
||||
OutputVector translate_reshape(const NodeContext & context) {
|
||||
num_inputs_check(context, 1, 1);
|
||||
if (context.get_input_shape(0) == context.get_output_shape()) {
|
||||
return {context.get_input(0)};
|
||||
}
|
||||
|
||||
int op_case = context.get_op_case();
|
||||
FRONT_END_CHECK_IMPLEMENTED(
|
||||
op_case == 1 || op_case == 2 || op_case == 3 || op_case == 4 || op_case == 5 || op_case == 6,
|
||||
"Unsupported RESHAPE case");
|
||||
|
||||
auto output_shape = context.get_output_shape().to_shape();
|
||||
std::shared_ptr<ov::Node> new_shape_node;
|
||||
if (op_case == 1) {
|
||||
if (context.is_stateful()) {
|
||||
new_shape_node = ov::op::v0::Constant::create(
|
||||
ov::element::i64, {3},
|
||||
std::vector<int64_t>{-1, (int64_t) output_shape[2], (int64_t) output_shape[3]});
|
||||
} else {
|
||||
new_shape_node = ov::op::v0::Constant::create(
|
||||
ov::element::i64, {4},
|
||||
std::vector<int64_t>{(int64_t) output_shape[0], -1, (int64_t) output_shape[2], (int64_t) output_shape[3]});
|
||||
}
|
||||
} else if (op_case == 2) {
|
||||
new_shape_node = ov::op::v0::Constant::create(
|
||||
ov::element::i64, {4},
|
||||
std::vector<int64_t>{(int64_t) output_shape[0], (int64_t) output_shape[1], -1, (int64_t) output_shape[3]});
|
||||
|
||||
} else if (op_case == 3) {
|
||||
throw std::runtime_error("might be outdated RESHAPE case");
|
||||
new_shape_node = ov::op::v0::Constant::create(
|
||||
ov::element::i64, {4}, std::vector<int64_t>{(int64_t) output_shape[0], (int64_t) output_shape[1], -1, 1});
|
||||
|
||||
} else if (op_case == 4) {
|
||||
return {context.get_input(0).get_node_shared_ptr()->input_value(0)};
|
||||
|
||||
} else if (op_case == 5) {
|
||||
if (context.is_stateful()) {
|
||||
std::vector<int64_t> shape_vec = {1, -1, (int64_t) context.get_output_shape().to_shape()[3]};
|
||||
new_shape_node = ov::op::v0::Constant::create(ov::element::i64, {3}, shape_vec);
|
||||
} else {
|
||||
std::vector<int64_t> shape_vec = {1, 1, -1, (int64_t) context.get_output_shape().to_shape()[3]};
|
||||
new_shape_node = ov::op::v0::Constant::create(ov::element::i64, {4}, shape_vec);
|
||||
}
|
||||
|
||||
// // Alternative
|
||||
// auto token_len = context.get_input("token_len");
|
||||
// auto emb_size =
|
||||
// ov::op::v0::Constant::create(ov::element::i64, {1}, {(int64_t) context.get_output_shape().to_shape()[3]});
|
||||
// auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
|
||||
// new_shape_node = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{one, one, token_len, emb_size}, 0);
|
||||
|
||||
} else if (op_case == 6) {
|
||||
new_shape_node = ov::op::v0::Constant::create(ov::element::i64, {4}, context.get_output_shape().to_shape());
|
||||
}
|
||||
auto res = std::make_shared<ov::op::v1::Reshape>(context.get_input(0), new_shape_node, false);
|
||||
return rename_outputs_with_suffix({res}, context.get_name());
|
||||
}
|
||||
|
||||
} // namespace op
|
||||
} // namespace ggml
|
||||
} // namespace frontend
|
||||
} // namespace ov
|
||||
|
|
@ -0,0 +1,46 @@
|
|||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
#include <memory>
|
||||
#include <openvino/op/add.hpp>
|
||||
#include <openvino/op/constant.hpp>
|
||||
#include <openvino/op/divide.hpp>
|
||||
#include <openvino/op/multiply.hpp>
|
||||
#include <openvino/op/power.hpp>
|
||||
#include <openvino/op/reduce_mean.hpp>
|
||||
#include <openvino/op/sqrt.hpp>
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace op {
|
||||
|
||||
OutputVector translate_rms_norm(const NodeContext & context) {
|
||||
num_inputs_check(context, 1, 1);
|
||||
|
||||
auto input_node = context.get_input(0);
|
||||
auto square = std::make_shared<ov::op::v1::Power>(
|
||||
input_node, ov::op::v0::Constant::create(ov::element::f32, ov::Shape{1}, {2.0f}));
|
||||
|
||||
auto mean = std::make_shared<ov::op::v1::ReduceMean>(
|
||||
square, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {-1}), true);
|
||||
|
||||
float eps;
|
||||
memcpy(&eps, context.get_output_op_params(), sizeof(float));
|
||||
|
||||
auto rms = std::make_shared<ov::op::v0::Sqrt>(
|
||||
std::make_shared<ov::op::v1::Add>(mean, ov::op::v0::Constant::create(ov::element::f32, ov::Shape{1}, {eps})));
|
||||
|
||||
auto reciprocal =
|
||||
std::make_shared<ov::op::v1::Divide>(ov::op::v0::Constant::create(ov::element::f32, ov::Shape{1}, {1.0f}), rms);
|
||||
|
||||
auto res = std::make_shared<ov::op::v1::Multiply>(input_node, reciprocal);
|
||||
|
||||
return rename_outputs_with_suffix({res}, context.get_name());
|
||||
}
|
||||
|
||||
} // namespace op
|
||||
} // namespace ggml
|
||||
} // namespace frontend
|
||||
} // namespace ov
|
||||
|
|
@ -0,0 +1,123 @@
|
|||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <openvino/core/node.hpp>
|
||||
#include <openvino/core/node_output.hpp>
|
||||
#include <openvino/op/add.hpp>
|
||||
#include <openvino/op/concat.hpp>
|
||||
#include <openvino/op/constant.hpp>
|
||||
#include <openvino/op/multiply.hpp>
|
||||
#include <openvino/op/reshape.hpp>
|
||||
#include <openvino/op/shape_of.hpp>
|
||||
#include <openvino/op/slice.hpp>
|
||||
#include <openvino/op/split.hpp>
|
||||
#include <openvino/op/subtract.hpp>
|
||||
#include <openvino/op/unsqueeze.hpp>
|
||||
#include <vector>
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace op {
|
||||
|
||||
OutputVector translate_rope(const NodeContext & context) {
|
||||
num_inputs_check(context, 2, 3);
|
||||
|
||||
int op_case = context.get_op_case();
|
||||
|
||||
ov::Output<Node> res;
|
||||
|
||||
auto data_node = context.get_input(0).get_node_shared_ptr();
|
||||
auto output_shape = context.get_output_shape().to_shape();
|
||||
int32_t * op_params = context.get_output_op_params();
|
||||
|
||||
Output<Node> cos_theta_node;
|
||||
Output<Node> sin_theta_node;
|
||||
if (context.has_input("rope_cos")) {
|
||||
cos_theta_node = context.get_input("rope_cos");
|
||||
sin_theta_node = context.get_input("rope_sin");
|
||||
} else {
|
||||
auto inp_pos = context.get_input(1).get_node_shared_ptr();
|
||||
std::shared_ptr<ov::Node> rope_freqs_weight;
|
||||
if (context.get_input_size() == 3) {
|
||||
rope_freqs_weight = context.get_input(2).get_node_shared_ptr();
|
||||
}
|
||||
auto sin_cos = make_sin_cos(op_params, inp_pos, rope_freqs_weight);
|
||||
sin_theta_node = sin_cos.first;
|
||||
cos_theta_node = sin_cos.second;
|
||||
}
|
||||
|
||||
if (op_case == 2) {
|
||||
// The input comes from a VIEW
|
||||
int slice_len = output_shape[2] * output_shape[3];
|
||||
data_node = process_view_input(context, 0, slice_len).get_node_shared_ptr();
|
||||
if (context.is_stateful()) {
|
||||
auto data_shape = ov::op::v0::Constant::create(
|
||||
ov::element::i64, {3}, std::vector<int64_t>{-1, (int64_t) output_shape[2], (int64_t) output_shape[3]});
|
||||
data_node = std::make_shared<ov::op::v1::Reshape>(data_node, data_shape, false);
|
||||
} else {
|
||||
auto data_shape = ov::op::v0::Constant::create(
|
||||
ov::element::i64, {4}, std::vector<int64_t>{1, -1, (int64_t) output_shape[2], (int64_t) output_shape[3]});
|
||||
data_node = std::make_shared<ov::op::v1::Reshape>(data_node, data_shape, false);
|
||||
}
|
||||
}
|
||||
|
||||
const int mode = op_params[2];
|
||||
constexpr int ROPE_TYPE_NORMAL = 0;
|
||||
constexpr int ROPE_TYPE_NEOX = 2;
|
||||
|
||||
if (mode == ROPE_TYPE_NORMAL) {
|
||||
auto neg_one = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1});
|
||||
auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
|
||||
auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
|
||||
auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2});
|
||||
auto end = ov::op::v0::Constant::create(ov::element::i64, {1}, {output_shape[3]});
|
||||
Output<Node> even_slice;
|
||||
Output<Node> odd_slice;
|
||||
int32_t unsqueeze_dim = context.is_stateful() ? 3 : 4;
|
||||
even_slice = std::make_shared<ov::op::v8::Slice>(data_node, zero, end, two, neg_one);
|
||||
odd_slice = std::make_shared<ov::op::v8::Slice>(data_node, one, end, two, neg_one);
|
||||
|
||||
Output<Node> first_half =
|
||||
std::make_shared<ov::op::v1::Subtract>(std::make_shared<ov::op::v1::Multiply>(even_slice, cos_theta_node),
|
||||
std::make_shared<ov::op::v1::Multiply>(odd_slice, sin_theta_node));
|
||||
Output<Node> second_half =
|
||||
std::make_shared<ov::op::v1::Add>(std::make_shared<ov::op::v1::Multiply>(even_slice, sin_theta_node),
|
||||
std::make_shared<ov::op::v1::Multiply>(odd_slice, cos_theta_node));
|
||||
|
||||
first_half = std::make_shared<ov::op::v0::Unsqueeze>(first_half,
|
||||
ov::op::v0::Constant::create(ov::element::i64, {1}, {unsqueeze_dim}));
|
||||
second_half = std::make_shared<ov::op::v0::Unsqueeze>(second_half,
|
||||
ov::op::v0::Constant::create(ov::element::i64, {1}, {unsqueeze_dim}));
|
||||
auto stack = std::make_shared<ov::op::v0::Concat>(OutputVector{first_half, second_half}, unsqueeze_dim);
|
||||
|
||||
auto data_shape = ov::op::v0::Constant::create(
|
||||
ov::element::i64, {4}, std::vector<int64_t>{1, -1, (int64_t) output_shape[2], (int64_t) output_shape[3]});
|
||||
res = std::make_shared<ov::op::v1::Reshape>(stack, data_shape, false);
|
||||
} else if (mode == ROPE_TYPE_NEOX) {
|
||||
auto data_split = std::make_shared<ov::op::v1::Split>(
|
||||
data_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {-1}), 2);
|
||||
Output<Node> slice_data_node_0 = data_split->outputs()[0];
|
||||
Output<Node> slice_data_node_1 = data_split->outputs()[1];
|
||||
|
||||
auto first_half_node = std::make_shared<ov::op::v1::Subtract>(
|
||||
std::make_shared<ov::op::v1::Multiply>(slice_data_node_0, cos_theta_node),
|
||||
std::make_shared<ov::op::v1::Multiply>(slice_data_node_1, sin_theta_node));
|
||||
|
||||
auto second_half_node = std::make_shared<ov::op::v1::Add>(
|
||||
std::make_shared<ov::op::v1::Multiply>(slice_data_node_0, sin_theta_node),
|
||||
std::make_shared<ov::op::v1::Multiply>(slice_data_node_1, cos_theta_node));
|
||||
|
||||
res = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{first_half_node, second_half_node}, -1);
|
||||
}
|
||||
|
||||
return rename_outputs_with_suffix({res}, context.get_name());
|
||||
}
|
||||
|
||||
} // namespace op
|
||||
} // namespace ggml
|
||||
} // namespace frontend
|
||||
} // namespace ov
|
||||
|
|
@ -0,0 +1,41 @@
|
|||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
#include <openvino/op/add.hpp>
|
||||
#include <openvino/op/constant.hpp>
|
||||
#include <openvino/op/multiply.hpp>
|
||||
#include <vector>
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace op {
|
||||
|
||||
OutputVector translate_scale(const NodeContext & context) {
|
||||
num_inputs_check(context, 1, 1);
|
||||
|
||||
float scale;
|
||||
float bias;
|
||||
memcpy(&scale, (float *) context.get_output_op_params() + 0, sizeof(float));
|
||||
memcpy(&bias, (float *) context.get_output_op_params() + 1, sizeof(float));
|
||||
|
||||
auto scale_node = std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{}, std::vector<float>{scale});
|
||||
auto scaled = std::make_shared<ov::op::v1::Multiply>(context.get_input(0), scale_node);
|
||||
|
||||
std::shared_ptr<ov::Node> res;
|
||||
if (bias != 0.0f) {
|
||||
auto bias_node =
|
||||
std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{}, std::vector<float>{bias});
|
||||
res = std::make_shared<ov::op::v1::Add>(scaled, bias_node);
|
||||
} else {
|
||||
res = scaled;
|
||||
}
|
||||
|
||||
return rename_outputs_with_suffix({res}, context.get_name());
|
||||
}
|
||||
|
||||
} // namespace op
|
||||
} // namespace ggml
|
||||
} // namespace frontend
|
||||
} // namespace ov
|
||||
|
|
@ -0,0 +1,76 @@
|
|||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
#include <cassert>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <openvino/core/node.hpp>
|
||||
#include <openvino/core/node_output.hpp>
|
||||
#include <openvino/frontend/exception.hpp>
|
||||
#include <openvino/op/concat.hpp>
|
||||
#include <openvino/op/constant.hpp>
|
||||
#include <openvino/op/convert.hpp>
|
||||
#include <openvino/op/gather.hpp>
|
||||
#include <openvino/op/reshape.hpp>
|
||||
#include <openvino/op/scatter_update.hpp>
|
||||
#include <openvino/op/shape_of.hpp>
|
||||
#include <openvino/op/slice.hpp>
|
||||
#include <openvino/op/squeeze.hpp>
|
||||
#include <openvino/op/transpose.hpp>
|
||||
#include <vector>
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace op {
|
||||
|
||||
OutputVector translate_set_rows(const NodeContext & context) {
|
||||
num_inputs_check(context, 3, 3);
|
||||
|
||||
auto data = context.get_input(0);
|
||||
auto indices = context.get_input(1);
|
||||
auto dst = context.get_input(2);
|
||||
|
||||
data = std::make_shared<ov::op::v0::Convert>(data, context.get_output_type());
|
||||
|
||||
auto dst_shape = context.get_output_shape().to_shape();
|
||||
|
||||
auto ind_squeezed =
|
||||
std::make_shared<ov::op::v0::Squeeze>(indices, ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 1, 2}));
|
||||
auto data_reshaped = std::make_shared<ov::op::v1::Reshape>(
|
||||
data,
|
||||
ov::op::v0::Constant::create(ov::element::i64, {4},
|
||||
{(int64_t) 1, (int64_t) 1, (int64_t) -1, (int64_t) dst_shape[3]}),
|
||||
false);
|
||||
auto axes = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {2});
|
||||
|
||||
Output<Node> res;
|
||||
if (context.is_stateful()) {
|
||||
int concat_axis = 1;
|
||||
int64_t dim2 = dst.get_partial_shape()[2].get_length();
|
||||
int64_t dim3 = dst.get_partial_shape()[3].get_length();
|
||||
data = std::make_shared<ov::op::v1::Reshape>(
|
||||
data, ov::op::v0::Constant::create(ov::element::i64, {4}, {(int64_t) 1, (int64_t) -1, dim2, dim3}), false);
|
||||
res = std::make_shared<ov::op::v0::Concat>(OutputVector{dst, data}, concat_axis);
|
||||
} else {
|
||||
res = std::make_shared<ov::op::v3::ScatterUpdate>(dst, ind_squeezed, data_reshaped, axes);
|
||||
}
|
||||
|
||||
if (auto dst_reshape = std::dynamic_pointer_cast<ov::op::v1::Reshape>(dst.get_node_shared_ptr())) {
|
||||
// Fix the case of multiple sequences, reshape back to original shape [1, n_seq, ctx_per_seq, emb]
|
||||
// ctx_per_seq is not fixed due to llama-bench compatibility
|
||||
auto dst_shape_partial = dst_reshape->get_input_partial_shape(0);
|
||||
std::vector<int64_t> dst_shape = {dst_shape_partial[0].get_length(), dst_shape_partial[1].get_length(),
|
||||
dst_shape_partial[2].is_static() ? dst_shape_partial[2].get_length() : -1,
|
||||
dst_shape_partial[3].get_length()};
|
||||
res = std::make_shared<ov::op::v1::Reshape>(res, ov::op::v0::Constant::create(ov::element::i64, {4}, dst_shape),
|
||||
false);
|
||||
}
|
||||
return rename_outputs_with_suffix({res}, context.get_name());
|
||||
}
|
||||
|
||||
} // namespace op
|
||||
} // namespace ggml
|
||||
} // namespace frontend
|
||||
} // namespace ov
|
||||
|
|
@ -0,0 +1,89 @@
|
|||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
#include <climits>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <openvino/core/node.hpp>
|
||||
#include <openvino/core/node_output.hpp>
|
||||
#include <openvino/op/add.hpp>
|
||||
#include <openvino/op/concat.hpp>
|
||||
#include <openvino/op/constant.hpp>
|
||||
#include <openvino/op/convert.hpp>
|
||||
#include <openvino/op/matmul.hpp>
|
||||
#include <openvino/op/multiply.hpp>
|
||||
#include <openvino/op/slice.hpp>
|
||||
#include <openvino/op/softmax.hpp>
|
||||
#include <vector>
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace op {
|
||||
|
||||
OutputVector translate_soft_max(const NodeContext & context) {
|
||||
// TODO code is outdated
|
||||
num_inputs_check(context, 1, 2);
|
||||
|
||||
auto input_node = context.get_input(0).get_node_shared_ptr();
|
||||
ov::Output<Node> res;
|
||||
|
||||
float scale = 1.0f;
|
||||
float max_bias = 0.0f;
|
||||
auto * op_params = context.get_output_op_params();
|
||||
memcpy(&scale, (float *) op_params + 0, sizeof(float));
|
||||
memcpy(&max_bias, (float *) op_params + 1, sizeof(float));
|
||||
auto src0_shape = context.get_input_shape(0).get_shape();
|
||||
const uint32_t h = src0_shape[2];
|
||||
const uint32_t n_head = src0_shape[0];
|
||||
const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
|
||||
|
||||
const float m0 = powf(2.0f, -(max_bias) / n_head_log2);
|
||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||
const float slope =
|
||||
(max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2 * (h - n_head_log2) + 1) : 1.0f;
|
||||
|
||||
auto scale_node = std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{}, std::vector<float>{scale});
|
||||
auto scaled_input = std::make_shared<ov::op::v1::Multiply>(input_node, scale_node);
|
||||
|
||||
if (context.get_input_size() < 2) {
|
||||
res = std::make_shared<ov::op::v8::Softmax>(scaled_input, 2);
|
||||
return rename_outputs_with_suffix({res}, context.get_name());
|
||||
}
|
||||
|
||||
ov::Output<ov::Node> mask_node_sliced;
|
||||
if (context.has_input("KQ_mask_sliced")) {
|
||||
mask_node_sliced = context.get_input("KQ_mask_sliced");
|
||||
} else {
|
||||
auto token_len = get_dimensions(input_node, {1});
|
||||
auto mask_node = context.get_input(1);
|
||||
auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
|
||||
auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
|
||||
mask_node_sliced = std::make_shared<ov::op::v8::Slice>(mask_node, zero, token_len, one, one);
|
||||
}
|
||||
|
||||
if (mask_node_sliced.get_element_type() != context.get_output_type()) {
|
||||
mask_node_sliced = std::make_shared<ov::op::v0::Convert>(mask_node_sliced, context.get_output_type());
|
||||
}
|
||||
|
||||
Output<Node> slope_mask;
|
||||
if (slope != 1.0f) {
|
||||
auto slope_node =
|
||||
std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{}, std::vector<float>{slope});
|
||||
slope_mask = std::make_shared<ov::op::v1::Multiply>(mask_node_sliced, slope_node);
|
||||
throw std::runtime_error("Slope != 1.0f in softmax has not been tested, verify it before use.");
|
||||
}
|
||||
slope_mask = mask_node_sliced;
|
||||
|
||||
auto input_slope_mask_node = std::make_shared<ov::op::v1::Add>(scaled_input, slope_mask);
|
||||
|
||||
res = std::make_shared<ov::op::v8::Softmax>(input_slope_mask_node, 2);
|
||||
|
||||
return rename_outputs_with_suffix({res}, context.get_name());
|
||||
}
|
||||
|
||||
} // namespace op
|
||||
} // namespace ggml
|
||||
} // namespace frontend
|
||||
} // namespace ov
|
||||
|
|
@ -0,0 +1,23 @@
|
|||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
#include <openvino/op/transpose.hpp>
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace op {
|
||||
|
||||
OutputVector translate_transpose(const NodeContext & context) {
|
||||
num_inputs_check(context, 1, 1);
|
||||
|
||||
auto res = std::make_shared<ov::op::v1::Transpose>(
|
||||
context.get_input(0), ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 1, 3, 2}));
|
||||
return rename_outputs_with_suffix({res}, context.get_name());
|
||||
}
|
||||
|
||||
} // namespace op
|
||||
} // namespace ggml
|
||||
} // namespace frontend
|
||||
} // namespace ov
|
||||
|
|
@ -0,0 +1,27 @@
|
|||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
#include <openvino/core/node_output.hpp>
|
||||
#include <openvino/op/multiply.hpp>
|
||||
#include <openvino/op/sigmoid.hpp>
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace op {
|
||||
|
||||
OutputVector translate_unary_silu(const NodeContext & context) {
|
||||
num_inputs_check(context, 1, 1);
|
||||
|
||||
auto input = context.get_input(0);
|
||||
auto sigmoid = std::make_shared<ov::op::v0::Sigmoid>(input);
|
||||
auto res = std::make_shared<ov::op::v1::Multiply>(input, sigmoid);
|
||||
|
||||
return rename_outputs_with_suffix({res}, context.get_name());
|
||||
}
|
||||
|
||||
} // namespace op
|
||||
} // namespace ggml
|
||||
} // namespace frontend
|
||||
} // namespace ov
|
||||
|
|
@ -0,0 +1,23 @@
|
|||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace op {
|
||||
|
||||
OutputVector translate_view(const NodeContext & context) {
|
||||
num_inputs_check(context, 1, 1);
|
||||
|
||||
if (context.get_op_case() == 2) {
|
||||
auto dst_shape = context.get_output_shape().to_shape();
|
||||
return rename_outputs_with_suffix({process_view_input(context, 0, dst_shape[2] * dst_shape[3])},
|
||||
context.get_name());
|
||||
}
|
||||
return {context.get_input(0)};
|
||||
}
|
||||
|
||||
} // namespace op
|
||||
} // namespace ggml
|
||||
} // namespace frontend
|
||||
} // namespace ov
|
||||
|
|
@ -0,0 +1,46 @@
|
|||
#include "op_table.hpp"
|
||||
|
||||
#include "utils.hpp"
|
||||
|
||||
#include <openvino/op/add.hpp>
|
||||
#include <openvino/op/divide.hpp>
|
||||
#include <openvino/op/gather.hpp>
|
||||
#include <openvino/op/matmul.hpp>
|
||||
#include <openvino/op/multiply.hpp>
|
||||
#include <openvino/op/subtract.hpp>
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
|
||||
std::unordered_map<std::string, CreatorFunction> get_supported_ops() {
|
||||
using namespace ov::op;
|
||||
return {
|
||||
{"GGML_OP_ADD", op::translate_1to1_match_2_inputs<v1::Add> },
|
||||
{"GGML_OP_ADD1", op::translate_1to1_match_2_inputs<v1::Add> },
|
||||
{"GGML_OP_CONT", op::translate_cont },
|
||||
{"GGML_OP_DIV", op::translate_1to1_match_2_inputs<v1::Divide> },
|
||||
{"GGML_OP_GET_ROWS", op::translate_get_rows },
|
||||
{"GGML_OP_MUL", op::translate_1to1_match_2_inputs<v1::Multiply>},
|
||||
{"GGML_OP_MUL_MAT", op::translate_mulmat },
|
||||
{"GGML_OP_PERMUTE", op::translate_permute },
|
||||
{"GGML_OP_RESHAPE", op::translate_reshape },
|
||||
{"GGML_OP_RMS_NORM", op::translate_rms_norm },
|
||||
{"GGML_OP_ROPE", op::translate_rope },
|
||||
{"GGML_OP_SCALE", op::translate_scale },
|
||||
{"GGML_OP_SOFT_MAX", op::translate_soft_max },
|
||||
{"GGML_OP_SUB", op::translate_1to1_match_2_inputs<v1::Subtract>},
|
||||
{"GGML_OP_TRANSPOSE", op::translate_transpose },
|
||||
{"GGML_UNARY_OP_SILU", op::translate_unary_silu },
|
||||
{"GGML_OP_VIEW", op::translate_view },
|
||||
{"GGML_GLU_OP_SWIGLU", op::translate_glu_swiglu },
|
||||
{"GGML_GLU_OP_GEGLU", op::translate_glu_geglu },
|
||||
{"GGML_OP_SET_ROWS", op::translate_set_rows },
|
||||
{"GGML_OP_CPY", op::translate_cpy },
|
||||
{"GGML_OP_FLASH_ATTN_EXT", op::translate_flash_attn_ext },
|
||||
};
|
||||
}
|
||||
|
||||
} // namespace ggml
|
||||
} // namespace frontend
|
||||
} // namespace ov
|
||||
|
|
@ -0,0 +1,39 @@
|
|||
#pragma once
|
||||
|
||||
#include "node_context.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
|
||||
namespace op {
|
||||
|
||||
#define GGML_OP_CONVERTER(op) OutputVector op(const NodeContext& context)
|
||||
|
||||
GGML_OP_CONVERTER(translate_add);
|
||||
GGML_OP_CONVERTER(translate_cont);
|
||||
GGML_OP_CONVERTER(translate_get_rows);
|
||||
GGML_OP_CONVERTER(translate_mul);
|
||||
GGML_OP_CONVERTER(translate_mulmat);
|
||||
GGML_OP_CONVERTER(translate_permute);
|
||||
GGML_OP_CONVERTER(translate_reshape);
|
||||
GGML_OP_CONVERTER(translate_rms_norm);
|
||||
GGML_OP_CONVERTER(translate_rope);
|
||||
GGML_OP_CONVERTER(translate_scale);
|
||||
GGML_OP_CONVERTER(translate_unary_silu);
|
||||
GGML_OP_CONVERTER(translate_soft_max);
|
||||
GGML_OP_CONVERTER(translate_transpose);
|
||||
GGML_OP_CONVERTER(translate_view);
|
||||
GGML_OP_CONVERTER(translate_glu_swiglu);
|
||||
GGML_OP_CONVERTER(translate_glu_geglu);
|
||||
GGML_OP_CONVERTER(translate_set_rows);
|
||||
GGML_OP_CONVERTER(translate_cpy);
|
||||
GGML_OP_CONVERTER(translate_flash_attn_ext);
|
||||
|
||||
} // namespace op
|
||||
|
||||
std::unordered_map<std::string, CreatorFunction> get_supported_ops();
|
||||
|
||||
} // namespace ggml
|
||||
} // namespace frontend
|
||||
} // namespace ov
|
||||
|
|
@ -0,0 +1,123 @@
|
|||
#include "eliminate_zp.hpp"
|
||||
|
||||
#include <openvino/core/graph_util.hpp>
|
||||
#include <openvino/core/parallel.hpp>
|
||||
#include <openvino/core/rt_info.hpp>
|
||||
#include <openvino/op/constant.hpp>
|
||||
#include <openvino/op/convert.hpp>
|
||||
#include <openvino/op/multiply.hpp>
|
||||
#include <openvino/op/subtract.hpp>
|
||||
#include <openvino/pass/pattern/op/label.hpp>
|
||||
#include <openvino/pass/pattern/op/pattern.hpp>
|
||||
#include <openvino/pass/pattern/op/wrap_type.hpp>
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace pass {
|
||||
|
||||
EliminateZeroPoints::EliminateZeroPoints() {
|
||||
// Find pattern:
|
||||
// (Multiply Any(scale)
|
||||
// (Subtract (Convert Constant(data)))
|
||||
// (Convert Constant(zero_point)))
|
||||
// where zero_point is a scalar
|
||||
// If data is u4 and zp value is 8 (q4_0), Replace the Subtract with an i4 Constant whose value is data - zp_val
|
||||
// If data is u8 and zp value is 128 (q8_0) or 32 (q6_k), Replace the Subtract with an i8 Constant
|
||||
|
||||
auto m_data_constant = ov::pass::pattern::wrap_type<ov::op::v0::Constant>();
|
||||
auto m_data_convert = ov::pass::pattern::wrap_type<ov::op::v0::Convert>({m_data_constant});
|
||||
|
||||
auto m_zp_constant = ov::pass::pattern::wrap_type<ov::op::v0::Constant>();
|
||||
auto m_zp_convert = ov::pass::pattern::wrap_type<ov::op::v0::Convert>({m_zp_constant});
|
||||
|
||||
auto m_subtract = ov::pass::pattern::wrap_type<ov::op::v1::Subtract>({m_data_convert, m_zp_convert});
|
||||
auto m_scale = ov::pass::pattern::any_input();
|
||||
auto m_multiply = ov::pass::pattern::wrap_type<ov::op::v1::Multiply>({m_scale, m_subtract});
|
||||
|
||||
const auto callback = [=](ov::pass::pattern::Matcher & m) {
|
||||
const auto & pattern_map = m.get_pattern_value_map();
|
||||
|
||||
auto multiply_node =
|
||||
std::dynamic_pointer_cast<ov::op::v1::Multiply>(pattern_map.at(m_multiply).get_node_shared_ptr());
|
||||
auto subtract_node =
|
||||
std::dynamic_pointer_cast<ov::op::v1::Subtract>(pattern_map.at(m_subtract).get_node_shared_ptr());
|
||||
auto data_constant =
|
||||
std::dynamic_pointer_cast<ov::op::v0::Constant>(pattern_map.at(m_data_constant).get_node_shared_ptr());
|
||||
auto zp_constant =
|
||||
std::dynamic_pointer_cast<ov::op::v0::Constant>(pattern_map.at(m_zp_constant).get_node_shared_ptr());
|
||||
|
||||
if (!multiply_node || !subtract_node || !data_constant || !zp_constant) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (ov::shape_size(zp_constant->get_shape()) != 1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
auto data_type = data_constant->get_element_type();
|
||||
auto zp_data = zp_constant->cast_vector<int>();
|
||||
|
||||
if (zp_data.empty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
int zp_value = zp_data[0];
|
||||
|
||||
bool should_eliminate = false;
|
||||
ov::element::Type target_type;
|
||||
|
||||
if (data_type == ov::element::u4 && zp_value == 8) {
|
||||
should_eliminate = true;
|
||||
target_type = ov::element::i4;
|
||||
} else if (data_type == ov::element::u8 && (zp_value == 128 || zp_value == 32)) {
|
||||
should_eliminate = true;
|
||||
target_type = ov::element::i8;
|
||||
}
|
||||
|
||||
if (!should_eliminate) {
|
||||
return false;
|
||||
}
|
||||
|
||||
auto data_shape = data_constant->get_shape();
|
||||
size_t total_elements = ov::shape_size(data_shape);
|
||||
|
||||
std::shared_ptr<ov::op::v0::Constant> new_constant;
|
||||
|
||||
// TODO improve performance
|
||||
if (data_type == ov::element::u4) {
|
||||
auto data_values = data_constant->cast_vector<uint8_t>();
|
||||
std::vector<int8_t> adjusted_values(total_elements);
|
||||
|
||||
ov::parallel_for(total_elements, [&](size_t i) {
|
||||
adjusted_values[i] = static_cast<int8_t>(static_cast<int>(data_values[i]) - 8);
|
||||
});
|
||||
|
||||
new_constant = std::make_shared<ov::op::v0::Constant>(target_type, data_shape, adjusted_values);
|
||||
} else if (data_type == ov::element::u8) {
|
||||
auto data_values = data_constant->cast_vector<uint8_t>();
|
||||
std::vector<int8_t> adjusted_values(total_elements);
|
||||
|
||||
ov::parallel_for(total_elements, [&, zp_value](size_t i) {
|
||||
adjusted_values[i] = static_cast<int8_t>(static_cast<int>(data_values[i]) - zp_value);
|
||||
});
|
||||
|
||||
new_constant = std::make_shared<ov::op::v0::Constant>(target_type, data_shape, adjusted_values);
|
||||
}
|
||||
|
||||
auto new_convert =
|
||||
std::make_shared<ov::op::v0::Convert>(new_constant, subtract_node->get_output_element_type(0));
|
||||
ov::replace_node(subtract_node, new_convert);
|
||||
|
||||
return true;
|
||||
};
|
||||
|
||||
register_matcher(
|
||||
std::make_shared<ov::pass::pattern::Matcher>(m_multiply, "ov::frontend::ggml::pass::EliminateZeroPoints"),
|
||||
callback);
|
||||
}
|
||||
|
||||
} // namespace pass
|
||||
} // namespace ggml
|
||||
} // namespace frontend
|
||||
} // namespace ov
|
||||
|
|
@ -0,0 +1,17 @@
|
|||
#include "openvino/pass/matcher_pass.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace pass {
|
||||
|
||||
class EliminateZeroPoints : public ov::pass::MatcherPass {
|
||||
public:
|
||||
OPENVINO_MATCHER_PASS_RTTI("ov::frontend::ggml::pass::EliminateZeroPoints")
|
||||
EliminateZeroPoints();
|
||||
};
|
||||
|
||||
} // namespace pass
|
||||
} // namespace ggml
|
||||
} // namespace frontend
|
||||
} // namespace ov
|
||||
|
|
@ -0,0 +1,60 @@
|
|||
#include "fuse_to_sdpa.hpp"
|
||||
|
||||
#include <openvino/core/graph_util.hpp>
|
||||
#include <openvino/core/rt_info.hpp>
|
||||
#include <openvino/op/add.hpp>
|
||||
#include <openvino/op/convert.hpp>
|
||||
#include <openvino/op/matmul.hpp>
|
||||
#include <openvino/op/multiply.hpp>
|
||||
#include <openvino/op/scaled_dot_product_attention.hpp>
|
||||
#include <openvino/op/softmax.hpp>
|
||||
#include <openvino/op/transpose.hpp>
|
||||
#include <openvino/pass/pattern/op/label.hpp>
|
||||
#include <openvino/pass/pattern/op/pattern.hpp>
|
||||
#include <openvino/pass/pattern/op/wrap_type.hpp>
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace pass {
|
||||
|
||||
FuseToSDPA::FuseToSDPA() {
|
||||
// Not maintained since FLASH_ATTN_EXT has replaced this pattern
|
||||
const auto m_k = ov::pass::pattern::any_input();
|
||||
const auto m_q = ov::pass::pattern::any_input();
|
||||
const auto m_qk = ov::pass::pattern::wrap_type<ov::op::v0::MatMul>({m_q, m_k});
|
||||
const auto m_qk_f32 = ov::pass::pattern::wrap_type<ov::op::v0::Convert>({m_qk});
|
||||
const auto m_scale = ov::pass::pattern::any_input();
|
||||
const auto m_scaled_qk = ov::pass::pattern::wrap_type<ov::op::v1::Multiply>({m_qk_f32, m_scale});
|
||||
const auto m_mask = ov::pass::pattern::any_input();
|
||||
const auto m_masked_qk = ov::pass::pattern::wrap_type<ov::op::v1::Add>({m_scaled_qk, m_mask});
|
||||
const auto m_softmax_qk = ov::pass::pattern::wrap_type<ov::op::v8::Softmax>({m_masked_qk});
|
||||
const auto m_softmax_qk_f16 = ov::pass::pattern::wrap_type<ov::op::v0::Convert>({m_softmax_qk});
|
||||
const auto m_v = ov::pass::pattern::any_input();
|
||||
const auto m_qkv = ov::pass::pattern::wrap_type<ov::op::v0::MatMul>({m_softmax_qk_f16, m_v});
|
||||
|
||||
const auto callback = [=](ov::pass::pattern::Matcher & m) {
|
||||
auto & pattern_to_output = m.get_pattern_value_map();
|
||||
auto k = pattern_to_output[m_k];
|
||||
auto q = pattern_to_output[m_q];
|
||||
auto v = pattern_to_output[m_v];
|
||||
auto mask = pattern_to_output[m_mask];
|
||||
auto scale = pattern_to_output[m_scale];
|
||||
|
||||
auto mask_f16 = register_new_node<ov::op::v0::Convert>(mask, ov::element::f16);
|
||||
auto scale_f16 = register_new_node<ov::op::v0::Convert>(scale, ov::element::f16);
|
||||
auto sdpa = std::make_shared<ov::op::v13::ScaledDotProductAttention>(q, k, v, mask_f16, scale_f16, false);
|
||||
|
||||
ov::replace_node(m.get_match_root(), sdpa);
|
||||
ov::copy_runtime_info(m.get_matched_nodes(), sdpa);
|
||||
|
||||
return true;
|
||||
};
|
||||
register_matcher(std::make_shared<ov::pass::pattern::Matcher>(m_qkv, "ov::frontend::ggml::pass::FuseToSDPA"),
|
||||
callback);
|
||||
}
|
||||
|
||||
} // namespace pass
|
||||
} // namespace ggml
|
||||
} // namespace frontend
|
||||
} // namespace ov
|
||||
|
|
@ -0,0 +1,17 @@
|
|||
#include "openvino/pass/matcher_pass.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace pass {
|
||||
|
||||
class FuseToSDPA : public ov::pass::MatcherPass {
|
||||
public:
|
||||
OPENVINO_MATCHER_PASS_RTTI("ov::frontend::ggml::pass::FuseToSDPA")
|
||||
FuseToSDPA();
|
||||
};
|
||||
|
||||
} // namespace pass
|
||||
} // namespace ggml
|
||||
} // namespace frontend
|
||||
} // namespace ov
|
||||
|
|
@ -0,0 +1,29 @@
|
|||
#pragma once
|
||||
|
||||
#include "mark_decompression_convert_constant_folding.hpp"
|
||||
#include "openvino/pass/matcher_pass.hpp"
|
||||
#include "openvino/core/visibility.hpp"
|
||||
|
||||
#ifdef OPENVINO_STATIC_LIBRARY
|
||||
# define TRANSFORMATIONS_API
|
||||
#else
|
||||
# ifdef IMPLEMENT_OPENVINO_API
|
||||
# define TRANSFORMATIONS_API OPENVINO_CORE_EXPORTS
|
||||
# else
|
||||
# define TRANSFORMATIONS_API OPENVINO_CORE_IMPORTS
|
||||
# endif // IMPLEMENT_OPENVINO_API
|
||||
#endif // OPENVINO_STATIC_LIBRARY
|
||||
|
||||
namespace ov {
|
||||
namespace pass {
|
||||
|
||||
class TRANSFORMATIONS_API MarkCompressedFloatConstants;
|
||||
|
||||
} // namespace pass
|
||||
} // namespace ov
|
||||
|
||||
class ov::pass::MarkCompressedFloatConstants : public MatcherPass {
|
||||
public:
|
||||
OPENVINO_MATCHER_PASS_RTTI("MarkCompressedFloatConstants")
|
||||
MarkCompressedFloatConstants();
|
||||
};
|
||||
|
|
@ -0,0 +1,58 @@
|
|||
#include "squeeze_matmul.hpp"
|
||||
|
||||
#include <openvino/core/graph_util.hpp>
|
||||
#include <openvino/core/rt_info.hpp>
|
||||
#include <openvino/op/constant.hpp>
|
||||
#include <openvino/op/matmul.hpp>
|
||||
#include <openvino/op/squeeze.hpp>
|
||||
#include <openvino/op/unsqueeze.hpp>
|
||||
#include <openvino/pass/pattern/op/label.hpp>
|
||||
#include <openvino/pass/pattern/op/pattern.hpp>
|
||||
#include <openvino/pass/pattern/op/wrap_type.hpp>
|
||||
|
||||
namespace opp = ov::pass::pattern;
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace pass {
|
||||
|
||||
// For quantized models, NPUW expects the activation to be 3d in DQ(DynamicQuantization) opt, e.g. DQMatMulGQ2i
|
||||
SqueezeMatmul::SqueezeMatmul() {
|
||||
auto m_act = opp::any_input();
|
||||
auto m_wei = opp::any_input();
|
||||
auto m_matmul = opp::wrap_type<ov::op::v0::MatMul>({m_act, m_wei});
|
||||
|
||||
const auto callback = [=](ov::pass::pattern::Matcher & m) {
|
||||
const auto & pattern_map = m.get_pattern_value_map();
|
||||
auto matmul_node =
|
||||
std::dynamic_pointer_cast<ov::op::v0::MatMul>(pattern_map.at(m_matmul).get_node_shared_ptr());
|
||||
auto act = pattern_map.at(m_act);
|
||||
auto wei = pattern_map.at(m_wei);
|
||||
auto act_shape = act.get_partial_shape();
|
||||
auto wei_shape = wei.get_partial_shape();
|
||||
if (act_shape.rank().is_dynamic() || wei_shape.rank().is_dynamic()) {
|
||||
return false;
|
||||
}
|
||||
if (act_shape.rank().get_length() == 4 && wei_shape.rank().get_length() == 2) {
|
||||
auto axis = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {0});
|
||||
auto squeezed_act = std::make_shared<ov::op::v0::Squeeze>(act, axis);
|
||||
auto new_matmul = std::make_shared<ov::op::v0::MatMul>(squeezed_act, wei, matmul_node->get_transpose_a(),
|
||||
matmul_node->get_transpose_b());
|
||||
auto unsqueezed_output = std::make_shared<ov::op::v0::Unsqueeze>(new_matmul, axis);
|
||||
unsqueezed_output->set_friendly_name(matmul_node->get_friendly_name());
|
||||
ov::copy_runtime_info(matmul_node, {squeezed_act, new_matmul, unsqueezed_output});
|
||||
ov::replace_node(matmul_node, unsqueezed_output);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
register_matcher(std::make_shared<ov::pass::pattern::Matcher>(m_matmul, "ov::frontend::ggml::pass::SqueezeMatmul"),
|
||||
callback);
|
||||
}
|
||||
|
||||
} // namespace pass
|
||||
} // namespace ggml
|
||||
} // namespace frontend
|
||||
} // namespace ov
|
||||
|
|
@ -0,0 +1,17 @@
|
|||
#include "openvino/pass/matcher_pass.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace pass {
|
||||
|
||||
class SqueezeMatmul : public ov::pass::MatcherPass {
|
||||
public:
|
||||
OPENVINO_MATCHER_PASS_RTTI("ov::frontend::ggml::pass::SqueezeMatmul")
|
||||
SqueezeMatmul();
|
||||
};
|
||||
|
||||
} // namespace pass
|
||||
} // namespace ggml
|
||||
} // namespace frontend
|
||||
} // namespace ov
|
||||
|
|
@ -0,0 +1,292 @@
|
|||
#include "translate_session.hpp"
|
||||
|
||||
#include "ggml-openvino/openvino/node_context.hpp"
|
||||
#include "ggml-openvino/openvino/utils.hpp"
|
||||
#include "input_model.hpp"
|
||||
#include "pass/eliminate_zp.hpp"
|
||||
#include "pass/mark_decompression_convert_constant_folding.hpp"
|
||||
#include "pass/squeeze_matmul.hpp"
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <openvino/core/node.hpp>
|
||||
#include <openvino/op/add.hpp>
|
||||
#include <openvino/op/broadcast.hpp>
|
||||
#include <openvino/op/concat.hpp>
|
||||
#include <openvino/op/convert.hpp>
|
||||
#include <openvino/op/convert_like.hpp>
|
||||
#include <openvino/op/cos.hpp>
|
||||
#include <openvino/op/divide.hpp>
|
||||
#include <openvino/op/gather.hpp>
|
||||
#include <openvino/op/multiply.hpp>
|
||||
#include <openvino/op/parameter.hpp>
|
||||
#include <openvino/op/range.hpp>
|
||||
#include <openvino/op/reshape.hpp>
|
||||
#include <openvino/op/result.hpp>
|
||||
#include <openvino/op/sin.hpp>
|
||||
#include <openvino/op/slice.hpp>
|
||||
#include <openvino/op/squeeze.hpp>
|
||||
#include <openvino/op/strided_slice.hpp>
|
||||
#include <openvino/op/transpose.hpp>
|
||||
#include <openvino/op/unsqueeze.hpp>
|
||||
#include <openvino/pass/constant_folding.hpp>
|
||||
#include <openvino/pass/make_stateful.hpp>
|
||||
#include <openvino/core/preprocess/pre_post_process.hpp>
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
|
||||
using namespace ov::op;
|
||||
|
||||
namespace {
|
||||
|
||||
ov::pass::MakeStateful::ParamResPairs get_kv_param_res_pairs(
|
||||
const std::shared_ptr<ov::Model> & model,
|
||||
const std::map<std::string, std::string> & kv_param_res_names) {
|
||||
ov::pass::MakeStateful::ParamResPairs pairs;
|
||||
const auto & params = model->get_parameters();
|
||||
const auto & results = model->get_results();
|
||||
|
||||
for (const auto & param_res : kv_param_res_names) {
|
||||
const auto & param_name = param_res.first;
|
||||
const auto & res_name = param_res.second;
|
||||
|
||||
auto param_it = std::find_if(params.begin(), params.end(), [&](const std::shared_ptr<v0::Parameter> & node) {
|
||||
return node->get_friendly_name() == param_name;
|
||||
});
|
||||
|
||||
OPENVINO_ASSERT(param_it != params.end(), "The tensor name ", param_name,
|
||||
" is not associated with any of "
|
||||
"Parameters in the network.");
|
||||
|
||||
auto res_it = std::find_if(results.begin(), results.end(), [&](const std::shared_ptr<v0::Result> & node) {
|
||||
return node->get_friendly_name() == res_name;
|
||||
});
|
||||
|
||||
OPENVINO_ASSERT(res_it != results.end(), "The tensor name ", res_name,
|
||||
" is not associated with any of "
|
||||
"Results in the network.");
|
||||
|
||||
std::shared_ptr<ov::op::v0::Parameter> param = *param_it;
|
||||
std::shared_ptr<ov::op::v0::Result> res = *res_it;
|
||||
pairs.emplace_back(param, res);
|
||||
}
|
||||
return pairs;
|
||||
}
|
||||
|
||||
void add_sliced_mask(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) {
|
||||
auto token_len_per_seq = tensor_map.at("token_len_per_seq").get_node_shared_ptr();
|
||||
|
||||
auto create_sliced_mask = [&](const std::string & mask_name, const std::string & sliced_name, bool is_static) {
|
||||
if (tensor_map.find(mask_name) != tensor_map.end()) {
|
||||
auto mask = tensor_map.at(mask_name).get_node_shared_ptr();
|
||||
std::shared_ptr<ov::Node> mask_sliced;
|
||||
if (is_static) {
|
||||
mask_sliced = mask;
|
||||
} else if (ggml_model_decoder.is_stateful()) {
|
||||
auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0,0});
|
||||
auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,1});
|
||||
auto zero_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
|
||||
auto three_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {3});
|
||||
auto neg_one_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1});
|
||||
auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {-2,-1});
|
||||
auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr();
|
||||
auto gather_inp_pos = std::make_shared<ov::op::v8::Gather>(inp_pos, neg_one_1d, three_1d);
|
||||
auto reshaped_inp_pos = std::make_shared<ov::op::v1::Reshape>(gather_inp_pos, ov::op::v0::Constant::create(ov::element::i64, {1}, {1}), false);
|
||||
auto inp_pos_incremented = std::make_shared<ov::op::v1::Add>(reshaped_inp_pos, ov::op::v0::Constant::create(ov::element::i32, ov::Shape{1}, {1}));
|
||||
auto stop = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{token_len_per_seq, std::make_shared<v1::ConvertLike>(inp_pos_incremented, token_len_per_seq)}, 0);
|
||||
mask_sliced =
|
||||
std::make_shared<ov::op::v8::Slice>(mask, zero_2d, stop, one_2d, axes);
|
||||
mask_sliced = std::make_shared<ov::op::v0::Convert>(mask_sliced, ov::element::f16);
|
||||
mask_sliced->set_friendly_name(sliced_name);
|
||||
} else {
|
||||
auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
|
||||
auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
|
||||
auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2});
|
||||
mask_sliced = std::make_shared<ov::op::v8::Slice>(mask, zero, token_len_per_seq, one, two);
|
||||
mask_sliced = std::make_shared<ov::op::v0::Convert>(mask_sliced, ov::element::f16);
|
||||
mask_sliced->set_friendly_name(sliced_name);
|
||||
}
|
||||
tensor_map.insert({sliced_name, mask_sliced->output(0)});
|
||||
}
|
||||
};
|
||||
|
||||
create_sliced_mask("self_kq_mask", "KQ_mask_sliced", ggml_model_decoder.is_static());
|
||||
create_sliced_mask("self_kq_mask_swa", "KQ_mask_swa_sliced", ggml_model_decoder.is_static());
|
||||
}
|
||||
|
||||
void add_rope_sin_cos(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) {
|
||||
int32_t * rope_params = ggml_model_decoder.get_rope_params();
|
||||
if (tensor_map.find("inp_pos") == tensor_map.end() || rope_params == nullptr) {
|
||||
return;
|
||||
}
|
||||
auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr();
|
||||
std::shared_ptr<ov::Node> rope_freqs_weight;
|
||||
if (tensor_map.find("rope_freqs.weight") != tensor_map.end()) {
|
||||
rope_freqs_weight = tensor_map.at("rope_freqs.weight").get_node_shared_ptr();
|
||||
}
|
||||
|
||||
auto sin_cos = make_sin_cos(rope_params, inp_pos, rope_freqs_weight);
|
||||
auto sin_theta = sin_cos.first;
|
||||
auto cos_theta = sin_cos.second;
|
||||
|
||||
cos_theta.get_node_shared_ptr()->set_friendly_name("rope_cos");
|
||||
sin_theta.get_node_shared_ptr()->set_friendly_name("rope_sin");
|
||||
tensor_map.insert({"rope_cos", cos_theta});
|
||||
tensor_map.insert({"rope_sin", sin_theta});
|
||||
}
|
||||
|
||||
// Create common patterns
|
||||
void preprocess(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) {
|
||||
add_sliced_mask(tensor_map, ggml_model_decoder);
|
||||
add_rope_sin_cos(tensor_map, ggml_model_decoder);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
TranslateSession::TranslateSession(const frontend::InputModel::Ptr & input_model,
|
||||
const std::unordered_map<std::string, CreatorFunction> & translator_map,
|
||||
bool naive) :
|
||||
m_input_model(input_model),
|
||||
m_translator_map(translator_map),
|
||||
m_ov_model(nullptr),
|
||||
m_naive(naive) {}
|
||||
|
||||
std::shared_ptr<Model> TranslateSession::get_converted_model() {
|
||||
if (m_ov_model) {
|
||||
return m_ov_model;
|
||||
}
|
||||
m_ov_model = translate_graph(m_input_model);
|
||||
return m_ov_model;
|
||||
}
|
||||
|
||||
std::shared_ptr<Model> TranslateSession::translate_graph(const frontend::InputModel::Ptr & input_model) {
|
||||
ov::ParameterVector params;
|
||||
ov::ResultVector results;
|
||||
auto tensor_map = std::make_shared<TensorMap>();
|
||||
std::shared_ptr<Model> resulting_model;
|
||||
|
||||
const auto & ggml_model = std::dynamic_pointer_cast<InputModel>(input_model);
|
||||
std::shared_ptr<GgmlDecoder> ggml_model_decoder = ggml_model->get_model_decoder();
|
||||
|
||||
for (const auto & it : ggml_model_decoder->get_model_inputs()) {
|
||||
params.push_back(std::dynamic_pointer_cast<ov::op::v0::Parameter>(it.second));
|
||||
(*tensor_map)[it.first] = it.second;
|
||||
}
|
||||
|
||||
for (const auto & it : ggml_model_decoder->get_model_extra_inputs()) {
|
||||
if (std::dynamic_pointer_cast<ov::op::v0::Parameter>(it.second)) {
|
||||
params.push_back(std::dynamic_pointer_cast<ov::op::v0::Parameter>(it.second));
|
||||
}
|
||||
(*tensor_map)[it.first] = it.second;
|
||||
}
|
||||
|
||||
for (const auto & it : ggml_model_decoder->get_model_weights()) {
|
||||
(*tensor_map)[it.first] = it.second;
|
||||
}
|
||||
|
||||
auto node_visitor = [&](std::shared_ptr<GgmlDecoder> decoder, int node_idx) {
|
||||
auto operation_type = decoder->get_op_type(node_idx);
|
||||
if (operation_type == "GGML_OP_NONE") {
|
||||
return;
|
||||
}
|
||||
|
||||
ov::OutputVector converted_outputs;
|
||||
auto it = m_translator_map.find(operation_type);
|
||||
FRONT_END_OP_CONVERSION_CHECK(it != m_translator_map.end(), "Translation for operation type ", operation_type,
|
||||
" is not implemented.");
|
||||
NodeContext node_context(decoder, tensor_map, node_idx, this);
|
||||
converted_outputs = it->second(node_context);
|
||||
|
||||
const auto & node_output_names = decoder->get_output_names(node_idx);
|
||||
FRONT_END_OP_CONVERSION_CHECK(node_output_names.size() == converted_outputs.size(), "Number of ",
|
||||
operation_type, " outputs greater than number of converted outputs, which are ",
|
||||
node_output_names.size(), " and ", converted_outputs.size(), " respectively.");
|
||||
|
||||
for (size_t i = 0; i < node_output_names.size(); ++i) {
|
||||
auto output_name = node_output_names[i];
|
||||
if (i < converted_outputs.size() && converted_outputs[i].get_node_shared_ptr() != nullptr) {
|
||||
(*tensor_map)[output_name] = converted_outputs[i];
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
if (!m_naive) {
|
||||
preprocess(*tensor_map, *ggml_model_decoder);
|
||||
}
|
||||
ggml_model_decoder->visit_subgraph(node_visitor);
|
||||
|
||||
for (const auto & name : ggml_model_decoder->get_model_output_names()) {
|
||||
FRONT_END_GENERAL_CHECK(tensor_map->find(name) != tensor_map->end(),
|
||||
"Output name not found in tensor map: ", name);
|
||||
auto result = std::make_shared<v0::Result>(tensor_map->at(name));
|
||||
result->set_friendly_name(name);
|
||||
results.push_back(result);
|
||||
}
|
||||
|
||||
ov::ParameterVector used_params;
|
||||
for (const auto & param : params) {
|
||||
if (!param->output(0).get_target_inputs().empty()) {
|
||||
used_params.push_back(param);
|
||||
}
|
||||
}
|
||||
// if (auto diff = params.size() - used_params.size()) {
|
||||
// GGML_LOG_INFO("%zu parameters are not used in the model.", diff);
|
||||
// }
|
||||
resulting_model = std::make_shared<Model>(results, used_params);
|
||||
|
||||
apply_transformations(resulting_model);
|
||||
return resulting_model;
|
||||
}
|
||||
|
||||
std::shared_ptr<Model> TranslateSession::apply_transformations(std::shared_ptr<Model> model) {
|
||||
auto ggml_model_decoder = std::dynamic_pointer_cast<InputModel>(m_input_model)->get_model_decoder();
|
||||
{
|
||||
ov::pass::Manager manager;
|
||||
manager.set_per_pass_validation(true);
|
||||
manager.register_pass<ov::pass::MarkCompressedFloatConstants>();
|
||||
|
||||
if (ggml_model_decoder->is_stateful()) {
|
||||
const auto kv_param_res_names = ggml_model_decoder->get_kv_param_res_names();
|
||||
const auto kv_param_res_pairs = get_kv_param_res_pairs(model, kv_param_res_names);
|
||||
manager.register_pass<ov::pass::MakeStateful>(kv_param_res_pairs);
|
||||
}
|
||||
|
||||
if (ggml_model_decoder->is_static()) {
|
||||
manager.register_pass<pass::EliminateZeroPoints>();
|
||||
manager.register_pass<pass::SqueezeMatmul>();
|
||||
}
|
||||
manager.run_passes(model);
|
||||
if (ggml_model_decoder->is_stateful()) {
|
||||
auto output_names = ggml_model_decoder->get_model_output_names();
|
||||
std::map<std::string, int> model_output_indexes;
|
||||
for (size_t i=0; i<output_names.size(); i++) {
|
||||
model_output_indexes.insert(std::make_pair(output_names[i], i));
|
||||
}
|
||||
ov::preprocess::PrePostProcessor ppp(model);
|
||||
for (size_t i=0; i<model->get_output_size(); i++) {
|
||||
auto output_friendly_name = model->output(i).get_node_shared_ptr()->get_friendly_name();
|
||||
auto output_id = model_output_indexes[output_friendly_name];
|
||||
auto model_output_shape = model->output(i).get_partial_shape();
|
||||
auto decoder_output_shape = ggml_model_decoder->get_output_shape(output_id);
|
||||
if (model_output_shape.rank().is_static() && decoder_output_shape.rank().is_static()
|
||||
&& model_output_shape.rank().get_length() + 1 == decoder_output_shape.rank().get_length()
|
||||
&& decoder_output_shape[0].is_static() && decoder_output_shape[0].get_length() == 1) {
|
||||
ppp.output(i).postprocess().custom([](const ov::Output<ov::Node>& node) {
|
||||
auto axes = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{1}, {0});
|
||||
return std::make_shared<ov::op::v0::Unsqueeze>(node, axes);
|
||||
});
|
||||
}
|
||||
}
|
||||
model = ppp.build();
|
||||
}
|
||||
}
|
||||
return model;
|
||||
}
|
||||
|
||||
} // namespace ggml
|
||||
} // namespace frontend
|
||||
} // namespace ov
|
||||
|
|
@ -0,0 +1,28 @@
|
|||
#pragma once
|
||||
|
||||
#include "input_model.hpp"
|
||||
#include "node_context.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
|
||||
class TranslateSession {
|
||||
public:
|
||||
TranslateSession(const frontend::InputModel::Ptr& input_model,
|
||||
const std::unordered_map<std::string, CreatorFunction>& translator_map, bool naive = false);
|
||||
|
||||
std::shared_ptr<Model> get_converted_model();
|
||||
std::shared_ptr<Model> translate_graph(const frontend::InputModel::Ptr& input_model);
|
||||
|
||||
private:
|
||||
std::shared_ptr<Model> apply_transformations(std::shared_ptr<Model> model);
|
||||
const frontend::InputModel::Ptr m_input_model;
|
||||
const std::unordered_map<std::string, CreatorFunction>& m_translator_map;
|
||||
std::shared_ptr<Model> m_ov_model;
|
||||
bool m_naive;
|
||||
};
|
||||
|
||||
} // namespace ggml
|
||||
} // namespace frontend
|
||||
} // namespace ov
|
||||
|
|
@ -0,0 +1,226 @@
|
|||
#include "utils.hpp"
|
||||
|
||||
#include "ggml-impl.h"
|
||||
|
||||
#include <cstddef>
|
||||
#include <ctime>
|
||||
#include <memory>
|
||||
#include <openvino/op/add.hpp>
|
||||
#include <openvino/op/clamp.hpp>
|
||||
#include <openvino/op/convert.hpp>
|
||||
#include <openvino/op/cos.hpp>
|
||||
#include <openvino/op/divide.hpp>
|
||||
#include <openvino/op/gather.hpp>
|
||||
#include <openvino/op/maximum.hpp>
|
||||
#include <openvino/op/multiply.hpp>
|
||||
#include <openvino/op/shape_of.hpp>
|
||||
#include <openvino/op/sin.hpp>
|
||||
#include <openvino/op/squeeze.hpp>
|
||||
#include <openvino/op/subtract.hpp>
|
||||
#include <openvino/op/transpose.hpp>
|
||||
#include <string>
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
|
||||
std::string getCurrentTime() {
|
||||
std::time_t now = std::time(nullptr);
|
||||
char buf[100];
|
||||
std::strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M:%S", std::localtime(&now));
|
||||
return buf;
|
||||
}
|
||||
|
||||
void num_inputs_check(const NodeContext & context, size_t min_inputs, size_t max_inputs) {
|
||||
auto input_size = context.get_input_size();
|
||||
FRONT_END_OP_CONVERSION_CHECK(input_size >= min_inputs, "Got less inputs than expected");
|
||||
FRONT_END_OP_CONVERSION_CHECK(input_size <= max_inputs, "Got more inputs than expected");
|
||||
}
|
||||
|
||||
int non_cont_dim(std::vector<size_t> ne, std::vector<size_t> nb) {
|
||||
int dim = nb.size() - 1;
|
||||
size_t bytes = nb[dim];
|
||||
for (int i = dim; i > 0; i--) {
|
||||
bytes *= ne[i];
|
||||
if (bytes != nb[i - 1]) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
std::shared_ptr<ov::Node> get_dimensions(const std::shared_ptr<ov::op::v3::ShapeOf> & shape,
|
||||
const std::vector<int> & dims) {
|
||||
using namespace ov::op;
|
||||
const auto zero = v0::Constant::create(ov::element::i32, ov::Shape{}, {0});
|
||||
const auto dims_const = v0::Constant::create(ov::element::i32, ov::Shape{dims.size()}, dims);
|
||||
return std::make_shared<v8::Gather>(shape, dims_const, zero);
|
||||
}
|
||||
|
||||
std::shared_ptr<ov::Node> get_dimensions(const std::shared_ptr<ov::Node> & node, const std::vector<int> & dims) {
|
||||
return get_dimensions(std::make_shared<ov::op::v3::ShapeOf>(node), dims);
|
||||
}
|
||||
|
||||
OutputVector rename_outputs_with_suffix(const OutputVector & outputs, const std::string & suffix) {
|
||||
for (const auto & output : outputs) {
|
||||
auto node = output.get_node_shared_ptr();
|
||||
std::string name = node->get_friendly_name();
|
||||
name += "_";
|
||||
name += suffix;
|
||||
node->set_friendly_name(name);
|
||||
// std::cout << name << " " << output.get_partial_shape() << std::endl;
|
||||
}
|
||||
return outputs;
|
||||
}
|
||||
|
||||
namespace {
|
||||
ov::Output<ov::Node> rope_yarn_ramp_mix(int n_dims, const float corr_dims[2], float ext_factor) {
|
||||
int half_n_dims = n_dims / 2;
|
||||
std::vector<float> dim_ids_vec(half_n_dims);
|
||||
std::iota(dim_ids_vec.begin(), dim_ids_vec.end(), 0);
|
||||
auto dim_ids = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1, (size_t) half_n_dims}, dim_ids_vec);
|
||||
auto corr_low = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1, 1}, {corr_dims[0]});
|
||||
auto corr_high = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1, 1}, {corr_dims[1]});
|
||||
auto denom = std::make_shared<ov::op::v1::Maximum>(
|
||||
std::make_shared<ov::op::v1::Subtract>(corr_high, corr_low),
|
||||
ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1, 1}, {0.001f}));
|
||||
auto ramp_y =
|
||||
std::make_shared<ov::op::v1::Divide>(std::make_shared<ov::op::v1::Subtract>(dim_ids, corr_low), denom);
|
||||
auto ramp_clamped = std::make_shared<ov::op::v0::Clamp>(ramp_y, 0.0f, 1.0f);
|
||||
auto ext_factor_node = ov::op::v0::Constant::create(ov::element::f32, Shape{}, {ext_factor});
|
||||
auto ramp_mix = std::make_shared<ov::op::v1::Multiply>(ramp_clamped, ext_factor_node);
|
||||
return ramp_mix;
|
||||
}
|
||||
|
||||
float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) {
|
||||
#ifndef M_PI
|
||||
# define M_PI 3.14159265358979323846
|
||||
#endif
|
||||
return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float) M_PI)) / (2 * logf(base));
|
||||
}
|
||||
|
||||
void ggml_rope_yarn_corr_dims(int n_dims,
|
||||
int n_ctx_orig,
|
||||
float freq_base,
|
||||
float beta_fast,
|
||||
float beta_slow,
|
||||
float dims[2]) {
|
||||
float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base));
|
||||
float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base));
|
||||
dims[0] = std::max(0.0f, start);
|
||||
dims[1] = std::min(static_cast<float>(n_dims - 1), end);
|
||||
}
|
||||
} // namespace
|
||||
|
||||
std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t * rope_params,
|
||||
std::shared_ptr<ov::Node> inp_pos,
|
||||
std::shared_ptr<ov::Node> rope_freqs_weight,
|
||||
bool stateful) {
|
||||
if (stateful) {
|
||||
inp_pos = std::make_shared<ov::op::v0::Squeeze>(inp_pos, ov::op::v0::Constant::create(ov::element::i64, {1}, {0}));
|
||||
inp_pos = std::make_shared<ov::op::v0::Convert>(inp_pos, ov::element::f32);
|
||||
auto pos_perm =
|
||||
std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{3}, std::vector<int64_t>{2, 1, 0});
|
||||
inp_pos = std::make_shared<ov::op::v1::Transpose>(inp_pos, pos_perm);
|
||||
} else {
|
||||
inp_pos = std::make_shared<ov::op::v0::Convert>(inp_pos, ov::element::f32);
|
||||
auto pos_perm =
|
||||
std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{4}, std::vector<int64_t>{0, 3, 1, 2});
|
||||
inp_pos = std::make_shared<ov::op::v1::Transpose>(inp_pos, pos_perm);
|
||||
}
|
||||
|
||||
float freq_base;
|
||||
float freq_scale;
|
||||
float ext_factor;
|
||||
float attn_factor;
|
||||
float beta_fast;
|
||||
float beta_slow;
|
||||
const int n_dims = rope_params[1];
|
||||
const int n_ctx_orig = rope_params[4];
|
||||
memcpy(&freq_base, rope_params + 5, sizeof(float));
|
||||
memcpy(&freq_scale, rope_params + 6, sizeof(float));
|
||||
memcpy(&ext_factor, rope_params + 7, sizeof(float));
|
||||
memcpy(&attn_factor, rope_params + 8, sizeof(float));
|
||||
memcpy(&beta_fast, rope_params + 9, sizeof(float));
|
||||
memcpy(&beta_slow, rope_params + 10, sizeof(float));
|
||||
|
||||
const float theta_scale = powf(freq_base, -2.0f / n_dims);
|
||||
|
||||
float corr_dims[2];
|
||||
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
|
||||
|
||||
std::vector<float> factor(n_dims / 2);
|
||||
factor[0] = 1.0f;
|
||||
for (size_t i = 1; i < factor.size(); i++) {
|
||||
factor[i] = theta_scale * factor[i - 1];
|
||||
}
|
||||
|
||||
Output<Node> freq_factors;
|
||||
if (stateful) {
|
||||
freq_factors =
|
||||
std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1, 1, factor.size()}, factor);
|
||||
} else {
|
||||
freq_factors =
|
||||
std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1, 1, 1, factor.size()}, factor);
|
||||
}
|
||||
if (rope_freqs_weight) {
|
||||
freq_factors = std::make_shared<ov::op::v1::Divide>(freq_factors, rope_freqs_weight);
|
||||
}
|
||||
|
||||
auto theta_extrap = std::make_shared<ov::op::v1::Multiply>(freq_factors, inp_pos);
|
||||
auto theta_interp = std::make_shared<ov::op::v1::Multiply>(
|
||||
theta_extrap, ov::op::v0::Constant::create(ov::element::f32, {1}, {freq_scale}));
|
||||
|
||||
Output<Node> theta;
|
||||
float mscale = attn_factor;
|
||||
if (ext_factor == 0.0f) {
|
||||
theta = theta_interp;
|
||||
} else {
|
||||
auto ramp_mix = rope_yarn_ramp_mix(n_dims, corr_dims, ext_factor);
|
||||
Output<Node> one;
|
||||
if (stateful) {
|
||||
one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1}, {1.0f});
|
||||
} else {
|
||||
one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1, 1}, {1.0f});
|
||||
}
|
||||
auto one_minus_ramp = std::make_shared<ov::op::v1::Subtract>(one, ramp_mix);
|
||||
|
||||
theta = std::make_shared<ov::op::v1::Add>(std::make_shared<ov::op::v1::Multiply>(theta_interp, one_minus_ramp),
|
||||
std::make_shared<ov::op::v1::Multiply>(theta_extrap, ramp_mix));
|
||||
mscale *= (1.0f + 0.1f * std::log(1.0f / freq_scale));
|
||||
}
|
||||
|
||||
Output<Node> cos_theta = std::make_shared<ov::op::v0::Cos>(theta);
|
||||
Output<Node> sin_theta = std::make_shared<ov::op::v0::Sin>(theta);
|
||||
|
||||
auto mscale_node = ov::op::v0::Constant::create(ov::element::f32, Shape{}, {mscale});
|
||||
|
||||
cos_theta = std::make_shared<ov::op::v1::Multiply>(cos_theta, mscale_node);
|
||||
sin_theta = std::make_shared<ov::op::v1::Multiply>(sin_theta, mscale_node);
|
||||
return std::make_pair(sin_theta, cos_theta);
|
||||
}
|
||||
|
||||
ov::Output<ov::Node> process_view_input(const NodeContext & context, int input_index, int slice_len) {
|
||||
// Only works for VIEW operations that slice at the lowest dimension
|
||||
// If the VIEW also reshape the result, `slice_len` should be provided
|
||||
auto input = context.get_input(input_index);
|
||||
auto * op_params = (size_t *) context.get_input_op_params(input_index);
|
||||
auto src1_stride = context.get_input_stride(input_index);
|
||||
|
||||
int64_t split_addr = op_params[0] / src1_stride[3];
|
||||
if (slice_len == 0) {
|
||||
slice_len = context.get_input_shape(input_index)[3].get_length();
|
||||
}
|
||||
int64_t slice_end = split_addr + slice_len;
|
||||
|
||||
auto begin = ov::op::v0::Constant::create(ov::element::i64, {1}, {split_addr});
|
||||
auto end = ov::op::v0::Constant::create(ov::element::i64, {1}, {slice_end});
|
||||
auto stride = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
|
||||
auto axes = ov::op::v0::Constant::create(ov::element::i64, {1}, {context.is_stateful() ? 2 : 3});
|
||||
auto sliced = std::make_shared<ov::op::v8::Slice>(input, begin, end, stride, axes);
|
||||
return sliced;
|
||||
}
|
||||
|
||||
} // namespace ggml
|
||||
} // namespace frontend
|
||||
} // namespace ov
|
||||
|
|
@ -0,0 +1,85 @@
|
|||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <openvino/core/node.hpp>
|
||||
#include <openvino/op/shape_of.hpp>
|
||||
#include <openvino/op/slice.hpp>
|
||||
#include <utility>
|
||||
|
||||
#include "node_context.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
|
||||
std::string getCurrentTime();
|
||||
|
||||
void dump_ov_model(std::shared_ptr<ov::Model> model);
|
||||
|
||||
void num_inputs_check(const NodeContext& context, size_t min_inputs, size_t max_inputs);
|
||||
|
||||
int non_cont_dim(std::vector<size_t> ne, std::vector<size_t> nb);
|
||||
|
||||
template <typename T>
|
||||
std::vector<int> argsort_descend(const std::vector<T>& v) {
|
||||
std::vector<int> idx(v.size());
|
||||
std::iota(idx.begin(), idx.end(), 0);
|
||||
std::sort(idx.begin(), idx.end(), [&v](int i1, int i2) {
|
||||
return v[i1] > v[i2];
|
||||
});
|
||||
return idx;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::vector<T> sorted_descend(std::vector<T> v) {
|
||||
std::sort(v.begin(), v.end(), [](T a, T b) {
|
||||
return a > b;
|
||||
});
|
||||
return v;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool is_permuted(const std::vector<T>& strides) {
|
||||
for (size_t i = 0; i < strides.size() - 1; ++i) {
|
||||
if (strides[i] < strides[i + 1]) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::vector<T> permute(const std::vector<T>& x, const std::vector<int>& perm) {
|
||||
std::vector<T> result;
|
||||
result.reserve(perm.size());
|
||||
for (int i : perm) {
|
||||
result.push_back(x[i]);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
std::shared_ptr<ov::Node> get_dimensions(const std::shared_ptr<ov::op::v3::ShapeOf>& shape,
|
||||
const std::vector<int>& dims);
|
||||
std::shared_ptr<ov::Node> get_dimensions(const std::shared_ptr<ov::Node>& node, const std::vector<int>& dims);
|
||||
|
||||
OutputVector rename_outputs_with_suffix(const OutputVector& outputs, const std::string& suffix);
|
||||
|
||||
std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t* rope_params,
|
||||
std::shared_ptr<ov::Node> inp_pos,
|
||||
std::shared_ptr<ov::Node> rope_freqs_weight = nullptr,
|
||||
bool stateful = false);
|
||||
|
||||
ov::Output<ov::Node> process_view_input(const NodeContext& context, int input_index, int slice_len = 0);
|
||||
|
||||
namespace op {
|
||||
template <typename T>
|
||||
OutputVector translate_1to1_match_2_inputs(const NodeContext& context) {
|
||||
num_inputs_check(context, 2, 2);
|
||||
auto res = std::make_shared<T>(context.get_input(0), context.get_input(1));
|
||||
return rename_outputs_with_suffix({res}, context.get_name());
|
||||
}
|
||||
} // namespace op
|
||||
|
||||
} // namespace ggml
|
||||
} // namespace frontend
|
||||
} // namespace ov
|
||||
|
|
@ -0,0 +1,802 @@
|
|||
#include "utils.h"
|
||||
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-openvino-extra.h"
|
||||
#include "ggml-openvino/ggml-decoder.h"
|
||||
#include "ggml.h"
|
||||
#include "openvino/frontend.hpp"
|
||||
#include "openvino/input_model.hpp"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <openvino/core/any.hpp>
|
||||
#include <openvino/core/graph_util.hpp>
|
||||
#include <openvino/core/shape.hpp>
|
||||
#include <openvino/core/type/float16.hpp>
|
||||
#include <openvino/frontend/manager.hpp>
|
||||
#include <openvino/openvino.hpp>
|
||||
#include <openvino/runtime/compiled_model.hpp>
|
||||
#include <openvino/runtime/infer_request.hpp>
|
||||
#include <openvino/runtime/intel_npu/properties.hpp>
|
||||
#include <openvino/runtime/properties.hpp>
|
||||
#include <openvino/runtime/tensor.hpp>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
// Suppress deprecation warning for ov::Tensor::data()
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
|
||||
|
||||
enum ggml_status ov_graph_compute(ggml_cgraph * cgraph) {
|
||||
try {
|
||||
if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) {
|
||||
std::string filename = "cgraph_ov.txt";
|
||||
GgmlOvDecoder::dump_cgraph(cgraph, filename);
|
||||
}
|
||||
|
||||
// Use device from singleton (initialized during backend init)
|
||||
const auto & device = ggml_openvino_get_device_name();
|
||||
const auto is_static = ggml_openvino_is_npu();
|
||||
bool stateful = false;
|
||||
if (getenv("GGML_OPENVINO_STATEFUL_EXECUTION") && !is_static) {
|
||||
stateful = true;
|
||||
}
|
||||
|
||||
return is_static ? ov_graph_compute_static(cgraph) : ov_graph_compute_dynamic(cgraph, device, stateful);
|
||||
} catch (const ov::Exception & e) {
|
||||
GGML_LOG_ERROR("GGML OpenVINO backend ov::Exception: %s\n", e.what());
|
||||
return GGML_STATUS_FAILED;
|
||||
} catch (const std::exception & e) {
|
||||
GGML_LOG_ERROR("GGML OpenVINO backend std::exception: %s\n", e.what());
|
||||
return GGML_STATUS_FAILED;
|
||||
} catch (...) {
|
||||
GGML_LOG_ERROR("GGML OpenVINO backend unknown exception\n");
|
||||
return GGML_STATUS_FAILED;
|
||||
}
|
||||
}
|
||||
|
||||
enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::string & device, bool stateful) {
|
||||
auto & core = ov_singleton_core();
|
||||
const auto & config = ggml_openvino_get_compile_config();
|
||||
static auto is_static = false;
|
||||
static size_t stateful_kv_size = 0;
|
||||
|
||||
if (is_naive(cgraph)) {
|
||||
return naive_compute(cgraph, core, device, config);
|
||||
}
|
||||
|
||||
auto start_time = ggml_time_us();
|
||||
|
||||
static std::mutex cache_mutex;
|
||||
static std::unordered_map<graph_key, std::shared_ptr<GgmlOvDecoder>, graph_key_hash> decoder_cache;
|
||||
static std::unordered_map<graph_key, std::shared_ptr<ov::InferRequest>, graph_key_hash> infer_request_cache;
|
||||
static std::unordered_map<graph_key, std::vector<std::string>, graph_key_hash> ov_input_names_cache;
|
||||
static std::unordered_map<graph_key, std::vector<std::string>, graph_key_hash> ov_output_names_cache;
|
||||
|
||||
std::shared_ptr<GgmlOvDecoder> ggml_decoder;
|
||||
std::shared_ptr<ov::InferRequest> infer_request;
|
||||
ModelParams m_params;
|
||||
ComputeParams c_params;
|
||||
std::tie(m_params, c_params) = GgmlOvDecoder::compute_llm_params(cgraph, is_static);
|
||||
|
||||
graph_key key(cgraph);
|
||||
bool cache_hit;
|
||||
|
||||
int64_t decoder_end_time;
|
||||
int64_t conversion_end_time;
|
||||
int64_t compile_end_time;
|
||||
int64_t infer_end_time;
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(cache_mutex);
|
||||
|
||||
auto it = decoder_cache.find(key);
|
||||
|
||||
cache_hit = it != decoder_cache.end();
|
||||
ModelParams old_m_params;
|
||||
if (cache_hit) {
|
||||
ggml_decoder = it->second;
|
||||
old_m_params = ggml_decoder->get_model_params();
|
||||
cache_hit = old_m_params.can_reuse_dynamically(m_params);
|
||||
}
|
||||
|
||||
if (cache_hit) {
|
||||
std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
|
||||
ggml_decoder->set_compute_params(c_params);
|
||||
ggml_decoder->set_model_params(m_params);
|
||||
if (old_m_params.kv_buffer_changed(m_params)) {
|
||||
ggml_decoder->update_io(cgraph);
|
||||
}
|
||||
ggml_decoder->add_extra_inputs();
|
||||
infer_request = infer_request_cache.at(key);
|
||||
|
||||
if (stateful) {
|
||||
const auto * inp_pos = get_inp_pos_tensor(cgraph);
|
||||
int32_t * pos_data = (int32_t *) inp_pos->data;
|
||||
auto pos_shape = ggml_decoder->get_shape(inp_pos);
|
||||
if (pos_data[0] == 0) {
|
||||
infer_request->reset_state();
|
||||
stateful_kv_size = pos_shape[3];
|
||||
} else if (stateful_kv_size == static_cast<size_t>(pos_data[0])) {
|
||||
stateful_kv_size += pos_shape[3];
|
||||
} else {
|
||||
auto states = infer_request->query_state();
|
||||
for (auto state : states) {
|
||||
auto state_tensor = state.get_state();
|
||||
ov::Coordinate begin = {0, 0, 0, 0};
|
||||
ov::Coordinate end = {state_tensor.get_shape()[0], static_cast<uint32_t>(pos_data[0]), state_tensor.get_shape()[2], state_tensor.get_shape()[3]};
|
||||
ov::Tensor new_state_tensor(state_tensor, begin, end);
|
||||
state.set_state(new_state_tensor);
|
||||
}
|
||||
stateful_kv_size = pos_data[0] + 1;
|
||||
}
|
||||
}
|
||||
|
||||
decoder_end_time = ggml_time_us();
|
||||
conversion_end_time = decoder_end_time;
|
||||
compile_end_time = decoder_end_time;
|
||||
} else {
|
||||
infer_request_cache.erase(key);
|
||||
|
||||
std::shared_ptr<ov::Model> model;
|
||||
auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph);
|
||||
|
||||
ggml_decoder =
|
||||
std::make_shared<GgmlOvDecoder>(cgraph, m_params, c_params, model_weights, is_static, stateful);
|
||||
decoder_end_time = ggml_time_us();
|
||||
|
||||
auto input_model = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder);
|
||||
model = ov::frontend::ggml::FrontEnd::convert(input_model);
|
||||
ggml_decoder->clear_model_weights();
|
||||
conversion_end_time = ggml_time_us();
|
||||
|
||||
if (getenv("GGML_OPENVINO_DUMP_IR")) {
|
||||
char timestamped_filename[64];
|
||||
auto timestamp = (long long) ggml_time_us();
|
||||
snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp);
|
||||
ov::serialize(model, timestamped_filename);
|
||||
}
|
||||
|
||||
ov::CompiledModel compiled_model;
|
||||
auto remote_context = ggml_openvino_get_remote_context();
|
||||
if (remote_context.has_value()) {
|
||||
compiled_model = core.compile_model(model, remote_context.value(), config);
|
||||
} else {
|
||||
compiled_model = core.compile_model(model, device, config);
|
||||
}
|
||||
compile_end_time = ggml_time_us();
|
||||
infer_request = std::make_shared<ov::InferRequest>(compiled_model.create_infer_request());
|
||||
infer_request_cache[key] = infer_request;
|
||||
decoder_cache[key] = ggml_decoder;
|
||||
|
||||
std::vector<std::string> ov_input_names;
|
||||
std::vector<std::string> ov_output_names;
|
||||
for (const auto & ov_param : model->get_parameters()) {
|
||||
ov_input_names.push_back(ov_param->get_friendly_name());
|
||||
}
|
||||
for (const auto & ov_output : model->get_results()) {
|
||||
ov_output_names.push_back(ov_output->get_friendly_name());
|
||||
}
|
||||
ov_input_names_cache[key] = std::move(ov_input_names);
|
||||
ov_output_names_cache[key] = std::move(ov_output_names);
|
||||
}
|
||||
}
|
||||
|
||||
auto ov_input_names = ov_input_names_cache[key];
|
||||
auto ov_output_names = ov_output_names_cache[key];
|
||||
|
||||
for (size_t i = 0; i < ov_input_names.size(); i++) {
|
||||
auto param_name = ov_input_names[i];
|
||||
auto input_tensor = get_ov_input_tensor(ggml_decoder, param_name);
|
||||
infer_request->set_input_tensor(i, input_tensor);
|
||||
|
||||
if (getenv("GGML_OPENVINO_DEBUG_INPUT")) {
|
||||
print_input_tensor_info(param_name, input_tensor);
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < ov_output_names.size(); i++) {
|
||||
auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]);
|
||||
infer_request->set_output_tensor(i, output_tensor);
|
||||
}
|
||||
|
||||
infer_request->infer();
|
||||
infer_end_time = ggml_time_us();
|
||||
|
||||
if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) {
|
||||
for (size_t i = 0; i < ov_output_names.size(); i++) {
|
||||
const auto output_tensor = infer_request->get_output_tensor(i);
|
||||
print_output_tensor_info(ov_output_names[i], output_tensor, output_tensor.data());
|
||||
}
|
||||
}
|
||||
|
||||
if (getenv("GGML_OPENVINO_PROFILING")) {
|
||||
GGML_LOG_INFO("\nGGML OpenVINO Backend: \n");
|
||||
GGML_LOG_INFO(" - Graph decoder time: %ld ms \n", (decoder_end_time - start_time) / 1000);
|
||||
if (!cache_hit) {
|
||||
GGML_LOG_INFO(" - Graph conversion time: %ld ms \n", (conversion_end_time - decoder_end_time) / 1000);
|
||||
GGML_LOG_INFO(" - Graph compile time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000);
|
||||
}
|
||||
GGML_LOG_INFO(" - Graph inference time: %ld ms \n", (infer_end_time - compile_end_time) / 1000);
|
||||
}
|
||||
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) {
|
||||
auto & core = ov_singleton_core();
|
||||
|
||||
auto get_prefill_chunk_size = [] {
|
||||
const char * chunk_size_str = getenv("GGML_OPENVINO_PREFILL_CHUNK_SIZE");
|
||||
if (chunk_size_str && atoi(chunk_size_str) > 0) {
|
||||
return atoi(chunk_size_str);
|
||||
}
|
||||
return 256;
|
||||
};
|
||||
|
||||
static std::string device = "NPU";
|
||||
static auto is_static = true;
|
||||
static auto stateful = false;
|
||||
static auto prefill_chunk_size = get_prefill_chunk_size();
|
||||
const auto & config = ggml_openvino_get_compile_config();
|
||||
|
||||
if (is_naive(cgraph)) {
|
||||
return naive_compute(cgraph, core, device, config);
|
||||
}
|
||||
|
||||
auto start_time = ggml_time_us();
|
||||
|
||||
static std::mutex cache_mutex;
|
||||
static std::unordered_map<graph_key, std::shared_ptr<GgmlOvDecoder>, graph_key_hash> decoder_cache;
|
||||
static std::unordered_map<graph_key, std::shared_ptr<ov::InferRequest>, graph_key_hash> infer_request_cache;
|
||||
static std::unordered_map<graph_key, std::shared_ptr<ov::InferRequest>, graph_key_hash> infer_request_cache_prefill;
|
||||
static std::unordered_map<graph_key, std::vector<std::string>, graph_key_hash> ov_input_names_cache;
|
||||
static std::unordered_map<graph_key, std::vector<std::string>, graph_key_hash> ov_output_names_cache;
|
||||
|
||||
std::shared_ptr<GgmlOvDecoder> ggml_decoder;
|
||||
std::shared_ptr<ov::InferRequest> infer_request;
|
||||
ModelParams m_params;
|
||||
ComputeParams c_params;
|
||||
std::tie(m_params, c_params) = GgmlOvDecoder::compute_llm_params(cgraph, is_static);
|
||||
|
||||
const auto * inp_pos = get_inp_pos_tensor(cgraph);
|
||||
const auto is_prefill = get_is_prefill(inp_pos);
|
||||
graph_key key(cgraph);
|
||||
bool cache_hit;
|
||||
|
||||
int64_t decoder_end_time;
|
||||
int64_t conversion_end_time;
|
||||
int64_t compile_end_time;
|
||||
int64_t infer_end_time;
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(cache_mutex);
|
||||
|
||||
auto it = decoder_cache.find(key);
|
||||
|
||||
cache_hit = it != decoder_cache.end();
|
||||
ModelParams old_m_params;
|
||||
if (cache_hit) {
|
||||
ggml_decoder = it->second;
|
||||
old_m_params = ggml_decoder->get_model_params();
|
||||
cache_hit = old_m_params.can_reuse_statically(m_params);
|
||||
}
|
||||
|
||||
if (cache_hit) {
|
||||
std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
|
||||
ggml_decoder->m_is_prefill = is_prefill;
|
||||
ggml_decoder->set_model_params(m_params);
|
||||
ggml_decoder->set_compute_params(c_params);
|
||||
if (old_m_params.kv_buffer_changed(m_params)) {
|
||||
ggml_decoder->update_io(cgraph);
|
||||
}
|
||||
ggml_decoder->add_extra_inputs();
|
||||
infer_request = is_prefill ? infer_request_cache_prefill.at(key) : infer_request_cache.at(key);
|
||||
|
||||
decoder_end_time = ggml_time_us();
|
||||
conversion_end_time = decoder_end_time;
|
||||
compile_end_time = decoder_end_time;
|
||||
} else {
|
||||
infer_request_cache.erase(key);
|
||||
infer_request_cache_prefill.erase(key);
|
||||
|
||||
std::shared_ptr<ov::Model> model;
|
||||
auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph);
|
||||
|
||||
auto ggml_decoder_prefill = std::make_shared<GgmlOvDecoder>(cgraph, m_params, c_params, model_weights,
|
||||
is_static, stateful, true, prefill_chunk_size);
|
||||
auto ggml_decoder_decode = std::make_shared<GgmlOvDecoder>(cgraph, m_params, c_params, model_weights,
|
||||
is_static, stateful, false, prefill_chunk_size);
|
||||
decoder_end_time = ggml_time_us();
|
||||
|
||||
auto input_model_prefill = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder_prefill);
|
||||
auto input_model_decode = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder_decode);
|
||||
|
||||
auto model_prefill = ov::frontend::ggml::FrontEnd::convert(input_model_prefill);
|
||||
ggml_decoder_prefill->clear_model_weights();
|
||||
auto model_decode = ov::frontend::ggml::FrontEnd::convert(input_model_decode);
|
||||
ggml_decoder_decode->clear_model_weights();
|
||||
conversion_end_time = ggml_time_us();
|
||||
|
||||
if (getenv("GGML_OPENVINO_DUMP_IR")) {
|
||||
char timestamped_filename[64];
|
||||
auto timestamp = (long long) ggml_time_us();
|
||||
snprintf(timestamped_filename, sizeof(timestamped_filename), "model_prefill_%lld.xml", timestamp);
|
||||
ov::serialize(model_prefill, timestamped_filename);
|
||||
snprintf(timestamped_filename, sizeof(timestamped_filename), "model_decode_%lld.xml", timestamp);
|
||||
ov::serialize(model_decode, timestamped_filename);
|
||||
}
|
||||
|
||||
ov::CompiledModel compiled_model_prefill;
|
||||
ov::CompiledModel compiled_model_decode;
|
||||
auto remote_context = ggml_openvino_get_remote_context();
|
||||
if (remote_context.has_value()) {
|
||||
compiled_model_prefill = core.compile_model(model_prefill, remote_context.value(), config);
|
||||
compiled_model_decode = core.compile_model(model_decode, remote_context.value(), config);
|
||||
} else {
|
||||
compiled_model_prefill = core.compile_model(model_prefill, device, config);
|
||||
compiled_model_decode = core.compile_model(model_decode, device, config);
|
||||
}
|
||||
|
||||
infer_request_cache_prefill[key] =
|
||||
std::make_shared<ov::InferRequest>(compiled_model_prefill.create_infer_request());
|
||||
infer_request_cache[key] = std::make_shared<ov::InferRequest>(compiled_model_decode.create_infer_request());
|
||||
compile_end_time = ggml_time_us();
|
||||
|
||||
model = is_prefill ? model_prefill : model_decode;
|
||||
ggml_decoder = is_prefill ? ggml_decoder_prefill : ggml_decoder_decode;
|
||||
infer_request = is_prefill ? infer_request_cache_prefill[key] : infer_request_cache[key];
|
||||
decoder_cache[key] = ggml_decoder;
|
||||
|
||||
std::vector<std::string> ov_input_names;
|
||||
std::vector<std::string> ov_output_names;
|
||||
for (const auto & ov_param : model->get_parameters()) {
|
||||
ov_input_names.push_back(ov_param->get_friendly_name());
|
||||
}
|
||||
for (const auto & ov_output : model->get_results()) {
|
||||
ov_output_names.push_back(ov_output->get_friendly_name());
|
||||
}
|
||||
ov_input_names_cache[key] = std::move(ov_input_names);
|
||||
ov_output_names_cache[key] = std::move(ov_output_names);
|
||||
}
|
||||
}
|
||||
|
||||
auto ov_input_names = ov_input_names_cache[key];
|
||||
auto ov_output_names = ov_output_names_cache[key];
|
||||
|
||||
if (is_prefill) {
|
||||
auto inp_len = inp_pos->ne[0];
|
||||
for (int chunk_index = 0; chunk_index * prefill_chunk_size < inp_len; chunk_index++) {
|
||||
for (size_t i = 0; i < ov_input_names.size(); i++) {
|
||||
auto param_name = ov_input_names[i];
|
||||
auto input_tensor = get_ov_input_tensor_static_prefill(ggml_decoder, param_name, chunk_index);
|
||||
infer_request->set_input_tensor(i, input_tensor);
|
||||
|
||||
if (getenv("GGML_OPENVINO_DEBUG_INPUT")) {
|
||||
const auto input_tensor = infer_request->get_input_tensor(i);
|
||||
print_input_tensor_info(param_name, input_tensor);
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < ov_output_names.size(); i++) {
|
||||
auto * ggml_tensor = ggml_decoder->get_model_outputs().at(ov_output_names[i]);
|
||||
ov::Tensor output_tensor(infer_request->get_output_tensor(i).get_element_type(),
|
||||
infer_request->get_output_tensor(i).get_shape(), ggml_tensor->data);
|
||||
infer_request->set_output_tensor(i, output_tensor);
|
||||
}
|
||||
|
||||
infer_request->infer();
|
||||
|
||||
if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) {
|
||||
for (size_t i = 0; i < ov_output_names.size(); i++) {
|
||||
const auto output_tensor = infer_request->get_output_tensor(i);
|
||||
print_output_tensor_info(ov_output_names[i], output_tensor, output_tensor.data());
|
||||
}
|
||||
}
|
||||
}
|
||||
infer_end_time = ggml_time_us();
|
||||
} else {
|
||||
for (size_t i = 0; i < ov_input_names.size(); i++) {
|
||||
auto param_name = ov_input_names[i];
|
||||
auto input_tensor = get_ov_input_tensor_static_decode(ggml_decoder, param_name);
|
||||
infer_request->set_input_tensor(i, input_tensor);
|
||||
|
||||
if (getenv("GGML_OPENVINO_DEBUG_INPUT")) {
|
||||
const auto input_tensor = infer_request->get_input_tensor(i);
|
||||
print_input_tensor_info(param_name, input_tensor);
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < ov_output_names.size(); i++) {
|
||||
auto * ggml_tensor = ggml_decoder->get_model_outputs().at(ov_output_names[i]);
|
||||
ov::Tensor output_tensor(infer_request->get_output_tensor(i).get_element_type(),
|
||||
infer_request->get_output_tensor(i).get_shape(), ggml_tensor->data);
|
||||
infer_request->set_output_tensor(i, output_tensor);
|
||||
}
|
||||
|
||||
infer_request->infer();
|
||||
infer_end_time = ggml_time_us();
|
||||
|
||||
if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) {
|
||||
for (size_t i = 0; i < ov_output_names.size(); i++) {
|
||||
const auto output_tensor = infer_request->get_output_tensor(i);
|
||||
print_output_tensor_info(ov_output_names[i], output_tensor, output_tensor.data());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (getenv("GGML_OPENVINO_PROFILING")) {
|
||||
GGML_LOG_INFO("\nGGML OpenVINO Backend: \n");
|
||||
GGML_LOG_INFO(" - Graph decoder time: %ld ms \n", (decoder_end_time - start_time) / 1000);
|
||||
if (!cache_hit) {
|
||||
GGML_LOG_INFO(" - Graph conversion time: %ld ms \n", (conversion_end_time - decoder_end_time) / 1000);
|
||||
GGML_LOG_INFO(" - Graph compile time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000);
|
||||
}
|
||||
GGML_LOG_INFO(" - Graph inference time: %ld ms \n", (infer_end_time - compile_end_time) / 1000);
|
||||
}
|
||||
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
bool is_naive(ggml_cgraph * cgraph) {
|
||||
constexpr int naive_graph_size_threshold = 20;
|
||||
int count = 0;
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
if (cgraph->nodes[i]->op != GGML_OP_NONE) {
|
||||
count++;
|
||||
}
|
||||
}
|
||||
return count < naive_graph_size_threshold;
|
||||
}
|
||||
|
||||
enum ggml_status naive_compute(ggml_cgraph * cgraph,
|
||||
ov::Core & core,
|
||||
const std::string & device,
|
||||
const ov::AnyMap & config) {
|
||||
if (cgraph->n_nodes == 1 && (cgraph->nodes[0]->op == GGML_OP_NONE || cgraph->nodes[0]->op == GGML_OP_VIEW)) {
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
bool naive = true;
|
||||
auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, naive);
|
||||
auto decoder = std::make_shared<GgmlOvDecoder>(cgraph, model_weights);
|
||||
auto input_model = std::make_shared<ov::frontend::ggml::InputModel>(decoder);
|
||||
auto model = ov::frontend::ggml::FrontEnd::convert(input_model, naive);
|
||||
if (getenv("GGML_OPENVINO_DUMP_IR")) {
|
||||
ov::serialize(model, "IR_naive.xml");
|
||||
}
|
||||
|
||||
ov::InferRequest infer_request;
|
||||
auto remote_context = ggml_openvino_get_remote_context();
|
||||
if (remote_context.has_value()) {
|
||||
infer_request = core.compile_model(model, remote_context.value(), config).create_infer_request();
|
||||
} else {
|
||||
infer_request = core.compile_model(model, device, config).create_infer_request();
|
||||
}
|
||||
|
||||
auto ov_params = model->get_parameters();
|
||||
for (size_t i = 0; i < ov_params.size(); i++) {
|
||||
auto param_name = ov_params[i]->get_friendly_name();
|
||||
auto input_tensor = get_ov_input_tensor(decoder, param_name);
|
||||
infer_request.set_input_tensor(i, input_tensor);
|
||||
}
|
||||
|
||||
auto ov_results = model->get_results();
|
||||
for (size_t i = 0; i < ov_results.size(); i++) {
|
||||
auto result_name = ov_results[i]->get_friendly_name();
|
||||
auto output_tensor = get_ov_output_tensor(decoder, result_name);
|
||||
infer_request.set_output_tensor(i, output_tensor);
|
||||
}
|
||||
|
||||
infer_request.infer();
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
namespace {
|
||||
ov::Tensor convert_ggml_input_to_ov(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string & name) {
|
||||
const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(name);
|
||||
|
||||
if (ggml_tensor->extra != nullptr) {
|
||||
// GGML_LOG_DEBUG("Using ggml_tensor->extra as ov::Tensor for input: %s\n", name.c_str());
|
||||
auto * extra_base = static_cast<ggml_openvino_extra_base *>(ggml_tensor->extra);
|
||||
if (extra_base->type != ggml_openvino_extra_base::Type::TENSOR) {
|
||||
throw std::runtime_error("ggml tensor extra is not of type TENSOR for input: " + name);
|
||||
}
|
||||
auto * tensor_extra = static_cast<ggml_openvino_tensor_extra *>(extra_base);
|
||||
return *tensor_extra->tensor;
|
||||
}
|
||||
|
||||
// GGML_LOG_DEBUG("Converting ggml tensor to ov::Tensor for input: %s\n", name.c_str());
|
||||
auto * input_data = ggml_tensor->data;
|
||||
ov::Shape input_shape;
|
||||
if (ggml_tensor->op == GGML_OP_VIEW) {
|
||||
// This case is added to make test-backend-ops work
|
||||
input_shape = ggml_decoder->get_shape(ggml_tensor->view_src);
|
||||
} else {
|
||||
input_shape = ggml_decoder->get_shape(ggml_tensor);
|
||||
}
|
||||
auto input_tensor = ov::Tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape, input_data);
|
||||
return input_tensor;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
ov::Tensor get_ov_input_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string & param_name) {
|
||||
ov::Tensor input_tensor;
|
||||
if (ggml_decoder->get_model_extra_inputs().find(param_name) != ggml_decoder->get_model_extra_inputs().end()) {
|
||||
input_tensor = *ggml_decoder->get_model_extra_input_values().at(param_name);
|
||||
} else {
|
||||
input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name);
|
||||
}
|
||||
return input_tensor;
|
||||
}
|
||||
|
||||
ov::Tensor get_ov_input_tensor_static_decode(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
|
||||
const std::string & param_name) {
|
||||
// NPU decoding stage
|
||||
const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(param_name);
|
||||
const auto * op = ggml_decoder->get_tensor_used_op(ggml_tensor);
|
||||
|
||||
if (GgmlOvDecoder::is_inp_tok(ggml_tensor, op) || GgmlOvDecoder::is_inp_pos(ggml_tensor, op) ||
|
||||
GgmlOvDecoder::is_kv_idx(ggml_tensor, op)) {
|
||||
assert(ggml_tensor->ne[0] == 1);
|
||||
ov::Shape input_shape = {1, 1, 1, 1};
|
||||
ov::Tensor input_tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape);
|
||||
if (ggml_tensor->type == GGML_TYPE_I32) {
|
||||
*input_tensor.data<int32_t>() = *((int32_t *) ggml_tensor->data);
|
||||
} else if (ggml_tensor->type == GGML_TYPE_I64) {
|
||||
*input_tensor.data<int64_t>() = *((int64_t *) ggml_tensor->data);
|
||||
} else {
|
||||
throw std::runtime_error("Unexpected tensor type for " + param_name);
|
||||
}
|
||||
return input_tensor;
|
||||
}
|
||||
|
||||
if (GgmlOvDecoder::is_output_idx(ggml_tensor, op)) {
|
||||
ov::Shape input_shape = {1, 1, 1, 1};
|
||||
ov::Tensor input_tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape);
|
||||
int32_t inp_out_id = *((int32_t *) ggml_tensor->data);
|
||||
assert(ggml_tensor->ne[0] == 1);
|
||||
assert(inp_out_id == 0);
|
||||
*input_tensor.data<int32_t>() = inp_out_id;
|
||||
return input_tensor;
|
||||
}
|
||||
|
||||
if (GgmlOvDecoder::is_inp_mask(ggml_tensor, op)) {
|
||||
size_t context_size = ggml_decoder->get_ctx_size();
|
||||
std::vector<float> padded_data = pad_input<float>(ggml_tensor, 1, context_size, -INFINITY);
|
||||
ov::Tensor input_tensor(ov::element::f32, ov::Shape{1, 1, 1, context_size});
|
||||
auto * data_ptr = input_tensor.data<float>();
|
||||
std::copy(padded_data.begin(), padded_data.begin() + context_size, data_ptr);
|
||||
return input_tensor;
|
||||
}
|
||||
|
||||
return get_ov_input_tensor(ggml_decoder, param_name);
|
||||
}
|
||||
|
||||
ov::Tensor get_ov_input_tensor_static_prefill(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
|
||||
const std::string & param_name,
|
||||
int chunk_index) {
|
||||
// NPU prompt processing stage
|
||||
const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(param_name);
|
||||
const auto * op = ggml_decoder->get_tensor_used_op(ggml_tensor);
|
||||
|
||||
const size_t input_len = ggml_decoder->get_input_len();
|
||||
const size_t chunk_size = ggml_decoder->m_prefill_chunk_size;
|
||||
const size_t chunk_valid_size = std::min(chunk_size, input_len - chunk_index * chunk_size);
|
||||
const size_t chunk_pad_size = chunk_size - chunk_valid_size;
|
||||
|
||||
if (GgmlOvDecoder::is_inp_tok(ggml_tensor, op) || GgmlOvDecoder::is_inp_pos(ggml_tensor, op) ||
|
||||
GgmlOvDecoder::is_kv_idx(ggml_tensor, op)) {
|
||||
ov::Shape input_shape = {1, 1, 1, chunk_size};
|
||||
ov::Tensor input_tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape);
|
||||
// copy the chunk_index-th chunk from ggml_tensor
|
||||
size_t element_size = ggml_type_size(ggml_tensor->type);
|
||||
void * input_data = (char *) ggml_tensor->data + chunk_index * chunk_size * element_size;
|
||||
std::memcpy(input_tensor.data(), input_data, chunk_valid_size * element_size);
|
||||
// pad the rest with last_value + 1, so that kv's of padded positions are inserted
|
||||
// to the next row after the valids row in the kvcache
|
||||
if (chunk_pad_size > 0) {
|
||||
if (ggml_tensor->type == GGML_TYPE_I32) {
|
||||
int32_t last_value =
|
||||
*((int32_t *) ggml_tensor->data + (chunk_index * chunk_size + chunk_valid_size - 1));
|
||||
int32_t * output_data = input_tensor.data<int32_t>();
|
||||
std::fill(output_data + chunk_valid_size, output_data + chunk_size, last_value + 1);
|
||||
} else if (ggml_tensor->type == GGML_TYPE_I64) {
|
||||
int64_t last_value =
|
||||
*((int64_t *) ggml_tensor->data + (chunk_index * chunk_size + chunk_valid_size - 1));
|
||||
int64_t * output_data = input_tensor.data<int64_t>();
|
||||
std::fill(output_data + chunk_valid_size, output_data + chunk_size, last_value + 1);
|
||||
} else {
|
||||
throw std::runtime_error("Unexpected tensor type for " + param_name);
|
||||
}
|
||||
}
|
||||
return input_tensor;
|
||||
}
|
||||
|
||||
if (GgmlOvDecoder::is_output_idx(ggml_tensor, op)) {
|
||||
size_t output_len = ggml_decoder->get_compute_params().output_len;
|
||||
ov::Shape input_shape = {1, 1, 1, output_len};
|
||||
ov::Tensor input_tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape);
|
||||
if (ggml_tensor->ne[0] == 0) {
|
||||
*input_tensor.data<int32_t>() = 0;
|
||||
} else {
|
||||
auto * data_addr = input_tensor.data<int32_t>();
|
||||
for (size_t i = 0; i < output_len; i++) {
|
||||
data_addr[i] = ((int32_t *) ggml_tensor->data)[i] % chunk_size;
|
||||
}
|
||||
}
|
||||
return input_tensor;
|
||||
}
|
||||
|
||||
if (GgmlOvDecoder::is_inp_mask(ggml_tensor, op)) {
|
||||
size_t cols = ggml_tensor->ne[0];
|
||||
size_t rows = ggml_tensor->ne[1];
|
||||
float * ggml_data = (float *) ggml_tensor->data + chunk_index * chunk_size * cols;
|
||||
size_t chunk_valid_rows = std::min(chunk_size, rows - chunk_index * chunk_size);
|
||||
size_t context_size = ggml_decoder->get_ctx_size();
|
||||
std::vector<float> padded_data =
|
||||
pad_input<float>(ggml_data, chunk_valid_rows, cols, chunk_size, context_size, -INFINITY);
|
||||
set_zero_diagonal(padded_data, chunk_size, context_size);
|
||||
ov::Tensor input_tensor(ov::element::f32, ov::Shape{1, 1, chunk_size, context_size});
|
||||
auto * data_ptr = input_tensor.data<float>();
|
||||
std::copy(padded_data.begin(), padded_data.begin() + chunk_size * context_size, data_ptr);
|
||||
return input_tensor;
|
||||
}
|
||||
|
||||
return get_ov_input_tensor(ggml_decoder, param_name);
|
||||
}
|
||||
|
||||
ov::Tensor get_ov_output_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string & result_name) {
|
||||
auto * ggml_tensor = ggml_decoder->get_model_outputs().at(result_name);
|
||||
auto output_type = ggml_decoder->get_ov_type(ggml_tensor);
|
||||
auto output_shape = ggml_decoder->get_shape(ggml_tensor);
|
||||
|
||||
ov::Tensor output_tensor(output_type, output_shape, ggml_tensor->data);
|
||||
return output_tensor;
|
||||
}
|
||||
|
||||
size_t checksum(const void * data, size_t size) {
|
||||
const uint8_t * bytes = static_cast<const uint8_t *>(data);
|
||||
size_t sum = 0;
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
sum += (uint8_t) i;
|
||||
sum += bytes[i];
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
void print_input_tensor_info(const std::string & name, const ov::Tensor & tensor) {
|
||||
std::cout << "Input name: " << name << ", Input shape: " << tensor.get_shape() << ", Address: " << tensor.data()
|
||||
<< std::endl;
|
||||
switch (tensor.get_element_type()) {
|
||||
case ov::element::f32: {
|
||||
if (name.find("self_kq_mask") == std::string::npos) {
|
||||
std::cout << *(tensor.data<float>()) << std::endl;
|
||||
} else {
|
||||
size_t rows = tensor.get_shape()[2];
|
||||
size_t cols = tensor.get_shape()[3];
|
||||
auto * data = tensor.data<float>();
|
||||
for (size_t i = 0; i < rows; ++i) {
|
||||
for (size_t j = 0; j < cols; ++j) {
|
||||
float val = data[i * cols + j];
|
||||
if (std::isinf(val) && val < 0) {
|
||||
std::cout << std::setw(5) << "-inf";
|
||||
} else {
|
||||
std::cout << std::setw(5) << val;
|
||||
}
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
case ov::element::f16:
|
||||
std::cout << *(tensor.data<ov::float16>()) << std::endl;
|
||||
break;
|
||||
case ov::element::i32:
|
||||
for (size_t i = 0; i < tensor.get_size(); ++i) {
|
||||
std::cout << tensor.data<int32_t>()[i] << " ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
break;
|
||||
case ov::element::i64:
|
||||
for (size_t i = 0; i < tensor.get_size(); ++i) {
|
||||
std::cout << tensor.data<int64_t>()[i] << " ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void print_output_tensor_info(const std::string & name, const ov::Tensor & tensor, const void * output_dst) {
|
||||
std::cout << "Output name: " << name << ", Output shape: " << tensor.get_shape() << ", Address: " << output_dst
|
||||
<< std::endl;
|
||||
|
||||
auto print_float_stats = [](const std::string & type_name, size_t size, auto get_value) {
|
||||
if (size == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
float first = get_value(0);
|
||||
float min = first;
|
||||
float max = first;
|
||||
double sum = first;
|
||||
|
||||
for (size_t i = 1; i < size; ++i) {
|
||||
float v = get_value(i);
|
||||
if (v < min) {
|
||||
min = v;
|
||||
}
|
||||
if (v > max) {
|
||||
max = v;
|
||||
}
|
||||
sum += v;
|
||||
}
|
||||
double mean = sum / size;
|
||||
|
||||
std::cout << std::right << std::setw(6) << type_name << std::right << std::setw(12) << "First" << std::setw(12)
|
||||
<< "Min" << std::setw(12) << "Max" << std::setw(12) << "Mean" << std::endl;
|
||||
std::cout << std::right << std::setw(6) << "" << std::right << std::setw(12) << first << std::setw(12) << min
|
||||
<< std::setw(12) << max << std::setw(12) << mean << std::endl;
|
||||
};
|
||||
|
||||
switch (tensor.get_element_type()) {
|
||||
case ov::element::f32: {
|
||||
const float * data = tensor.data<float>();
|
||||
size_t size = tensor.get_size();
|
||||
print_float_stats("[f32]", size, [data](size_t i) { return data[i]; });
|
||||
break;
|
||||
}
|
||||
case ov::element::f16: {
|
||||
const ov::float16 * data = tensor.data<ov::float16>();
|
||||
size_t size = tensor.get_size();
|
||||
print_float_stats("[f16]", size, [data](size_t i) { return static_cast<float>(data[i]); });
|
||||
break;
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void set_zero_diagonal(std::vector<float> & matrix, size_t rows, size_t cols) {
|
||||
for (size_t i = 0; i < rows; ++i) {
|
||||
size_t diag_col = std::min(i, cols - 1);
|
||||
matrix[i * cols + diag_col] = 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
const ggml_tensor * get_inp_pos_tensor(ggml_cgraph * cgraph) {
|
||||
for (int i = 0; i < cgraph->n_nodes; ++i) {
|
||||
auto * op = cgraph->nodes[i];
|
||||
for (int j = 0; j < GGML_MAX_SRC; ++j) {
|
||||
auto * src = op->src[j];
|
||||
if (src == nullptr) {
|
||||
break;
|
||||
}
|
||||
if (GgmlOvDecoder::is_inp_pos(src, op)) {
|
||||
return src;
|
||||
}
|
||||
}
|
||||
}
|
||||
GGML_LOG_ERROR("get_inp_pos_tensor: inp_pos not found in cgraph");
|
||||
throw std::runtime_error("get_inp_pos_tensor: inp_pos not found in cgraph");
|
||||
}
|
||||
|
||||
bool get_is_prefill(const ggml_tensor * inp_pos) {
|
||||
return inp_pos->ne[0] > 1;
|
||||
}
|
||||
|
||||
#pragma GCC diagnostic pop
|
||||
|
|
@ -0,0 +1,96 @@
|
|||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-decoder.h"
|
||||
#include "ggml-impl.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstddef>
|
||||
#include <openvino/runtime/core.hpp>
|
||||
#include <string>
|
||||
|
||||
struct graph_key {
|
||||
int n_nodes;
|
||||
std::string first_node_name;
|
||||
std::string last_node_name;
|
||||
|
||||
graph_key(const ggml_cgraph * cgraph) : n_nodes(cgraph->n_nodes) {
|
||||
if (n_nodes > 0) {
|
||||
first_node_name = cgraph->nodes[0]->name;
|
||||
last_node_name = cgraph->nodes[n_nodes - 1]->name;
|
||||
}
|
||||
}
|
||||
|
||||
bool operator==(const graph_key & other) const {
|
||||
return n_nodes == other.n_nodes && first_node_name == other.first_node_name &&
|
||||
last_node_name == other.last_node_name;
|
||||
}
|
||||
};
|
||||
|
||||
struct graph_key_hash {
|
||||
size_t operator()(const graph_key & key) const {
|
||||
size_t h = std::hash<int>{}(key.n_nodes);
|
||||
if (key.n_nodes > 0) {
|
||||
h ^= std::hash<std::string>{}(key.first_node_name) + 0x9e3779b9 + (h << 6) + (h >> 2);
|
||||
h ^= std::hash<std::string>{}(key.last_node_name) + 0x9e3779b9 + (h << 6) + (h >> 2);
|
||||
}
|
||||
return h;
|
||||
}
|
||||
};
|
||||
|
||||
enum ggml_status ov_graph_compute(struct ggml_cgraph * cgraph);
|
||||
|
||||
enum ggml_status ov_graph_compute_dynamic(struct ggml_cgraph * cgraph, const std::string & device, bool stateful = false);
|
||||
enum ggml_status ov_graph_compute_static(struct ggml_cgraph * cgraph);
|
||||
|
||||
size_t checksum(const void * data, size_t size);
|
||||
|
||||
void print_input_tensor_info(const std::string & name, const ov::Tensor & tensor);
|
||||
|
||||
void print_output_tensor_info(const std::string & name, const ov::Tensor & tensor, const void * output_dst);
|
||||
|
||||
template <typename T>
|
||||
std::vector<T> pad_input(const T * data,
|
||||
size_t rows,
|
||||
size_t cols,
|
||||
size_t padded_rows,
|
||||
size_t padded_cols,
|
||||
T pad_value) {
|
||||
std::vector<T> padded(padded_rows * padded_cols, pad_value);
|
||||
|
||||
for (size_t i = 0; i < std::min(rows, padded_rows); ++i) {
|
||||
for (size_t j = 0; j < std::min(cols, padded_cols); ++j) {
|
||||
padded[i * padded_cols + j] = data[i * cols + j];
|
||||
}
|
||||
}
|
||||
|
||||
return padded;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::vector<T> pad_input(const ggml_tensor * tensor, size_t padded_rows, size_t padded_cols, T pad_value) {
|
||||
return pad_input<T>(reinterpret_cast<const T *>(tensor->data),
|
||||
static_cast<size_t>(tensor->ne[1]), // rows
|
||||
static_cast<size_t>(tensor->ne[0]), // cols
|
||||
padded_rows, padded_cols, pad_value);
|
||||
}
|
||||
|
||||
void set_zero_diagonal(std::vector<float> & matrix, size_t rows, size_t cols);
|
||||
|
||||
const ggml_tensor * get_inp_pos_tensor(struct ggml_cgraph * cgraph);
|
||||
|
||||
bool get_is_prefill(const ggml_tensor * inp_pos);
|
||||
|
||||
ov::Tensor get_ov_input_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string & param_name);
|
||||
ov::Tensor get_ov_input_tensor_static_decode(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
|
||||
const std::string & param_name);
|
||||
ov::Tensor get_ov_input_tensor_static_prefill(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
|
||||
const std::string & param_name,
|
||||
int chunk_index);
|
||||
|
||||
ov::Tensor get_ov_output_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string & result_name);
|
||||
|
||||
bool is_naive(struct ggml_cgraph * cgraph);
|
||||
|
||||
enum ggml_status naive_compute(struct ggml_cgraph * cgraph,
|
||||
ov::Core & core,
|
||||
const std::string & device,
|
||||
const ov::AnyMap & config);
|
||||
Loading…
Reference in New Issue