Added OpenVINO CI/CD. Updated docs

This commit is contained in:
ravi9 2025-07-17 17:51:10 -07:00 committed by Mustafa Cavus
parent d61f83c9b7
commit ea75772e48
6 changed files with 314 additions and 39 deletions

134
.devops/openvino.Dockerfile Normal file
View File

@ -0,0 +1,134 @@
ARG OPENVINO_VERSION_MAJOR=2025.2
ARG OPENVINO_VERSION_FULL=2025.2.0.19140.c01cd93e24d
ARG UBUNTU_VERSION=24.04
# Optional proxy build arguments - empty by default
ARG http_proxy=
ARG https_proxy=
## Build Image
FROM ubuntu:${UBUNTU_VERSION} AS build
# Pass proxy args to build stage
ARG http_proxy
ARG https_proxy
RUN apt-get update && \
apt-get install -y --no-install-recommends \
ca-certificates \
gnupg \
wget \
git \
cmake \
ninja-build \
build-essential \
libtbb12 \
libcurl4-openssl-dev && \
rm -rf /var/lib/apt/lists/*
# Install OpenVINO for Ubuntu 24.04
ARG OPENVINO_VERSION_MAJOR
ARG OPENVINO_VERSION_FULL
RUN mkdir -p /opt/intel && \
wget https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz && \
tar -xf openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz && \
mv openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64 /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} && \
cd /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} && \
echo "Y" | ./install_dependencies/install_openvino_dependencies.sh && \
cd - && \
ln -s /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} /opt/intel/openvino
ENV OpenVINO_DIR=/opt/intel/openvino
WORKDIR /app
COPY . .
# Build Stage
RUN bash -c "source ${OpenVINO_DIR}/setupvars.sh && \
cmake -B build/ReleaseOV -G Ninja \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENVINO=ON && \
cmake --build build/ReleaseOV -j$(nproc)"
# Copy all necessary libraries
RUN mkdir -p /app/lib && \
find build/ReleaseOV -name '*.so*' -exec cp {} /app/lib \; && \
find ${OpenVINO_DIR}/runtime/lib/intel64 -name '*.so*' -exec cp -P {} /app/lib \; 2>/dev/null || \
find ${OpenVINO_DIR}/lib/intel64 -name '*.so*' -exec cp -P {} /app/lib \;
# Create runtime directories and copy binaries
RUN mkdir -p /app/full \
&& cp build/ReleaseOV/bin/* /app/full/ \
&& cp *.py /app/full \
&& cp -r gguf-py /app/full \
&& cp -r requirements /app/full \
&& cp requirements.txt /app/full \
&& cp .devops/tools.sh /app/full/tools.sh
## Base Runtime Image
FROM ubuntu:${UBUNTU_VERSION} AS base
# Pass proxy args to runtime stage
ARG http_proxy
ARG https_proxy
RUN apt-get update \
&& apt-get install -y libgomp1 libtbb12 curl\
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete
COPY --from=build /app/lib/ /app/
### Full (all binaries)
FROM base AS full
ARG http_proxy
ARG https_proxy
COPY --from=build /app/full /app/
WORKDIR /app
RUN apt-get update && \
apt-get install -y --no-install-recommends \
git \
python3 \
python3-venv \
python3-pip && \
python3 -m venv /ov-venv && \
/ov-venv/bin/pip install --no-cache-dir --upgrade pip setuptools wheel && \
/ov-venv/bin/pip install --no-cache-dir -r requirements.txt && \
apt-get autoremove -y && \
apt-get clean && \
rm -rf /tmp/* /var/tmp/* && \
find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
find /var/cache -type f -delete
ENTRYPOINT ["/bin/bash", "-c", "source /ov-venv/bin/activate && exec /app/tools.sh \"$@\"", "--"]
### Light, CLI only
FROM base AS light
COPY --from=build /app/full/llama-cli /app/
WORKDIR /app
ENTRYPOINT [ "/app/llama-cli" ]
### Server, Server only
FROM base AS server
ENV LLAMA_ARG_HOST=0.0.0.0
COPY --from=build /app/full/llama-server /app/
WORKDIR /app
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/app/llama-server" ]

View File

@ -737,6 +737,45 @@ jobs:
-DGGML_SYCL_F16=ON
cmake --build build --config Release -j $(nproc)
ubuntu-24-cmake-openvino:
runs-on: ubuntu-24.04
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
with:
key: ubuntu-24-cmake-openvino-no-preset-v1
evict-old-files: 1d
- name: Dependencies
id: depends
run: |
export OPENVINO_VERSION_MAJOR=2025.2
export OPENVINO_VERSION_FULL=2025.2.0.19140.c01cd93e24d
sudo apt-get update
sudo apt-get install -y build-essential libcurl4-openssl-dev libtbb12 cmake ninja-build python3-pip curl wget tar
sudo mkdir -p /opt/intel
wget -O openvino_${OPENVINO_VERSION_MAJOR}.tgz https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz
tar -xf openvino_${OPENVINO_VERSION_MAJOR}.tgz
sudo mv openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64 /opt/intel/openvino_${OPENVINO_VERSION_MAJOR}
rm openvino_${OPENVINO_VERSION_MAJOR}.tgz
cd /opt/intel/openvino_${OPENVINO_VERSION_MAJOR}
echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh && cd -
sudo ln -s /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} /opt/intel/openvino
- name: Build
id: cmake_build
run: |
source /opt/intel/openvino/setupvars.sh
cmake -B build/ReleaseOV -G Ninja \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENVINO=ON
cmake --build build/ReleaseOV --config Release -j $(nproc)
build-linux-cross:
uses: ./.github/workflows/build-linux-cross.yml

View File

@ -47,6 +47,7 @@ jobs:
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
- { tag: "s390x", dockerfile: ".devops/s390x.Dockerfile", platforms: "linux/s390x", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04-s390x" }
- { tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04" }
- { tag: "openvino", dockerfile: ".devops/openvino.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
steps:
- name: Check out the repo
uses: actions/checkout@v4

View File

@ -231,6 +231,63 @@ jobs:
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz
name: llama-bin-ubuntu-vulkan-x64.tar.gz
ubuntu-24-openvino:
runs-on: ubuntu-24.04
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
with:
key: ubuntu-24-cmake-openvino-release-no-preset-v1
evict-old-files: 1d
- name: Dependencies
id: depends
run: |
export OPENVINO_VERSION_MAJOR=2025.2
export OPENVINO_VERSION_FULL=2025.2.0.19140.c01cd93e24d
sudo apt-get update
sudo apt-get install -y build-essential libcurl4-openssl-dev libtbb12 cmake ninja-build python3-pip curl wget tar
sudo mkdir -p /opt/intel
wget -O openvino_${OPENVINO_VERSION_MAJOR}.tgz https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz
tar -xf openvino_${OPENVINO_VERSION_MAJOR}.tgz
sudo mv openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64 /opt/intel/openvino_${OPENVINO_VERSION_MAJOR}
rm openvino_${OPENVINO_VERSION_MAJOR}.tgz
cd /opt/intel/openvino_${OPENVINO_VERSION_MAJOR}
echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh && cd -
sudo ln -s /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} /opt/intel/openvino
- name: Build
id: cmake_build
run: |
source /opt/intel/openvino/setupvars.sh
cmake -B build/ReleaseOV -G Ninja \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENVINO=ON
cmake --build build/ReleaseOV --config Release -j $(nproc)
- name: Determine tag name
id: tag
uses: ./.github/actions/get-tag-name
- name: Pack artifacts
id: pack_artifacts
run: |
cp LICENSE ./build/ReleaseOV/bin/
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-x64.zip ./build/ReleaseOV/bin/*
- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-x64.zip
name: llama-bin-ubuntu-openvino-x64.zip
windows-cpu:
runs-on: windows-2025

View File

@ -25,6 +25,9 @@
# # with KLEIDIAI support
# GG_BUILD_KLEIDIAI=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
# # with OPENVINO support
# GG_BUILD_OPENVINO=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
if [ -z "$2" ]; then
echo "usage: $0 <output-dir> <mnt-dir>"
@ -165,6 +168,15 @@ if [ -n "${GG_BUILD_KLEIDIAI}" ]; then
-DBUILD_SHARED_LIBS=OFF"
fi
if [ ! -z ${GG_BUILD_OPENVINO} ]; then
if [ -z ${OpenVINO_DIR} ]; then
echo "OpenVINO_DIR not found, please install OpenVINO via archives and enable it by:"
echo "source /opt/intel/openvino/setupvars.sh"
exit 1
fi
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_OPENVINO=ON"
fi
## helpers
# download a file if it does not exist or if it is outdated

View File

@ -25,7 +25,7 @@ The following sections describe how to build with different backends and options
* [Arm® KleidiAI™](#arm-kleidiai)
* [OpenCL](#opencl)
* [Android](#android-1)
* [OPENVINO](#openvino)
* [OpenVINO](#openvino)
* [Notes about GPU-accelerated backends](#notes-about-gpu-accelerated-backends)
## CPU Build
@ -696,20 +696,48 @@ Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/m
To read documentation for how to build on IBM Z & LinuxONE, [click here](./build-s390x.md)
## OPENVINO
## OpenVINO
[OpenVINO](https://docs.openvino.ai/2025/index.html) is a open-source toolkit for optimizing and deploying performant AI inference, specifically designed for Intel hardware including CPUs, GPUs, and NPUs in the cloud, on-prem, and on the edge alike. The OpenVINO backend enhances performance by leveraging hardware-specific optimizations and can be enabled for use with llama.cpp.
[OpenVINO](https://docs.openvino.ai/2025/index.html) is an open-source toolkit for optimizing and deploying high-performance AI inference, specifically designed for Intel hardware, including CPUs, GPUs, and NPUs, in the cloud, on-premises, and on the edge.
The OpenVINO backend enhances performance by leveraging hardware-specific optimizations and can be enabled for use with llama.cpp.
Follow the instructions below to install OpenVINO runtime and build llama.cpp with OpenVINO support.
### Prerequisites
- Linux or Windows system with Intel hardware (CPU, GPU, or NPU)
- **For Intel GPU or NPU Usage**: Install the appropriate hardware drivers for your Intel GPU or NPU. For detailed instructions, see: [Additional Configurations for Hardware Acceleration](https://docs.openvino.ai/2025/get-started/install-openvino/configurations.html).
- Git, CMake, and Ninja software tools are needed for building
```bash
sudo apt-get update
sudo apt-get install -y build-essential libcurl4-openssl-dev libtbb12 cmake ninja-build python3-pip curl wget tar
```
### 1. Install OpenVINO Runtime
- Follow the guide to install OpenVINO Runtime from an archive file: **[Install OpenVINO™ Runtime on Linux from an Archive File.](https://docs.openvino.ai/2025/get-started/install-openvino/install-openvino-archive-linux.html)**
- After installation, make sure to [source the environment setup script](https://docs.openvino.ai/2025/get-started/install-openvino/install-openvino-archive-linux.html#step-2-configure-the-environment):
<details>
<summary>📦 Click to expand OpenVINO 2025.2 installation commands</summary>
<br>
```bash
source /opt/intel/openvino_2025.1.0/setupvars.sh
export OPENVINO_VERSION_MAJOR=2025.2
export OPENVINO_VERSION_FULL=2025.2.0.19140.c01cd93e24d
sudo apt-get update
sudo apt-get install -y build-essential libcurl4-openssl-dev libtbb12 cmake ninja-build python3-pip curl wget tar
sudo mkdir -p /opt/intel
wget -O openvino_${OPENVINO_VERSION_MAJOR}.tgz https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz
tar -xf openvino_${OPENVINO_VERSION_MAJOR}.tgz
sudo mv openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64 /opt/intel/openvino_${OPENVINO_VERSION_MAJOR}
rm openvino_${OPENVINO_VERSION_MAJOR}.tgz
cd /opt/intel/openvino_${OPENVINO_VERSION_MAJOR}
echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh && cd -
sudo ln -s /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} /opt/intel/openvino
source /opt/intel/openvino/setupvars.sh
```
</details>
- Verify OpenVINO is initialized properly
```bash
echo $OpenVINO_DIR
@ -725,23 +753,26 @@ cd llama.cpp
git switch dev_backend_openvino
# Build with OpenVINO support
cmake --preset ReleaseOV
cmake --build build/ReleaseOV --parallel
source /opt/intel/openvino/setupvars.sh
cmake -B build/ReleaseOV -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON
cmake --build build/ReleaseOV --config Release -j $(nproc)
```
### 3. Download Sample Model
Download the Phi-3 mini model for testing:
Download models for testing:
```bash
# Create models directory
mkdir -p ~/models/Phi-3-mini-4k-instruct-gguf
mkdir -p ~/models/
# Download model file
# Download model file: Llama-3.2-1B-Instruct.fp16.gguf
wget https://huggingface.co/MaziyarPanahi/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct.fp16.gguf \
-O ~/models/Llama-3.2-1B-Instruct.fp16.gguf
# Download model file: Phi-3-mini-4k-instruct-fp16.gguf
wget https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-fp16.gguf \
-O ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf
-O ~/models/Phi-3-mini-4k-instruct-fp16.gguf
```
### 4. Run inference with OpenVINO backend:
@ -750,28 +781,19 @@ When using the OpenVINO backend, the first inference token may have slightly hig
```bash
export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache
# Default device is GPU.
# If not set, automatically selects the first available device in priority order: GPU, CPU, NPU.
export GGML_OPENVINO_DEVICE=GPU
./build/ReleaseOV/bin/llama-simple \
-m ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf \
-n 50 \
"Hello, my name is "
./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct.fp16.gguf -n 50 "The story of AI is "
```
### Using Llama.cpp's Built-in CPU Backend (for Comparison)
To compare performance with the deafult CPU backend:
To run in chat mode:
```bash
# Build CPU-only version
cmake --preset ReleaseCPU
cmake --build build/ReleaseCPU --parallel
export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache
# Run with Default CPU backend
./build/ReleaseCPU/bin/llama-simple \
-m ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf \
-n 50 \
"Hello, my name is "
./build/ReleaseOV/bin/llama-cli -m ~/models/Llama-3.2-1B-Instruct.fp16.gguf -n 50 "The story of AI is "
```
@ -779,13 +801,14 @@ cmake --build build/ReleaseCPU --parallel
Control OpenVINO behavior using these environment variables:
- **`GGML_OPENVINO_CACHE_DIR`**: Directory for model caching (recommended: `/tmp/ov_cache`). If set, enables model caching in OpenVINO.
- **`GGML_OPENVINO_DEVICE`**: Specify the target device for OpenVINO inference. If not set, automatically selects the first available device in priority order: GPU, CPU, NPU. When set to `NPU` to use Intel NPUs, it enables static compilation mode for optimal performance.
- **`GGML_OPENVINO_CACHE_DIR`**: Directory for model caching (recommended: `/tmp/ov_cache`). If set, enables model caching in OpenVINO. Note: Not supported when using NPU devices yet.
- **`GGML_OPENVINO_WEIGHT_AS_INPUT`**: Pass the weights as input to the OpenVINO model instead of creating Constant nodes for them.
- **`GGML_OPENVINO_PROFILING`**: Enable execution time profiling
- **`GGML_OPENVINO_DUMP_CGRAPH`**: Save compute graph to `cgraph.txt`
- **`GGML_OPENVINO_DUMP_IR`**: Export OpenVINO IR files with timestamps
- **`GGML_OPENVINO_DEBUG_INPUT`**: Enable input debugging
- **`GGML_OPENVINO_DEBUG_OUTPUT`**: Enable output debugging
- **`GGML_OPENVINO_PROFILING`**: Enable execution time profiling.
- **`GGML_OPENVINO_DUMP_CGRAPH`**: Save compute graph to `cgraph.txt`.
- **`GGML_OPENVINO_DUMP_IR`**: Export OpenVINO IR files with timestamps.
- **`GGML_OPENVINO_DEBUG_INPUT`**: Enable input debugging.
- **`GGML_OPENVINO_DEBUG_OUTPUT`**: Enable output debugging.
### Example with Profiling
@ -793,11 +816,20 @@ Control OpenVINO behavior using these environment variables:
export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache
export GGML_OPENVINO_PROFILING=1
./build/ReleaseOV/bin/llama-simple \
-m ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf \
-n 50 \
"Hello, my name is "
./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct.fp16.gguf -n 50 "The story of AI is "
```
### Using Llama.cpp's Built-in CPU Backend (for Comparison)
To compare performance with the default CPU backend:
```bash
# Build CPU-only version
cmake --preset ReleaseCPU
cmake --build build/ReleaseCPU --parallel
# Run with the default CPU backend
./build/ReleaseCPU/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct.fp16.gguf -n 50 "The story of AI is "
```
## Notes about GPU-accelerated backends