Added OpenVINO CI/CD. Updated docs

2025-07-17 17:51:10 -07:00 · 2025-07-17 17:51:10 -07:00 · ea75772e48
parent d61f83c9b7
commit ea75772e48
6 changed files with 314 additions and 39 deletions
--- a/.devops/openvino.Dockerfile
+++ b/.devops/openvino.Dockerfile
@ -0,0 +1,134 @@
 ARG OPENVINO_VERSION_MAJOR=2025.2
 ARG OPENVINO_VERSION_FULL=2025.2.0.19140.c01cd93e24d
 ARG UBUNTU_VERSION=24.04
 # Optional proxy build arguments - empty by default
 ARG http_proxy=
 ARG https_proxy=
 ## Build Image
 FROM ubuntu:${UBUNTU_VERSION} AS build
 # Pass proxy args to build stage
 ARG http_proxy
 ARG https_proxy
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        ca-certificates \
        gnupg \
        wget \
        git \
        cmake \
        ninja-build \
        build-essential \
        libtbb12 \
        libcurl4-openssl-dev && \
    rm -rf /var/lib/apt/lists/*
 # Install OpenVINO for Ubuntu 24.04
 ARG OPENVINO_VERSION_MAJOR
 ARG OPENVINO_VERSION_FULL
 RUN mkdir -p /opt/intel && \
    wget https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz && \
    tar -xf openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz && \
    mv openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64 /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} && \
    cd /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} && \
    echo "Y" | ./install_dependencies/install_openvino_dependencies.sh && \
    cd - && \
    ln -s /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} /opt/intel/openvino
 ENV OpenVINO_DIR=/opt/intel/openvino
 WORKDIR /app
 COPY . .
 # Build Stage
 RUN bash -c "source ${OpenVINO_DIR}/setupvars.sh && \
    cmake -B build/ReleaseOV -G Ninja \
        -DCMAKE_BUILD_TYPE=Release \
        -DGGML_OPENVINO=ON && \
    cmake --build build/ReleaseOV -j$(nproc)"
 # Copy all necessary libraries
 RUN mkdir -p /app/lib && \
    find build/ReleaseOV -name '*.so*' -exec cp {} /app/lib \; && \
    find ${OpenVINO_DIR}/runtime/lib/intel64 -name '*.so*' -exec cp -P {} /app/lib \; 2>/dev/null || \
    find ${OpenVINO_DIR}/lib/intel64 -name '*.so*' -exec cp -P {} /app/lib \;
 # Create runtime directories and copy binaries
 RUN mkdir -p /app/full \
    && cp build/ReleaseOV/bin/* /app/full/ \
    && cp *.py /app/full \
    && cp -r gguf-py /app/full \
    && cp -r requirements /app/full \
    && cp requirements.txt /app/full \
    && cp .devops/tools.sh /app/full/tools.sh
 ## Base Runtime Image
 FROM ubuntu:${UBUNTU_VERSION} AS base
 # Pass proxy args to runtime stage
 ARG http_proxy
 ARG https_proxy
 RUN apt-get update \
    && apt-get install -y libgomp1 libtbb12 curl\
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
    && find /var/cache -type f -delete
 COPY --from=build /app/lib/ /app/
 ### Full (all binaries)
 FROM base AS full
 ARG http_proxy
 ARG https_proxy
 COPY --from=build /app/full /app/
 WORKDIR /app
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
    git \
    python3 \
    python3-venv \
    python3-pip && \
    python3 -m venv /ov-venv && \
    /ov-venv/bin/pip install --no-cache-dir --upgrade pip setuptools wheel && \
    /ov-venv/bin/pip install --no-cache-dir -r requirements.txt && \
    apt-get autoremove -y && \
    apt-get clean && \
    rm -rf /tmp/* /var/tmp/* && \
    find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
    find /var/cache -type f -delete
 ENTRYPOINT ["/bin/bash", "-c", "source /ov-venv/bin/activate && exec /app/tools.sh \"$@\"", "--"]
 ### Light, CLI only
 FROM base AS light
 COPY --from=build /app/full/llama-cli /app/
 WORKDIR /app
 ENTRYPOINT [ "/app/llama-cli" ]
 ### Server, Server only
 FROM base AS server
 ENV LLAMA_ARG_HOST=0.0.0.0
 COPY --from=build /app/full/llama-server /app/
 WORKDIR /app
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/app/llama-server" ]
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -737,6 +737,45 @@ jobs:
            -DGGML_SYCL_F16=ON
          cmake --build build --config Release -j $(nproc)
  ubuntu-24-cmake-openvino:
      runs-on: ubuntu-24.04
      steps:
        - name: Clone
          id: checkout
          uses: actions/checkout@v4
        - name: ccache
          uses: hendrikmuhs/ccache-action@v1.2.16
          with:
            key: ubuntu-24-cmake-openvino-no-preset-v1
            evict-old-files: 1d
        - name: Dependencies
          id: depends
          run: |
            export OPENVINO_VERSION_MAJOR=2025.2
            export OPENVINO_VERSION_FULL=2025.2.0.19140.c01cd93e24d
            sudo apt-get update
            sudo apt-get install -y build-essential libcurl4-openssl-dev libtbb12 cmake ninja-build python3-pip curl wget tar
            sudo mkdir -p /opt/intel
            wget -O openvino_${OPENVINO_VERSION_MAJOR}.tgz https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz
            tar -xf openvino_${OPENVINO_VERSION_MAJOR}.tgz
            sudo mv openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64 /opt/intel/openvino_${OPENVINO_VERSION_MAJOR}
            rm openvino_${OPENVINO_VERSION_MAJOR}.tgz
            cd /opt/intel/openvino_${OPENVINO_VERSION_MAJOR}
            echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh && cd -
            sudo ln -s /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} /opt/intel/openvino
        - name: Build
          id: cmake_build
          run: |
            source /opt/intel/openvino/setupvars.sh
            cmake -B build/ReleaseOV -G Ninja \
              -DCMAKE_BUILD_TYPE=Release \
              -DGGML_OPENVINO=ON
            cmake --build build/ReleaseOV --config Release -j $(nproc)
  build-linux-cross:
    uses: ./.github/workflows/build-linux-cross.yml
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@ -47,6 +47,7 @@ jobs:
          - { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
          - { tag: "s390x",  dockerfile: ".devops/s390x.Dockerfile",  platforms: "linux/s390x", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04-s390x" }
          - { tag: "rocm",   dockerfile: ".devops/rocm.Dockerfile",   platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true,  runs_on: "ubuntu-22.04" }
          - { tag: "openvino", dockerfile: ".devops/openvino.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
    steps:
      - name: Check out the repo
        uses: actions/checkout@v4
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@ -231,6 +231,63 @@ jobs:
          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz
          name: llama-bin-ubuntu-vulkan-x64.tar.gz
  ubuntu-24-openvino:
    runs-on: ubuntu-24.04
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
      - name: ccache
        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: ubuntu-24-cmake-openvino-release-no-preset-v1 
          evict-old-files: 1d
      - name: Dependencies
        id: depends
        run: |
          export OPENVINO_VERSION_MAJOR=2025.2
          export OPENVINO_VERSION_FULL=2025.2.0.19140.c01cd93e24d
          sudo apt-get update
          sudo apt-get install -y build-essential libcurl4-openssl-dev libtbb12 cmake ninja-build python3-pip curl wget tar
          sudo mkdir -p /opt/intel
          wget -O openvino_${OPENVINO_VERSION_MAJOR}.tgz https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz
          tar -xf openvino_${OPENVINO_VERSION_MAJOR}.tgz
          sudo mv openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64 /opt/intel/openvino_${OPENVINO_VERSION_MAJOR}
          rm openvino_${OPENVINO_VERSION_MAJOR}.tgz
          cd /opt/intel/openvino_${OPENVINO_VERSION_MAJOR}
          echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh && cd -
          sudo ln -s /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} /opt/intel/openvino
      - name: Build
        id: cmake_build
        run: |
          source /opt/intel/openvino/setupvars.sh
          cmake -B build/ReleaseOV -G Ninja \
            -DCMAKE_BUILD_TYPE=Release \
            -DGGML_OPENVINO=ON
          cmake --build build/ReleaseOV --config Release -j $(nproc)
      - name: Determine tag name
        id: tag
        uses: ./.github/actions/get-tag-name
      - name: Pack artifacts
        id: pack_artifacts
        run: |
          cp LICENSE ./build/ReleaseOV/bin/
          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-x64.zip ./build/ReleaseOV/bin/*
      - name: Upload artifacts
        uses: actions/upload-artifact@v4
        with:
          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-x64.zip
          name: llama-bin-ubuntu-openvino-x64.zip
  windows-cpu:
    runs-on: windows-2025
--- a/ci/run.sh
+++ b/ci/run.sh
@ -25,6 +25,9 @@
 # # with KLEIDIAI support
 # GG_BUILD_KLEIDIAI=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 #
 # # with OPENVINO support
 # GG_BUILD_OPENVINO=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 #
 if [ -z "$2" ]; then
    echo "usage: $0 <output-dir> <mnt-dir>"
@ -165,6 +168,15 @@ if [ -n "${GG_BUILD_KLEIDIAI}" ]; then
        -DBUILD_SHARED_LIBS=OFF"
 fi
 if [ ! -z ${GG_BUILD_OPENVINO} ]; then
    if [ -z ${OpenVINO_DIR} ]; then
        echo "OpenVINO_DIR not found, please install OpenVINO via archives and enable it by:"
        echo "source /opt/intel/openvino/setupvars.sh"
        exit 1
    fi
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_OPENVINO=ON"
 fi
 ## helpers
 # download a file if it does not exist or if it is outdated
--- a/docs/build.md
+++ b/docs/build.md
@ -25,7 +25,7 @@ The following sections describe how to build with different backends and options
 * [Arm® KleidiAI™](#arm-kleidiai)
 * [OpenCL](#opencl)
 * [Android](#android-1)
-* [OPENVINO](#openvino)
+* [OpenVINO](#openvino)
 * [Notes about GPU-accelerated backends](#notes-about-gpu-accelerated-backends)
 ## CPU Build
@ -696,20 +696,48 @@ Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/m
 To read documentation for how to build on IBM Z & LinuxONE, [click here](./build-s390x.md)
-## OPENVINO
+## OpenVINO
-[OpenVINO](https://docs.openvino.ai/2025/index.html) is a open-source toolkit for optimizing and deploying performant AI inference, specifically designed for Intel hardware including CPUs, GPUs, and NPUs in the cloud, on-prem, and on the edge alike. The OpenVINO backend enhances performance by leveraging hardware-specific optimizations and can be enabled for use with llama.cpp.
+[OpenVINO](https://docs.openvino.ai/2025/index.html) is an open-source toolkit for optimizing and deploying high-performance AI inference, specifically designed for Intel hardware, including CPUs, GPUs, and NPUs, in the cloud, on-premises, and on the edge. 
 The OpenVINO backend enhances performance by leveraging hardware-specific optimizations and can be enabled for use with llama.cpp.
 Follow the instructions below to install OpenVINO runtime and build llama.cpp with OpenVINO support.
 ### Prerequisites
 - Linux or Windows system with Intel hardware (CPU, GPU, or NPU)
 - **For Intel GPU or NPU Usage**: Install the appropriate hardware drivers for your Intel GPU or NPU. For detailed instructions, see: [Additional Configurations for Hardware Acceleration](https://docs.openvino.ai/2025/get-started/install-openvino/configurations.html).
 - Git, CMake, and Ninja software tools are needed for building
 ```bash
  sudo apt-get update
  sudo apt-get install -y build-essential libcurl4-openssl-dev libtbb12 cmake ninja-build python3-pip curl wget tar
 ```
 ### 1. Install OpenVINO Runtime
 - Follow the guide to install OpenVINO Runtime from an archive file: **[Install OpenVINO™ Runtime on Linux from an Archive File.](https://docs.openvino.ai/2025/get-started/install-openvino/install-openvino-archive-linux.html)**
- After installation, make sure to [source the environment setup script](https://docs.openvino.ai/2025/get-started/install-openvino/install-openvino-archive-linux.html#step-2-configure-the-environment):
+<details>
 <summary>📦 Click to expand OpenVINO 2025.2 installation commands</summary>
 <br>
 ```bash
-source /opt/intel/openvino_2025.1.0/setupvars.sh
+export OPENVINO_VERSION_MAJOR=2025.2
 export OPENVINO_VERSION_FULL=2025.2.0.19140.c01cd93e24d
 sudo apt-get update
 sudo apt-get install -y build-essential libcurl4-openssl-dev libtbb12 cmake ninja-build python3-pip curl wget tar
 sudo mkdir -p /opt/intel
 wget -O openvino_${OPENVINO_VERSION_MAJOR}.tgz https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz
 tar -xf openvino_${OPENVINO_VERSION_MAJOR}.tgz
 sudo mv openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64 /opt/intel/openvino_${OPENVINO_VERSION_MAJOR}
 rm openvino_${OPENVINO_VERSION_MAJOR}.tgz
 cd /opt/intel/openvino_${OPENVINO_VERSION_MAJOR}
 echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh && cd -
 sudo ln -s /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} /opt/intel/openvino
 source /opt/intel/openvino/setupvars.sh
 ```
 </details>
 - Verify OpenVINO is initialized properly
 ```bash
 echo $OpenVINO_DIR
@ -725,23 +753,26 @@ cd llama.cpp
 git switch dev_backend_openvino
 # Build with OpenVINO support
-cmake --preset ReleaseOV
+source /opt/intel/openvino/setupvars.sh
-cmake --build build/ReleaseOV --parallel
+cmake -B build/ReleaseOV -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON
-
+cmake --build build/ReleaseOV --config Release -j $(nproc)
 ```
 ### 3. Download Sample Model
-Download the Phi-3 mini model for testing:
+Download models for testing:
 ```bash
 # Create models directory
-mkdir -p ~/models/Phi-3-mini-4k-instruct-gguf
+mkdir -p ~/models/
-# Download model file
+# Download model file: Llama-3.2-1B-Instruct.fp16.gguf
 wget https://huggingface.co/MaziyarPanahi/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct.fp16.gguf \
     -O ~/models/Llama-3.2-1B-Instruct.fp16.gguf
 # Download model file: Phi-3-mini-4k-instruct-fp16.gguf
 wget https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-fp16.gguf \
-     -O ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf
+     -O ~/models/Phi-3-mini-4k-instruct-fp16.gguf
 ```
 ### 4. Run inference with OpenVINO backend:
@ -750,28 +781,19 @@ When using the OpenVINO backend, the first inference token may have slightly hig
 ```bash
 export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache
 # Default device is GPU.
 # If not set, automatically selects the first available device in priority order: GPU, CPU, NPU.
 export GGML_OPENVINO_DEVICE=GPU
-./build/ReleaseOV/bin/llama-simple \
+./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct.fp16.gguf -n 50 "The story of AI is "
    -m ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf \
    -n 50 \
    "Hello, my name is "
 ```
-### Using Llama.cpp's Built-in CPU Backend (for Comparison)
+To run in chat mode:
 To compare performance with the deafult CPU backend:
 ```bash
-# Build CPU-only version
+export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache
 cmake --preset ReleaseCPU
 cmake --build build/ReleaseCPU --parallel
-# Run with Default CPU backend
+./build/ReleaseOV/bin/llama-cli -m ~/models/Llama-3.2-1B-Instruct.fp16.gguf -n 50 "The story of AI is "
 ./build/ReleaseCPU/bin/llama-simple \
    -m ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf \
    -n 50 \
    "Hello, my name is "
 ```
@ -779,13 +801,14 @@ cmake --build build/ReleaseCPU --parallel
 Control OpenVINO behavior using these environment variables:
-   **`GGML_OPENVINO_CACHE_DIR`**: Directory for model caching (recommended: `/tmp/ov_cache`). If set, enables model caching in OpenVINO.
+-   **`GGML_OPENVINO_DEVICE`**: Specify the target device for OpenVINO inference.  If not set, automatically selects the first available device in priority order: GPU, CPU, NPU. When set to `NPU` to use Intel NPUs, it enables static compilation mode for optimal performance. 
 -   **`GGML_OPENVINO_CACHE_DIR`**: Directory for model caching (recommended: `/tmp/ov_cache`). If set, enables model caching in OpenVINO. Note: Not supported when using NPU devices yet.  
 -   **`GGML_OPENVINO_WEIGHT_AS_INPUT`**: Pass the weights as input to the OpenVINO model instead of creating Constant nodes for them.
-   **`GGML_OPENVINO_PROFILING`**: Enable execution time profiling
+-   **`GGML_OPENVINO_PROFILING`**: Enable execution time profiling.
-   **`GGML_OPENVINO_DUMP_CGRAPH`**: Save compute graph to `cgraph.txt`
+-   **`GGML_OPENVINO_DUMP_CGRAPH`**: Save compute graph to `cgraph.txt`.
-   **`GGML_OPENVINO_DUMP_IR`**: Export OpenVINO IR files with timestamps
+-   **`GGML_OPENVINO_DUMP_IR`**: Export OpenVINO IR files with timestamps.
-   **`GGML_OPENVINO_DEBUG_INPUT`**: Enable input debugging
+-   **`GGML_OPENVINO_DEBUG_INPUT`**: Enable input debugging.
-   **`GGML_OPENVINO_DEBUG_OUTPUT`**: Enable output debugging
+-   **`GGML_OPENVINO_DEBUG_OUTPUT`**: Enable output debugging.
 ### Example with Profiling
@ -793,11 +816,20 @@ Control OpenVINO behavior using these environment variables:
 export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache
 export GGML_OPENVINO_PROFILING=1
-./build/ReleaseOV/bin/llama-simple \
+./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct.fp16.gguf -n 50 "The story of AI is "
-    -m ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf \
+```
    -n 50 \
    "Hello, my name is "
 ### Using Llama.cpp's Built-in CPU Backend (for Comparison)
 To compare performance with the default CPU backend:
 ```bash
 # Build CPU-only version
 cmake --preset ReleaseCPU
 cmake --build build/ReleaseCPU --parallel
 # Run with the default CPU backend
 ./build/ReleaseCPU/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct.fp16.gguf -n 50 "The story of AI is "
 ```
 ## Notes about GPU-accelerated backends