llama.cpp/.github/workflows/embedding.yml

# Embedding CLI build and tests
name: Embedding CLI

on:
    workflow_dispatch:
    push:
        branches: [master, feature/**]
        paths:
            - '.github/workflows/embedding.yml'
            - 'examples/**'
            - 'src/**'
            - 'ggml/**'
            - 'include/**'
            - '**/CMakeLists.txt'
            - 'tests/e2e/embedding/**'
    pull_request:
        types: [opened, synchronize, reopened]
        paths:
            - '.github/workflows/embedding.yml'
            - 'examples/**'
            - 'src/**'
            - 'ggml/**'
            - 'include/**'
            - '**/CMakeLists.txt'
            - 'tests/e2e/embedding/**'

jobs:
    embedding-cli-tests-linux:
        runs-on: ubuntu-latest
        env:
            LLAMA_CACHE: tmp   # stable path for cache
            EMBD_TEST_DEBUG: "1"

        steps:
            - uses: actions/checkout@v4
              with: { fetch-depth: 0 }

            - name: Restore model cache
              uses: actions/cache@v4
              with:
                  path: |
                      ~/.cache/llama.cpp
                      tmp
                  key: hf-${{ runner.os }}-embeddinggemma-300M-q4_0-v1
                  restore-keys: |
                      hf-${{ runner.os }}-
                      hf-

            - name: Install system deps
              run: |
                  sudo apt-get update
                  sudo apt-get -y install \
                    build-essential cmake curl libcurl4-openssl-dev python3-pip

            - name: Set up Python
              uses: actions/setup-python@v5
              with: { python-version: '3.11' }

            - name: Install Python deps
              run: |
                  python -m pip install -r requirements.txt || echo "No extra requirements found"
                  python -m pip install pytest numpy pytest-timeout

            - name: Build llama-embedding
              run: |
                  cmake -B build -DCMAKE_BUILD_TYPE=Release
                  cmake --build build --target llama-embedding -j $(nproc)

            - name: Pre-download tiny model (retry x3 on network)
              run: |
                  set -e
                  tries=0
                  until ./build/bin/llama-embedding \
                      -hfr ggml-org/embeddinggemma-300M-qat-q4_0-GGUF \
                      -hff embeddinggemma-300M-qat-Q4_0.gguf \
                      --ctx-size 16 --embd-output-format json --no-warmup --threads 1 --seed 42 <<< "ok"; do
                    tries=$((tries+1))
                    if [ $tries -ge 3 ]; then
                      echo "Pre-download failed after $tries attempts"
                      exit 1
                    fi
                    echo "Retrying download ($tries/3)..."
                    sleep 3
                  done

            - name: Run embedding tests (30s per-test cap)
              shell: bash
              run: |
                  set -o pipefail
                  pytest -v tests/e2e/embedding \
                  --timeout=30 \
                  --durations=10 \
                  --junitxml=pytest-report.xml | tee pytest-output.txt

            - name: Upload test artifacts
              if: always()
              uses: actions/upload-artifact@v4
              with:
                  name: linux-embedding-tests
                  path: |
                      pytest-output.txt
                      pytest-report.xml

            - name: Save model cache
              if: always()
              uses: actions/cache@v4
              with:
                  path: |
                      ~/.cache/llama.cpp
                      tmp
                  key: hf-${{ runner.os }}-embeddinggemma-300M-q4_0-v1

    embedding-cli-tests-windows:
        runs-on: windows-latest
        continue-on-error: true
        env:
            LLAMA_CACHE: tmp
            EMBD_TEST_DEBUG: "1"

        steps:
            - uses: actions/checkout@v4
            - uses: actions/setup-python@v5
              with: { python-version: '3.11' }

            # --- vcpkg plain bootstrap (no actions, no submodules) ---
            - name: Bootstrap vcpkg
              shell: pwsh
              run: |
                  $env:VCPKG_ROOT = "$env:RUNNER_TEMP\vcpkg"
                  git clone https://github.com/microsoft/vcpkg $env:VCPKG_ROOT
                  & "$env:VCPKG_ROOT\bootstrap-vcpkg.bat" -disableMetrics
                  echo "VCPKG_ROOT=$env:VCPKG_ROOT" | Out-File -FilePath $env:GITHUB_ENV -Append

            - name: Install curl with OpenSSL via vcpkg
              shell: pwsh
              run: |
                  & "$env:VCPKG_ROOT\vcpkg.exe" install curl[openssl]:x64-windows

            - name: Restore model cache
              uses: actions/cache@v4
              with:
                  path: |
                      $HOME/.cache/llama.cpp
                      tmp
                  key: hf-${{ runner.os }}-embeddinggemma-300M-q4_0-v1
                  restore-keys: |
                      hf-${{ runner.os }}-
                      hf-

            - name: Install Python deps
              run: pip install pytest numpy

            - name: Configure & Build (Release)
              shell: pwsh
              run: |
                  cmake -B build -DCMAKE_BUILD_TYPE=Release `
                    -DCMAKE_TOOLCHAIN_FILE="$env:VCPKG_ROOT\scripts\buildsystems\vcpkg.cmake"
                  cmake --build build --target llama-embedding --config Release -j 2

            - name: Pre-download tiny model (retry x3)
              shell: bash
              run: |
                  set -e
                  tries=0
                  until ./build/bin/Release/llama-embedding.exe \
                    -hfr ggml-org/embeddinggemma-300M-qat-q4_0-GGUF \
                    -hff embeddinggemma-300M-qat-Q4_0.gguf \
                    --ctx-size 16 --embd-output-format json --no-warmup --threads 1 --seed 42 <<< "ok"; do
                    tries=$((tries+1))
                    if [ $tries -ge 3 ]; then
                      echo "Pre-download failed after $tries attempts"; exit 1
                    fi
                    echo "Retrying download ($tries/3)..."; sleep 3
                  done

            - name: Run smoke tests
              shell: bash
              run: |
                  pytest -q tests/e2e/embedding -k raw_vs_json_consistency


    embedding-cli-tests-macos:
        runs-on: macos-latest
        continue-on-error: true
        env:
            LLAMA_CACHE: tmp
            EMBD_TEST_DEBUG: "1"
        steps:
            - uses: actions/checkout@v4
            - uses: actions/setup-python@v5
              with: { python-version: '3.11' }

            - name: Install Python deps
              run: pip install pytest numpy

            - name: Build
              run: |
                  cmake -B build -DCMAKE_BUILD_TYPE=Release
                  cmake --build build --target llama-embedding -j 3

            - name: Pre-download tiny model (retry x3)
              run: |
                  set -e
                  tries=0
                  until ./build/bin/llama-embedding \
                    -hfr ggml-org/embeddinggemma-300M-qat-q4_0-GGUF \
                    -hff embeddinggemma-300M-qat-Q4_0.gguf \
                    --ctx-size 16 --embd-output-format json --no-warmup --threads 1 --seed 42 <<< "ok"; do
                    tries=$((tries+1))
                    if [ $tries -ge 3 ]; then
                      echo "Pre-download failed after $tries attempts"; exit 1
                    fi
                    echo "Retrying download ($tries/3)..."; sleep 3
                  done

            - name: Warm cache & run a tiny smoke
              run: |
                  ./build/bin/llama-embedding --help >/dev/null 2>&1
                  pytest -q tests/e2e/embedding -k raw_vs_json_consistency