This commit is contained in:
Sam Malayek 2026-02-13 15:05:19 -08:00 committed by GitHub
commit ef7047b168
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 471 additions and 0 deletions

220
.github/workflows/embedding.yml vendored Normal file
View File

@ -0,0 +1,220 @@
# Embedding CLI build and tests
name: Embedding CLI
on:
workflow_dispatch:
push:
branches: [master, feature/**]
paths:
- '.github/workflows/embedding.yml'
- 'examples/**'
- 'src/**'
- 'ggml/**'
- 'include/**'
- '**/CMakeLists.txt'
- 'tests/e2e/embedding/**'
pull_request:
types: [opened, synchronize, reopened]
paths:
- '.github/workflows/embedding.yml'
- 'examples/**'
- 'src/**'
- 'ggml/**'
- 'include/**'
- '**/CMakeLists.txt'
- 'tests/e2e/embedding/**'
jobs:
embedding-cli-tests-linux:
runs-on: ubuntu-latest
env:
LLAMA_CACHE: tmp # stable path for cache
EMBD_TEST_DEBUG: "1"
steps:
- uses: actions/checkout@v4
with: { fetch-depth: 0 }
- name: Restore model cache
uses: actions/cache@v4
with:
path: |
~/.cache/llama.cpp
tmp
key: hf-${{ runner.os }}-embeddinggemma-300M-q4_0-v1
restore-keys: |
hf-${{ runner.os }}-
hf-
- name: Install system deps
run: |
sudo apt-get update
sudo apt-get -y install \
build-essential cmake curl libcurl4-openssl-dev python3-pip
- name: Set up Python
uses: actions/setup-python@v5
with: { python-version: '3.11' }
- name: Install Python deps
run: |
python -m pip install -r requirements.txt || echo "No extra requirements found"
python -m pip install pytest numpy pytest-timeout
- name: Build llama-embedding
run: |
cmake -B build -DCMAKE_BUILD_TYPE=Release
cmake --build build --target llama-embedding -j $(nproc)
- name: Pre-download tiny model (retry x3 on network)
run: |
set -e
tries=0
until ./build/bin/llama-embedding \
-hfr ggml-org/embeddinggemma-300M-qat-q4_0-GGUF \
-hff embeddinggemma-300M-qat-Q4_0.gguf \
--ctx-size 16 --embd-output-format json --no-warmup --threads 1 --seed 42 <<< "ok"; do
tries=$((tries+1))
if [ $tries -ge 3 ]; then
echo "Pre-download failed after $tries attempts"
exit 1
fi
echo "Retrying download ($tries/3)..."
sleep 3
done
- name: Run embedding tests (30s per-test cap)
shell: bash
run: |
set -o pipefail
pytest -v tests/e2e/embedding \
--timeout=30 \
--durations=10 \
--junitxml=pytest-report.xml | tee pytest-output.txt
- name: Upload test artifacts
if: always()
uses: actions/upload-artifact@v4
with:
name: linux-embedding-tests
path: |
pytest-output.txt
pytest-report.xml
- name: Save model cache
if: always()
uses: actions/cache@v4
with:
path: |
~/.cache/llama.cpp
tmp
key: hf-${{ runner.os }}-embeddinggemma-300M-q4_0-v1
embedding-cli-tests-windows:
runs-on: windows-latest
continue-on-error: true
env:
LLAMA_CACHE: tmp
EMBD_TEST_DEBUG: "1"
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with: { python-version: '3.11' }
# --- vcpkg plain bootstrap (no actions, no submodules) ---
- name: Bootstrap vcpkg
shell: pwsh
run: |
$env:VCPKG_ROOT = "$env:RUNNER_TEMP\vcpkg"
git clone https://github.com/microsoft/vcpkg $env:VCPKG_ROOT
& "$env:VCPKG_ROOT\bootstrap-vcpkg.bat" -disableMetrics
echo "VCPKG_ROOT=$env:VCPKG_ROOT" | Out-File -FilePath $env:GITHUB_ENV -Append
- name: Install curl with OpenSSL via vcpkg
shell: pwsh
run: |
& "$env:VCPKG_ROOT\vcpkg.exe" install curl[openssl]:x64-windows
- name: Restore model cache
uses: actions/cache@v4
with:
path: |
$HOME/.cache/llama.cpp
tmp
key: hf-${{ runner.os }}-embeddinggemma-300M-q4_0-v1
restore-keys: |
hf-${{ runner.os }}-
hf-
- name: Install Python deps
run: pip install pytest numpy
- name: Configure & Build (Release)
shell: pwsh
run: |
cmake -B build -DCMAKE_BUILD_TYPE=Release `
-DCMAKE_TOOLCHAIN_FILE="$env:VCPKG_ROOT\scripts\buildsystems\vcpkg.cmake"
cmake --build build --target llama-embedding --config Release -j 2
- name: Pre-download tiny model (retry x3)
shell: bash
run: |
set -e
tries=0
until ./build/bin/Release/llama-embedding.exe \
-hfr ggml-org/embeddinggemma-300M-qat-q4_0-GGUF \
-hff embeddinggemma-300M-qat-Q4_0.gguf \
--ctx-size 16 --embd-output-format json --no-warmup --threads 1 --seed 42 <<< "ok"; do
tries=$((tries+1))
if [ $tries -ge 3 ]; then
echo "Pre-download failed after $tries attempts"; exit 1
fi
echo "Retrying download ($tries/3)..."; sleep 3
done
- name: Run smoke tests
shell: bash
run: |
pytest -q tests/e2e/embedding -k raw_vs_json_consistency
embedding-cli-tests-macos:
runs-on: macos-latest
continue-on-error: true
env:
LLAMA_CACHE: tmp
EMBD_TEST_DEBUG: "1"
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with: { python-version: '3.11' }
- name: Install Python deps
run: pip install pytest numpy
- name: Build
run: |
cmake -B build -DCMAKE_BUILD_TYPE=Release
cmake --build build --target llama-embedding -j 3
- name: Pre-download tiny model (retry x3)
run: |
set -e
tries=0
until ./build/bin/llama-embedding \
-hfr ggml-org/embeddinggemma-300M-qat-q4_0-GGUF \
-hff embeddinggemma-300M-qat-Q4_0.gguf \
--ctx-size 16 --embd-output-format json --no-warmup --threads 1 --seed 42 <<< "ok"; do
tries=$((tries+1))
if [ $tries -ge 3 ]; then
echo "Pre-download failed after $tries attempts"; exit 1
fi
echo "Retrying download ($tries/3)..."; sleep 3
done
- name: Warm cache & run a tiny smoke
run: |
./build/bin/llama-embedding --help >/dev/null 2>&1
pytest -q tests/e2e/embedding -k raw_vs_json_consistency

View File

@ -0,0 +1,251 @@
import json
import hashlib
import logging
import os
import pytest
import subprocess
from pathlib import Path
import numpy as np
import time
from typing import Optional, List
# ---------------------------------------------------------------------------
# Configuration constants
# ---------------------------------------------------------------------------
EPS = 1e-3
REPO_ROOT = Path(__file__).resolve().parents[3]
EXE = REPO_ROOT / ("build/bin/llama-embedding.exe" if os.name == "nt" else "build/bin/llama-embedding")
DEFAULT_ENV = {**os.environ, "LLAMA_CACHE": os.environ.get("LLAMA_CACHE", "tmp")}
SEED = "42"
ALLOWED_DIMS = {384, 768, 1024, 4096}
SMALL_CTX = 16 # preflight/cache
TEST_CTX = 1024 # main tests
log = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Shared helpers (single source of truth for command building)
# ---------------------------------------------------------------------------
def resolve_exe() -> Path:
exe = EXE
if not exe.exists() and os.name == "nt":
alt = REPO_ROOT / "build/bin/Release/llama-embedding.exe"
if alt.exists():
exe = alt
if not exe.exists():
raise FileNotFoundError(f"llama-embedding not found under {REPO_ROOT}/build/bin")
return exe
def hf_params_default():
return {
"hf_repo": "ggml-org/embeddinggemma-300M-qat-q4_0-GGUF",
"hf_file": "embeddinggemma-300M-qat-Q4_0.gguf",
}
def build_cmd(
*,
exe: Path,
params: dict,
fmt: str,
threads: int,
ctx: int,
seed: str,
extra: Optional[List[str]] = None, # was: list[str] | None
) -> List[str]: # was: list[str]
assert fmt in {"raw", "json"}, f"unsupported fmt={fmt}"
cmd = [
str(exe),
"-hfr", params["hf_repo"],
"-hff", params["hf_file"],
"--ctx-size", str(ctx),
"--embd-output-format", fmt,
"--threads", str(threads),
"--seed", seed,
]
if extra:
cmd.extend(extra)
return cmd
def run_cmd(cmd: list[str], text: str, timeout: int = 60) -> str:
t0 = time.perf_counter()
res = subprocess.run(cmd, input=text, capture_output=True, text=True,
env=DEFAULT_ENV, timeout=timeout)
dur_ms = (time.perf_counter() - t0) * 1000.0
if os.environ.get("EMBD_TEST_DEBUG") == "1":
log.debug("embedding cmd finished in %.1f ms", dur_ms)
if res.returncode != 0:
raise AssertionError(f"embedding failed ({res.returncode}):\n{res.stderr[:400]}")
out = res.stdout.strip()
assert out, "empty stdout from llama-embedding"
return out
# ---------------------------------------------------------------------------
# Session model preflight/cache
# ---------------------------------------------------------------------------
@pytest.fixture(scope="session")
def embedding_model():
"""Download/cache model once per session with a tiny ctx + no warmup."""
exe = resolve_exe()
params = hf_params_default()
cmd = build_cmd(
exe=exe, params=params, fmt="json",
threads=1, ctx=SMALL_CTX, seed=SEED,
extra=["--no-warmup"],
)
_ = run_cmd(cmd, text="ok")
return params
# ---------------------------------------------------------------------------
# Utility functions
# ---------------------------------------------------------------------------
def run_embedding(
text: str,
*,
fmt: str = "raw",
threads: int = 1,
ctx: int = TEST_CTX,
params: Optional[dict] = None, # was: dict | None
timeout: int = 60,
) -> str:
exe = resolve_exe()
params = params or hf_params_default()
cmd = build_cmd(exe=exe, params=params, fmt=fmt, threads=threads, ctx=ctx, seed=SEED)
return run_cmd(cmd, text, timeout=timeout)
def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))
def embedding_hash(vec: np.ndarray) -> str:
"""Return short deterministic signature for regression tracking."""
return hashlib.sha256(vec[:8].tobytes()).hexdigest()[:16]
def parse_vec(out: str, fmt: str) -> np.ndarray:
if fmt == "raw":
arr = np.array(out.split(), dtype=np.float32)
else:
arr = np.array(json.loads(out)["data"][0]["embedding"], dtype=np.float32)
return arr
# ---------------------------------------------------------------------------
# Tests
# ---------------------------------------------------------------------------
# Register custom mark so pytest doesn't warn about it
pytestmark = pytest.mark.filterwarnings("ignore::pytest.PytestUnknownMarkWarning")
@pytest.mark.parametrize("fmt", ["raw", "json"])
@pytest.mark.parametrize("text", ["hello world", "hi 🌎", "line1\nline2\nline3"])
def test_embedding_runs_and_finite(fmt, text, embedding_model):
out = run_embedding(text, fmt=fmt, threads=1, ctx=TEST_CTX, params=embedding_model)
vec = parse_vec(out, fmt)
assert vec.dtype == np.float32
# dim & finiteness
assert len(vec) in ALLOWED_DIMS, f"unexpected dim={len(vec)}"
assert np.all(np.isfinite(vec))
assert 0.1 < np.linalg.norm(vec) < 10
def test_raw_vs_json_consistency(embedding_model):
text = "hello world"
raw = parse_vec(run_embedding(text, fmt="raw", params=embedding_model), "raw")
jsn = parse_vec(run_embedding(text, fmt="json", params=embedding_model), "json")
assert raw.shape == jsn.shape
cos = cosine_similarity(raw, jsn)
assert cos > 0.999, f"raw/json divergence: cos={cos:.6f}"
assert embedding_hash(raw) == embedding_hash(jsn)
def test_empty_input_deterministic(embedding_model):
v1 = parse_vec(run_embedding("", fmt="raw", params=embedding_model), "raw")
v2 = parse_vec(run_embedding("", fmt="raw", params=embedding_model), "raw")
assert np.all(np.isfinite(v1))
assert embedding_hash(v1) == embedding_hash(v2)
assert cosine_similarity(v1, v2) > 0.99999
def test_very_long_input_stress(embedding_model):
"""Stress test: large input near context window."""
text = "lorem " * 2000
vec = parse_vec(run_embedding(text, fmt="raw", params=embedding_model), "raw")
assert len(vec) in ALLOWED_DIMS
assert np.isfinite(np.linalg.norm(vec))
@pytest.mark.parametrize("text", [" ", "\n\n\n", "123 456 789"])
def test_low_information_inputs_stable(text, embedding_model):
"""Whitespace/numeric inputs should yield stable embeddings."""
v1 = parse_vec(run_embedding(text, fmt="raw", params=embedding_model), "raw")
v2 = parse_vec(run_embedding(text, fmt="raw", params=embedding_model), "raw")
cos = cosine_similarity(v1, v2)
assert cos > 0.999, f"unstable embedding for {text!r}"
@pytest.mark.parametrize("flag", ["--no-such-flag", "--help"])
def test_invalid_or_help_flag(flag):
"""Invalid flags should fail; help should succeed."""
exe = resolve_exe()
res = subprocess.run([str(exe), flag], capture_output=True, text=True, env=DEFAULT_ENV)
if flag == "--no-such-flag":
assert res.returncode != 0
assert any(k in res.stderr.lower() for k in ("error", "invalid", "unknown"))
else:
assert res.returncode == 0
assert "usage" in (res.stdout.lower() + res.stderr.lower())
@pytest.mark.parametrize("fmt", ["raw", "json"])
def test_threads_two_similarity_vs_single(fmt, embedding_model):
text = "determinism vs threads"
single = parse_vec(run_embedding(text, fmt=fmt, threads=1, params=embedding_model), fmt)
multi = parse_vec(run_embedding(text, fmt=fmt, threads=2, params=embedding_model), fmt)
assert single.shape == multi.shape
cos = cosine_similarity(single, multi)
assert cos >= 0.999, f"threads>1 similarity too low: {cos:.6f}"
def test_json_shape_schema_minimal(embedding_model):
js = json.loads(run_embedding("schema check", fmt="json", params=embedding_model))
assert isinstance(js, dict)
# Top-level “object” (present in CLI) is optional for us
if "object" in js:
assert js["object"] in ("list", "embeddings", "embedding_list")
# Required: data[0].embedding + index
assert "data" in js and isinstance(js["data"], list) and len(js["data"]) >= 1
item0 = js["data"][0]
assert isinstance(item0, dict)
if "object" in item0:
assert item0["object"] in ("embedding",)
assert "index" in item0 and item0["index"] == 0
assert "embedding" in item0 and isinstance(item0["embedding"], list)
assert len(item0["embedding"]) in ALLOWED_DIMS
# Optional fields: tolerate absence in current CLI output
if "model" in js:
assert isinstance(js["model"], str)
if "dim" in js:
assert js["dim"] == len(item0["embedding"])
usage = js.get("usage", {})
if usage:
assert isinstance(usage, dict)
# if present, prompt_tokens should be int
if "prompt_tokens" in usage:
assert isinstance(usage["prompt_tokens"], int)