From 5ce810ee516d87134bd5a1536a38a9564204c26a Mon Sep 17 00:00:00 2001 From: Sam Malayek Date: Mon, 3 Nov 2025 00:35:33 -0800 Subject: [PATCH] Update test and workflow to match new RFC --- .github/workflows/embedding.yml | 193 +++++++++++++++-- tests/e2e/embedding/test_embedding_cli.py | 243 ++++++++++++++-------- 2 files changed, 327 insertions(+), 109 deletions(-) diff --git a/.github/workflows/embedding.yml b/.github/workflows/embedding.yml index db566b36d5..f368946177 100644 --- a/.github/workflows/embedding.yml +++ b/.github/workflows/embedding.yml @@ -25,41 +25,196 @@ on: - 'tests/e2e/embedding/**' jobs: - embedding-cli-tests: + embedding-cli-tests-linux: runs-on: ubuntu-latest + env: + LLAMA_CACHE: tmp # stable path for cache + EMBD_TEST_DEBUG: "1" steps: + - uses: actions/checkout@v4 + with: { fetch-depth: 0 } + + - name: Restore model cache + uses: actions/cache@v4 + with: + path: | + ~/.cache/llama.cpp + tmp + key: hf-${{ runner.os }}-embeddinggemma-300M-q4_0-v1 + restore-keys: | + hf-${{ runner.os }}- + hf- + - name: Install system deps run: | sudo apt-get update sudo apt-get -y install \ - build-essential \ - cmake \ - curl \ - libcurl4-openssl-dev \ - python3-pip - - - name: Checkout repository - uses: actions/checkout@v4 - with: - fetch-depth: 0 + build-essential cmake curl libcurl4-openssl-dev python3-pip - name: Set up Python uses: actions/setup-python@v5 - with: - python-version: '3.11' + with: { python-version: '3.11' } - name: Install Python deps run: | - pip install -r requirements.txt || echo "No extra requirements found" - pip install pytest + python -m pip install -r requirements.txt || echo "No extra requirements found" + python -m pip install pytest numpy pytest-timeout - name: Build llama-embedding run: | - cmake -B build \ - -DCMAKE_BUILD_TYPE=Release + cmake -B build -DCMAKE_BUILD_TYPE=Release cmake --build build --target llama-embedding -j $(nproc) - - name: Run embedding tests + - name: Pre-download tiny model (retry x3 on network) run: | - pytest -v tests/e2e/embedding + set -e + tries=0 + until ./build/bin/llama-embedding \ + -hfr ggml-org/embeddinggemma-300M-qat-q4_0-GGUF \ + -hff embeddinggemma-300M-qat-Q4_0.gguf \ + --ctx-size 16 --embd-output-format json --no-warmup --threads 1 --seed 42 <<< "ok"; do + tries=$((tries+1)) + if [ $tries -ge 3 ]; then + echo "Pre-download failed after $tries attempts" + exit 1 + fi + echo "Retrying download ($tries/3)..." + sleep 3 + done + + - name: Run embedding tests (30s per-test cap) + shell: bash + run: | + set -o pipefail + pytest -v tests/e2e/embedding \ + --timeout=30 \ + --durations=10 \ + --junitxml=pytest-report.xml | tee pytest-output.txt + + - name: Upload test artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: linux-embedding-tests + path: | + pytest-output.txt + pytest-report.xml + + - name: Save model cache + if: always() + uses: actions/cache@v4 + with: + path: | + ~/.cache/llama.cpp + tmp + key: hf-${{ runner.os }}-embeddinggemma-300M-q4_0-v1 + + embedding-cli-tests-windows: + runs-on: windows-latest + continue-on-error: true + env: + LLAMA_CACHE: tmp + EMBD_TEST_DEBUG: "1" + + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: { python-version: '3.11' } + + # --- vcpkg plain bootstrap (no actions, no submodules) --- + - name: Bootstrap vcpkg + shell: pwsh + run: | + $env:VCPKG_ROOT = "$env:RUNNER_TEMP\vcpkg" + git clone https://github.com/microsoft/vcpkg $env:VCPKG_ROOT + & "$env:VCPKG_ROOT\bootstrap-vcpkg.bat" -disableMetrics + echo "VCPKG_ROOT=$env:VCPKG_ROOT" | Out-File -FilePath $env:GITHUB_ENV -Append + + - name: Install curl with OpenSSL via vcpkg + shell: pwsh + run: | + & "$env:VCPKG_ROOT\vcpkg.exe" install curl[openssl]:x64-windows + + - name: Restore model cache + uses: actions/cache@v4 + with: + path: | + $HOME/.cache/llama.cpp + tmp + key: hf-${{ runner.os }}-embeddinggemma-300M-q4_0-v1 + restore-keys: | + hf-${{ runner.os }}- + hf- + + - name: Install Python deps + run: pip install pytest numpy + + - name: Configure & Build (Release) + shell: pwsh + run: | + cmake -B build -DCMAKE_BUILD_TYPE=Release ` + -DCMAKE_TOOLCHAIN_FILE="$env:VCPKG_ROOT\scripts\buildsystems\vcpkg.cmake" + cmake --build build --target llama-embedding --config Release -j 2 + + - name: Pre-download tiny model (retry x3) + shell: bash + run: | + set -e + tries=0 + until ./build/bin/Release/llama-embedding.exe \ + -hfr ggml-org/embeddinggemma-300M-qat-q4_0-GGUF \ + -hff embeddinggemma-300M-qat-Q4_0.gguf \ + --ctx-size 16 --embd-output-format json --no-warmup --threads 1 --seed 42 <<< "ok"; do + tries=$((tries+1)) + if [ $tries -ge 3 ]; then + echo "Pre-download failed after $tries attempts"; exit 1 + fi + echo "Retrying download ($tries/3)..."; sleep 3 + done + + - name: Run smoke tests + shell: bash + run: | + pytest -q tests/e2e/embedding -k raw_vs_json_consistency + + + + embedding-cli-tests-macos: + runs-on: macos-latest + continue-on-error: true + env: + LLAMA_CACHE: tmp + EMBD_TEST_DEBUG: "1" + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: { python-version: '3.11' } + + - name: Install Python deps + run: pip install pytest numpy + + - name: Build + run: | + cmake -B build -DCMAKE_BUILD_TYPE=Release + cmake --build build --target llama-embedding -j 3 + + - name: Pre-download tiny model (retry x3) + run: | + set -e + tries=0 + until ./build/bin/llama-embedding \ + -hfr ggml-org/embeddinggemma-300M-qat-q4_0-GGUF \ + -hff embeddinggemma-300M-qat-Q4_0.gguf \ + --ctx-size 16 --embd-output-format json --no-warmup --threads 1 --seed 42 <<< "ok"; do + tries=$((tries+1)) + if [ $tries -ge 3 ]; then + echo "Pre-download failed after $tries attempts"; exit 1 + fi + echo "Retrying download ($tries/3)..."; sleep 3 + done + + - name: Warm cache & run a tiny smoke + run: | + ./build/bin/llama-embedding --help >/dev/null 2>&1 + pytest -q tests/e2e/embedding -k raw_vs_json_consistency diff --git a/tests/e2e/embedding/test_embedding_cli.py b/tests/e2e/embedding/test_embedding_cli.py index cf4731d6bf..80f986ec86 100644 --- a/tests/e2e/embedding/test_embedding_cli.py +++ b/tests/e2e/embedding/test_embedding_cli.py @@ -1,10 +1,13 @@ import json import hashlib +import logging import os import pytest import subprocess from pathlib import Path import numpy as np +import time +from typing import Optional, List # --------------------------------------------------------------------------- # Configuration constants @@ -15,72 +18,111 @@ REPO_ROOT = Path(__file__).resolve().parents[3] EXE = REPO_ROOT / ("build/bin/llama-embedding.exe" if os.name == "nt" else "build/bin/llama-embedding") DEFAULT_ENV = {**os.environ, "LLAMA_CACHE": os.environ.get("LLAMA_CACHE", "tmp")} SEED = "42" +ALLOWED_DIMS = {384, 768, 1024, 4096} +SMALL_CTX = 16 # preflight/cache +TEST_CTX = 1024 # main tests + +log = logging.getLogger(__name__) # --------------------------------------------------------------------------- -# Model setup helpers +# Shared helpers (single source of truth for command building) # --------------------------------------------------------------------------- -def get_model_hf_params(): - """Default lightweight embedding model.""" + +def resolve_exe() -> Path: + exe = EXE + if not exe.exists() and os.name == "nt": + alt = REPO_ROOT / "build/bin/Release/llama-embedding.exe" + if alt.exists(): + exe = alt + if not exe.exists(): + raise FileNotFoundError(f"llama-embedding not found under {REPO_ROOT}/build/bin") + return exe + + +def hf_params_default(): return { "hf_repo": "ggml-org/embeddinggemma-300M-qat-q4_0-GGUF", "hf_file": "embeddinggemma-300M-qat-Q4_0.gguf", } -@pytest.fixture(scope="session") -def embedding_model(): - """Download/cache model once per session.""" - exe_path = EXE - if not exe_path.exists(): - alt = REPO_ROOT / "build/bin/Release/llama-embedding.exe" - if alt.exists(): - exe_path = alt - else: - raise FileNotFoundError(f"llama-embedding binary not found under {REPO_ROOT}/build/bin") - - params = get_model_hf_params() +def build_cmd( + *, + exe: Path, + params: dict, + fmt: str, + threads: int, + ctx: int, + seed: str, + extra: Optional[List[str]] = None, # was: list[str] | None +) -> List[str]: # was: list[str] + assert fmt in {"raw", "json"}, f"unsupported fmt={fmt}" cmd = [ - str(exe_path), + str(exe), "-hfr", params["hf_repo"], "-hff", params["hf_file"], - "--ctx-size", "16", - "--embd-output-format", "json", - "--no-warmup", - "--threads", "1", - "--seed", SEED, + "--ctx-size", str(ctx), + "--embd-output-format", fmt, + "--threads", str(threads), + "--seed", seed, ] - res = subprocess.run(cmd, input="ok", capture_output=True, text=True, env=DEFAULT_ENV) - assert res.returncode == 0, f"model download failed: {res.stderr}" - return params + if extra: + cmd.extend(extra) + return cmd +def run_cmd(cmd: list[str], text: str, timeout: int = 60) -> str: + t0 = time.perf_counter() + res = subprocess.run(cmd, input=text, capture_output=True, text=True, + env=DEFAULT_ENV, timeout=timeout) + dur_ms = (time.perf_counter() - t0) * 1000.0 + if os.environ.get("EMBD_TEST_DEBUG") == "1": + log.debug("embedding cmd finished in %.1f ms", dur_ms) + + if res.returncode != 0: + raise AssertionError(f"embedding failed ({res.returncode}):\n{res.stderr[:400]}") + out = res.stdout.strip() + assert out, "empty stdout from llama-embedding" + return out + +# --------------------------------------------------------------------------- +# Session model preflight/cache +# --------------------------------------------------------------------------- + + +@pytest.fixture(scope="session") +def embedding_model(): + """Download/cache model once per session with a tiny ctx + no warmup.""" + exe = resolve_exe() + params = hf_params_default() + cmd = build_cmd( + exe=exe, params=params, fmt="json", + threads=1, ctx=SMALL_CTX, seed=SEED, + extra=["--no-warmup"], + ) + _ = run_cmd(cmd, text="ok") + return params + # --------------------------------------------------------------------------- # Utility functions # --------------------------------------------------------------------------- -def run_embedding(text: str, fmt: str = "raw", params=None) -> str: - """Runs llama-embedding and returns stdout (string).""" - exe_path = EXE - if not exe_path.exists(): - raise FileNotFoundError(f"Missing binary: {exe_path}") - params = params or get_model_hf_params() - cmd = [ - str(exe_path), - "-hfr", params["hf_repo"], - "-hff", params["hf_file"], - "--ctx-size", "2048", - "--embd-output-format", fmt, - "--threads", "1", - "--seed", SEED, - ] - result = subprocess.run(cmd, input=text, capture_output=True, text=True, env=DEFAULT_ENV) - if result.returncode: - raise AssertionError(f"embedding failed ({result.returncode}):\n{result.stderr[:400]}") - out = result.stdout.strip() - assert out, f"empty output for text={text!r}, fmt={fmt}" - return out + +def run_embedding( + text: str, + *, + fmt: str = "raw", + threads: int = 1, + ctx: int = TEST_CTX, + params: Optional[dict] = None, # was: dict | None + timeout: int = 60, +) -> str: + exe = resolve_exe() + params = params or hf_params_default() + cmd = build_cmd(exe=exe, params=params, fmt=fmt, threads=threads, ctx=ctx, seed=SEED) + return run_cmd(cmd, text, timeout=timeout) def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float: @@ -92,69 +134,65 @@ def embedding_hash(vec: np.ndarray) -> str: return hashlib.sha256(vec[:8].tobytes()).hexdigest()[:16] +def parse_vec(out: str, fmt: str) -> np.ndarray: + if fmt == "raw": + arr = np.array(out.split(), dtype=np.float32) + else: + arr = np.array(json.loads(out)["data"][0]["embedding"], dtype=np.float32) + return arr + # --------------------------------------------------------------------------- # Tests # --------------------------------------------------------------------------- + # Register custom mark so pytest doesn't warn about it pytestmark = pytest.mark.filterwarnings("ignore::pytest.PytestUnknownMarkWarning") -@pytest.mark.slow @pytest.mark.parametrize("fmt", ["raw", "json"]) @pytest.mark.parametrize("text", ["hello world", "hi 🌎", "line1\nline2\nline3"]) def test_embedding_runs_and_finite(fmt, text, embedding_model): - """Ensure embeddings run end-to-end and produce finite floats.""" - out = run_embedding(text, fmt, embedding_model) - floats = ( - np.array(out.split(), float) - if fmt == "raw" - else np.array(json.loads(out)["data"][0]["embedding"], float) - ) - assert len(floats) > 100 - assert np.all(np.isfinite(floats)), f"non-finite values in {fmt} output" - assert 0.1 < np.linalg.norm(floats) < 10 + out = run_embedding(text, fmt=fmt, threads=1, ctx=TEST_CTX, params=embedding_model) + vec = parse_vec(out, fmt) + assert vec.dtype == np.float32 + # dim & finiteness + assert len(vec) in ALLOWED_DIMS, f"unexpected dim={len(vec)}" + assert np.all(np.isfinite(vec)) + assert 0.1 < np.linalg.norm(vec) < 10 def test_raw_vs_json_consistency(embedding_model): - """Compare raw vs JSON embedding output for same text.""" text = "hello world" - raw = np.array(run_embedding(text, "raw", embedding_model).split(), float) - jsn = np.array(json.loads(run_embedding(text, "json", embedding_model))["data"][0]["embedding"], float) - + raw = parse_vec(run_embedding(text, fmt="raw", params=embedding_model), "raw") + jsn = parse_vec(run_embedding(text, fmt="json", params=embedding_model), "json") assert raw.shape == jsn.shape cos = cosine_similarity(raw, jsn) - assert cos > 0.999, f"divergence: cos={cos:.4f}" - assert embedding_hash(raw) == embedding_hash(jsn), "hash mismatch → possible nondeterminism" + assert cos > 0.999, f"raw/json divergence: cos={cos:.6f}" + assert embedding_hash(raw) == embedding_hash(jsn) def test_empty_input_deterministic(embedding_model): - """Empty input should yield finite, deterministic vector.""" - v1 = np.array(run_embedding("", "raw", embedding_model).split(), float) - v2 = np.array(run_embedding("", "raw", embedding_model).split(), float) + v1 = parse_vec(run_embedding("", fmt="raw", params=embedding_model), "raw") + v2 = parse_vec(run_embedding("", fmt="raw", params=embedding_model), "raw") assert np.all(np.isfinite(v1)) - cos = cosine_similarity(v1, v2) - assert cos > 0.9999, f"Empty input not deterministic (cos={cos:.5f})" - assert 0.1 < np.linalg.norm(v1) < 10 + assert embedding_hash(v1) == embedding_hash(v2) + assert cosine_similarity(v1, v2) > 0.99999 -@pytest.mark.slow def test_very_long_input_stress(embedding_model): """Stress test: large input near context window.""" text = "lorem " * 2000 - vec = np.array(run_embedding(text, "raw", embedding_model).split(), float) - assert len(vec) > 100 + vec = parse_vec(run_embedding(text, fmt="raw", params=embedding_model), "raw") + assert len(vec) in ALLOWED_DIMS assert np.isfinite(np.linalg.norm(vec)) -@pytest.mark.parametrize( - "text", - [" ", "\n\n\n", "123 456 789"], -) +@pytest.mark.parametrize("text", [" ", "\n\n\n", "123 456 789"]) def test_low_information_inputs_stable(text, embedding_model): """Whitespace/numeric inputs should yield stable embeddings.""" - v1 = np.array(run_embedding(text, "raw", embedding_model).split(), float) - v2 = np.array(run_embedding(text, "raw", embedding_model).split(), float) + v1 = parse_vec(run_embedding(text, fmt="raw", params=embedding_model), "raw") + v2 = parse_vec(run_embedding(text, fmt="raw", params=embedding_model), "raw") cos = cosine_similarity(v1, v2) assert cos > 0.999, f"unstable embedding for {text!r}" @@ -162,7 +200,8 @@ def test_low_information_inputs_stable(text, embedding_model): @pytest.mark.parametrize("flag", ["--no-such-flag", "--help"]) def test_invalid_or_help_flag(flag): """Invalid flags should fail; help should succeed.""" - res = subprocess.run([str(EXE), flag], capture_output=True, text=True) + exe = resolve_exe() + res = subprocess.run([str(exe), flag], capture_output=True, text=True, env=DEFAULT_ENV) if flag == "--no-such-flag": assert res.returncode != 0 assert any(k in res.stderr.lower() for k in ("error", "invalid", "unknown")) @@ -172,17 +211,41 @@ def test_invalid_or_help_flag(flag): @pytest.mark.parametrize("fmt", ["raw", "json"]) -@pytest.mark.parametrize("text", ["deterministic test", "deterministic test again"]) -def test_repeated_call_consistent(fmt, text, embedding_model): - """Same input → same hash across repeated runs.""" - out1 = run_embedding(text, fmt, embedding_model) - out2 = run_embedding(text, fmt, embedding_model) +def test_threads_two_similarity_vs_single(fmt, embedding_model): + text = "determinism vs threads" + single = parse_vec(run_embedding(text, fmt=fmt, threads=1, params=embedding_model), fmt) + multi = parse_vec(run_embedding(text, fmt=fmt, threads=2, params=embedding_model), fmt) + assert single.shape == multi.shape + cos = cosine_similarity(single, multi) + assert cos >= 0.999, f"threads>1 similarity too low: {cos:.6f}" - if fmt == "json": - v1 = np.array(json.loads(out1)["data"][0]["embedding"], float) - v2 = np.array(json.loads(out2)["data"][0]["embedding"], float) - else: - v1 = np.array(out1.split(), float) - v2 = np.array(out2.split(), float) - assert embedding_hash(v1) == embedding_hash(v2) +def test_json_shape_schema_minimal(embedding_model): + js = json.loads(run_embedding("schema check", fmt="json", params=embedding_model)) + assert isinstance(js, dict) + + # Top-level “object” (present in CLI) is optional for us + if "object" in js: + assert js["object"] in ("list", "embeddings", "embedding_list") + + # Required: data[0].embedding + index + assert "data" in js and isinstance(js["data"], list) and len(js["data"]) >= 1 + item0 = js["data"][0] + assert isinstance(item0, dict) + if "object" in item0: + assert item0["object"] in ("embedding",) + assert "index" in item0 and item0["index"] == 0 + assert "embedding" in item0 and isinstance(item0["embedding"], list) + assert len(item0["embedding"]) in ALLOWED_DIMS + + # Optional fields: tolerate absence in current CLI output + if "model" in js: + assert isinstance(js["model"], str) + if "dim" in js: + assert js["dim"] == len(item0["embedding"]) + usage = js.get("usage", {}) + if usage: + assert isinstance(usage, dict) + # if present, prompt_tokens should be int + if "prompt_tokens" in usage: + assert isinstance(usage["prompt_tokens"], int)