import pytest from utils import * server: ServerProcess @pytest.fixture(autouse=True) def create_server(): global server server = ServerPreset.router() @pytest.mark.parametrize( "model,success", [ ("ggml-org/tinygemma3-GGUF:Q8_0", True), ("non-existent/model", False), ] ) def test_router_chat_completion_stream(model: str, success: bool): global server server.start() content = "" ex: ServerError | None = None try: res = server.make_stream_request("POST", "/chat/completions", data={ "model": model, "max_tokens": 16, "messages": [ {"role": "user", "content": "hello"}, ], "stream": True, }) for data in res: if data["choices"]: choice = data["choices"][0] if choice["finish_reason"] in ["stop", "length"]: assert "content" not in choice["delta"] else: assert choice["finish_reason"] is None content += choice["delta"]["content"] or '' except ServerError as e: ex = e if success: assert ex is None assert len(content) > 0 else: assert ex is not None assert content == "" def _get_model_status(model_id: str) -> str: res = server.make_request("GET", "/models") assert res.status_code == 200 for item in res.body.get("data", []): if item.get("id") == model_id or item.get("model") == model_id: return item["status"]["value"] raise AssertionError(f"Model {model_id} not found in /models response") def _wait_for_model_status(model_id: str, desired: set[str], timeout: int = 60) -> str: deadline = time.time() + timeout last_status = None while time.time() < deadline: last_status = _get_model_status(model_id) if last_status in desired: return last_status time.sleep(1) raise AssertionError( f"Timed out waiting for {model_id} to reach {desired}, last status: {last_status}" ) def _load_model_and_wait( model_id: str, timeout: int = 60, headers: dict | None = None ) -> None: load_res = server.make_request( "POST", "/models/load", data={"model": model_id}, headers=headers ) assert load_res.status_code == 200 assert isinstance(load_res.body, dict) assert load_res.body.get("success") is True _wait_for_model_status(model_id, {"loaded"}, timeout=timeout) def test_router_unload_model(): global server server.start() model_id = "ggml-org/tinygemma3-GGUF:Q8_0" _load_model_and_wait(model_id) unload_res = server.make_request("POST", "/models/unload", data={"model": model_id}) assert unload_res.status_code == 200 assert unload_res.body.get("success") is True _wait_for_model_status(model_id, {"unloaded"}) def test_router_models_max_evicts_lru(): global server server.models_max = 2 server.start() candidate_models = [ "ggml-org/tinygemma3-GGUF:Q8_0", "ggml-org/test-model-stories260K", "ggml-org/test-model-stories260K-infill", ] # Load only the first 2 models to fill the cache first, second, third = candidate_models[:3] _load_model_and_wait(first, timeout=120) _load_model_and_wait(second, timeout=120) # Verify both models are loaded assert _get_model_status(first) == "loaded" assert _get_model_status(second) == "loaded" # Load the third model - this should trigger LRU eviction of the first model _load_model_and_wait(third, timeout=120) # Verify eviction: third is loaded, first was evicted assert _get_model_status(third) == "loaded" assert _get_model_status(first) == "unloaded" def test_router_no_models_autoload(): global server server.no_models_autoload = True server.start() model_id = "ggml-org/tinygemma3-GGUF:Q8_0" res = server.make_request( "POST", "/v1/chat/completions", data={ "model": model_id, "messages": [{"role": "user", "content": "hello"}], "max_tokens": 4, }, ) assert res.status_code == 400 assert "error" in res.body _load_model_and_wait(model_id) success_res = server.make_request( "POST", "/v1/chat/completions", data={ "model": model_id, "messages": [{"role": "user", "content": "hello"}], "max_tokens": 4, }, ) assert success_res.status_code == 200 assert "error" not in success_res.body def test_router_api_key_required(): global server server.api_key = "sk-router-secret" server.start() model_id = "ggml-org/tinygemma3-GGUF:Q8_0" auth_headers = {"Authorization": f"Bearer {server.api_key}"} res = server.make_request( "POST", "/v1/chat/completions", data={ "model": model_id, "messages": [{"role": "user", "content": "hello"}], "max_tokens": 4, }, ) assert res.status_code == 401 assert res.body.get("error", {}).get("type") == "authentication_error" _load_model_and_wait(model_id, headers=auth_headers) authed = server.make_request( "POST", "/v1/chat/completions", headers=auth_headers, data={ "model": model_id, "messages": [{"role": "user", "content": "hello"}], "max_tokens": 4, }, ) assert authed.status_code == 200 assert "error" not in authed.body