add test

2025-11-20 00:29:59 +01:00 · 2025-11-20 00:29:59 +01:00 · 0ef3b61e82
parent 5423d42a35
commit 0ef3b61e82
2 changed files with 68 additions and 4 deletions
--- a/tools/server/tests/unit/test_router.py
+++ b/tools/server/tests/unit/test_router.py
@ -0,0 +1,50 @@
 import pytest
 from utils import *
 server: ServerProcess
@pytest.fixture(autouse=True)
 def create_server():
    global server
    server = ServerPreset.router()
@pytest.mark.parametrize(
    "model,success",
    [
        ("ggml-org/tinygemma3-GGUF:Q8_0", True),
        ("non-existent/model", False),
    ]
 )
 def test_chat_completion_stream(model: str, success: bool):
    # TODO: make sure the model is in cache (ie. ServerProcess.load_all()) before starting the router server
    global server
    server.start()
    content = ""
    ex: ServerError | None = None
    try:
        res = server.make_stream_request("POST", "/chat/completions", data={
            "model": model,
            "max_tokens": 16,
            "messages": [
                {"role": "user", "content": "hello"},
            ],
            "stream": True,
        })
        for data in res:
            if data["choices"]:
                choice = data["choices"][0]
                if choice["finish_reason"] in ["stop", "length"]:
                    assert "content" not in choice["delta"]
                else:
                    assert choice["finish_reason"] is None
                    content += choice["delta"]["content"] or ''
    except ServerError as e:
        ex = e
    if success:
        assert ex is None
        assert len(content) > 0
    else:
        assert ex is not None
        assert content == ""
--- a/tools/server/tests/utils.py
+++ b/tools/server/tests/utils.py
@ -46,7 +46,7 @@ class ServerProcess:
    debug: bool = False
    server_port: int = 8080
    server_host: str = "127.0.0.1"
-    model_hf_repo: str = "ggml-org/models"
+    model_hf_repo: str | None = "ggml-org/models"
    model_hf_file: str | None = "tinyllamas/stories260K.gguf"
    model_alias: str = "tinyllama-2"
    temperature: float = 0.8
@ -519,9 +519,8 @@ class ServerPreset:
        server = ServerProcess()
        server.offline = True # will be downloaded by load_all()
        # mmproj is already provided by HF registry API
-        server.model_hf_repo = "ggml-org/tinygemma3-GGUF"
+        server.model_hf_file = None
-        server.model_hf_file = "tinygemma3-Q8_0.gguf"
+        server.model_hf_repo = "ggml-org/tinygemma3-GGUF:Q8_0"
        server.mmproj_url = "https://huggingface.co/ggml-org/tinygemma3-GGUF/resolve/main/mmproj-tinygemma3.gguf"
        server.model_alias = "tinygemma3"
        server.n_ctx = 1024
        server.n_batch = 32
@ -530,6 +529,21 @@ class ServerPreset:
        server.seed = 42
        return server
    @staticmethod
    def router() -> ServerProcess:
        server = ServerProcess()
        # router server has no models
        server.model_file = None
        server.model_alias = None
        server.model_hf_repo = None
        server.model_hf_file = None
        server.n_ctx = 1024
        server.n_batch = 16
        server.n_slots = 1
        server.n_predict = 16
        server.seed = 42
        return server
 def parallel_function_calls(function_list: List[Tuple[Callable[..., Any], Tuple[Any, ...]]]) -> List[Any]:
    """