From c82e486fdbfc57378c972d4a433fc657714dbd83 Mon Sep 17 00:00:00 2001 From: "Grisman, Jorge" Date: Tue, 17 Feb 2026 09:24:22 -0600 Subject: [PATCH] test(server): add multi-image and no-image vision API tests Add three new test cases to test_vision_api.py that address the TODO for testing with multiple images and no images: - test_vision_chat_completion_multiple_images: verifies the server handles multiple image_url content parts in a single request - test_vision_chat_completion_no_image: verifies text-only messages work correctly on a multimodal model - test_vision_chat_completion_no_image_content_parts: verifies content parts with only text type (no image_url) work correctly The audio test TODO is narrowed to note it needs a model with audio input support, which the current tinygemma3 test model lacks. --- tools/server/tests/unit/test_vision_api.py | 65 +++++++++++++++++++++- 1 file changed, 64 insertions(+), 1 deletion(-) diff --git a/tools/server/tests/unit/test_vision_api.py b/tools/server/tests/unit/test_vision_api.py index 9408116d1c..29065acdd4 100644 --- a/tools/server/tests/unit/test_vision_api.py +++ b/tools/server/tests/unit/test_vision_api.py @@ -70,7 +70,7 @@ def test_v1_models_supports_multimodal_capability(): ("What is this:\n", "malformed", False, None), ("What is this:\n", "https://google.com/404", False, None), # non-existent image ("What is this:\n", "https://ggml.ai", False, None), # non-image data - # TODO @ngxson : test with multiple images, no images and with audio + # TODO @ngxson : test with audio (needs a model that supports audio input) ] ) def test_vision_chat_completion(prompt, image_url, success, re_content): @@ -97,6 +97,69 @@ def test_vision_chat_completion(prompt, image_url, success, re_content): assert res.status_code != 200 +def test_vision_chat_completion_multiple_images(): + """Test sending multiple images in a single chat completion request.""" + global server + server.n_ctx = 2048 + server.n_slots = 1 + server.start() + res = server.make_request("POST", "/chat/completions", data={ + "temperature": 0.0, + "top_k": 1, + "messages": [ + {"role": "user", "content": [ + {"type": "text", "text": "What are these:\n"}, + {"type": "image_url", "image_url": { + "url": get_img_url("IMG_URL_0"), + }}, + {"type": "image_url", "image_url": { + "url": get_img_url("IMG_URL_1"), + }}, + ]}, + ], + }) + assert res.status_code == 200 + choice = res.body["choices"][0] + assert "assistant" == choice["message"]["role"] + assert len(choice["message"]["content"]) > 0 + + +def test_vision_chat_completion_no_image(): + """Test sending a text-only message to a multimodal model (no images).""" + global server + server.start() + res = server.make_request("POST", "/chat/completions", data={ + "temperature": 0.0, + "top_k": 1, + "messages": [ + {"role": "user", "content": "Hello, how are you?"}, + ], + }) + assert res.status_code == 200 + choice = res.body["choices"][0] + assert "assistant" == choice["message"]["role"] + assert len(choice["message"]["content"]) > 0 + + +def test_vision_chat_completion_no_image_content_parts(): + """Test sending content parts with only text (no image_url parts).""" + global server + server.start() + res = server.make_request("POST", "/chat/completions", data={ + "temperature": 0.0, + "top_k": 1, + "messages": [ + {"role": "user", "content": [ + {"type": "text", "text": "Hello, how are you?"}, + ]}, + ], + }) + assert res.status_code == 200 + choice = res.body["choices"][0] + assert "assistant" == choice["message"]["role"] + assert len(choice["message"]["content"]) > 0 + + @pytest.mark.parametrize( "prompt, image_data, success, re_content", [