test(server): add multi-image and no-image vision API tests

Add three new test cases to test_vision_api.py that address the TODO
for testing with multiple images and no images:

- test_vision_chat_completion_multiple_images: verifies the server
  handles multiple image_url content parts in a single request
- test_vision_chat_completion_no_image: verifies text-only messages
  work correctly on a multimodal model
- test_vision_chat_completion_no_image_content_parts: verifies
  content parts with only text type (no image_url) work correctly

The audio test TODO is narrowed to note it needs a model with audio
input support, which the current tinygemma3 test model lacks.
This commit is contained in:
Grisman, Jorge 2026-02-17 09:24:22 -06:00
parent 2ba9adc093
commit c82e486fdb
1 changed files with 64 additions and 1 deletions

View File

@ -70,7 +70,7 @@ def test_v1_models_supports_multimodal_capability():
("What is this:\n", "malformed", False, None),
("What is this:\n", "https://google.com/404", False, None), # non-existent image
("What is this:\n", "https://ggml.ai", False, None), # non-image data
# TODO @ngxson : test with multiple images, no images and with audio
# TODO @ngxson : test with audio (needs a model that supports audio input)
]
)
def test_vision_chat_completion(prompt, image_url, success, re_content):
@ -97,6 +97,69 @@ def test_vision_chat_completion(prompt, image_url, success, re_content):
assert res.status_code != 200
def test_vision_chat_completion_multiple_images():
"""Test sending multiple images in a single chat completion request."""
global server
server.n_ctx = 2048
server.n_slots = 1
server.start()
res = server.make_request("POST", "/chat/completions", data={
"temperature": 0.0,
"top_k": 1,
"messages": [
{"role": "user", "content": [
{"type": "text", "text": "What are these:\n"},
{"type": "image_url", "image_url": {
"url": get_img_url("IMG_URL_0"),
}},
{"type": "image_url", "image_url": {
"url": get_img_url("IMG_URL_1"),
}},
]},
],
})
assert res.status_code == 200
choice = res.body["choices"][0]
assert "assistant" == choice["message"]["role"]
assert len(choice["message"]["content"]) > 0
def test_vision_chat_completion_no_image():
"""Test sending a text-only message to a multimodal model (no images)."""
global server
server.start()
res = server.make_request("POST", "/chat/completions", data={
"temperature": 0.0,
"top_k": 1,
"messages": [
{"role": "user", "content": "Hello, how are you?"},
],
})
assert res.status_code == 200
choice = res.body["choices"][0]
assert "assistant" == choice["message"]["role"]
assert len(choice["message"]["content"]) > 0
def test_vision_chat_completion_no_image_content_parts():
"""Test sending content parts with only text (no image_url parts)."""
global server
server.start()
res = server.make_request("POST", "/chat/completions", data={
"temperature": 0.0,
"top_k": 1,
"messages": [
{"role": "user", "content": [
{"type": "text", "text": "Hello, how are you?"},
]},
],
})
assert res.status_code == 200
choice = res.body["choices"][0]
assert "assistant" == choice["message"]["role"]
assert len(choice["message"]["content"]) > 0
@pytest.mark.parametrize(
"prompt, image_data, success, re_content",
[