added tests

2025-11-04 15:58:30 +01:00 · 2025-11-04 15:58:30 +01:00 · cc18ecc5b7
parent 6441ad48c6
commit cc18ecc5b7
2 changed files with 121 additions and 0 deletions
--- a/tests/test-chat.cpp
+++ b/tests/test-chat.cpp
@ -1111,6 +1111,68 @@ static void test_template_output_parsers() {
                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
                }));
    }
    {
        auto tmpls = read_templates("models/templates/Qwen-Qwen3-0.6B.jinja");
        std::vector<std::string> end_tokens{ "<|im_end|>" };
        assert_equals(COMMON_CHAT_FORMAT_HERMES_2_PRO, common_chat_templates_apply(tmpls.get(), inputs_no_tools).format);
        assert_equals(COMMON_CHAT_FORMAT_HERMES_2_PRO, common_chat_templates_apply(tmpls.get(), inputs_tools).format);
        // Test that enable_thinking=false adds empty think tags
        {
            common_chat_templates_inputs inputs_no_thinking;
            inputs_no_thinking.messages = {message_user};
            inputs_no_thinking.tools = tools;
            inputs_no_thinking.tool_choice = COMMON_CHAT_TOOL_CHOICE_REQUIRED;
            inputs_no_thinking.enable_thinking = false;
            auto params = common_chat_templates_apply(tmpls.get(), inputs_no_thinking);
            assert_equals(COMMON_CHAT_FORMAT_HERMES_2_PRO, params.format);
            // Verify the prompt contains empty think tags when thinking is disabled
            assert_equals(true, params.prompt.find("<think>\n\n</think>") != std::string::npos);
        }
        // Test that grammar allows thinking with REQUIRED tool choice
        {
            common_chat_templates_inputs inputs_with_thinking;
            inputs_with_thinking.messages = {message_user};
            inputs_with_thinking.tools = tools;
            inputs_with_thinking.tool_choice = COMMON_CHAT_TOOL_CHOICE_REQUIRED;
            inputs_with_thinking.enable_thinking = true;
            auto params = common_chat_templates_apply(tmpls.get(), inputs_with_thinking);
            assert_equals(COMMON_CHAT_FORMAT_HERMES_2_PRO, params.format);
            // The key fix: grammar should contain the thinking pattern even with REQUIRED
            assert_equals(false, params.grammar.empty());
            assert_equals(true, params.grammar.find("</think>") != std::string::npos);
            // Grammar should allow thinking before tool calls
            assert_equals(true, params.grammar.find("think-") != std::string::npos || 
                            params.grammar.find("<think>") != std::string::npos);
        }
        // Test parsing: tool call with thinking works correctly
        assert_msg_equals(message_assist_call_thoughts,
            common_chat_parse(
                "<think>I'm\nthinking</think>\n"
                "<tool_call>{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}</tool_call>",
                /* is_partial= */ false,
                {
                    /* .format = */ COMMON_CHAT_FORMAT_HERMES_2_PRO,
                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
                }));
        // Test that reasoning + tool calls work in template generation
        test_templates(tmpls.get(), end_tokens, message_assist_call_thoughts, tools,
                    "",  // Don't check exact delta, just verify it parses correctly
                    /* expect_grammar_triggered= */ true,
                    /* test_grammar_if_triggered= */ true,
                    COMMON_REASONING_FORMAT_DEEPSEEK);
        // Verify enable_thinking support
        assert_equals(true, common_chat_templates_support_enable_thinking(tmpls.get()));
    }
    {
        auto tmpls = read_templates("models/templates/meta-llama-Llama-3.1-8B-Instruct.jinja");
        std::vector<std::string>   end_tokens{ "<|eom_id|>", "<|eot_id|>" };
--- a/tools/server/tests/unit/test_tool_call.py
+++ b/tools/server/tests/unit/test_tool_call.py
@ -623,3 +623,62 @@ def do_test_hello_world(server: ServerProcess, **kwargs):
    code = actual_arguments["code"]
    assert isinstance(code, str), f"Expected code to be a string, got {type(code)}: {json.dumps(code)}"
    assert re.match(r'''print\(("[Hh]ello,? [Ww]orld!?"|'[Hh]ello,? [Ww]orld!?')\)''', re.sub(r'#.*\n?', '', code)), f'Expected hello world, got {code}'
@pytest.mark.slow
@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
@pytest.mark.parametrize("tool,hf_repo,template_override,reasoning_format", [
    (PYTHON_TOOL, "unsloth/Qwen3-0.6B-GGUF:Q4_K_M", None, 'deepseek'),
    (TEST_TOOL,   "unsloth/Qwen3-0.6B-GGUF:Q4_K_M", None, 'deepseek'),
 ])
 def test_required_tool_with_reasoning(tool: dict, hf_repo: str, template_override: str | Tuple[str, str | None] | None, reasoning_format: Literal['deepseek', 'none'], stream: CompletionMode):
    global server
    n_predict = 512
    # Set the reasoning format
    server.reasoning_format = reasoning_format
    server.jinja = True
    server.n_ctx = 8192
    server.n_predict = n_predict
    server.model_hf_repo = hf_repo
    server.model_hf_file = None
    server.start(timeout_seconds=TIMEOUT_START_SLOW)
    # Make the request with "tool_choice": "required"
    body = server.make_any_request("POST", "/v1/chat/completions", data={
        "max_tokens": n_predict,
        "messages": [
            {"role": "system", "content": "You are a coding assistant."},
            {"role": "user", "content": "Write an example"}, # This prompt will force the tool use
        ],
        "tool_choice": "required",
        "tools": [tool],
        "parallel_tool_calls": False,
        "stream": stream == CompletionMode.STREAMED,
        "temperature": 0.0,
        "top_k": 1,
        "top_p": 1.0,
    }, timeout=TIMEOUT_HTTP_REQUEST)
    choice = body["choices"][0]
    reasoning_content:str = choice["message"].get("reasoning_content")
    assert reasoning_content is not None, 'Expected reasoning content, but got None'
    assert len(reasoning_content.strip()) > 3, 'Reasoning content is too small to be credible'
    tool_calls = choice["message"].get("tool_calls")
    assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
    tool_call = tool_calls[0]
    expected_function_name = "python" if tool["type"] == "code_interpreter" else tool["function"]["name"]
    assert expected_function_name == tool_call["function"]["name"]
    actual_arguments = json.loads(tool_call["function"]["arguments"])
    if tool is PYTHON_TOOL:
         assert "code" in actual_arguments, f"tool arguments: {json.dumps(actual_arguments)}, expected: 'code'"
    elif tool is TEST_TOOL:
         assert "success" in actual_arguments, f"tool arguments: {json.dumps(actual_arguments)}, expected: 'success'"