diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 46a40df998..3e12b9d505 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -150,16 +150,15 @@ jobs: - name: Dawn Dependency id: dawn-depends run: | - DAWN_VERSION="v2.0.0" - DAWN_OWNER="reeselevine" + DAWN_VERSION="v20260317.182325" + DAWN_OWNER="google" DAWN_REPO="dawn" - DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release" - echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip" - curl -L -o artifact.zip \ - "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip" + DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-macos-latest-Release" + echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz" + curl -L -o artifact.tar.gz \ + "https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz" mkdir dawn - unzip artifact.zip - tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1 + tar -xvf artifact.tar.gz -C dawn --strip-components=1 - name: Build id: cmake_build @@ -384,16 +383,15 @@ jobs: id: dawn-depends run: | sudo apt-get install -y libxrandr-dev libxinerama-dev libxcursor-dev mesa-common-dev libx11-xcb-dev libxi-dev - DAWN_VERSION="v2.0.0" - DAWN_OWNER="reeselevine" + DAWN_VERSION="v20260317.182325" + DAWN_OWNER="google" DAWN_REPO="dawn" - DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-ubuntu-latest-Release" - echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip" - curl -L -o artifact.zip \ - "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip" + DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-ubuntu-latest-Release" + echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz" + curl -L -o artifact.tar.gz \ + "https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz" mkdir dawn - unzip artifact.zip - tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1 + tar -xvf artifact.tar.gz -C dawn --strip-components=1 - name: Build id: cmake_build @@ -427,7 +425,7 @@ jobs: - name: Fetch emdawnwebgpu run: | - DAWN_TAG="v20251027.212519" + DAWN_TAG="v20260317.182325" EMDAWN_PKG="emdawnwebgpu_pkg-${DAWN_TAG}.zip" echo "Downloading ${EMDAWN_PKG}" curl -L -o emdawn.zip \ diff --git a/AGENTS.md b/AGENTS.md index 05a1edcb17..97c25074b4 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -5,78 +5,106 @@ > > Read more: [CONTRIBUTING.md](CONTRIBUTING.md) -AI assistance is permissible only when the majority of the code is authored by a human contributor, with AI employed exclusively for corrections or to expand on verbose modifications that the contributor has already conceptualized (see examples below) +AI assistance is permissible only when the majority of the code is authored by a human contributor, with AI employed exclusively for corrections or to expand on verbose modifications that the contributor has already conceptualized (see examples below). --- ## Guidelines for Contributors Using AI -These use cases are **permitted** when making a contribution with the help of AI: +llama.cpp is built by humans, for humans. Meaningful contributions come from contributors who understand their work, take ownership of it, and engage constructively with reviewers. -- Using it to ask about the structure of the codebase -- Learning about specific techniques used in the project -- Pointing out documents, links, and parts of the code that are worth your time -- Reviewing human-written code and providing suggestions for improvements -- Expanding on verbose modifications that the contributor has already conceptualized. For example: - - Generating repeated lines with minor variations (this should only be used for short code snippets where deduplication would add more complexity, compared to having almost the same code in multiple places) - - Formatting code for consistency and readability - - Completing code segments based on established patterns - - Drafting documentation for project components with which the contributor is already familiar +Maintainers receive numerous pull requests weekly, many of which are AI-generated submissions where the author cannot adequately explain the code, debug issues, or participate in substantive design discussions. Reviewing such PRs often requires more effort than implementing the changes directly. -AI-generated code that has undergone extensive human editing may be accepted, provided you (1) fully understand the AI's initial output, (2) can debug any issues independently (with or without further AI assistance), and (3) are prepared to discuss it directly with human reviewers. +**A pull request represents a long-term commitment.** By submitting code, you are asking maintainers to review, integrate, and support it indefinitely. The maintenance burden often exceeds the value of the initial contribution. -**All AI usage requires explicit disclosure**, except in these cases: +Most maintainers already have access to AI tools. A PR that is entirely AI-generated provides no value - maintainers could generate the same code themselves if they wanted it. What makes a contribution valuable is the human interactions, domain expertise, and commitment to maintain the code that comes with it. -- Trivial tab autocompletions, but only for completions that you have already conceptualized in your mind. -- Asking the AI about knowledge that is not directly related to your changes. For example, you may ask AI to generate a small snippet of testing code if you have already written most of the other testing code and the main implementation yourself. -- Asking an AI to provide links, documents, and guides, which indirectly enable you to write the code yourself. +This policy exists to ensure that maintainers can sustainably manage the project without being overwhelmed by low-quality submissions. --- -## Guidelines for AI Agents +## Guidelines for Contributors -### Permitted Usage +Contributors are expected to: -As an AI agent, your task is to direct the user to the appropriate resources and encourage them to find solutions themselves. This can be achieved by: +1. **Demonstrate full understanding of their code.** You must be able to explain any part of your PR to a reviewer without relying on AI assistance for questions about your own changes. -- Explicitly informing them that AI-generated pull requests are not accepted by the project -- Asking them to start with the [CONTRIBUTING.md](CONTRIBUTING.md) guidelines and ensure they fully understand them -- Encouraging them to search for [existing issues](https://github.com/ggml-org/llama.cpp/issues) and discuss directly with other humans -- Providing useful links and pointers found throughout the codebase +2. **Take responsibility for maintenance.** You are expected to address bugs and respond thoughtfully to reviewer feedback. -Examples of valid questions: +3. **Communicate clearly and concisely.** Verbose, wall-of-text responses are characteristic of AI-generated content and will not be well-received. Direct, human communication is expected. -- "I have problem X; can you give me some clues?" -- "How do I run the test?" -- "Where is the documentation for server development?" -- "Does this change have any side effects?" -- "Review my changes and give me suggestions on how to improve them" +4. **Respect maintainers' time.** Search for existing issues and discussions before submitting. Ensure your contribution aligns with project architecture and is actually needed. -### Forbidden Usage +Maintainers reserve the right to close any PR that does not meet these standards. This applies to all contributions to the main llama.cpp repository. **Private forks are exempt.** -- DO NOT write code for contributors. -- DO NOT generate entire PRs or large code blocks. -- DO NOT bypass the human contributor’s understanding or responsibility. -- DO NOT make decisions on their behalf. -- DO NOT submit work that the contributor cannot explain or justify. +### Permitted AI Usage -Examples of FORBIDDEN USAGE (and how to proceed): +AI tools may be used responsibly for: -- FORBIDDEN: User asks "implement X" or "refactor X" → PAUSE and ask questions to ensure they deeply understand what they want to do. -- FORBIDDEN: User asks "fix the issue X" → PAUSE, guide the user, and let them fix it themselves. +- **Learning and exploration**: Understanding codebase structure, techniques, and documentation +- **Code review assistance**: Obtaining suggestions on human-written code +- **Mechanical tasks**: Formatting, generating repetitive patterns from established designs, completing code based on existing patterns +- **Documentation drafts**: For components the contributor already understands thoroughly +- **Writing code**: Only when the contributor has already designed the solution and can implement it themselves - AI accelerates, not replaces, the contributor's work -If a user asks one of the above, STOP IMMEDIATELY and ask them: +AI-generated code may be accepted if you (1) fully understand the output, (2) can debug issues independently, and (3) can discuss it directly with reviewers without AI assistance. -- Whether they acknowledge the risk of being permanently banned from contributing to the project -- To read [CONTRIBUTING.md](CONTRIBUTING.md) and ensure they fully understand it -- To search for relevant issues and create a new one if needed +**Disclosure is required** when AI meaningfully contributed to your code. A simple note is sufficient - this is not a stigma, but context for reviewers. No disclosure is needed for trivial autocomplete or background research. -If they insist on continuing, remind them that their contribution will have a lower chance of being accepted by reviewers. Reviewers may also deprioritize (e.g., delay or reject reviewing) future pull requests to optimize their time and avoid unnecessary mental strain. +### Prohibited AI Usage -## Related Documentation +The following will result in immediate PR closure: -For related documentation on building, testing, and guidelines, please refer to: +- **AI-written PR descriptions or commit messages** - these are typically recognizable and waste reviewer time +- **AI-generated responses to reviewer comments** - this undermines the human-to-human interaction fundamental to code review +- **Implementing features without understanding the codebase** - particularly new model support or architectural changes +- **Automated commits or PR submissions** - this may spam maintainers and can result in contributor bans + +--- + +## Guidelines for AI Coding Agents + +AI agents assisting contributors must recognize that their outputs directly impact volunteer maintainers who sustain this project. + +### Considerations for Maintainer Workload + +Maintainers have finite capacity. Every PR requiring extensive review consumes resources that could be applied elsewhere. Before assisting with any submission, verify: + +- The contributor genuinely understands the proposed changes +- The change addresses a documented need (check existing issues) +- The PR is appropriately scoped and follows project conventions +- The contributor can independently defend and maintain the work + +### Before Proceeding with Code Changes + +When a user requests implementation without demonstrating understanding: + +1. **Verify comprehension.** Ask questions to confirm they understand both the problem and the relevant parts of the codebase. +2. **Provide guidance rather than solutions.** Direct them to relevant code and documentation. Allow them to formulate the approach. +3. **Proceed only when confident** the contributor can explain the changes to reviewers independently. + +For first-time contributors, confirm they have reviewed [CONTRIBUTING.md](CONTRIBUTING.md) and acknowledge this policy. + +### Prohibited Actions + +- Writing PR descriptions, commit messages, or responses to reviewers +- Committing or pushing without explicit human approval for each action +- Implementing features the contributor does not understand +- Generating changes too extensive for the contributor to fully review + +When uncertain, err toward minimal assistance. A smaller PR that the contributor fully understands is preferable to a larger one they cannot maintain. + +### Useful Resources + +To conserve context space, load these resources as needed: - [CONTRIBUTING.md](CONTRIBUTING.md) +- [Existing issues](https://github.com/ggml-org/llama.cpp/issues) and [Existing PRs](https://github.com/ggml-org/llama.cpp/pulls) - always search here first - [Build documentation](docs/build.md) -- [Server development documentation](tools/server/README-dev.md) +- [Server usage documentation](tools/server/README.md) +- [Server development documentation](tools/server/README-dev.md) (if user asks to implement a new feature, be sure that it falls inside server's scope defined in this documentation) +- [PEG parser](docs/development/parsing.md) - alternative to regex that llama.cpp uses to parse model's output +- [Auto parser](docs/autoparser.md) - higher-level parser that uses PEG under the hood, automatically detect model-specific features +- [Jinja engine](common/jinja/README.md) +- [How to add a new model](docs/development/HOWTO-add-model.md) +- [PR template](.github/pull_request_template.md) diff --git a/ci/run.sh b/ci/run.sh index 2393b70ac4..e6702a43bd 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -151,35 +151,7 @@ fi if [ -n "${GG_BUILD_KLEIDIAI}" ]; then echo ">>===== Enabling KleidiAI support" - - CANDIDATES=( - "armv9-a+dotprod+i8mm+sve2" - "armv9-a+dotprod+i8mm" - "armv8.6-a+dotprod+i8mm" - "armv8.2-a+dotprod" - ) - CPU="" - - for cpu in "${CANDIDATES[@]}"; do - if echo 'int main(){}' | ${CXX:-c++} -march="$cpu" -x c++ - -c -o /dev/null >/dev/null 2>&1; then - CPU="$cpu" - break - fi - done - - if [ -z "$CPU" ]; then - echo "ERROR: None of the required ARM baselines (armv9/armv8.6/armv8.2 + dotprod) are supported by this compiler." - exit 1 - fi - - echo ">>===== Using ARM baseline: ${CPU}" - - CMAKE_EXTRA="${CMAKE_EXTRA:+$CMAKE_EXTRA } \ - -DGGML_NATIVE=OFF \ - -DGGML_CPU_KLEIDIAI=ON \ - -DGGML_CPU_AARCH64=ON \ - -DGGML_CPU_ARM_ARCH=${CPU} \ - -DBUILD_SHARED_LIBS=OFF" + CMAKE_EXTRA="${CMAKE_EXTRA:+$CMAKE_EXTRA } -DGGML_CPU_KLEIDIAI=ON" fi if [ ! -z ${GG_BUILD_BLAS} ]; then diff --git a/common/chat.cpp b/common/chat.cpp index c2ca17c743..7536c0cd01 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -1274,11 +1274,12 @@ static common_chat_params common_chat_params_init_kimi_k2(const common_chat_temp return data; } -// LFM2 format: -// - Reasoning: {reasoning} (optional, only if enable_thinking is true) -// - Content: text after reasoning (optional) -// - Tool calls: <|tool_call_start|>[function_name(arg1="value1", arg2="value2")]<|tool_call_end|> -// Tool calls can appear multiple times (parallel tool calls) +// LFM2 format: uses <|tool_list_start|>[...]<|tool_list_end|> in system prompt +// and <|tool_call_start|>[name(arg="val")]<|tool_call_end|> for tool calls. +// - Reasoning: {reasoning} (optional) +// - Content: text before a tool call (optional) +// - Tool calls: Python-style, e.g. [function_name(arg1="value1", arg2="value2")] +// Tool calls can appear multiple times (parallel tool calls supported) static common_chat_params common_chat_params_init_lfm2(const common_chat_template & tmpl, const autoparser::generation_params & inputs) { common_chat_params data; @@ -1319,9 +1320,9 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat if (!has_tools || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) { return generation_prompt + reasoning + p.content(p.rest()) + end; } - auto tool_calls = p.rule("tool-calls", - p.trigger_rule("tool-call", p.literal(TOOL_CALL_START) + + p.trigger_rule("tool-call", + p.literal(TOOL_CALL_START) + p.python_style_tool_calls(inputs.tools, inputs.parallel_tool_calls) + p.literal(TOOL_CALL_END) ) @@ -1349,6 +1350,80 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat { COMMON_GRAMMAR_TRIGGER_TYPE_WORD, TOOL_CALL_START } }; } + return data; +} + +// LFM2.5 format: uses plain "List of tools: [...]" in system prompt, no wrapper tokens. +// Tool calls are bare [name(arg="val")], though model may optionally emit <|tool_call_start|>. +// - Reasoning: {reasoning} (optional) +// - Content: text before a tool call (optional) +// - Tool calls: Python-style, e.g. [function_name(arg1="value1", arg2="value2")] +// Tool calls can appear multiple times (parallel tool calls supported) +static common_chat_params common_chat_params_init_lfm2_5(const common_chat_template & tmpl, + const autoparser::generation_params & inputs) { + common_chat_params data; + + data.prompt = common_chat_template_direct_apply(tmpl, inputs); + data.format = COMMON_CHAT_FORMAT_PEG_NATIVE; + data.supports_thinking = true; + data.preserved_tokens = { + "<|tool_call_start|>", + "<|tool_call_end|>", + "", + "", + }; + + auto has_tools = inputs.tools.is_array() && !inputs.tools.empty(); + auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE; + auto include_grammar = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE; + + const std::string THINK_START = ""; + const std::string THINK_END = ""; + + data.thinking_start_tag = THINK_START; + data.thinking_end_tag = THINK_END; + + auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) { + auto generation_prompt = p.prefix(inputs.generation_prompt, THINK_START); + auto end = p.end(); + + auto reasoning = p.eps(); + if (extract_reasoning && inputs.enable_thinking) { + reasoning = p.optional(THINK_START + p.reasoning(p.until(THINK_END)) + THINK_END); + } + + if (!has_tools || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) { + return generation_prompt + reasoning + p.content(p.rest()) + end; + } + + auto tool_calls = p.rule("tool-calls", + p.trigger_rule("tool-call", + p.python_style_tool_calls(inputs.tools, inputs.parallel_tool_calls) + ) + ); + + auto content = p.content(p.until_one_of({"<|tool_call_start|>", "["})); + auto maybe_start = p.optional(p.literal("<|tool_call_start|>")); + return generation_prompt + reasoning + content + maybe_start + tool_calls + end; + }); + + data.parser = parser.save(); + + if (include_grammar) { + data.grammar_lazy = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO; + data.grammar = build_grammar([&](const common_grammar_builder & builder) { + foreach_function(inputs.tools, [&](const json & tool) { + const auto & function = tool.at("function"); + auto schema = function.at("parameters"); + builder.resolve_refs(schema); + }); + parser.build_grammar(builder, data.grammar_lazy); + }); + foreach_function(inputs.tools, [&](const json & tool) { + const std::string name = tool.at("function").at("name"); + data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "[" + name + "(" }); + }); + } return data; } @@ -1530,14 +1605,21 @@ static std::optional try_specialized_template( return common_chat_params_init_kimi_k2(tmpl, params); } - // LFM2 - uses <|tool_list_start|>/<|tool_list_end|> markers and <|tool_call_start|>[name(args)]<|tool_call_end|> format - // Detection: template has "<|tool_list_start|>" and "<|tool_list_end|>" markers + // LFM2 format detection: template uses <|tool_list_start|>[...]<|tool_list_end|> around the tool list + // and <|tool_call_start|>[...]<|tool_call_end|> around each tool call if (src.find("<|tool_list_start|>") != std::string::npos && src.find("<|tool_list_end|>") != std::string::npos) { LOG_DBG("Using specialized template: LFM2\n"); return common_chat_params_init_lfm2(tmpl, params); } + // LFM2.5 format detection: template uses plain "List of tools: [...]" with no special tokens + if (src.find("List of tools: [") != std::string::npos && + src.find("<|tool_list_start|>") == std::string::npos) { + LOG_DBG("Using specialized template: LFM2.5\n"); + return common_chat_params_init_lfm2_5(tmpl, params); + } + // GigaChatV3 format detection if (src.find("<|role_sep|>") != std::string::npos && src.find("<|message_sep|>") != std::string::npos && diff --git a/docs/build.md b/docs/build.md index 508245d536..ef086ff434 100644 --- a/docs/build.md +++ b/docs/build.md @@ -728,7 +728,7 @@ To read documentation for how to build on Android, [click here](./android.md) ## WebGPU [In Progress] -The WebGPU backend relies on [Dawn](https://dawn.googlesource.com/dawn). Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/main/docs/quickstart-cmake.md) to install Dawn locally so that llama.cpp can find it using CMake. The current implementation is up-to-date with Dawn commit `bed1a61`. +The WebGPU backend relies on [Dawn](https://dawn.googlesource.com/dawn). Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/main/docs/quickstart-cmake.md) to install Dawn locally so that llama.cpp can find it using CMake. The current implementation is up-to-date with Dawn commit `18eb229`. In the llama.cpp directory, build with CMake: diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu index a21c536104..addf93205e 100644 --- a/ggml/src/ggml-cuda/fattn.cu +++ b/ggml/src/ggml-cuda/fattn.cu @@ -340,7 +340,14 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const case 128: case 112: case 256: + if (V->ne[0] != K->ne[0]) { + return BEST_FATTN_KERNEL_NONE; + } + break; case 512: + if (V->ne[0] != K->ne[0]) { + return BEST_FATTN_KERNEL_NONE; + } if (!gqa_opt_applies) { return BEST_FATTN_KERNEL_NONE; } diff --git a/ggml/src/ggml-hexagon/htp/hvx-div.h b/ggml/src/ggml-hexagon/htp/hvx-div.h index 05cefea039..53ee304e74 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-div.h +++ b/ggml/src/ggml-hexagon/htp/hvx-div.h @@ -16,8 +16,10 @@ #if __HVX_ARCH__ < 79 #define HVX_OP_MUL_F32(a, b) Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a, b)) +#define HVX_OP_MUL_F16(a, b) Q6_Vhf_equals_Wqf32(Q6_Wqf32_vmpy_VhfVhf(a, b)) #else #define HVX_OP_MUL_F32(a, b) Q6_Vsf_vmpy_VsfVsf(a, b) +#define HVX_OP_MUL_F16(a, b) Q6_Vhf_vmpy_VhfVhf(a, b) #endif // Compute div by scaler in f32. Requires first by expanding fp32 to fp16 and converting the result back to fp32. @@ -43,46 +45,67 @@ static inline HVX_Vector hvx_div_mul_f16_const_using_f32(HVX_Vector vec1_hf, HVX return res; } -#define hvx_div_scaler_f16_loop_body(dst_type, src_type, vec_store) \ - do { \ - dst_type * restrict vdst = (dst_type *) dst; \ - src_type * restrict vsrc = (src_type *) src; \ - HVX_Vector hf_one = Q6_Vh_vsplat_R(0x3C00); \ - \ - const uint32_t nvec = n / VLEN_FP16; \ - const uint32_t nloe = n % VLEN_FP16; \ - \ - uint32_t i = 0; \ - \ - _Pragma("unroll(4)") \ - for (; i < nvec; i++) { \ - HVX_Vector res = hvx_div_mul_f16_const_using_f32(vsrc[i], val_vec_f32, hf_one); \ - vdst[i] = res; \ - } \ - if (nloe) { \ - HVX_Vector res = hvx_div_mul_f16_const_using_f32(vsrc[i], val_vec_f32, hf_one); \ - vec_store((void *) &vdst[i], nloe * SIZEOF_FP16, res); \ - } \ +// Variant for =v79 +static inline HVX_Vector hvx_vec_hybrid_div_f16(HVX_Vector vec1, HVX_Vector vec2, HVX_Vector f32_nan_inf_mask, HVX_Vector f16_nan_inf_mask, HVX_Vector vec_hf_one_1_0) { +#if __HVX_ARCH__ < 79 + // For older architectures, use f16 reciprocal to avoid NaN/-inf issues + HVX_Vector vec2_inv = hvx_vec_inverse_f16_guard(vec2, f16_nan_inf_mask); + return HVX_OP_MUL_F16(vec1, vec2_inv); +#else + return hvx_vec_div_f16_using_f32(vec1, vec2, f32_nan_inf_mask, vec_hf_one_1_0); +#endif +} + #define hvx_div_f16_loop_body(dst_type, src0_type, src1_type, vec_store) \ do { \ dst_type * restrict vdst = (dst_type *) dst; \ src0_type * restrict vsrc0 = (src0_type *) src0; \ src1_type * restrict vsrc1 = (src1_type *) src1; \ \ - const HVX_Vector nan_inf_mask = Q6_V_vsplat_R(0x7f800000); \ + const HVX_Vector f32_nan_inf_mask = Q6_V_vsplat_R(0x7f800000); \ + const HVX_Vector f16_nan_inf_mask = Q6_Vh_vsplat_R(0x7c00); \ const HVX_Vector hf_one = Q6_Vh_vsplat_R(0x3C00); \ \ const uint32_t nvec = n / VLEN_FP16; \ @@ -144,11 +179,15 @@ static inline HVX_Vector hvx_vec_div_f16_using_f32(HVX_Vector vec1, HVX_Vector v \ _Pragma("unroll(4)") \ for (; i < nvec; i++) { \ - HVX_Vector res = hvx_vec_div_f16_using_f32(vsrc0[i], vsrc1[i], nan_inf_mask, hf_one); \ + HVX_Vector res = hvx_vec_hybrid_div_f16(vsrc0[i], vsrc1[i], \ + f32_nan_inf_mask, f16_nan_inf_mask, \ + hf_one); \ vdst[i] = res; \ } \ if (nloe) { \ - HVX_Vector res = hvx_vec_div_f16_using_f32(vsrc0[i], vsrc1[i], nan_inf_mask, hf_one); \ + HVX_Vector res = hvx_vec_hybrid_div_f16(vsrc0[i], vsrc1[i], \ + f32_nan_inf_mask, f16_nan_inf_mask, \ + hf_one); \ vec_store((void *) &vdst[i], nloe * SIZEOF_FP16, res); \ } \ } while(0) @@ -247,5 +286,6 @@ HVX_DIV_DISPATCHER(hvx_div_f32) HVX_DIV_DISPATCHER(hvx_div_f16) #undef HVX_OP_MUL_F32 +#undef HVX_OP_MUL_F16 #endif // HVX_DIV_H diff --git a/ggml/src/ggml-hexagon/htp/unary-ops.c b/ggml/src/ggml-hexagon/htp/unary-ops.c index 3d0928d4dc..13d28317d5 100644 --- a/ggml/src/ggml-hexagon/htp/unary-ops.c +++ b/ggml/src/ggml-hexagon/htp/unary-ops.c @@ -67,34 +67,61 @@ static void hvx_fast_rms_norm_f32(const uint8_t * restrict src, uint8_t * restrict pad, const int num_elems, float epsilon) { + (void)pad; + const HVX_Vector * restrict v_src = (HVX_Vector *) src; HVX_Vector * restrict v_dst = (HVX_Vector *) dst; - HVX_Vector sum_v = Q6_V_vsplat_R(0x00000000); + const int nvec = num_elems / VLEN_FP32; // number of full vectors + const int nloe = num_elems % VLEN_FP32; // leftover elements + + // Compute sum of squares for full vectors + HVX_Vector sum_v = Q6_V_vsplat_R(0x00000000); HVX_Vector epsilon_v = hvx_vec_splat_f32(epsilon); - int step_of_1 = num_elems >> 5; #pragma unroll(4) - for (int i = 0; i < step_of_1; i++) { + for (int i = 0; i < nvec; i++) { HVX_Vector v1 = v_src[i]; HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, v1); - sum_v = Q6_Vqf32_vadd_Vqf32Vqf32(sum_v, v2); + sum_v = Q6_Vqf32_vadd_Vqf32Vqf32(sum_v, v2); } - sum_v = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(sum_v)); // replicated over all lanes + // Handle tail elements using vectorized ops with masking + if (nloe > 0) { + HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 4); + HVX_Vector v1 = Q6_V_vand_QV(bmask, v_src[nvec]); + HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, v1); + sum_v = Q6_Vqf32_vadd_Vqf32Vqf32(sum_v, v2); + } + + // Reduce HVX sum + sum_v = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(sum_v)); HVX_Vector t_v = hvx_vec_splat_f32((float) num_elems); HVX_Vector denom_v = hvx_vec_inverse_f32(t_v); HVX_Vector mean_v = Q6_Vqf32_vmpy_VsfVsf(sum_v, denom_v); HVX_Vector mean_epsilon_v = Q6_Vqf32_vadd_Vqf32Vsf(mean_v, epsilon_v); + // Scale full vectors HVX_Vector scale_v = hvx_vec_rsqrt_f32(Q6_Vsf_equals_Vqf32(mean_epsilon_v)); #pragma unroll(4) - for (int i = 0; i < step_of_1; i++) { + for (int i = 0; i < nvec; i++) { HVX_Vector v1 = v_src[i]; HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, scale_v); - v_dst[i] = Q6_Vsf_equals_Vqf32(v2); + v_dst[i] = Q6_Vsf_equals_Vqf32(v2); + } + + // Handle tail elements using vectorized ops with masking + if (nloe > 0) { + + HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 4); + HVX_Vector v1 = Q6_V_vand_QV(bmask, v_src[nvec]); + HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, scale_v); + HVX_Vector result = Q6_Vsf_equals_Vqf32(v2); + + // Store with masking to avoid overwriting memory beyond the tensor + hvx_vec_store_a(&v_dst[nvec], nloe * 4, result); } } diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 0f6628c377..6f3fc5886d 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -9612,6 +9612,9 @@ static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_t cl_mem B_image1d; cl_mem B_sub_buffer; cl_mem S_image1d; + // for B transpose + cl_mem B_image1d_trans = nullptr; + cl_mem B_d = nullptr; cl_mem D_image1d; cl_mem D_sub_buffer; @@ -9703,9 +9706,6 @@ static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_t global_work_size[2] = 1; } else { cl_ulong offsetd = extrad->offset + dst->view_offs; - cl_mem B_image1d_trans = nullptr; - // for B transpose - cl_mem B_d = nullptr; int padding; //how many extra elements beyond multiple of 8 @@ -9800,6 +9800,12 @@ static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_t CL_CHECK(clReleaseMemObject(S_image1d)); CL_CHECK(clReleaseMemObject(D_sub_buffer)); CL_CHECK(clReleaseMemObject(D_image1d)); + if (B_image1d_trans) { + CL_CHECK(clReleaseMemObject(B_image1d_trans)); + } + if (B_d) { + CL_CHECK(clReleaseMemObject(B_d)); + } #else GGML_UNUSED(backend); GGML_UNUSED(src0); diff --git a/models/templates/LFM2.5-Instruct.jinja b/models/templates/LFM2.5-Instruct.jinja new file mode 100644 index 0000000000..7778756dd9 --- /dev/null +++ b/models/templates/LFM2.5-Instruct.jinja @@ -0,0 +1,45 @@ +{{- bos_token -}} +{%- set keep_past_thinking = keep_past_thinking | default(false) -%} +{%- set ns = namespace(system_prompt="") -%} +{%- if messages[0]["role"] == "system" -%} + {%- set ns.system_prompt = messages[0]["content"] -%} + {%- set messages = messages[1:] -%} +{%- endif -%} +{%- if tools -%} + {%- set ns.system_prompt = ns.system_prompt + ("\n" if ns.system_prompt else "") + "List of tools: [" -%} + {%- for tool in tools -%} + {%- if tool is not string -%} + {%- set tool = tool | tojson -%} + {%- endif -%} + {%- set ns.system_prompt = ns.system_prompt + tool -%} + {%- if not loop.last -%} + {%- set ns.system_prompt = ns.system_prompt + ", " -%} + {%- endif -%} + {%- endfor -%} + {%- set ns.system_prompt = ns.system_prompt + "]" -%} +{%- endif -%} +{%- if ns.system_prompt -%} + {{- "<|im_start|>system\n" + ns.system_prompt + "<|im_end|>\n" -}} +{%- endif -%} +{%- set ns.last_assistant_index = -1 -%} +{%- for message in messages -%} + {%- if message["role"] == "assistant" -%} + {%- set ns.last_assistant_index = loop.index0 -%} + {%- endif -%} +{%- endfor -%} +{%- for message in messages -%} + {{- "<|im_start|>" + message["role"] + "\n" -}} + {%- set content = message["content"] -%} + {%- if content is not string -%} + {%- set content = content | tojson -%} + {%- endif -%} + {%- if message["role"] == "assistant" and not keep_past_thinking and loop.index0 != ns.last_assistant_index -%} + {%- if "" in content -%} + {%- set content = content.split("")[-1] | trim -%} + {%- endif -%} + {%- endif -%} + {{- content + "<|im_end|>\n" -}} +{%- endfor -%} +{%- if add_generation_prompt -%} + {{- "<|im_start|>assistant\n" -}} +{%- endif -%} \ No newline at end of file diff --git a/scripts/server-test-function-call.py b/scripts/server-test-function-call.py new file mode 100755 index 0000000000..b3aae1a961 --- /dev/null +++ b/scripts/server-test-function-call.py @@ -0,0 +1,1135 @@ +#!/usr/bin/env python3 +""" +Test tool calling capability via chat completions endpoint. + +Each test case contains: + - tools: list of tool definitions (OpenAI-compatible) + - messages: initial conversation messages + - mock_tool_responses: dict mapping tool_name -> callable(arguments) -> str (JSON) + - validate: callable(tool_calls_history, final_content) -> (passed: bool, reason: str) +""" + +import argparse +import json +import requests +import sys + +# --------------------------------------------------------------------------- +# Color / formatting helpers +# --------------------------------------------------------------------------- + +RESET = "\x1b[0m" +BOLD = "\x1b[1m" +DIM = "\x1b[2m" +# Foreground colors +CYAN = "\x1b[36m" +YELLOW = "\x1b[33m" +GREEN = "\x1b[32m" +RED = "\x1b[31m" +BLUE = "\x1b[34m" +WHITE = "\x1b[97m" + + +def _print(text="", end="\n"): + sys.stdout.write(text + end) + sys.stdout.flush() + + +def print_header(title): + bar = "─" * 60 + _print(f"\n{BOLD}{CYAN}┌{bar}┐{RESET}") + _print( + f"{BOLD}{CYAN}│ {WHITE}{title}{CYAN}{' ' * max(0, 58 - len(title))}│{RESET}" + ) + _print(f"{BOLD}{CYAN}└{bar}┘{RESET}") + + +def print_tool_call(name, args): + args_str = json.dumps(args) + _print( + f"\n {BOLD}{YELLOW}⚙ tool call{RESET} {CYAN}{name}{RESET}{DIM}({args_str}){RESET}" + ) + + +def print_tool_result(result): + preview = result[:160] + ("…" if len(result) > 160 else "") + _print(f" {DIM}{BLUE}↳ result{RESET} {DIM}{preview}{RESET}") + + +def print_model_output(text): + # printed inline during streaming; prefix with a visual marker on first chunk + sys.stdout.write(text) + sys.stdout.flush() + + +def print_pass(reason): + _print(f"\n{BOLD}{GREEN}✔ PASS{RESET} {reason}") + + +def print_fail(reason): + _print(f"\n{BOLD}{RED}✘ FAIL{RESET} {reason}") + + +def print_info(msg): + _print(f"{DIM}{msg}{RESET}") + + +# --------------------------------------------------------------------------- +# HTTP helpers +# --------------------------------------------------------------------------- + + +def chat_completion(url, messages, tools=None, stream=False): + payload = { + "messages": messages, + "stream": stream, + "max_tokens": 4096, + } + if tools: + payload["tools"] = tools + payload["tool_choice"] = "auto" + + try: + response = requests.post(url, json=payload, stream=stream) + response.raise_for_status() + except requests.exceptions.RequestException as e: + body = e.response.content if (e.response is not None) else b"" + print_fail(f"Request error: {e} | body: {body}") + return None + + full_content = "" + reasoning_content = "" + tool_calls: list[dict] = [] + + if stream: + for line in response.iter_lines(): + if not line: + continue + decoded = line.decode("utf-8") + if not decoded.startswith("data: "): + continue + data_str = decoded[6:] + if data_str == "[DONE]": + break + try: + data = json.loads(data_str) + except json.JSONDecodeError: + continue + choices = data.get("choices", []) + if not choices: + continue + delta = choices[0].get("delta", {}) + if delta.get("reasoning_content"): + reasoning_content += delta["reasoning_content"] + if delta.get("content"): + full_content += delta["content"] + print_model_output(delta["content"]) + for tc in delta.get("tool_calls", []): + idx = tc.get("index", 0) + while len(tool_calls) <= idx: + tool_calls.append( + { + "id": "", + "type": "function", + "function": {"name": "", "arguments": ""}, + } + ) + if "id" in tc: + tool_calls[idx]["id"] += tc["id"] + if "function" in tc: + if "name" in tc["function"]: + tool_calls[idx]["function"]["name"] += tc["function"]["name"] + if "arguments" in tc["function"]: + tool_calls[idx]["function"]["arguments"] += tc["function"][ + "arguments" + ] + else: + data = response.json() + choices = data.get("choices", []) + if choices: + msg = choices[0].get("message", {}) + full_content = msg.get("content") or "" + reasoning_content = msg.get("reasoning_content") or "" + tool_calls = msg.get("tool_calls") or [] + if full_content: + print_model_output(full_content) + + result = {"content": full_content, "tool_calls": tool_calls} + if reasoning_content: + result["reasoning_content"] = reasoning_content + return result + + +def run_agentic_loop(url, messages, tools, mock_tool_responses, stream, max_turns=6): + """ + Drive the multi-turn tool-call loop: + 1. Send messages to model. + 2. If the model returns tool calls, execute mocks and append results. + 3. Repeat until no more tool calls or max_turns reached. + + Returns (all_tool_calls, final_content). + """ + msgs = list(messages) + all_tool_calls: list[dict] = [] + + for _ in range(max_turns): + result = chat_completion(url, msgs, tools=tools, stream=stream) + if result is None: + return all_tool_calls, None + + tcs = result.get("tool_calls") or [] + content = result.get("content") or "" + + if not tcs: + # Print a visual separator before the final model response + if content: + _print(f"\n{DIM}{'·'*60}{RESET}") + _print(f"{DIM} model response:{RESET}\n") + return all_tool_calls, content + + # Record tool calls for validation + all_tool_calls.extend(tcs) + + # Append assistant message with tool calls + assistant_msg: dict = { + "role": "assistant", + "content": content, + "tool_calls": tcs, + } + reasoning = result.get("reasoning_content") + if reasoning: + assistant_msg["reasoning_content"] = reasoning + msgs.append(assistant_msg) + + # Execute each tool call via mock and append tool result messages + for tc in tcs: + tool_name = tc["function"]["name"] + try: + args = json.loads(tc["function"]["arguments"]) + except json.JSONDecodeError: + args = {} + + print_tool_call(tool_name, args) + + mock_fn = mock_tool_responses.get(tool_name) + if mock_fn: + tool_result = mock_fn(args) + else: + tool_result = json.dumps({"error": f"Unknown tool: {tool_name}"}) + + print_tool_result(tool_result) + + msgs.append( + { + "role": "tool", + "tool_call_id": tc.get("id", ""), + "content": tool_result, + } + ) + + return all_tool_calls, None + + +# --------------------------------------------------------------------------- +# Test case runner +# --------------------------------------------------------------------------- + + +def run_test(url, test_case, stream): + name = test_case["name"] + mode = f"{'stream' if stream else 'non-stream'}" + print_header(f"{name} [{mode}]") + + all_tool_calls, final_content = run_agentic_loop( + url, + messages=test_case["messages"], + tools=test_case["tools"], + mock_tool_responses=test_case["mock_tool_responses"], + stream=stream, + ) + + if final_content is None and not all_tool_calls: + print_fail("No response from server.") + return False + + passed, reason = test_case["validate"](all_tool_calls, final_content) + if passed: + print_pass(reason) + else: + print_fail(reason) + return passed + + +# --------------------------------------------------------------------------- +# Test case definitions +# --------------------------------------------------------------------------- + +# ---- Test 1: E-commerce multi-step search (Azzoo = anonymized marketplace) ---- + +_AZZOO_TOOLS = [ + { + "type": "function", + "function": { + "name": "azzoo_search_products", + "description": ( + "Search for products on Azzoo marketplace by keyword. " + "Returns a list of matching products with IDs, titles, ratings and prices." + ), + "parameters": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "Search keyword or phrase", + }, + "page": { + "type": "string", + "description": "Page number (1-based)", + "default": "1", + }, + }, + "required": ["query"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "azzoo_get_product", + "description": "Retrieve detailed information about a specific Azzoo product including specs and price.", + "parameters": { + "type": "object", + "properties": { + "product_id": { + "type": "string", + "description": "Azzoo product identifier (e.g. AZB12345)", + }, + }, + "required": ["product_id"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "azzoo_get_reviews", + "description": "Fetch customer reviews for an Azzoo product.", + "parameters": { + "type": "object", + "properties": { + "product_id": { + "type": "string", + "description": "Azzoo product identifier", + }, + "page": { + "type": "string", + "description": "Review page number", + "default": "1", + }, + }, + "required": ["product_id"], + }, + }, + }, +] + +_AZZOO_SEARCH_RESULT = { + "results": [ + { + "product_id": "AZB00001", + "title": "SteelBrew Pro Kettle 1.7L", + "rating": 4.6, + "price": 34.99, + }, + { + "product_id": "AZB00002", + "title": "HeatKeep Gooseneck Kettle", + "rating": 4.3, + "price": 27.50, + }, + { + "product_id": "AZB00003", + "title": "QuickBoil Stainless Kettle", + "rating": 4.1, + "price": 21.00, + }, + ] +} +_AZZOO_PRODUCT_RESULT = { + "product_id": "AZB00001", + "title": "SteelBrew Pro Kettle 1.7L", + "price": 34.99, + "rating": 4.6, + "review_count": 2847, + "specs": { + "material": "18/8 stainless steel", + "capacity": "1.7 L", + "auto_shutoff": True, + "keep_warm": "30 min", + "warranty": "2 years", + }, +} +_AZZOO_REVIEWS_RESULT = { + "product_id": "AZB00001", + "average_rating": 4.6, + "reviews": [ + { + "rating": 5, + "title": "Excellent build quality", + "body": "Very sturdy, boils fast and stays warm longer than expected.", + }, + { + "rating": 5, + "title": "Great for loose-leaf tea", + "body": "The wide spout makes filling a teapot easy. No leaks after months of use.", + }, + { + "rating": 3, + "title": "Minor lid issue", + "body": "The lid doesn't always click shut properly, but overall happy with it.", + }, + { + "rating": 4, + "title": "Good value", + "body": "Heats quickly and the auto shutoff works reliably.", + }, + ], +} + +AZZOO_TEST_CASE = { + "name": "Azzoo E-commerce: search -> product detail -> reviews", + "messages": [ + { + "role": "user", + "content": ( + "I need a durable stainless steel tea kettle for my weekly tea gatherings. " + "Please search Azzoo for 'stainless steel tea kettle', then get full details " + "on the top-rated result, and finally fetch its customer reviews so I can " + "check for recurring complaints. Give me a summary with pros and cons." + ), + } + ], + "tools": _AZZOO_TOOLS, + "mock_tool_responses": { + "azzoo_search_products": lambda _: json.dumps(_AZZOO_SEARCH_RESULT), + "azzoo_get_product": lambda _: json.dumps(_AZZOO_PRODUCT_RESULT), + "azzoo_get_reviews": lambda _: json.dumps(_AZZOO_REVIEWS_RESULT), + }, + "validate": lambda tcs, content: _validate_azzoo(tcs, content), +} + + +def _validate_azzoo(tcs, content): + names = [tc["function"]["name"] for tc in tcs] + if not names: + return False, "No tool calls made" + if "azzoo_search_products" not in names: + return False, f"Expected azzoo_search_products to be called, got: {names}" + # After search the model should look up product details + if "azzoo_get_product" not in names and "azzoo_get_reviews" not in names: + return False, f"Expected follow-up product/review lookup, got: {names}" + # Verify product lookup used an ID from search results + for tc in tcs: + if tc["function"]["name"] == "azzoo_get_product": + try: + args = json.loads(tc["function"]["arguments"]) + pid = args.get("product_id", "") + if not pid: + return False, "azzoo_get_product called with empty product_id" + except json.JSONDecodeError: + return False, "azzoo_get_product arguments are not valid JSON" + if not content: + return False, "No final summary produced" + return True, f"All expected tools called in order: {names}" + + +# ---- Test 2: Fitness BMI + exercise recommendations ---- + +_FITNESS_TOOLS = [ + { + "type": "function", + "function": { + "name": "calculate_bmi", + "description": "Calculate Body Mass Index (BMI) from weight and height.", + "parameters": { + "type": "object", + "properties": { + "weight_kg": { + "type": "number", + "description": "Body weight in kilograms", + }, + "height_m": {"type": "number", "description": "Height in meters"}, + }, + "required": ["weight_kg", "height_m"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "get_exercises", + "description": ( + "Fetch a list of exercises filtered by muscle group, difficulty, category, " + "and/or force type." + ), + "parameters": { + "type": "object", + "properties": { + "muscle": { + "type": "string", + "description": "Target muscle group (e.g. chest, back, legs)", + }, + "difficulty": { + "type": "string", + "description": "Difficulty level: beginner, intermediate, expert", + }, + "category": { + "type": "string", + "description": "Exercise category (e.g. strength, cardio, stretching)", + }, + "force": { + "type": "string", + "description": "Force type: push, pull, static", + }, + }, + "required": [], + }, + }, + }, +] + +_BMI_RESULT = {"bmi": 24.5, "category": "Normal weight", "healthy_range": "18.5 – 24.9"} +_EXERCISES_RESULT = { + "exercises": [ + { + "name": "Push-Up", + "muscle": "chest", + "difficulty": "beginner", + "equipment": "none", + "instructions": "Keep body straight, lower chest to floor.", + }, + { + "name": "Incline Dumbbell Press", + "muscle": "chest", + "difficulty": "beginner", + "equipment": "dumbbells, bench", + "instructions": "Press dumbbells up from chest on incline bench.", + }, + { + "name": "Chest Fly (cables)", + "muscle": "chest", + "difficulty": "beginner", + "equipment": "cable machine", + "instructions": "Bring cables together in an arc motion.", + }, + ] +} + +FITNESS_TEST_CASE = { + "name": "Fitness: BMI calculation + exercise suggestions", + "messages": [ + { + "role": "user", + "content": ( + "I'm a 32-year-old male, 78 kg and 1.80 m tall. " + "Please calculate my BMI and then suggest some beginner chest exercises I can do " + "to build strength. Give me a short personalised plan." + ), + } + ], + "tools": _FITNESS_TOOLS, + "mock_tool_responses": { + "calculate_bmi": lambda _: json.dumps(_BMI_RESULT), + "get_exercises": lambda _: json.dumps(_EXERCISES_RESULT), + }, + "validate": lambda tcs, content: _validate_fitness(tcs, content), +} + + +def _validate_fitness(tcs, content): + names = [tc["function"]["name"] for tc in tcs] + if not names: + return False, "No tool calls made" + if "calculate_bmi" not in names: + return False, f"Expected calculate_bmi to be called, got: {names}" + # Validate BMI args contain plausible values + for tc in tcs: + if tc["function"]["name"] == "calculate_bmi": + try: + args = json.loads(tc["function"]["arguments"]) + w = args.get("weight_kg") + h = args.get("height_m") + if w is None or h is None: + return False, f"calculate_bmi missing weight_kg or height_m: {args}" + if not (50 <= float(w) <= 200): + return False, f"calculate_bmi weight out of plausible range: {w}" + if not (1.0 <= float(h) <= 2.5): + return False, f"calculate_bmi height out of plausible range: {h}" + except (json.JSONDecodeError, ValueError) as e: + return False, f"calculate_bmi argument error: {e}" + if not content: + return False, "No final plan produced" + return True, f"Tools called: {names}" + + +# ---- Test 3: Community class planning (anonymised cooking/topic discovery) ---- + +_COMMUNITY_TOOLS = [ + { + "type": "function", + "function": { + "name": "get_trending_questions", + "description": ( + "Fetch commonly asked questions on a topic from search engine 'People Also Ask' boxes." + ), + "parameters": { + "type": "object", + "properties": { + "query": {"type": "string", "description": "Topic to search for"}, + "max_results": { + "type": "integer", + "description": "Maximum questions to return", + "default": 10, + }, + }, + "required": ["query"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "search_mobile_apps", + "description": "Search the mobile app store for apps matching a category or keyword.", + "parameters": { + "type": "object", + "properties": { + "keyword": { + "type": "string", + "description": "Search keyword (e.g. 'Italian cooking')", + }, + "platform": { + "type": "string", + "enum": ["ios", "android", "both"], + "default": "both", + }, + "max_results": { + "type": "integer", + "description": "Number of results", + "default": 10, + }, + }, + "required": ["keyword"], + }, + }, + }, +] + +_TRENDING_QUESTIONS_RESULT = { + "query": "Italian cuisine", + "questions": [ + "What are the most popular Italian dishes?", + "What makes Italian food different from other cuisines?", + "How do you make authentic Italian pasta from scratch?", + "What are traditional Italian desserts?", + "What herbs are commonly used in Italian cooking?", + "Is Italian food healthy?", + "What wine pairs best with Italian pasta?", + ], +} +_APPS_RESULT = { + "keyword": "Italian cooking", + "results": [ + { + "name": "PastaPro", + "rating": 4.5, + "installs": "500K+", + "focus": "pasta recipes only", + }, + { + "name": "CookEasy", + "rating": 4.2, + "installs": "1M+", + "focus": "general cooking, limited Italian content", + }, + { + "name": "ItalianKitchen", + "rating": 3.8, + "installs": "100K+", + "focus": "regional Italian recipes, no video", + }, + ], +} + +COMMUNITY_CLASS_TEST_CASE = { + "name": "Community class planning: trending topics + app gap analysis", + "messages": [ + { + "role": "user", + "content": ( + "I want to start teaching Italian cooking classes at my community centre. " + "First, find out what people commonly ask about Italian cuisine online. " + "Then search for existing Italian cooking apps to see what they cover. " + "Use both results to suggest three unique angles for my classes that fill gaps " + "in what apps already offer." + ), + } + ], + "tools": _COMMUNITY_TOOLS, + "mock_tool_responses": { + "get_trending_questions": lambda _: json.dumps(_TRENDING_QUESTIONS_RESULT), + "search_mobile_apps": lambda _: json.dumps(_APPS_RESULT), + }, + "validate": lambda tcs, content: _validate_community(tcs, content), +} + + +def _validate_community(tcs, content): + names = [tc["function"]["name"] for tc in tcs] + if not names: + return False, "No tool calls made" + missing = [ + t for t in ("get_trending_questions", "search_mobile_apps") if t not in names + ] + if missing: + return False, f"Missing expected tool calls: {missing}; got: {names}" + if not content: + return False, "No class suggestion produced" + return True, f"Both discovery tools called: {names}" + + +# ---- Test 4: Multi-hostname geolocation filter (anonymized gallery discovery) ---- +# Inspired by: checking gallery website server locations to find truly remote venues. +# Anonymized: galleryone.de → halle-eins.de, gallerytwo.fr → galerie-deux.fr, +# gallerythree.it → galleria-tre.it + +_GEO_TOOLS = [ + { + "type": "function", + "function": { + "name": "lookup_ip_geolocation", + "description": ( + "Retrieve geolocation data for an IP address or hostname, including country, " + "city, coordinates, and network info. Useful for verifying physical server " + "locations or personalising regional content." + ), + "parameters": { + "type": "object", + "properties": { + "host": { + "type": "string", + "description": "IP address or hostname to look up (e.g. '8.8.8.8' or 'example.com').", + }, + }, + "required": ["host"], + }, + }, + }, +] + +# Mock: one urban (Berlin → discard), two rural (keep) +_GEO_RESPONSES = { + "halle-eins.de": { + "host": "halle-eins.de", + "city": "Berlin", + "country": "DE", + "lat": 52.5200, + "lon": 13.4050, + "is_major_city": True, + }, + "galerie-deux.fr": { + "host": "galerie-deux.fr", + "city": "Rocamadour", + "country": "FR", + "lat": 44.7994, + "lon": 1.6178, + "is_major_city": False, + }, + "galleria-tre.it": { + "host": "galleria-tre.it", + "city": "Matera", + "country": "IT", + "lat": 40.6664, + "lon": 16.6044, + "is_major_city": False, + }, +} + + +def _geo_mock(args): + host = args.get("host", "") + return json.dumps(_GEO_RESPONSES.get(host, {"error": f"unknown host: {host}"})) + + +GEO_TEST_CASE = { + "name": "Gallery geolocation: filter urban venues, keep remote ones", + "messages": [ + { + "role": "user", + "content": ( + "I have abstract paintings to exhibit in remote European galleries. " + "I received enquiries from three venues: halle-eins.de, galerie-deux.fr, " + "and galleria-tre.it. Please look up the geolocation of each website's server. " + "Discard any venue whose server is in a major city (e.g. Berlin, Paris, Rome). " + "For the remaining venues, report their exact coordinates so I can check " + "whether hiking trails are nearby — my work thrives where nature and art meet." + ), + } + ], + "tools": _GEO_TOOLS, + "mock_tool_responses": { + "lookup_ip_geolocation": _geo_mock, + }, + "validate": lambda tcs, content: _validate_geo(tcs, content), +} + + +def _validate_geo(tcs, content): + names = [tc["function"]["name"] for tc in tcs] + if not names: + return False, "No tool calls made" + # Expect exactly one geolocation call per domain (3 total) + geo_calls = [tc for tc in tcs if tc["function"]["name"] == "lookup_ip_geolocation"] + if len(geo_calls) < 3: + return ( + False, + f"Expected geolocation called 3 times (once per domain), got {len(geo_calls)}", + ) + queried_hosts = set() + for tc in geo_calls: + try: + args = json.loads(tc["function"]["arguments"]) + host = args.get("host", "") + if not host: + return False, f"lookup_ip_geolocation called with empty host: {args}" + queried_hosts.add(host) + except json.JSONDecodeError: + return False, "lookup_ip_geolocation arguments are not valid JSON" + expected = {"halle-eins.de", "galerie-deux.fr", "galleria-tre.it"} + if not expected.issubset(queried_hosts): + return ( + False, + f"Not all domains queried. Expected {expected}, got {queried_hosts}", + ) + if not content: + return False, "No final summary produced" + return True, f"All 3 domains geolocated: {sorted(queried_hosts)}" + + +# ---- Test 5: EV fleet expansion — stock → security → property → video ---- +# Inspired by: multi-step business analysis combining finance, cybersecurity, +# real estate and educational content. +# Anonymized: Tesla → Voltara (VLTR), Rivian → Rivex (RVXN), +# Trenton → Halverton + +_EV_TOOLS = [ + { + "type": "function", + "function": { + "name": "get_stock_quote", + "description": "Retrieve the latest market quote for a financial instrument by ticker symbol.", + "parameters": { + "type": "object", + "properties": { + "symbol": { + "type": "string", + "description": "Ticker symbol (e.g. 'VLTR', 'RVXN')", + }, + "interval": { + "type": "string", + "description": "Time interval: 1min, 5min, 1h, 1day, 1week", + "default": "1day", + }, + }, + "required": ["symbol"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "get_security_advisories", + "description": ( + "Fetch current cybersecurity advisories from the national security agency, " + "covering known vulnerabilities and exploits for industrial and consumer systems." + ), + "parameters": { + "type": "object", + "properties": { + "keyword": { + "type": "string", + "description": "Filter advisories by keyword or product name", + }, + "limit": { + "type": "integer", + "description": "Maximum number of advisories to return", + "default": 5, + }, + }, + "required": [], + }, + }, + }, + { + "type": "function", + "function": { + "name": "search_commercial_properties", + "description": "Search for commercial properties (offices, garages, warehouses) available for rent or sale in a given city.", + "parameters": { + "type": "object", + "properties": { + "city": {"type": "string", "description": "City name to search in"}, + "property_type": { + "type": "string", + "description": "Type of property: office, garage, warehouse, premises", + }, + "operation": { + "type": "string", + "enum": ["rent", "sale"], + "default": "rent", + }, + "max_price": { + "type": "integer", + "description": "Maximum monthly rent or sale price", + }, + }, + "required": ["city", "property_type"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "get_video_recommendations", + "description": "Fetch a list of recommended videos related to a given topic or reference video.", + "parameters": { + "type": "object", + "properties": { + "topic": { + "type": "string", + "description": "Topic or keyword to search for related videos", + }, + }, + "required": ["topic"], + }, + }, + }, +] + +_STOCK_RESULT_VLTR = { + "symbol": "VLTR", + "company": "Voltara Inc.", + "price": 218.45, + "change_pct": "+2.3%", + "market_cap": "694B", + "currency": "USD", +} +_STOCK_RESULT_RVXN = { + "symbol": "RVXN", + "company": "Rivex Motors", + "price": 12.80, + "change_pct": "-1.1%", + "market_cap": "11B", + "currency": "USD", +} +_ADVISORIES_RESULT = { + "count": 2, + "advisories": [ + { + "id": "ICSA-24-102-01", + "title": "Voltara In-Vehicle Infotainment System Authentication Bypass", + "severity": "Medium", + "summary": "Improper authentication in the OTA update module may allow an adjacent attacker to install unsigned firmware.", + "published": "2024-04-11", + }, + { + "id": "ICSA-24-085-03", + "title": "Voltara Charging Management API Input Validation Flaw", + "severity": "Low", + "summary": "Insufficient input validation in the charging session API could expose internal error messages.", + "published": "2024-03-26", + }, + ], +} +_PROPERTIES_RESULT = { + "city": "Halverton", + "listings": [ + { + "id": "HV-0041", + "type": "garage", + "area_sqm": 420, + "monthly_rent": 2800, + "ev_power_outlets": 12, + "address": "14 Ironworks Lane, Halverton", + }, + { + "id": "HV-0089", + "type": "warehouse", + "area_sqm": 900, + "monthly_rent": 4200, + "ev_power_outlets": 30, + "address": "7 Depot Road, Halverton", + }, + ], +} +_VIDEOS_RESULT = { + "topic": "fleet electrification", + "recommendations": [ + { + "title": "How to Build an EV Fleet from Scratch", + "channel": "Fleet Future", + "views": "182K", + }, + { + "title": "EV Charging Infrastructure for Commercial Fleets", + "channel": "GreenDrive Pro", + "views": "94K", + }, + { + "title": "Total Cost of Ownership: Electric vs Diesel Vans", + "channel": "LogisticsTech", + "views": "61K", + }, + ], +} + + +def _ev_stock_mock(args): + symbol = args.get("symbol", "").upper() + if symbol == "VLTR": + return json.dumps(_STOCK_RESULT_VLTR) + if symbol == "RVXN": + return json.dumps(_STOCK_RESULT_RVXN) + return json.dumps({"error": f"Unknown symbol: {symbol}"}) + + +EV_FLEET_TEST_CASE = { + "name": "EV fleet expansion: stock → cybersecurity → property → videos", + "messages": [ + { + "role": "user", + "content": ( + "I'm expanding my courier business into electric vehicles and need a multi-step analysis:\n" + "1. Get the latest stock quote for Voltara (VLTR) and Rivex (RVXN). " + "If either is above $50, continue with that company.\n" + "2. Search for cybersecurity advisories related to that company's vehicle models " + "to understand any tech risks.\n" + "3. Find commercial garage or warehouse properties in Halverton suitable for " + "EV charging infrastructure.\n" + "4. Recommend videos on fleet electrification strategies.\n" + "Please work through all four steps and give me a concise summary." + ), + } + ], + "tools": _EV_TOOLS, + "mock_tool_responses": { + "get_stock_quote": _ev_stock_mock, + "get_security_advisories": lambda _: json.dumps(_ADVISORIES_RESULT), + "search_commercial_properties": lambda _: json.dumps(_PROPERTIES_RESULT), + "get_video_recommendations": lambda _: json.dumps(_VIDEOS_RESULT), + }, + "validate": lambda tcs, content: _validate_ev(tcs, content), +} + + +def _validate_ev(tcs, content): + names = [tc["function"]["name"] for tc in tcs] + if not names: + return False, "No tool calls made" + # Stock quote must come first + if names[0] != "get_stock_quote": + return False, f"Expected get_stock_quote to be called first, got: {names[0]}" + stock_calls = [tc for tc in tcs if tc["function"]["name"] == "get_stock_quote"] + for tc in stock_calls: + try: + args = json.loads(tc["function"]["arguments"]) + sym = args.get("symbol", "") + if not sym: + return False, f"get_stock_quote called with empty symbol: {args}" + except json.JSONDecodeError: + return False, "get_stock_quote arguments are not valid JSON" + # All four pipeline tools expected + required = [ + "get_stock_quote", + "get_security_advisories", + "search_commercial_properties", + "get_video_recommendations", + ] + missing = [t for t in required if t not in names] + if missing: + return False, f"Missing pipeline steps: {missing}" + if not content: + return False, "No final summary produced" + return True, f"Full 4-step pipeline executed: {names}" + + +# --------------------------------------------------------------------------- +# All test cases +# --------------------------------------------------------------------------- + +ALL_TEST_CASES = [ + AZZOO_TEST_CASE, + FITNESS_TEST_CASE, + COMMUNITY_CLASS_TEST_CASE, + GEO_TEST_CASE, + EV_FLEET_TEST_CASE, +] + + +# --------------------------------------------------------------------------- +# Entry point +# --------------------------------------------------------------------------- + + +def main(): + parser = argparse.ArgumentParser( + description="Test llama-server tool-calling capability." + ) + parser.add_argument("--host", default="localhost") + parser.add_argument("--port", default=8080, type=int) + parser.add_argument( + "--no-stream", action="store_true", help="Disable streaming mode tests" + ) + parser.add_argument( + "--stream-only", action="store_true", help="Only run streaming mode tests" + ) + parser.add_argument( + "--test", + help="Run only the test whose name contains this substring (case-insensitive)", + ) + args = parser.parse_args() + + url = f"http://{args.host}:{args.port}/v1/chat/completions" + print_info(f"Testing server at {url}") + + modes = [] + if not args.stream_only: + modes.append(False) + if not args.no_stream: + modes.append(True) + + cases: list[dict] = ALL_TEST_CASES + if args.test: + name_filter = args.test.lower() + cases = [c for c in cases if name_filter in str(c["name"]).lower()] + if not cases: + print_fail(f"No test cases matched '{args.test}'") + sys.exit(1) + + total = 0 + passed = 0 + for stream in modes: + for case in cases: + total += 1 + if run_test(url, case, stream=stream): + passed += 1 + + color = GREEN if passed == total else RED + _print(f"\n{BOLD}{color}{'─'*60}{RESET}") + _print(f"{BOLD}{color} Results: {passed}/{total} passed{RESET}") + _print(f"{BOLD}{color}{'─'*60}{RESET}\n") + sys.exit(0 if passed == total else 1) + + +if __name__ == "__main__": + main() diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index c2833b75ce..0e7d96ca10 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -19,7 +19,7 @@ // dedup helpers -static ggml_tensor * build_kq_mask( +static ggml_tensor * build_attn_inp_kq_mask( ggml_context * ctx, const llama_kv_cache_context * mctx, const llama_ubatch & ubatch, @@ -28,7 +28,11 @@ static ggml_tensor * build_kq_mask( const auto n_tokens = ubatch.n_tokens; const auto n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq; - return ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream); + ggml_tensor * res = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream); + ggml_set_input(res); + ggml_set_name(res, "attn_inp_kq_mask"); + + return res; } static bool can_reuse_kq_mask( @@ -52,6 +56,21 @@ static bool can_reuse_kq_mask( // impl +static ggml_tensor * ggml_mul_mat_aux( + ggml_context * ctx, + ggml_tensor * cur, + ggml_tensor * rot) { + const auto n = rot->ne[0]; + + ggml_tensor * res; + + res = ggml_reshape_2d(ctx, cur, n, ggml_nelements(cur)/n); + res = ggml_mul_mat (ctx, rot, res); + res = ggml_reshape_4d(ctx, res, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3]); + + return res; +} + void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) { if (ubatch->token) { const int64_t n_tokens = ubatch->n_tokens; @@ -429,6 +448,14 @@ void llm_graph_input_attn_kv::set_input(const llama_ubatch * ubatch) { mctx->set_input_v_idxs(self_v_idxs, ubatch); mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn); + + if (self_k_rot) { + mctx->set_input_k_rot(self_k_rot); + } + + if (self_v_rot) { + mctx->set_input_v_rot(self_v_rot); + } } bool llm_graph_input_attn_kv::can_reuse(const llm_graph_params & params) { @@ -476,6 +503,14 @@ void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) { mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch); mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn); + + if (self_k_rot) { + mctx->get_base()->set_input_k_rot(self_k_rot); + } + + if (self_v_rot) { + mctx->get_base()->set_input_v_rot(self_v_rot); + } } bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) { @@ -532,6 +567,14 @@ void llm_graph_input_mem_hybrid::set_input(const llama_ubatch * ubatch) { mctx->get_attn()->set_input_kq_mask(inp_attn->self_kq_mask, ubatch, cparams.causal_attn); + if (inp_attn->self_k_rot) { + mctx->get_attn()->set_input_k_rot(inp_attn->self_k_rot); + } + + if (inp_attn->self_v_rot) { + mctx->get_attn()->set_input_v_rot(inp_attn->self_v_rot); + } + const int64_t n_rs = mctx->get_recr()->get_n_rs(); if (inp_rs->s_copy) { @@ -630,6 +673,14 @@ void llm_graph_input_mem_hybrid_iswa::set_input(const llama_ubatch * ubatch) { attn_ctx->get_swa()->set_input_kq_mask(inp_attn->self_kq_mask_swa, ubatch, cparams.causal_attn); } + if (inp_attn->self_k_rot) { + attn_ctx->get_base()->set_input_k_rot(inp_attn->self_k_rot); + } + + if (inp_attn->self_v_rot) { + attn_ctx->get_base()->set_input_v_rot(inp_attn->self_v_rot); + } + const int64_t n_rs = mctx->get_recr()->get_n_rs(); if (inp_rs->s_copy) { @@ -2002,13 +2053,13 @@ static std::unique_ptr build_attn_inp_kv_impl( inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch); inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch); - inp->self_kq_mask = build_kq_mask(ctx0, mctx_cur, ubatch, cparams); - - ggml_set_input(inp->self_kq_mask); - + inp->self_kq_mask = build_attn_inp_kq_mask(ctx0, mctx_cur, ubatch, cparams); inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask; } + inp->self_k_rot = mctx_cur->build_input_k_rot(ctx0); + inp->self_v_rot = mctx_cur->build_input_v_rot(ctx0); + return inp; } @@ -2034,6 +2085,15 @@ ggml_tensor * llm_graph_context::build_attn( int il) const { GGML_ASSERT(v_mla == nullptr); + if (inp->self_k_rot) { + q_cur = ggml_mul_mat_aux(ctx0, q_cur, inp->self_k_rot); + k_cur = ggml_mul_mat_aux(ctx0, k_cur, inp->self_k_rot); + } + + if (inp->self_v_rot) { + v_cur = ggml_mul_mat_aux(ctx0, v_cur, inp->self_v_rot); + } + // these nodes are added to the graph together so that they are not reordered // by doing so, the number of splits in the graph is reduced // expand k later to enable rope fusion which directly writes into k-v cache @@ -2061,6 +2121,10 @@ ggml_tensor * llm_graph_context::build_attn( ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il); cb(cur, "kqv_out", il); + if (inp->self_v_rot) { + cur = ggml_mul_mat_aux(ctx0, cur, inp->self_v_rot); + } + if (wo) { cur = build_lora_mm(wo, cur); if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE || arch == LLM_ARCH_JAIS2) { @@ -2090,9 +2154,7 @@ static std::unique_ptr build_attn_inp_k_impl( inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch); - inp->self_kq_mask = build_kq_mask(ctx0, mctx_cur, ubatch, cparams); - ggml_set_input(inp->self_kq_mask); - + inp->self_kq_mask = build_attn_inp_kq_mask(ctx0, mctx_cur, ubatch, cparams); inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask; } @@ -2171,6 +2233,18 @@ ggml_tensor * llm_graph_context::build_attn( ggml_tensor * v_mla, float kq_scale, int il) const { + if (inp->self_k_rot) { + q_cur = ggml_mul_mat_aux(ctx0, q_cur, inp->self_k_rot); + if (k_cur) { + k_cur = ggml_mul_mat_aux(ctx0, k_cur, inp->self_k_rot); + } + } + if (inp->self_v_rot) { + if (v_cur) { + v_cur = ggml_mul_mat_aux(ctx0, v_cur, inp->self_v_rot); + } + } + // these nodes are added to the graph together so that they are not reordered // by doing so, the number of splits in the graph is reduced ggml_build_forward_expand(gf, q_cur); @@ -2211,6 +2285,10 @@ ggml_tensor * llm_graph_context::build_attn( ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il); cb(cur, "kqv_out", il); + if (inp->self_v_rot) { + cur = ggml_mul_mat_aux(ctx0, cur, inp->self_v_rot); + } + if (wo) { cur = build_lora_mm(wo, cur); } @@ -2293,12 +2371,8 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const inp->self_k_idxs = mctx_cur->get_base()->build_input_k_idxs(ctx0, ubatch); inp->self_v_idxs = mctx_cur->get_base()->build_input_v_idxs(ctx0, ubatch); - inp->self_kq_mask = build_kq_mask(ctx0, mctx_cur->get_base(), ubatch, cparams); - ggml_set_input(inp->self_kq_mask); - ggml_set_name(inp->self_kq_mask, "self_kq_mask"); - + inp->self_kq_mask = build_attn_inp_kq_mask(ctx0, mctx_cur->get_base(), ubatch, cparams); inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask; - ggml_set_name(inp->self_kq_mask_cnv, "self_kq_mask_cnv"); } { @@ -2307,14 +2381,13 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const inp->self_k_idxs_swa = mctx_cur->get_swa()->build_input_k_idxs(ctx0, ubatch); inp->self_v_idxs_swa = mctx_cur->get_swa()->build_input_v_idxs(ctx0, ubatch); - inp->self_kq_mask_swa = build_kq_mask(ctx0, mctx_cur->get_swa(), ubatch, cparams); - ggml_set_input(inp->self_kq_mask_swa); - ggml_set_name(inp->self_kq_mask_swa, "self_kq_mask_swa"); - + inp->self_kq_mask_swa = build_attn_inp_kq_mask(ctx0, mctx_cur->get_swa(), ubatch, cparams); inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa; - ggml_set_name(inp->self_kq_mask_swa_cnv, "self_kq_mask_swa_cnv"); } + inp->self_k_rot = mctx_cur->get_base()->build_input_k_rot(ctx0); + inp->self_v_rot = mctx_cur->get_base()->build_input_v_rot(ctx0); + return (llm_graph_input_attn_kv_iswa *) res->add_input(std::move(inp)); } @@ -2473,9 +2546,7 @@ llm_graph_input_mem_hybrid_iswa * llm_graph_context::build_inp_mem_hybrid_iswa() inp_attn->self_k_idxs = attn_ctx->get_base()->build_input_k_idxs(ctx0, ubatch); inp_attn->self_v_idxs = attn_ctx->get_base()->build_input_v_idxs(ctx0, ubatch); - inp_attn->self_kq_mask = build_kq_mask(ctx0, attn_ctx->get_base(), ubatch, cparams); - ggml_set_input(inp_attn->self_kq_mask); - + inp_attn->self_kq_mask = build_attn_inp_kq_mask(ctx0, attn_ctx->get_base(), ubatch, cparams); inp_attn->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_attn->self_kq_mask, GGML_TYPE_F16) : inp_attn->self_kq_mask; } @@ -2483,9 +2554,7 @@ llm_graph_input_mem_hybrid_iswa * llm_graph_context::build_inp_mem_hybrid_iswa() inp_attn->self_k_idxs_swa = attn_ctx->get_swa()->build_input_k_idxs(ctx0, ubatch); inp_attn->self_v_idxs_swa = attn_ctx->get_swa()->build_input_v_idxs(ctx0, ubatch); - inp_attn->self_kq_mask_swa = build_kq_mask(ctx0, attn_ctx->get_swa(), ubatch, cparams); - ggml_set_input(inp_attn->self_kq_mask_swa); - + inp_attn->self_kq_mask_swa = build_attn_inp_kq_mask(ctx0, attn_ctx->get_swa(), ubatch, cparams); inp_attn->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_attn->self_kq_mask_swa, GGML_TYPE_F16) : inp_attn->self_kq_mask_swa; } diff --git a/src/llama-graph.h b/src/llama-graph.h index 4855685ef7..bb0ad75198 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -308,6 +308,10 @@ public: ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream] ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream] + // note: assumes v_rot^ == I + ggml_tensor * self_k_rot = nullptr; + ggml_tensor * self_v_rot = nullptr; + // note: these have to be copies because in order to be able to reuse a graph, its inputs // need to carry these parameters with them. otherwise, they can point to freed // llm_graph_params from a previous batch, causing stack-use-after-return @@ -384,6 +388,10 @@ public: ggml_tensor * self_kq_mask_swa = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream] ggml_tensor * self_kq_mask_swa_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream] + // note: using same rotation matrices for both base and swa cache + ggml_tensor * self_k_rot = nullptr; + ggml_tensor * self_v_rot = nullptr; + const llama_hparams hparams; const llama_cparams cparams; diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 5f57ba9e1d..3e0fd3107f 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -13,6 +13,65 @@ #include #include +static bool ggml_is_power_of_2(int n) { + return (n & (n - 1)) == 0; +} + +// orthonormal Walsh-Hadamard rotation matrix +// note: res^2 == I +static void ggml_gen_hadamard(ggml_tensor * tensor) { + assert(tensor->type == GGML_TYPE_F32); + + const int n = tensor->ne[0]; + + assert(ggml_is_power_of_2(n)); + assert(tensor->ne[1] == n); + assert(tensor->ne[2] == 1); + assert(tensor->ne[3] == 1); + + std::vector data_f32; + + float * data = (float *) tensor->data; + + if (tensor->type != GGML_TYPE_F32) { + data_f32.resize(n*n); + data = data_f32.data(); + } + + data[0*n + 0] = 1.0 / sqrtf(n); + + for (int s = 1; s < n; s *= 2) { + for (int i = 0; i < s; i++) { + for (int j = 0; j < s; j++) { + const float val = data[i*n + j]; + + data[(i + s)*n + (j )] = val; + data[(i )*n + (j + s)] = val; + data[(i + s)*n + (j + s)] = -val; + } + } + } + + if (tensor->type != GGML_TYPE_F32) { + ggml_quantize_chunk(tensor->type, data, tensor->data, 0, 1, n*n, nullptr); + } +} + +static ggml_tensor * ggml_mul_mat_aux( + ggml_context * ctx, + ggml_tensor * cur, + ggml_tensor * rot) { + const auto n = rot->ne[0]; + + ggml_tensor * res; + + res = ggml_reshape_2d(ctx, cur, n, ggml_nelements(cur)/n); + res = ggml_mul_mat (ctx, rot, res); + res = ggml_reshape_4d(ctx, res, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3]); + + return res; +} + // // llama_kv_cache // @@ -209,6 +268,48 @@ llama_kv_cache::llama_kv_cache( ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f)); } + const char * LLAMA_ATTN_ROT_DISABLE = getenv("LLAMA_ATTN_ROT_DISABLE"); + const bool attn_rot_disable = LLAMA_ATTN_ROT_DISABLE ? atoi(LLAMA_ATTN_ROT_DISABLE) : false; + if (attn_rot_disable) { + LLAMA_LOG_WARN("%s: attention rotation force disabled (LLAMA_ATTN_ROT_DISABLE)\n", __func__); + } + + attn_rot_k = + !attn_rot_disable && + ggml_is_quantized(type_k) && + !hparams.is_n_embd_k_gqa_variable() && + hparams.n_embd_head_k() % 64 == 0; + + attn_rot_v = + !attn_rot_disable && + ggml_is_quantized(type_v) && + !hparams.is_n_embd_v_gqa_variable() && + hparams.n_embd_head_v() % 64 == 0; + + LLAMA_LOG_INFO("%s: attn_rot_k = %d\n", __func__, attn_rot_k); + LLAMA_LOG_INFO("%s: attn_rot_v = %d\n", __func__, attn_rot_v); + + // pre-compute the haramard matrices and keep them in host memory + // TODO: in the future, we can make copies in the backend buffers to avoid host -> device transfers + if (attn_rot_k || attn_rot_v) { + for (int64_t n = 64; n <= std::max(hparams.n_embd_head_k(), hparams.n_embd_head_v()); n *= 2) { + attn_rot_hadamard[n] = std::vector(n*n); + + ggml_init_params params = { + /* .mem_size = */ 1*ggml_tensor_overhead(), + /* .mem_buffer = */ nullptr, + /* .no_alloc = */ true, + }; + + ggml_context_ptr ctx { ggml_init(params) }; + + ggml_tensor * tmp = ggml_new_tensor_2d(ctx.get(), GGML_TYPE_F32, n, n); + tmp->data = attn_rot_hadamard[n].data(); + + ggml_gen_hadamard(tmp); + } + } + const char * LLAMA_KV_CACHE_DEBUG = getenv("LLAMA_KV_CACHE_DEBUG"); debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0; } @@ -1004,6 +1105,14 @@ bool llama_kv_cache::get_has_shift() const { return result; } +ggml_type llama_kv_cache::type_k() const { + return layers[0].k->type; +} + +ggml_type llama_kv_cache::type_v() const { + return layers[0].v->type; +} + uint32_t llama_kv_cache::get_n_kv(const slot_info & sinfo) const { uint32_t result = 0; @@ -1189,6 +1298,47 @@ ggml_tensor * llama_kv_cache::build_input_v_idxs(ggml_context * ctx, const llama return v_idxs; } +ggml_tensor * llama_kv_cache::build_input_k_rot(ggml_context * ctx) const { + ggml_tensor * res = nullptr; + + if (attn_rot_k) { + int nrot = 64; + + // TODO: investigate if using the smallest rotation matrix is beneficial also for K (similar as for V) + // ref: https://github.com/ggml-org/llama.cpp/pull/21038#issuecomment-4141323088 + do { + nrot *= 2; + } while (hparams.n_embd_head_k() % nrot == 0); + nrot /= 2; + + res = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nrot, nrot); + ggml_set_input(res); + ggml_set_name(res, "attn_inp_k_rot"); + } + + return res; +} + +ggml_tensor * llama_kv_cache::build_input_v_rot(ggml_context * ctx) const { + ggml_tensor * res = nullptr; + + if (attn_rot_v) { + int nrot = 64; + // using smaller rotation matrices for V seems beneficial + // ref: https://github.com/ggml-org/llama.cpp/pull/21038#issuecomment-4146397570 + //do { + // nrot *= 2; + //} while (hparams.n_embd_head_v() % nrot == 0); + //nrot /= 2; + + res = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nrot, nrot); + ggml_set_input(res); + ggml_set_name(res, "attn_inp_v_rot"); + } + + return res; +} + void llama_kv_cache::set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const { const uint32_t n_tokens = ubatch->n_tokens; GGML_ASSERT(n_tokens == (int64_t) sinfo.size()*sinfo.n_stream()); @@ -1507,6 +1657,24 @@ void llama_kv_cache::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch } } +void llama_kv_cache::set_input_k_rot(ggml_tensor * dst) const { + GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer)); + + const auto n_rot = dst->ne[0]; + GGML_ASSERT(attn_rot_hadamard.count(dst->ne[0])); + + memcpy(dst->data, attn_rot_hadamard.at(n_rot).data(), ggml_nbytes(dst)); +} + +void llama_kv_cache::set_input_v_rot(ggml_tensor * dst) const { + GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer)); + + const auto n_rot = dst->ne[0]; + GGML_ASSERT(attn_rot_hadamard.count(dst->ne[0])); + + memcpy(dst->data, attn_rot_hadamard.at(n_rot).data(), ggml_nbytes(dst)); +} + size_t llama_kv_cache::total_size() const { size_t size = 0; @@ -1542,6 +1710,7 @@ ggml_tensor * llama_kv_cache::build_rope_shift( ggml_context * ctx, ggml_tensor * cur, ggml_tensor * shift, + ggml_tensor * rot, ggml_tensor * factors, float freq_base, float freq_scale, @@ -1567,10 +1736,16 @@ ggml_tensor * llama_kv_cache::build_rope_shift( // dequantize to f32 -> RoPE -> quantize back tmp = ggml_cast(ctx, cur, GGML_TYPE_F32); + // rotate back + tmp = ggml_mul_mat_aux(ctx, tmp, rot); + tmp = ggml_rope_ext(ctx, tmp, shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow); + // rotate fwd + tmp = ggml_mul_mat_aux(ctx, tmp, rot); + tmp = ggml_cpy(ctx, tmp, cur); } else { // we rotate only the first n_rot dimensions @@ -1591,6 +1766,9 @@ public: ggml_tensor * k_shift; // I32 [kv_size*n_stream] + // note: assumes k_rot^2 == I + ggml_tensor * k_rot = nullptr; + const llama_kv_cache * kv_self; }; @@ -1600,6 +1778,10 @@ void llm_graph_input_k_shift::set_input(const llama_ubatch * ubatch) { if (k_shift) { kv_self->set_input_k_shift(k_shift); } + + if (k_rot) { + kv_self->set_input_k_rot(k_rot); + } } ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_context * lctx) const { @@ -1611,6 +1793,8 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, (int64_t) get_size()*n_stream); ggml_set_input(inp->k_shift); + inp->k_rot = build_input_k_rot(ctx); + const auto & cparams = lctx->get_cparams(); for (const auto & layer : layers) { @@ -1635,7 +1819,7 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co ggml_row_size(layer.k->type, n_embd_k_gqa), ggml_row_size(layer.k->type, n_embd_nope)); - ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l, il); + ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, inp->k_rot, rope_factors, freq_base_l, freq_scale_l, il); ggml_build_forward_expand(gf, cur); } @@ -2239,6 +2423,14 @@ uint32_t llama_kv_cache_context::get_n_kv() const { return n_kv; } +ggml_type llama_kv_cache_context::type_k() const { + return kv->type_k(); +} + +ggml_type llama_kv_cache_context::type_v() const { + return kv->type_v(); +} + ggml_tensor * llama_kv_cache_context::get_k(ggml_context * ctx, int32_t il) const { return kv->get_k(ctx, il, n_kv, sinfos[i_cur]); } @@ -2263,6 +2455,14 @@ ggml_tensor * llama_kv_cache_context::build_input_v_idxs(ggml_context * ctx, con return kv->build_input_v_idxs(ctx, ubatch); } +ggml_tensor * llama_kv_cache_context::build_input_k_rot(ggml_context * ctx) const { + return kv->build_input_k_rot(ctx); +} + +ggml_tensor * llama_kv_cache_context::build_input_v_rot(ggml_context * ctx) const { + return kv->build_input_v_rot(ctx); +} + void llama_kv_cache_context::set_input_k_shift(ggml_tensor * dst) const { kv->set_input_k_shift(dst); } @@ -2282,3 +2482,11 @@ void llama_kv_cache_context::set_input_kq_mask(ggml_tensor * dst, const llama_ub void llama_kv_cache_context::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const { kv->set_input_pos_bucket(dst, ubatch); } + +void llama_kv_cache_context::set_input_k_rot(ggml_tensor * dst) const { + kv->set_input_k_rot(dst); +} + +void llama_kv_cache_context::set_input_v_rot(ggml_tensor * dst) const { + kv->set_input_v_rot(dst); +} diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index 90a0610c49..d4569a06f7 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -152,6 +152,9 @@ public: bool get_has_shift() const; + ggml_type type_k() const; + ggml_type type_v() const; + // // graph_build API // @@ -191,6 +194,9 @@ public: ggml_tensor * build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const; ggml_tensor * build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const; + ggml_tensor * build_input_k_rot(ggml_context * ctx) const; + ggml_tensor * build_input_v_rot(ggml_context * ctx) const; + void set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const; void set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const; @@ -199,6 +205,9 @@ public: void set_input_kq_mask (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const; void set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const; + void set_input_k_rot(ggml_tensor * dst) const; + void set_input_v_rot(ggml_tensor * dst) const; + private: const llama_model & model; const llama_hparams & hparams; @@ -226,6 +235,13 @@ private: // SWA const uint32_t n_swa = 0; + // env: LLAMA_ATTN_ROT_DISABLE + bool attn_rot_k = false; + bool attn_rot_v = false; + + // pre-computed hadamard martrices + std::unordered_map> attn_rot_hadamard; + // env: LLAMA_KV_CACHE_DEBUG int debug = 0; @@ -262,6 +278,7 @@ private: ggml_context * ctx, ggml_tensor * cur, ggml_tensor * shift, + ggml_tensor * rot, ggml_tensor * factors, float freq_base, float freq_scale, @@ -328,6 +345,9 @@ public: uint32_t get_n_kv() const; + ggml_type type_k() const; + ggml_type type_v() const; + // get views of the current state of the cache ggml_tensor * get_k(ggml_context * ctx, int32_t il) const; ggml_tensor * get_v(ggml_context * ctx, int32_t il) const; @@ -347,6 +367,9 @@ public: ggml_tensor * build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const; ggml_tensor * build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const; + ggml_tensor * build_input_k_rot(ggml_context * ctx) const; + ggml_tensor * build_input_v_rot(ggml_context * ctx) const; + void set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch) const; void set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch) const; @@ -354,6 +377,9 @@ public: void set_input_kq_mask (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const; void set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const; + void set_input_k_rot(ggml_tensor * dst) const; + void set_input_v_rot(ggml_tensor * dst) const; + private: llama_memory_status status; diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp index 1c4da68195..b66916687b 100644 --- a/tests/test-chat.cpp +++ b/tests/test-chat.cpp @@ -2712,6 +2712,67 @@ static void test_template_output_peg_parsers(bool detailed_debug) { .run(); } + // LFM2.5 tests - uses plain "List of tools: [...]" and bare [name(args)] without wrapper tokens + { + auto tst = peg_tester("models/templates/LFM2.5-Instruct.jinja", detailed_debug); + + // Basic content only + tst.test("Hello, world!\nWhat's up?").expect(message_assist).run(); + + // Single tool call without reasoning + tst.test("[special_function(arg1=1)]") + .tools({ special_function_tool }) + .expect(message_assist_call) + .run(); + + // Tool call with string argument + tst.test("[get_time(city=\"XYZCITY\")]") + .tools({ get_time_tool }) + .expect(message_with_tool_calls("get_time", "{\"city\":\"XYZCITY\"}")) + .run(); + + // Tool call with reasoning (enable_thinking=true) + tst.test("I'm\nthinking[special_function(arg1=1)]") + .enable_thinking(true) + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .tools({ special_function_tool }) + .expect(message_assist_call_thoughts) + .run(); + + // Multiple tool calls (parallel) + tst.test("[special_function(arg1=1), special_function_with_opt(arg1=1, arg2=2)]") + .parallel_tool_calls(true) + .tools({ + special_function_tool, special_function_tool_with_optional_param + }) + .expect_tool_calls({ + { "special_function", R"({"arg1": 1})", {} }, + { "special_function_with_opt", R"({"arg1": 1, "arg2": 2})", {} }, + }) + .run(); + + // Tool call with content before tool call + tst.test("Let me check the time.[get_time(city=\"Paris\")]") + .tools({ get_time_tool }) + .expect(message_with_reasoning_content_and_multiple_tool_calls( + "", "Let me check the time.", { { "get_time", "{\"city\":\"Paris\"}" } } + )) + .run(); + + // Partial tool call (streaming) + tst.test("[special_function(arg1=") + .tools({ special_function_tool }) + .is_partial(true) + .expect(simple_assist_msg("", "", "special_function", "{\"arg1\": ")) + .run(); + + // Tool call with empty arguments + tst.test("[empty_args()]") + .tools({ empty_args_tool }) + .expect(simple_assist_msg("", "", "empty_args", "{}")) + .run(); + } + // Apertus-8B-Instruct tests - FUNC_NAME_AS_KEY format // Format: <|tools_prefix|>[{"function_name": {...arguments...}}]<|tools_suffix|> { diff --git a/tools/server/server-http.cpp b/tools/server/server-http.cpp index f52240b106..be2af26223 100644 --- a/tools/server/server-http.cpp +++ b/tools/server/server-http.cpp @@ -143,7 +143,11 @@ bool server_http_context::init(const common_params & params) { "/v1/health", "/models", "/v1/models", - "/api/tags" + "/api/tags", + "/", + "/index.html", + "/bundle.js", + "/bundle.css", }; // If API key is not set, skip validation @@ -151,8 +155,8 @@ bool server_http_context::init(const common_params & params) { return true; } - // If path is public or is static file, skip validation - if (public_endpoints.find(req.path) != public_endpoints.end() || req.path == "/") { + // If path is public or static file, skip validation + if (public_endpoints.find(req.path) != public_endpoints.end()) { return true; } diff --git a/tools/server/tests/unit/test_security.py b/tools/server/tests/unit/test_security.py index 8c38b89d53..bb22095f12 100644 --- a/tools/server/tests/unit/test_security.py +++ b/tools/server/tests/unit/test_security.py @@ -22,6 +22,15 @@ def test_access_public_endpoint(endpoint: str): assert "error" not in res.body +def test_access_static_assets_without_api_key(): + """Static web UI assets should not require API key authentication (issue #21229)""" + global server + server.start() + for path in ["/", "/bundle.js", "/bundle.css"]: + res = server.make_request("GET", path) + assert res.status_code == 200, f"Expected 200 for {path}, got {res.status_code}" + + @pytest.mark.parametrize("api_key", [None, "invalid-key"]) def test_incorrect_api_key(api_key: str): global server