Compare commits

...

16 Commits

Author SHA1 Message Date
Marko Tombak c0f6eea67e
Merge 6403785b94 into fbd441c379 2026-04-02 03:44:16 +03:00
Todor Boinovski fbd441c379
hexagon : add cumsum op support (#21246)
* hexagon : add cumsum op support

* hexagon: enable dma for cumsum op

* Fix line-ending

---------

Co-authored-by: Max Krasnyansky <maxk@qti.qualcomm.com>
2026-04-01 17:44:02 -07:00
Xuan-Son Nguyen c30e012253
contrib : rewrite AGENTS.md, make it more clear about project values (#21270)
* contrib : rewrite AGENTS.md, make it more clear about types of permitted AI usage

* permit AI for writing code
2026-04-01 23:31:51 +02:00
lhez 95a6ebabb2
opencl: fix leak in Adreno q8_0 path (#21212) 2026-04-01 12:54:58 -07:00
Aleksander Grygier 12dbf1da95
server: Bypass API Key validation for WebUI static bundle assets (#21269)
* fix: Bypass API Key validation for static bundle assets

* refactor: All bypassed routes in `public_endpoints`

* test: Update static assets API Key test
2026-04-01 21:32:15 +02:00
Johannes Gäßler 86221cf6da
CUDA: fix FA kernel selection logic (#21271) 2026-04-01 22:28:19 +03:00
Martin Klacer 6de97b9d3e
kleidiai: add CPU feature detection to CI run script (#20394)
* kleidiai: add cpu feature detection to CI run script

Signed-off-by: Martin Klacer <martin.klacer@arm.com>
Change-Id: I663adc3a7691a98e7dac5488962c13cc344f034a

* kleidiai: revert unrelated requirements change

Signed-off-by: Martin Klacer <martin.klacer@arm.com>

* kleidiai: removed cpu feature detection from CI run script

 * As per the maintainers' suggestion, removed cpu feature detection
   from CI run script as CMake handles it already

Signed-off-by: Martin Klacer <martin.klacer@arm.com>

---------

Signed-off-by: Martin Klacer <martin.klacer@arm.com>
2026-04-01 20:02:41 +03:00
Nikhil Jain 5a0ed5150a
Update Dawn version in WebGPU CI (#20784)
* Pin Dawn version

* Update docs with new Dawn commit hash
2026-04-01 09:53:05 -07:00
Aparna M P 8710e5f9b9
hexagon: improve RMS_NORM and DIV accuracy (#21251)
* hexagon-rms_norm: fix RMS_NORM for non-aligned tensor sizes

Co-authored-by: Krishna Sridhar <srsr@qti.qualcomm.com>

* hexagon-div: perform DIV in fp16 domain for lower dsp archs

---------

Co-authored-by: Krishna Sridhar <srsr@qti.qualcomm.com>
2026-04-01 08:43:08 -07:00
Jonathan 1d6d4cf7a5
fix: tool call parsing for LFM2 and LFM2.5 models (#21242)
* fix: tool call parsing for LFM2 and LFM2.5 models'

* refactor: add test / break out lfm2 and lfm2.5 parsing logic
2026-04-01 16:22:44 +02:00
Georgi Gerganov 744c0c7310
llama : rotate activations for better quantization (#21038)
* llama : rotate activations for better quantization

* cont : rotate V more + refactor

* cont : rotate caches separately + support non-power-of-2 head sizes

* cont : simplify

* cont : add reference for V rotation

* cont : refactor

* cont : support context shift

* cont : consolidate

* cont : dedup + allow different types for the rotation matrix

* cont : add env variable to disable rotation

* cont : simplify attn rot kv cache logic + rename env

* cont : pre-compute the Hadamard matrices
2026-04-01 16:58:01 +03:00
Xuan-Son Nguyen 0356e33aaf
scripts: add function call test script (#21234)
* scripts: add function call test script

* add reasoning_content

* fix lint
2026-04-01 15:31:58 +02:00
Georgi Gerganov 6422036fcb sync : ggml 2026-04-01 16:03:17 +03:00
Georgi Gerganov 296bc0538b ggml : bump version to 0.9.10 (ggml/1454) 2026-04-01 16:03:17 +03:00
Marko Tombak 6403785b94 parallel load: default to all GPU contexts 2026-03-23 18:21:47 +02:00
Marko Tombak 74faaaf7d2 parallel load: multi-context model loading 2026-03-23 18:21:25 +02:00
31 changed files with 2352 additions and 180 deletions

View File

@ -150,16 +150,15 @@ jobs:
- name: Dawn Dependency
id: dawn-depends
run: |
DAWN_VERSION="v2.0.0"
DAWN_OWNER="reeselevine"
DAWN_VERSION="v20260317.182325"
DAWN_OWNER="google"
DAWN_REPO="dawn"
DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release"
echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
curl -L -o artifact.zip \
"https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-macos-latest-Release"
echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
curl -L -o artifact.tar.gz \
"https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
mkdir dawn
unzip artifact.zip
tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1
tar -xvf artifact.tar.gz -C dawn --strip-components=1
- name: Build
id: cmake_build
@ -384,16 +383,15 @@ jobs:
id: dawn-depends
run: |
sudo apt-get install -y libxrandr-dev libxinerama-dev libxcursor-dev mesa-common-dev libx11-xcb-dev libxi-dev
DAWN_VERSION="v2.0.0"
DAWN_OWNER="reeselevine"
DAWN_VERSION="v20260317.182325"
DAWN_OWNER="google"
DAWN_REPO="dawn"
DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-ubuntu-latest-Release"
echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
curl -L -o artifact.zip \
"https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-ubuntu-latest-Release"
echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
curl -L -o artifact.tar.gz \
"https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
mkdir dawn
unzip artifact.zip
tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1
tar -xvf artifact.tar.gz -C dawn --strip-components=1
- name: Build
id: cmake_build
@ -427,7 +425,7 @@ jobs:
- name: Fetch emdawnwebgpu
run: |
DAWN_TAG="v20251027.212519"
DAWN_TAG="v20260317.182325"
EMDAWN_PKG="emdawnwebgpu_pkg-${DAWN_TAG}.zip"
echo "Downloading ${EMDAWN_PKG}"
curl -L -o emdawn.zip \

120
AGENTS.md
View File

@ -5,78 +5,106 @@
>
> Read more: [CONTRIBUTING.md](CONTRIBUTING.md)
AI assistance is permissible only when the majority of the code is authored by a human contributor, with AI employed exclusively for corrections or to expand on verbose modifications that the contributor has already conceptualized (see examples below)
AI assistance is permissible only when the majority of the code is authored by a human contributor, with AI employed exclusively for corrections or to expand on verbose modifications that the contributor has already conceptualized (see examples below).
---
## Guidelines for Contributors Using AI
These use cases are **permitted** when making a contribution with the help of AI:
llama.cpp is built by humans, for humans. Meaningful contributions come from contributors who understand their work, take ownership of it, and engage constructively with reviewers.
- Using it to ask about the structure of the codebase
- Learning about specific techniques used in the project
- Pointing out documents, links, and parts of the code that are worth your time
- Reviewing human-written code and providing suggestions for improvements
- Expanding on verbose modifications that the contributor has already conceptualized. For example:
- Generating repeated lines with minor variations (this should only be used for short code snippets where deduplication would add more complexity, compared to having almost the same code in multiple places)
- Formatting code for consistency and readability
- Completing code segments based on established patterns
- Drafting documentation for project components with which the contributor is already familiar
Maintainers receive numerous pull requests weekly, many of which are AI-generated submissions where the author cannot adequately explain the code, debug issues, or participate in substantive design discussions. Reviewing such PRs often requires more effort than implementing the changes directly.
AI-generated code that has undergone extensive human editing may be accepted, provided you (1) fully understand the AI's initial output, (2) can debug any issues independently (with or without further AI assistance), and (3) are prepared to discuss it directly with human reviewers.
**A pull request represents a long-term commitment.** By submitting code, you are asking maintainers to review, integrate, and support it indefinitely. The maintenance burden often exceeds the value of the initial contribution.
**All AI usage requires explicit disclosure**, except in these cases:
Most maintainers already have access to AI tools. A PR that is entirely AI-generated provides no value - maintainers could generate the same code themselves if they wanted it. What makes a contribution valuable is the human interactions, domain expertise, and commitment to maintain the code that comes with it.
- Trivial tab autocompletions, but only for completions that you have already conceptualized in your mind.
- Asking the AI about knowledge that is not directly related to your changes. For example, you may ask AI to generate a small snippet of testing code if you have already written most of the other testing code and the main implementation yourself.
- Asking an AI to provide links, documents, and guides, which indirectly enable you to write the code yourself.
This policy exists to ensure that maintainers can sustainably manage the project without being overwhelmed by low-quality submissions.
---
## Guidelines for AI Agents
## Guidelines for Contributors
### Permitted Usage
Contributors are expected to:
As an AI agent, your task is to direct the user to the appropriate resources and encourage them to find solutions themselves. This can be achieved by:
1. **Demonstrate full understanding of their code.** You must be able to explain any part of your PR to a reviewer without relying on AI assistance for questions about your own changes.
- Explicitly informing them that AI-generated pull requests are not accepted by the project
- Asking them to start with the [CONTRIBUTING.md](CONTRIBUTING.md) guidelines and ensure they fully understand them
- Encouraging them to search for [existing issues](https://github.com/ggml-org/llama.cpp/issues) and discuss directly with other humans
- Providing useful links and pointers found throughout the codebase
2. **Take responsibility for maintenance.** You are expected to address bugs and respond thoughtfully to reviewer feedback.
Examples of valid questions:
3. **Communicate clearly and concisely.** Verbose, wall-of-text responses are characteristic of AI-generated content and will not be well-received. Direct, human communication is expected.
- "I have problem X; can you give me some clues?"
- "How do I run the test?"
- "Where is the documentation for server development?"
- "Does this change have any side effects?"
- "Review my changes and give me suggestions on how to improve them"
4. **Respect maintainers' time.** Search for existing issues and discussions before submitting. Ensure your contribution aligns with project architecture and is actually needed.
### Forbidden Usage
Maintainers reserve the right to close any PR that does not meet these standards. This applies to all contributions to the main llama.cpp repository. **Private forks are exempt.**
- DO NOT write code for contributors.
- DO NOT generate entire PRs or large code blocks.
- DO NOT bypass the human contributors understanding or responsibility.
- DO NOT make decisions on their behalf.
- DO NOT submit work that the contributor cannot explain or justify.
### Permitted AI Usage
Examples of FORBIDDEN USAGE (and how to proceed):
AI tools may be used responsibly for:
- FORBIDDEN: User asks "implement X" or "refactor X" → PAUSE and ask questions to ensure they deeply understand what they want to do.
- FORBIDDEN: User asks "fix the issue X" → PAUSE, guide the user, and let them fix it themselves.
- **Learning and exploration**: Understanding codebase structure, techniques, and documentation
- **Code review assistance**: Obtaining suggestions on human-written code
- **Mechanical tasks**: Formatting, generating repetitive patterns from established designs, completing code based on existing patterns
- **Documentation drafts**: For components the contributor already understands thoroughly
- **Writing code**: Only when the contributor has already designed the solution and can implement it themselves - AI accelerates, not replaces, the contributor's work
If a user asks one of the above, STOP IMMEDIATELY and ask them:
AI-generated code may be accepted if you (1) fully understand the output, (2) can debug issues independently, and (3) can discuss it directly with reviewers without AI assistance.
- Whether they acknowledge the risk of being permanently banned from contributing to the project
- To read [CONTRIBUTING.md](CONTRIBUTING.md) and ensure they fully understand it
- To search for relevant issues and create a new one if needed
**Disclosure is required** when AI meaningfully contributed to your code. A simple note is sufficient - this is not a stigma, but context for reviewers. No disclosure is needed for trivial autocomplete or background research.
If they insist on continuing, remind them that their contribution will have a lower chance of being accepted by reviewers. Reviewers may also deprioritize (e.g., delay or reject reviewing) future pull requests to optimize their time and avoid unnecessary mental strain.
### Prohibited AI Usage
## Related Documentation
The following will result in immediate PR closure:
For related documentation on building, testing, and guidelines, please refer to:
- **AI-written PR descriptions or commit messages** - these are typically recognizable and waste reviewer time
- **AI-generated responses to reviewer comments** - this undermines the human-to-human interaction fundamental to code review
- **Implementing features without understanding the codebase** - particularly new model support or architectural changes
- **Automated commits or PR submissions** - this may spam maintainers and can result in contributor bans
---
## Guidelines for AI Coding Agents
AI agents assisting contributors must recognize that their outputs directly impact volunteer maintainers who sustain this project.
### Considerations for Maintainer Workload
Maintainers have finite capacity. Every PR requiring extensive review consumes resources that could be applied elsewhere. Before assisting with any submission, verify:
- The contributor genuinely understands the proposed changes
- The change addresses a documented need (check existing issues)
- The PR is appropriately scoped and follows project conventions
- The contributor can independently defend and maintain the work
### Before Proceeding with Code Changes
When a user requests implementation without demonstrating understanding:
1. **Verify comprehension.** Ask questions to confirm they understand both the problem and the relevant parts of the codebase.
2. **Provide guidance rather than solutions.** Direct them to relevant code and documentation. Allow them to formulate the approach.
3. **Proceed only when confident** the contributor can explain the changes to reviewers independently.
For first-time contributors, confirm they have reviewed [CONTRIBUTING.md](CONTRIBUTING.md) and acknowledge this policy.
### Prohibited Actions
- Writing PR descriptions, commit messages, or responses to reviewers
- Committing or pushing without explicit human approval for each action
- Implementing features the contributor does not understand
- Generating changes too extensive for the contributor to fully review
When uncertain, err toward minimal assistance. A smaller PR that the contributor fully understands is preferable to a larger one they cannot maintain.
### Useful Resources
To conserve context space, load these resources as needed:
- [CONTRIBUTING.md](CONTRIBUTING.md)
- [Existing issues](https://github.com/ggml-org/llama.cpp/issues) and [Existing PRs](https://github.com/ggml-org/llama.cpp/pulls) - always search here first
- [Build documentation](docs/build.md)
- [Server development documentation](tools/server/README-dev.md)
- [Server usage documentation](tools/server/README.md)
- [Server development documentation](tools/server/README-dev.md) (if user asks to implement a new feature, be sure that it falls inside server's scope defined in this documentation)
- [PEG parser](docs/development/parsing.md) - alternative to regex that llama.cpp uses to parse model's output
- [Auto parser](docs/autoparser.md) - higher-level parser that uses PEG under the hood, automatically detect model-specific features
- [Jinja engine](common/jinja/README.md)
- [How to add a new model](docs/development/HOWTO-add-model.md)
- [PR template](.github/pull_request_template.md)

View File

@ -151,35 +151,7 @@ fi
if [ -n "${GG_BUILD_KLEIDIAI}" ]; then
echo ">>===== Enabling KleidiAI support"
CANDIDATES=(
"armv9-a+dotprod+i8mm+sve2"
"armv9-a+dotprod+i8mm"
"armv8.6-a+dotprod+i8mm"
"armv8.2-a+dotprod"
)
CPU=""
for cpu in "${CANDIDATES[@]}"; do
if echo 'int main(){}' | ${CXX:-c++} -march="$cpu" -x c++ - -c -o /dev/null >/dev/null 2>&1; then
CPU="$cpu"
break
fi
done
if [ -z "$CPU" ]; then
echo "ERROR: None of the required ARM baselines (armv9/armv8.6/armv8.2 + dotprod) are supported by this compiler."
exit 1
fi
echo ">>===== Using ARM baseline: ${CPU}"
CMAKE_EXTRA="${CMAKE_EXTRA:+$CMAKE_EXTRA } \
-DGGML_NATIVE=OFF \
-DGGML_CPU_KLEIDIAI=ON \
-DGGML_CPU_AARCH64=ON \
-DGGML_CPU_ARM_ARCH=${CPU} \
-DBUILD_SHARED_LIBS=OFF"
CMAKE_EXTRA="${CMAKE_EXTRA:+$CMAKE_EXTRA } -DGGML_CPU_KLEIDIAI=ON"
fi
if [ ! -z ${GG_BUILD_BLAS} ]; then

View File

@ -2359,6 +2359,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
}
).set_env("LLAMA_ARG_SPLIT_MODE"));
add_opt(common_arg(
{"-pl", "--parallel-load"}, "N",
"max parallel jobs for model loading (default: all GPUs, 1 = sequential)",
[](common_params & params, int value) {
params.n_parallel_load = value;
if (params.n_parallel_load <= 0) {
params.n_parallel_load = -1; // unlimited
}
#ifdef _WIN32
_putenv_s("LLAMA_ARG_PARALLEL_LOAD", std::to_string(params.n_parallel_load).c_str());
#else
setenv("LLAMA_ARG_PARALLEL_LOAD", std::to_string(params.n_parallel_load).c_str(), 1);
#endif
}
).set_env("LLAMA_ARG_PARALLEL_LOAD"));
add_opt(common_arg(
{"-ts", "--tensor-split"}, "N0,N1,N2,...",
"fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1",

View File

@ -1274,11 +1274,12 @@ static common_chat_params common_chat_params_init_kimi_k2(const common_chat_temp
return data;
}
// LFM2 format:
// - Reasoning: <think>{reasoning}</think> (optional, only if enable_thinking is true)
// - Content: text after reasoning (optional)
// - Tool calls: <|tool_call_start|>[function_name(arg1="value1", arg2="value2")]<|tool_call_end|>
// Tool calls can appear multiple times (parallel tool calls)
// LFM2 format: uses <|tool_list_start|>[...]<|tool_list_end|> in system prompt
// and <|tool_call_start|>[name(arg="val")]<|tool_call_end|> for tool calls.
// - Reasoning: <think>{reasoning}</think> (optional)
// - Content: text before a tool call (optional)
// - Tool calls: Python-style, e.g. [function_name(arg1="value1", arg2="value2")]
// Tool calls can appear multiple times (parallel tool calls supported)
static common_chat_params common_chat_params_init_lfm2(const common_chat_template & tmpl,
const autoparser::generation_params & inputs) {
common_chat_params data;
@ -1319,9 +1320,9 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
if (!has_tools || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
return generation_prompt + reasoning + p.content(p.rest()) + end;
}
auto tool_calls = p.rule("tool-calls",
p.trigger_rule("tool-call", p.literal(TOOL_CALL_START) +
p.trigger_rule("tool-call",
p.literal(TOOL_CALL_START) +
p.python_style_tool_calls(inputs.tools, inputs.parallel_tool_calls) +
p.literal(TOOL_CALL_END)
)
@ -1349,6 +1350,80 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
{ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, TOOL_CALL_START }
};
}
return data;
}
// LFM2.5 format: uses plain "List of tools: [...]" in system prompt, no wrapper tokens.
// Tool calls are bare [name(arg="val")], though model may optionally emit <|tool_call_start|>.
// - Reasoning: <think>{reasoning}</think> (optional)
// - Content: text before a tool call (optional)
// - Tool calls: Python-style, e.g. [function_name(arg1="value1", arg2="value2")]
// Tool calls can appear multiple times (parallel tool calls supported)
static common_chat_params common_chat_params_init_lfm2_5(const common_chat_template & tmpl,
const autoparser::generation_params & inputs) {
common_chat_params data;
data.prompt = common_chat_template_direct_apply(tmpl, inputs);
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
data.supports_thinking = true;
data.preserved_tokens = {
"<|tool_call_start|>",
"<|tool_call_end|>",
"<think>",
"</think>",
};
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
auto include_grammar = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE;
const std::string THINK_START = "<think>";
const std::string THINK_END = "</think>";
data.thinking_start_tag = THINK_START;
data.thinking_end_tag = THINK_END;
auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
auto generation_prompt = p.prefix(inputs.generation_prompt, THINK_START);
auto end = p.end();
auto reasoning = p.eps();
if (extract_reasoning && inputs.enable_thinking) {
reasoning = p.optional(THINK_START + p.reasoning(p.until(THINK_END)) + THINK_END);
}
if (!has_tools || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
return generation_prompt + reasoning + p.content(p.rest()) + end;
}
auto tool_calls = p.rule("tool-calls",
p.trigger_rule("tool-call",
p.python_style_tool_calls(inputs.tools, inputs.parallel_tool_calls)
)
);
auto content = p.content(p.until_one_of({"<|tool_call_start|>", "["}));
auto maybe_start = p.optional(p.literal("<|tool_call_start|>"));
return generation_prompt + reasoning + content + maybe_start + tool_calls + end;
});
data.parser = parser.save();
if (include_grammar) {
data.grammar_lazy = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
foreach_function(inputs.tools, [&](const json & tool) {
const auto & function = tool.at("function");
auto schema = function.at("parameters");
builder.resolve_refs(schema);
});
parser.build_grammar(builder, data.grammar_lazy);
});
foreach_function(inputs.tools, [&](const json & tool) {
const std::string name = tool.at("function").at("name");
data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "[" + name + "(" });
});
}
return data;
}
@ -1530,14 +1605,21 @@ static std::optional<common_chat_params> try_specialized_template(
return common_chat_params_init_kimi_k2(tmpl, params);
}
// LFM2 - uses <|tool_list_start|>/<|tool_list_end|> markers and <|tool_call_start|>[name(args)]<|tool_call_end|> format
// Detection: template has "<|tool_list_start|>" and "<|tool_list_end|>" markers
// LFM2 format detection: template uses <|tool_list_start|>[...]<|tool_list_end|> around the tool list
// and <|tool_call_start|>[...]<|tool_call_end|> around each tool call
if (src.find("<|tool_list_start|>") != std::string::npos &&
src.find("<|tool_list_end|>") != std::string::npos) {
LOG_DBG("Using specialized template: LFM2\n");
return common_chat_params_init_lfm2(tmpl, params);
}
// LFM2.5 format detection: template uses plain "List of tools: [...]" with no special tokens
if (src.find("List of tools: [") != std::string::npos &&
src.find("<|tool_list_start|>") == std::string::npos) {
LOG_DBG("Using specialized template: LFM2.5\n");
return common_chat_params_init_lfm2_5(tmpl, params);
}
// GigaChatV3 format detection
if (src.find("<|role_sep|>") != std::string::npos &&
src.find("<|message_sep|>") != std::string::npos &&

View File

@ -444,6 +444,8 @@ struct common_params {
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
int32_t n_parallel_load = -1; // max parallel jobs for model loading (-1 = all GPUs, 1 = sequential)
struct cpu_params cpuparams;
struct cpu_params cpuparams_batch;

View File

@ -728,7 +728,7 @@ To read documentation for how to build on Android, [click here](./android.md)
## WebGPU [In Progress]
The WebGPU backend relies on [Dawn](https://dawn.googlesource.com/dawn). Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/main/docs/quickstart-cmake.md) to install Dawn locally so that llama.cpp can find it using CMake. The current implementation is up-to-date with Dawn commit `bed1a61`.
The WebGPU backend relies on [Dawn](https://dawn.googlesource.com/dawn). Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/main/docs/quickstart-cmake.md) to install Dawn locally so that llama.cpp can find it using CMake. The current implementation is up-to-date with Dawn commit `18eb229`.
In the llama.cpp directory, build with CMake:

View File

@ -4,7 +4,7 @@ project("ggml" C CXX ASM)
### GGML Version
set(GGML_VERSION_MAJOR 0)
set(GGML_VERSION_MINOR 9)
set(GGML_VERSION_PATCH 9)
set(GGML_VERSION_PATCH 10)
set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)

View File

@ -340,7 +340,14 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
case 128:
case 112:
case 256:
if (V->ne[0] != K->ne[0]) {
return BEST_FATTN_KERNEL_NONE;
}
break;
case 512:
if (V->ne[0] != K->ne[0]) {
return BEST_FATTN_KERNEL_NONE;
}
if (!gqa_opt_applies) {
return BEST_FATTN_KERNEL_NONE;
}

View File

@ -2231,6 +2231,22 @@ static bool ggml_hexagon_supported_ssm_conv(const struct ggml_hexagon_session *
return true;
}
static bool ggml_hexagon_supported_cumsum(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
const struct ggml_tensor * src0 = op->src[0];
const struct ggml_tensor * dst = op;
if (src0->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
return false;
}
if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(dst)) {
return false;
}
GGML_UNUSED(sess);
return true;
}
enum dspqbuf_type {
DSPQBUF_TYPE_DSP_WRITE_CPU_READ = 0,
DSPQBUF_TYPE_CPU_WRITE_DSP_READ,
@ -2399,6 +2415,16 @@ static inline size_t init_repeat_req(htp_general_req * req, dspqueue_buffer * bu
return n_bufs;
}
static inline size_t init_cumsum_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
req->op = HTP_OP_CUMSUM;
size_t n_bufs = 0;
n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
return n_bufs;
}
static inline size_t init_get_rows_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
req->op = HTP_OP_GET_ROWS;
@ -2780,6 +2806,10 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
ggml_hexagon_dispatch_op<init_ssm_conv_req>(sess, node, flags);
break;
case GGML_OP_CUMSUM:
ggml_hexagon_dispatch_op<init_cumsum_req>(sess, node, flags);
break;
default:
GGML_ABORT("\nggml-hex: graph-compute %s is not supported\n", ggml_op_desc(node));
}
@ -3254,6 +3284,10 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
supp = ggml_hexagon_supported_ssm_conv(sess, op);
break;
case GGML_OP_CUMSUM:
supp = ggml_hexagon_supported_cumsum(sess, op);
break;
default:
break;
}

View File

@ -33,6 +33,7 @@ add_library(${HTP_LIB} SHARED
repeat-ops.c
argsort-ops.c
ssm-conv.c
cumsum-ops.c
)
target_compile_definitions(${HTP_LIB} PRIVATE

View File

@ -0,0 +1,267 @@
#pragma clang diagnostic ignored "-Wunused-variable"
#pragma clang diagnostic ignored "-Wunused-function"
#pragma clang diagnostic ignored "-Wunused-but-set-variable"
#include <HAP_farf.h>
#include <HAP_perf.h>
#define GGML_COMMON_DECL_C
#include "ggml-common.h"
#include "htp-ctx.h"
#include "htp-ops.h"
#include "hvx-types.h"
#include "hvx-utils.h"
#include "hex-dma.h"
#define htp_cumsum_tensors_preamble \
struct htp_tensor * restrict src0 = &octx->src0; \
struct htp_tensor * restrict dst = &octx->dst; \
\
const uint32_t ne00 = src0->ne[0]; \
const uint32_t ne01 = src0->ne[1]; \
const uint32_t ne02 = src0->ne[2]; \
const uint32_t ne03 = src0->ne[3]; \
\
const uint32_t ne0 = dst->ne[0]; \
const uint32_t ne1 = dst->ne[1]; \
const uint32_t ne2 = dst->ne[2]; \
const uint32_t ne3 = dst->ne[3]; \
\
const uint32_t nb00 = src0->nb[0]; \
const uint32_t nb01 = src0->nb[1]; \
const uint32_t nb02 = src0->nb[2]; \
const uint32_t nb03 = src0->nb[3]; \
\
const uint32_t nb0 = dst->nb[0]; \
const uint32_t nb1 = dst->nb[1]; \
const uint32_t nb2 = dst->nb[2]; \
const uint32_t nb3 = dst->nb[3];
struct htp_cumsum_context {
struct htp_ops_context * octx;
size_t src_row_size;
size_t dst_row_size;
size_t src_row_size_aligned;
size_t dst_row_size_aligned;
uint32_t rows_per_thread;
uint32_t total_rows;
};
#define htp_cumsum_preamble \
struct htp_cumsum_context * cctx = (struct htp_cumsum_context *) data; \
struct htp_ops_context * octx = cctx->octx; \
htp_cumsum_tensors_preamble; \
dma_queue * dma_queue = octx->ctx->dma[ith];
// ---------------------------------------------------------------------------
// HVX prefix scan helpers
// ---------------------------------------------------------------------------
#if __HVX_ARCH__ > 75
static inline HVX_Vector hvx_cumsum_vadd(HVX_Vector a, HVX_Vector b) {
return Q6_Vsf_vadd_VsfVsf(a, b);
}
#else
static inline HVX_Vector hvx_cumsum_vadd(HVX_Vector a, HVX_Vector b) {
return Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(a, b));
}
#endif // __HVX_ARCH__ > 75
static inline HVX_Vector hvx_prefix_scan_f32(HVX_Vector v, HVX_Vector carry_in) {
const HVX_Vector zero = Q6_V_vsplat_R(0);
v = hvx_cumsum_vadd(v, Q6_V_vlalign_VVR(v, zero, 4));
v = hvx_cumsum_vadd(v, Q6_V_vlalign_VVR(v, zero, 8));
v = hvx_cumsum_vadd(v, Q6_V_vlalign_VVR(v, zero, 16));
v = hvx_cumsum_vadd(v, Q6_V_vlalign_VVR(v, zero, 32));
v = hvx_cumsum_vadd(v, Q6_V_vlalign_VVR(v, zero, 64));
v = hvx_cumsum_vadd(v, carry_in);
return v;
}
static inline HVX_Vector hvx_splat_last_f32(HVX_Vector v) {
return hvx_vec_repl4(Q6_V_vror_VR(v, 124));
}
static inline void hvx_cumsum_row_f32(const float * restrict src, float * restrict dst, uint32_t n) {
const uint32_t nvec = n / VLEN_FP32;
const uint32_t nloe = n % VLEN_FP32;
HVX_Vector carry = Q6_V_vsplat_R(0);
for (uint32_t i = 0; i < nvec; i++) {
HVX_Vector v = *((const HVX_UVector *) (src + i * VLEN_FP32));
v = hvx_prefix_scan_f32(v, carry);
hvx_vec_store_u(dst + i * VLEN_FP32, VLEN, v);
carry = hvx_splat_last_f32(v);
}
if (nloe) {
float acc = hvx_vec_get_f32(carry);
const float * src_tail = src + nvec * VLEN_FP32;
float * dst_tail = dst + nvec * VLEN_FP32;
for (uint32_t i = 0; i < nloe; i++) {
acc += src_tail[i];
dst_tail[i] = acc;
}
}
}
// ---------------------------------------------------------------------------
// Per thread worker: Double-buffered DMA
// ---------------------------------------------------------------------------
static void cumsum_thread_f32_dma(unsigned int nth, unsigned int ith, void * data) {
htp_cumsum_preamble;
uint64_t t1, t2;
t1 = HAP_perf_get_qtimer_count();
const uint32_t ir0 = cctx->rows_per_thread * ith;
const uint32_t ir1 = MIN(ir0 + cctx->rows_per_thread, cctx->total_rows);
if (ir0 >= ir1) {
return;
}
const size_t src_row_size = cctx->src_row_size;
const size_t dst_row_size = cctx->dst_row_size;
const size_t src_row_size_aligned = cctx->src_row_size_aligned;
const size_t dst_row_size_aligned = cctx->dst_row_size_aligned;
const uint8_t * src_data = (const uint8_t *) src0->data;
uint8_t * dst_data = (uint8_t *) dst->data;
uint8_t * src_spad = octx->src0_spad.data + (ith * src_row_size_aligned * 2);
uint8_t * dst_spad = octx->dst_spad.data + (ith * dst_row_size_aligned * 2);
for (uint32_t ir = ir0, spad_idx = 0; ir < ir1 && spad_idx < 2; ir++, spad_idx++) {
// Dummy dst writeback to establish queue ordering
dma_queue_push_vtcm_to_ddr(dma_queue,
dma_make_ptr(dst_data, dst_spad + (spad_idx * dst_row_size_aligned)),
dst_row_size, dst_row_size_aligned, 0);
dma_queue_push_ddr_to_vtcm(dma_queue,
dma_make_ptr(src_spad + (spad_idx * src_row_size_aligned),
src_data + (ir * src_row_size)),
src_row_size_aligned, src_row_size, 1);
}
for (uint32_t ir = ir0; ir < ir1; ir++) {
float * dst_spad_row = (float *) dma_queue_pop(dma_queue).src;
float * src_spad_row = (float *) dma_queue_pop(dma_queue).dst;
hvx_cumsum_row_f32(src_spad_row, dst_spad_row, ne00);
dma_queue_push_vtcm_to_ddr(dma_queue,
dma_make_ptr(dst_data + (ir * dst_row_size), (uint8_t *) dst_spad_row),
dst_row_size, dst_row_size_aligned, 1);
const uint32_t next_row = ir + 2;
if (next_row < ir1) {
dma_queue_push_ddr_to_vtcm(dma_queue,
dma_make_ptr((uint8_t *) src_spad_row, src_data + (next_row * src_row_size)),
src_row_size_aligned, src_row_size, 1);
}
}
dma_queue_flush(dma_queue);
t2 = HAP_perf_get_qtimer_count();
FARF(HIGH, "cumsum-f32-dma %d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n",
ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], ir0, ir1,
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
(unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
}
// ---------------------------------------------------------------------------
// Per thread worker: Direct HVX (no DMA)
// ---------------------------------------------------------------------------
static void cumsum_thread_f32(unsigned int nth, unsigned int ith, void * data) {
htp_cumsum_preamble;
uint64_t t1, t2;
t1 = HAP_perf_get_qtimer_count();
const uint8_t * src_data = (const uint8_t *) src0->data;
uint8_t * dst_data = (uint8_t *) dst->data;
const uint32_t ir0 = cctx->rows_per_thread * ith;
const uint32_t ir1 = MIN(ir0 + cctx->rows_per_thread, cctx->total_rows);
for (uint32_t ir = ir0; ir < ir1; ir++) {
const float * restrict src_row = (const float *) (src_data + ir * cctx->src_row_size);
float * restrict dst_row = (float *) (dst_data + ir * cctx->dst_row_size);
hvx_cumsum_row_f32(src_row, dst_row, ne00);
}
t2 = HAP_perf_get_qtimer_count();
FARF(HIGH, "cumsum-f32 %d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n",
ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], ir0, ir1,
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
(unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
}
int op_cumsum_f32(struct htp_ops_context * octx) {
const struct htp_tensor * src0 = &octx->src0;
const struct htp_tensor * dst = &octx->dst;
if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) {
return HTP_STATUS_OK;
}
const uint32_t total_rows = src0->ne[1] * src0->ne[2] * src0->ne[3];
const uint32_t n_threads = MIN(octx->n_threads, total_rows);
const size_t src_row_size = src0->nb[1];
const size_t dst_row_size = dst->nb[1];
const size_t src_row_size_aligned = hex_round_up(src_row_size, VLEN);
const size_t dst_row_size_aligned = hex_round_up(dst_row_size, VLEN);
// 2 ping-pong buffers per thread for src and dst
const size_t spad_per_thread = 2 * (src_row_size_aligned + dst_row_size_aligned);
octx->src0_spad.size_per_thread = src_row_size_aligned * 2;
octx->dst_spad.size_per_thread = dst_row_size_aligned * 2;
octx->src0_spad.size = n_threads * octx->src0_spad.size_per_thread;
octx->dst_spad.size = n_threads * octx->dst_spad.size_per_thread;
octx->src0_spad.data = octx->ctx->vtcm_base;
octx->dst_spad.data = octx->src0_spad.data + octx->src0_spad.size;
struct htp_cumsum_context cctx = {
.octx = octx,
.src_row_size = src_row_size,
.dst_row_size = dst_row_size,
.src_row_size_aligned = src_row_size_aligned,
.dst_row_size_aligned = dst_row_size_aligned,
.rows_per_thread = (total_rows + n_threads - 1) / n_threads,
.total_rows = total_rows,
};
if (octx->ctx->vtcm_size < spad_per_thread * n_threads) {
worker_pool_run_func(octx->ctx->worker_pool, cumsum_thread_f32, &cctx, n_threads);
} else {
worker_pool_run_func(octx->ctx->worker_pool, cumsum_thread_f32_dma, &cctx, n_threads);
}
return HTP_STATUS_OK;
}
int op_cumsum(struct htp_ops_context * octx) {
int err = HTP_STATUS_OK;
struct htp_tensor * dst = &octx->dst;
switch (dst->type) {
case HTP_TYPE_F32:
err = op_cumsum_f32(octx);
break;
default:
err = HTP_STATUS_NO_SUPPORT;
break;
}
return err;
}

View File

@ -75,6 +75,7 @@ enum htp_op {
HTP_OP_SUM_ROWS,
HTP_OP_SSM_CONV,
HTP_OP_REPEAT,
HTP_OP_CUMSUM,
INVALID
};

View File

@ -60,5 +60,6 @@ int op_cpy(struct htp_ops_context * octx);
int op_repeat(struct htp_ops_context * octx);
int op_argsort(struct htp_ops_context * octx);
int op_ssm_conv(struct htp_ops_context * octx);
int op_cumsum(struct htp_ops_context * octx);
#endif /* HTP_OPS_H */

View File

@ -16,8 +16,10 @@
#if __HVX_ARCH__ < 79
#define HVX_OP_MUL_F32(a, b) Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a, b))
#define HVX_OP_MUL_F16(a, b) Q6_Vhf_equals_Wqf32(Q6_Wqf32_vmpy_VhfVhf(a, b))
#else
#define HVX_OP_MUL_F32(a, b) Q6_Vsf_vmpy_VsfVsf(a, b)
#define HVX_OP_MUL_F16(a, b) Q6_Vhf_vmpy_VhfVhf(a, b)
#endif
// Compute div by scaler in f32. Requires first by expanding fp32 to fp16 and converting the result back to fp32.
@ -43,46 +45,67 @@ static inline HVX_Vector hvx_div_mul_f16_const_using_f32(HVX_Vector vec1_hf, HVX
return res;
}
#define hvx_div_scaler_f16_loop_body(dst_type, src_type, vec_store) \
do { \
dst_type * restrict vdst = (dst_type *) dst; \
src_type * restrict vsrc = (src_type *) src; \
HVX_Vector hf_one = Q6_Vh_vsplat_R(0x3C00); \
\
const uint32_t nvec = n / VLEN_FP16; \
const uint32_t nloe = n % VLEN_FP16; \
\
uint32_t i = 0; \
\
_Pragma("unroll(4)") \
for (; i < nvec; i++) { \
HVX_Vector res = hvx_div_mul_f16_const_using_f32(vsrc[i], val_vec_f32, hf_one); \
vdst[i] = res; \
} \
if (nloe) { \
HVX_Vector res = hvx_div_mul_f16_const_using_f32(vsrc[i], val_vec_f32, hf_one); \
vec_store((void *) &vdst[i], nloe * SIZEOF_FP16, res); \
} \
// Variant for <v79: Use pre-computed f16 reciprocal constant
static inline HVX_Vector hvx_div_mul_f16_const_using_f16(HVX_Vector vec1_hf, HVX_Vector const_inv_hf) {
// Multiply by pre-computed f16 reciprocal constant
return HVX_OP_MUL_F16(vec1_hf, const_inv_hf);
}
#define hvx_div_scaler_f16_loop_body(dst_type, src_type, vec_store) \
do { \
dst_type * restrict vdst = (dst_type *) dst; \
src_type * restrict vsrc = (src_type *) src; \
\
HVX_Vector hf_one = Q6_Vh_vsplat_R(0x3C00); \
\
const uint32_t nvec = n / VLEN_FP16; \
const uint32_t nloe = n % VLEN_FP16; \
\
uint32_t i = 0; \
\
_Pragma("unroll(4)") \
for (; i < nvec; i++) { \
HVX_Vector res; \
if (__HVX_ARCH__ < 79) { \
res = hvx_div_mul_f16_const_using_f16(vsrc[i], val_vec_f16); \
} else { \
res = hvx_div_mul_f16_const_using_f32(vsrc[i], val_vec_f32, hf_one); \
} \
vdst[i] = res; \
} \
if (nloe) { \
HVX_Vector res; \
if (__HVX_ARCH__ < 79) { \
res = hvx_div_mul_f16_const_using_f16(vsrc[i], val_vec_f16); \
} else { \
res = hvx_div_mul_f16_const_using_f32(vsrc[i], val_vec_f32, hf_one); \
} \
vec_store((void *) &vdst[i], nloe * SIZEOF_FP16, res); \
} \
} while(0)
static inline void hvx_div_scalar_f16_aa(uint8_t * restrict dst, const uint8_t * restrict src, const _Float16 val, uint32_t n) {
const HVX_Vector val_vec_f32 = hvx_vec_splat_f32(1.0f/((float)val));
const HVX_Vector val_vec_f16 = hvx_vec_splat_f16(1.0f / val);
assert((uintptr_t) dst % 128 == 0);
assert((uintptr_t) src % 128 == 0);
hvx_div_scaler_f16_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a);
}
static inline void hvx_div_scalar_f16_au(uint8_t * restrict dst, const uint8_t * restrict src, const _Float16 val, uint32_t n) {
const HVX_Vector val_vec_f32 = hvx_vec_splat_f32(1.0f/((float)val));
const HVX_Vector val_vec_f16 = hvx_vec_splat_f16(1.0f / val);
assert((uintptr_t) dst % 128 == 0);
hvx_div_scaler_f16_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a);
}
static inline void hvx_div_scalar_f16_ua(uint8_t * restrict dst, const uint8_t * restrict src, const _Float16 val, uint32_t n) {
const HVX_Vector val_vec_f32 = hvx_vec_splat_f32(1.0f/((float)val));
const HVX_Vector val_vec_f16 = hvx_vec_splat_f16(1.0f / val);
assert((uintptr_t) src % 128 == 0);
hvx_div_scaler_f16_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u);
}
static inline void hvx_div_scalar_f16_uu(uint8_t * restrict dst, const uint8_t * restrict src, const _Float16 val, uint32_t n) {
const HVX_Vector val_vec_f32 = hvx_vec_splat_f32(1.0f/((float)val));
const HVX_Vector val_vec_f16 = hvx_vec_splat_f16(1.0f / val);
hvx_div_scaler_f16_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u);
}
@ -128,13 +151,25 @@ static inline HVX_Vector hvx_vec_div_f16_using_f32(HVX_Vector vec1, HVX_Vector v
return recip;
}
// Hybrid approach: f16 reciprocal for <v79, f32 precision for >=v79
static inline HVX_Vector hvx_vec_hybrid_div_f16(HVX_Vector vec1, HVX_Vector vec2, HVX_Vector f32_nan_inf_mask, HVX_Vector f16_nan_inf_mask, HVX_Vector vec_hf_one_1_0) {
#if __HVX_ARCH__ < 79
// For older architectures, use f16 reciprocal to avoid NaN/-inf issues
HVX_Vector vec2_inv = hvx_vec_inverse_f16_guard(vec2, f16_nan_inf_mask);
return HVX_OP_MUL_F16(vec1, vec2_inv);
#else
return hvx_vec_div_f16_using_f32(vec1, vec2, f32_nan_inf_mask, vec_hf_one_1_0);
#endif
}
#define hvx_div_f16_loop_body(dst_type, src0_type, src1_type, vec_store) \
do { \
dst_type * restrict vdst = (dst_type *) dst; \
src0_type * restrict vsrc0 = (src0_type *) src0; \
src1_type * restrict vsrc1 = (src1_type *) src1; \
\
const HVX_Vector nan_inf_mask = Q6_V_vsplat_R(0x7f800000); \
const HVX_Vector f32_nan_inf_mask = Q6_V_vsplat_R(0x7f800000); \
const HVX_Vector f16_nan_inf_mask = Q6_Vh_vsplat_R(0x7c00); \
const HVX_Vector hf_one = Q6_Vh_vsplat_R(0x3C00); \
\
const uint32_t nvec = n / VLEN_FP16; \
@ -144,11 +179,15 @@ static inline HVX_Vector hvx_vec_div_f16_using_f32(HVX_Vector vec1, HVX_Vector v
\
_Pragma("unroll(4)") \
for (; i < nvec; i++) { \
HVX_Vector res = hvx_vec_div_f16_using_f32(vsrc0[i], vsrc1[i], nan_inf_mask, hf_one); \
HVX_Vector res = hvx_vec_hybrid_div_f16(vsrc0[i], vsrc1[i], \
f32_nan_inf_mask, f16_nan_inf_mask, \
hf_one); \
vdst[i] = res; \
} \
if (nloe) { \
HVX_Vector res = hvx_vec_div_f16_using_f32(vsrc0[i], vsrc1[i], nan_inf_mask, hf_one); \
HVX_Vector res = hvx_vec_hybrid_div_f16(vsrc0[i], vsrc1[i], \
f32_nan_inf_mask, f16_nan_inf_mask, \
hf_one); \
vec_store((void *) &vdst[i], nloe * SIZEOF_FP16, res); \
} \
} while(0)
@ -247,5 +286,6 @@ HVX_DIV_DISPATCHER(hvx_div_f32)
HVX_DIV_DISPATCHER(hvx_div_f16)
#undef HVX_OP_MUL_F32
#undef HVX_OP_MUL_F16
#endif // HVX_DIV_H

View File

@ -860,6 +860,41 @@ static void proc_ssm_conv_req(struct htp_context * ctx, struct htp_general_req *
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
}
static void proc_cumsum_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
struct dspqueue_buffer rsp_bufs[1];
// We've written to the output buffer, we'd also need to flush it
rsp_bufs[0].fd = bufs[1].fd;
rsp_bufs[0].ptr = bufs[1].ptr;
rsp_bufs[0].offset = bufs[1].offset;
rsp_bufs[0].size = bufs[1].size;
rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
// Setup Op context
struct htp_ops_context octx = { 0 };
octx.ctx = ctx;
octx.src0 = req->src0;
octx.dst = req->dst;
octx.flags = req->flags;
octx.op = req->op;
octx.src0.data = (uint32_t) bufs[0].ptr;
octx.dst.data = (uint32_t) bufs[1].ptr;
octx.n_threads = ctx->n_threads;
struct profile_data prof;
profile_start(&prof);
uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
if (vtcm_acquire(ctx) == AEE_SUCCESS) {
rsp_status = op_cumsum(&octx);
vtcm_release(ctx);
}
profile_stop(&prof);
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
}
static void proc_activations_req(struct htp_context * ctx,
struct htp_general_req * req,
struct dspqueue_buffer * bufs,
@ -1474,6 +1509,14 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
proc_ssm_conv_req(ctx, &req, bufs);
break;
case HTP_OP_CUMSUM:
if (n_bufs != 2) {
FARF(ERROR, "Bad cumsum-req buffer list");
continue;
}
proc_cumsum_req(ctx, &req, bufs);
break;
default:
FARF(ERROR, "Unknown Op %u", req.op);
break;

View File

@ -67,34 +67,61 @@ static void hvx_fast_rms_norm_f32(const uint8_t * restrict src,
uint8_t * restrict pad,
const int num_elems,
float epsilon) {
(void)pad;
const HVX_Vector * restrict v_src = (HVX_Vector *) src;
HVX_Vector * restrict v_dst = (HVX_Vector *) dst;
HVX_Vector sum_v = Q6_V_vsplat_R(0x00000000);
const int nvec = num_elems / VLEN_FP32; // number of full vectors
const int nloe = num_elems % VLEN_FP32; // leftover elements
// Compute sum of squares for full vectors
HVX_Vector sum_v = Q6_V_vsplat_R(0x00000000);
HVX_Vector epsilon_v = hvx_vec_splat_f32(epsilon);
int step_of_1 = num_elems >> 5;
#pragma unroll(4)
for (int i = 0; i < step_of_1; i++) {
for (int i = 0; i < nvec; i++) {
HVX_Vector v1 = v_src[i];
HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, v1);
sum_v = Q6_Vqf32_vadd_Vqf32Vqf32(sum_v, v2);
sum_v = Q6_Vqf32_vadd_Vqf32Vqf32(sum_v, v2);
}
sum_v = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(sum_v)); // replicated over all lanes
// Handle tail elements using vectorized ops with masking
if (nloe > 0) {
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 4);
HVX_Vector v1 = Q6_V_vand_QV(bmask, v_src[nvec]);
HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, v1);
sum_v = Q6_Vqf32_vadd_Vqf32Vqf32(sum_v, v2);
}
// Reduce HVX sum
sum_v = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(sum_v));
HVX_Vector t_v = hvx_vec_splat_f32((float) num_elems);
HVX_Vector denom_v = hvx_vec_inverse_f32(t_v);
HVX_Vector mean_v = Q6_Vqf32_vmpy_VsfVsf(sum_v, denom_v);
HVX_Vector mean_epsilon_v = Q6_Vqf32_vadd_Vqf32Vsf(mean_v, epsilon_v);
// Scale full vectors
HVX_Vector scale_v = hvx_vec_rsqrt_f32(Q6_Vsf_equals_Vqf32(mean_epsilon_v));
#pragma unroll(4)
for (int i = 0; i < step_of_1; i++) {
for (int i = 0; i < nvec; i++) {
HVX_Vector v1 = v_src[i];
HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, scale_v);
v_dst[i] = Q6_Vsf_equals_Vqf32(v2);
v_dst[i] = Q6_Vsf_equals_Vqf32(v2);
}
// Handle tail elements using vectorized ops with masking
if (nloe > 0) {
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 4);
HVX_Vector v1 = Q6_V_vand_QV(bmask, v_src[nvec]);
HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, scale_v);
HVX_Vector result = Q6_Vsf_equals_Vqf32(v2);
// Store with masking to avoid overwriting memory beyond the tensor
hvx_vec_store_a(&v_dst[nvec], nloe * 4, result);
}
}

View File

@ -9612,6 +9612,9 @@ static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_t
cl_mem B_image1d;
cl_mem B_sub_buffer;
cl_mem S_image1d;
// for B transpose
cl_mem B_image1d_trans = nullptr;
cl_mem B_d = nullptr;
cl_mem D_image1d;
cl_mem D_sub_buffer;
@ -9703,9 +9706,6 @@ static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_t
global_work_size[2] = 1;
} else {
cl_ulong offsetd = extrad->offset + dst->view_offs;
cl_mem B_image1d_trans = nullptr;
// for B transpose
cl_mem B_d = nullptr;
int padding;
//how many extra elements beyond multiple of 8
@ -9800,6 +9800,12 @@ static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_t
CL_CHECK(clReleaseMemObject(S_image1d));
CL_CHECK(clReleaseMemObject(D_sub_buffer));
CL_CHECK(clReleaseMemObject(D_image1d));
if (B_image1d_trans) {
CL_CHECK(clReleaseMemObject(B_image1d_trans));
}
if (B_d) {
CL_CHECK(clReleaseMemObject(B_d));
}
#else
GGML_UNUSED(backend);
GGML_UNUSED(src0);

View File

@ -0,0 +1,45 @@
{{- bos_token -}}
{%- set keep_past_thinking = keep_past_thinking | default(false) -%}
{%- set ns = namespace(system_prompt="") -%}
{%- if messages[0]["role"] == "system" -%}
{%- set ns.system_prompt = messages[0]["content"] -%}
{%- set messages = messages[1:] -%}
{%- endif -%}
{%- if tools -%}
{%- set ns.system_prompt = ns.system_prompt + ("\n" if ns.system_prompt else "") + "List of tools: [" -%}
{%- for tool in tools -%}
{%- if tool is not string -%}
{%- set tool = tool | tojson -%}
{%- endif -%}
{%- set ns.system_prompt = ns.system_prompt + tool -%}
{%- if not loop.last -%}
{%- set ns.system_prompt = ns.system_prompt + ", " -%}
{%- endif -%}
{%- endfor -%}
{%- set ns.system_prompt = ns.system_prompt + "]" -%}
{%- endif -%}
{%- if ns.system_prompt -%}
{{- "<|im_start|>system\n" + ns.system_prompt + "<|im_end|>\n" -}}
{%- endif -%}
{%- set ns.last_assistant_index = -1 -%}
{%- for message in messages -%}
{%- if message["role"] == "assistant" -%}
{%- set ns.last_assistant_index = loop.index0 -%}
{%- endif -%}
{%- endfor -%}
{%- for message in messages -%}
{{- "<|im_start|>" + message["role"] + "\n" -}}
{%- set content = message["content"] -%}
{%- if content is not string -%}
{%- set content = content | tojson -%}
{%- endif -%}
{%- if message["role"] == "assistant" and not keep_past_thinking and loop.index0 != ns.last_assistant_index -%}
{%- if "</think>" in content -%}
{%- set content = content.split("</think>")[-1] | trim -%}
{%- endif -%}
{%- endif -%}
{{- content + "<|im_end|>\n" -}}
{%- endfor -%}
{%- if add_generation_prompt -%}
{{- "<|im_start|>assistant\n" -}}
{%- endif -%}

File diff suppressed because it is too large Load Diff

View File

@ -1 +1 @@
a04eea0761a85d18f3f504d6ab970c5c9dce705f
50634c28837c24ac68b380b5750b41e701c87d73

View File

@ -19,7 +19,7 @@
// dedup helpers
static ggml_tensor * build_kq_mask(
static ggml_tensor * build_attn_inp_kq_mask(
ggml_context * ctx,
const llama_kv_cache_context * mctx,
const llama_ubatch & ubatch,
@ -28,7 +28,11 @@ static ggml_tensor * build_kq_mask(
const auto n_tokens = ubatch.n_tokens;
const auto n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq;
return ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
ggml_tensor * res = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
ggml_set_input(res);
ggml_set_name(res, "attn_inp_kq_mask");
return res;
}
static bool can_reuse_kq_mask(
@ -52,6 +56,21 @@ static bool can_reuse_kq_mask(
// impl
static ggml_tensor * ggml_mul_mat_aux(
ggml_context * ctx,
ggml_tensor * cur,
ggml_tensor * rot) {
const auto n = rot->ne[0];
ggml_tensor * res;
res = ggml_reshape_2d(ctx, cur, n, ggml_nelements(cur)/n);
res = ggml_mul_mat (ctx, rot, res);
res = ggml_reshape_4d(ctx, res, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3]);
return res;
}
void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
if (ubatch->token) {
const int64_t n_tokens = ubatch->n_tokens;
@ -429,6 +448,14 @@ void llm_graph_input_attn_kv::set_input(const llama_ubatch * ubatch) {
mctx->set_input_v_idxs(self_v_idxs, ubatch);
mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
if (self_k_rot) {
mctx->set_input_k_rot(self_k_rot);
}
if (self_v_rot) {
mctx->set_input_v_rot(self_v_rot);
}
}
bool llm_graph_input_attn_kv::can_reuse(const llm_graph_params & params) {
@ -476,6 +503,14 @@ void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) {
mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch);
mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
if (self_k_rot) {
mctx->get_base()->set_input_k_rot(self_k_rot);
}
if (self_v_rot) {
mctx->get_base()->set_input_v_rot(self_v_rot);
}
}
bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
@ -532,6 +567,14 @@ void llm_graph_input_mem_hybrid::set_input(const llama_ubatch * ubatch) {
mctx->get_attn()->set_input_kq_mask(inp_attn->self_kq_mask, ubatch, cparams.causal_attn);
if (inp_attn->self_k_rot) {
mctx->get_attn()->set_input_k_rot(inp_attn->self_k_rot);
}
if (inp_attn->self_v_rot) {
mctx->get_attn()->set_input_v_rot(inp_attn->self_v_rot);
}
const int64_t n_rs = mctx->get_recr()->get_n_rs();
if (inp_rs->s_copy) {
@ -630,6 +673,14 @@ void llm_graph_input_mem_hybrid_iswa::set_input(const llama_ubatch * ubatch) {
attn_ctx->get_swa()->set_input_kq_mask(inp_attn->self_kq_mask_swa, ubatch, cparams.causal_attn);
}
if (inp_attn->self_k_rot) {
attn_ctx->get_base()->set_input_k_rot(inp_attn->self_k_rot);
}
if (inp_attn->self_v_rot) {
attn_ctx->get_base()->set_input_v_rot(inp_attn->self_v_rot);
}
const int64_t n_rs = mctx->get_recr()->get_n_rs();
if (inp_rs->s_copy) {
@ -2002,13 +2053,13 @@ static std::unique_ptr<llm_graph_input_attn_kv> build_attn_inp_kv_impl(
inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch);
inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch);
inp->self_kq_mask = build_kq_mask(ctx0, mctx_cur, ubatch, cparams);
ggml_set_input(inp->self_kq_mask);
inp->self_kq_mask = build_attn_inp_kq_mask(ctx0, mctx_cur, ubatch, cparams);
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
}
inp->self_k_rot = mctx_cur->build_input_k_rot(ctx0);
inp->self_v_rot = mctx_cur->build_input_v_rot(ctx0);
return inp;
}
@ -2034,6 +2085,15 @@ ggml_tensor * llm_graph_context::build_attn(
int il) const {
GGML_ASSERT(v_mla == nullptr);
if (inp->self_k_rot) {
q_cur = ggml_mul_mat_aux(ctx0, q_cur, inp->self_k_rot);
k_cur = ggml_mul_mat_aux(ctx0, k_cur, inp->self_k_rot);
}
if (inp->self_v_rot) {
v_cur = ggml_mul_mat_aux(ctx0, v_cur, inp->self_v_rot);
}
// these nodes are added to the graph together so that they are not reordered
// by doing so, the number of splits in the graph is reduced
// expand k later to enable rope fusion which directly writes into k-v cache
@ -2061,6 +2121,10 @@ ggml_tensor * llm_graph_context::build_attn(
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
cb(cur, "kqv_out", il);
if (inp->self_v_rot) {
cur = ggml_mul_mat_aux(ctx0, cur, inp->self_v_rot);
}
if (wo) {
cur = build_lora_mm(wo, cur);
if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE || arch == LLM_ARCH_JAIS2) {
@ -2090,9 +2154,7 @@ static std::unique_ptr<llm_graph_input_attn_k> build_attn_inp_k_impl(
inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch);
inp->self_kq_mask = build_kq_mask(ctx0, mctx_cur, ubatch, cparams);
ggml_set_input(inp->self_kq_mask);
inp->self_kq_mask = build_attn_inp_kq_mask(ctx0, mctx_cur, ubatch, cparams);
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
}
@ -2171,6 +2233,18 @@ ggml_tensor * llm_graph_context::build_attn(
ggml_tensor * v_mla,
float kq_scale,
int il) const {
if (inp->self_k_rot) {
q_cur = ggml_mul_mat_aux(ctx0, q_cur, inp->self_k_rot);
if (k_cur) {
k_cur = ggml_mul_mat_aux(ctx0, k_cur, inp->self_k_rot);
}
}
if (inp->self_v_rot) {
if (v_cur) {
v_cur = ggml_mul_mat_aux(ctx0, v_cur, inp->self_v_rot);
}
}
// these nodes are added to the graph together so that they are not reordered
// by doing so, the number of splits in the graph is reduced
ggml_build_forward_expand(gf, q_cur);
@ -2211,6 +2285,10 @@ ggml_tensor * llm_graph_context::build_attn(
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
cb(cur, "kqv_out", il);
if (inp->self_v_rot) {
cur = ggml_mul_mat_aux(ctx0, cur, inp->self_v_rot);
}
if (wo) {
cur = build_lora_mm(wo, cur);
}
@ -2293,12 +2371,8 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
inp->self_k_idxs = mctx_cur->get_base()->build_input_k_idxs(ctx0, ubatch);
inp->self_v_idxs = mctx_cur->get_base()->build_input_v_idxs(ctx0, ubatch);
inp->self_kq_mask = build_kq_mask(ctx0, mctx_cur->get_base(), ubatch, cparams);
ggml_set_input(inp->self_kq_mask);
ggml_set_name(inp->self_kq_mask, "self_kq_mask");
inp->self_kq_mask = build_attn_inp_kq_mask(ctx0, mctx_cur->get_base(), ubatch, cparams);
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
ggml_set_name(inp->self_kq_mask_cnv, "self_kq_mask_cnv");
}
{
@ -2307,14 +2381,13 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
inp->self_k_idxs_swa = mctx_cur->get_swa()->build_input_k_idxs(ctx0, ubatch);
inp->self_v_idxs_swa = mctx_cur->get_swa()->build_input_v_idxs(ctx0, ubatch);
inp->self_kq_mask_swa = build_kq_mask(ctx0, mctx_cur->get_swa(), ubatch, cparams);
ggml_set_input(inp->self_kq_mask_swa);
ggml_set_name(inp->self_kq_mask_swa, "self_kq_mask_swa");
inp->self_kq_mask_swa = build_attn_inp_kq_mask(ctx0, mctx_cur->get_swa(), ubatch, cparams);
inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
ggml_set_name(inp->self_kq_mask_swa_cnv, "self_kq_mask_swa_cnv");
}
inp->self_k_rot = mctx_cur->get_base()->build_input_k_rot(ctx0);
inp->self_v_rot = mctx_cur->get_base()->build_input_v_rot(ctx0);
return (llm_graph_input_attn_kv_iswa *) res->add_input(std::move(inp));
}
@ -2473,9 +2546,7 @@ llm_graph_input_mem_hybrid_iswa * llm_graph_context::build_inp_mem_hybrid_iswa()
inp_attn->self_k_idxs = attn_ctx->get_base()->build_input_k_idxs(ctx0, ubatch);
inp_attn->self_v_idxs = attn_ctx->get_base()->build_input_v_idxs(ctx0, ubatch);
inp_attn->self_kq_mask = build_kq_mask(ctx0, attn_ctx->get_base(), ubatch, cparams);
ggml_set_input(inp_attn->self_kq_mask);
inp_attn->self_kq_mask = build_attn_inp_kq_mask(ctx0, attn_ctx->get_base(), ubatch, cparams);
inp_attn->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_attn->self_kq_mask, GGML_TYPE_F16) : inp_attn->self_kq_mask;
}
@ -2483,9 +2554,7 @@ llm_graph_input_mem_hybrid_iswa * llm_graph_context::build_inp_mem_hybrid_iswa()
inp_attn->self_k_idxs_swa = attn_ctx->get_swa()->build_input_k_idxs(ctx0, ubatch);
inp_attn->self_v_idxs_swa = attn_ctx->get_swa()->build_input_v_idxs(ctx0, ubatch);
inp_attn->self_kq_mask_swa = build_kq_mask(ctx0, attn_ctx->get_swa(), ubatch, cparams);
ggml_set_input(inp_attn->self_kq_mask_swa);
inp_attn->self_kq_mask_swa = build_attn_inp_kq_mask(ctx0, attn_ctx->get_swa(), ubatch, cparams);
inp_attn->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_attn->self_kq_mask_swa, GGML_TYPE_F16) : inp_attn->self_kq_mask_swa;
}

View File

@ -308,6 +308,10 @@ public:
ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
// note: assumes v_rot^ == I
ggml_tensor * self_k_rot = nullptr;
ggml_tensor * self_v_rot = nullptr;
// note: these have to be copies because in order to be able to reuse a graph, its inputs
// need to carry these parameters with them. otherwise, they can point to freed
// llm_graph_params from a previous batch, causing stack-use-after-return
@ -384,6 +388,10 @@ public:
ggml_tensor * self_kq_mask_swa = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
ggml_tensor * self_kq_mask_swa_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
// note: using same rotation matrices for both base and swa cache
ggml_tensor * self_k_rot = nullptr;
ggml_tensor * self_v_rot = nullptr;
const llama_hparams hparams;
const llama_cparams cparams;

View File

@ -13,6 +13,65 @@
#include <map>
#include <stdexcept>
static bool ggml_is_power_of_2(int n) {
return (n & (n - 1)) == 0;
}
// orthonormal Walsh-Hadamard rotation matrix
// note: res^2 == I
static void ggml_gen_hadamard(ggml_tensor * tensor) {
assert(tensor->type == GGML_TYPE_F32);
const int n = tensor->ne[0];
assert(ggml_is_power_of_2(n));
assert(tensor->ne[1] == n);
assert(tensor->ne[2] == 1);
assert(tensor->ne[3] == 1);
std::vector<float> data_f32;
float * data = (float *) tensor->data;
if (tensor->type != GGML_TYPE_F32) {
data_f32.resize(n*n);
data = data_f32.data();
}
data[0*n + 0] = 1.0 / sqrtf(n);
for (int s = 1; s < n; s *= 2) {
for (int i = 0; i < s; i++) {
for (int j = 0; j < s; j++) {
const float val = data[i*n + j];
data[(i + s)*n + (j )] = val;
data[(i )*n + (j + s)] = val;
data[(i + s)*n + (j + s)] = -val;
}
}
}
if (tensor->type != GGML_TYPE_F32) {
ggml_quantize_chunk(tensor->type, data, tensor->data, 0, 1, n*n, nullptr);
}
}
static ggml_tensor * ggml_mul_mat_aux(
ggml_context * ctx,
ggml_tensor * cur,
ggml_tensor * rot) {
const auto n = rot->ne[0];
ggml_tensor * res;
res = ggml_reshape_2d(ctx, cur, n, ggml_nelements(cur)/n);
res = ggml_mul_mat (ctx, rot, res);
res = ggml_reshape_4d(ctx, res, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3]);
return res;
}
//
// llama_kv_cache
//
@ -209,6 +268,48 @@ llama_kv_cache::llama_kv_cache(
ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
}
const char * LLAMA_ATTN_ROT_DISABLE = getenv("LLAMA_ATTN_ROT_DISABLE");
const bool attn_rot_disable = LLAMA_ATTN_ROT_DISABLE ? atoi(LLAMA_ATTN_ROT_DISABLE) : false;
if (attn_rot_disable) {
LLAMA_LOG_WARN("%s: attention rotation force disabled (LLAMA_ATTN_ROT_DISABLE)\n", __func__);
}
attn_rot_k =
!attn_rot_disable &&
ggml_is_quantized(type_k) &&
!hparams.is_n_embd_k_gqa_variable() &&
hparams.n_embd_head_k() % 64 == 0;
attn_rot_v =
!attn_rot_disable &&
ggml_is_quantized(type_v) &&
!hparams.is_n_embd_v_gqa_variable() &&
hparams.n_embd_head_v() % 64 == 0;
LLAMA_LOG_INFO("%s: attn_rot_k = %d\n", __func__, attn_rot_k);
LLAMA_LOG_INFO("%s: attn_rot_v = %d\n", __func__, attn_rot_v);
// pre-compute the haramard matrices and keep them in host memory
// TODO: in the future, we can make copies in the backend buffers to avoid host -> device transfers
if (attn_rot_k || attn_rot_v) {
for (int64_t n = 64; n <= std::max(hparams.n_embd_head_k(), hparams.n_embd_head_v()); n *= 2) {
attn_rot_hadamard[n] = std::vector<float>(n*n);
ggml_init_params params = {
/* .mem_size = */ 1*ggml_tensor_overhead(),
/* .mem_buffer = */ nullptr,
/* .no_alloc = */ true,
};
ggml_context_ptr ctx { ggml_init(params) };
ggml_tensor * tmp = ggml_new_tensor_2d(ctx.get(), GGML_TYPE_F32, n, n);
tmp->data = attn_rot_hadamard[n].data();
ggml_gen_hadamard(tmp);
}
}
const char * LLAMA_KV_CACHE_DEBUG = getenv("LLAMA_KV_CACHE_DEBUG");
debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0;
}
@ -1004,6 +1105,14 @@ bool llama_kv_cache::get_has_shift() const {
return result;
}
ggml_type llama_kv_cache::type_k() const {
return layers[0].k->type;
}
ggml_type llama_kv_cache::type_v() const {
return layers[0].v->type;
}
uint32_t llama_kv_cache::get_n_kv(const slot_info & sinfo) const {
uint32_t result = 0;
@ -1189,6 +1298,47 @@ ggml_tensor * llama_kv_cache::build_input_v_idxs(ggml_context * ctx, const llama
return v_idxs;
}
ggml_tensor * llama_kv_cache::build_input_k_rot(ggml_context * ctx) const {
ggml_tensor * res = nullptr;
if (attn_rot_k) {
int nrot = 64;
// TODO: investigate if using the smallest rotation matrix is beneficial also for K (similar as for V)
// ref: https://github.com/ggml-org/llama.cpp/pull/21038#issuecomment-4141323088
do {
nrot *= 2;
} while (hparams.n_embd_head_k() % nrot == 0);
nrot /= 2;
res = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nrot, nrot);
ggml_set_input(res);
ggml_set_name(res, "attn_inp_k_rot");
}
return res;
}
ggml_tensor * llama_kv_cache::build_input_v_rot(ggml_context * ctx) const {
ggml_tensor * res = nullptr;
if (attn_rot_v) {
int nrot = 64;
// using smaller rotation matrices for V seems beneficial
// ref: https://github.com/ggml-org/llama.cpp/pull/21038#issuecomment-4146397570
//do {
// nrot *= 2;
//} while (hparams.n_embd_head_v() % nrot == 0);
//nrot /= 2;
res = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nrot, nrot);
ggml_set_input(res);
ggml_set_name(res, "attn_inp_v_rot");
}
return res;
}
void llama_kv_cache::set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const {
const uint32_t n_tokens = ubatch->n_tokens;
GGML_ASSERT(n_tokens == (int64_t) sinfo.size()*sinfo.n_stream());
@ -1507,6 +1657,24 @@ void llama_kv_cache::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch
}
}
void llama_kv_cache::set_input_k_rot(ggml_tensor * dst) const {
GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
const auto n_rot = dst->ne[0];
GGML_ASSERT(attn_rot_hadamard.count(dst->ne[0]));
memcpy(dst->data, attn_rot_hadamard.at(n_rot).data(), ggml_nbytes(dst));
}
void llama_kv_cache::set_input_v_rot(ggml_tensor * dst) const {
GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
const auto n_rot = dst->ne[0];
GGML_ASSERT(attn_rot_hadamard.count(dst->ne[0]));
memcpy(dst->data, attn_rot_hadamard.at(n_rot).data(), ggml_nbytes(dst));
}
size_t llama_kv_cache::total_size() const {
size_t size = 0;
@ -1542,6 +1710,7 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
ggml_context * ctx,
ggml_tensor * cur,
ggml_tensor * shift,
ggml_tensor * rot,
ggml_tensor * factors,
float freq_base,
float freq_scale,
@ -1567,10 +1736,16 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
// dequantize to f32 -> RoPE -> quantize back
tmp = ggml_cast(ctx, cur, GGML_TYPE_F32);
// rotate back
tmp = ggml_mul_mat_aux(ctx, tmp, rot);
tmp = ggml_rope_ext(ctx, tmp,
shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
// rotate fwd
tmp = ggml_mul_mat_aux(ctx, tmp, rot);
tmp = ggml_cpy(ctx, tmp, cur);
} else {
// we rotate only the first n_rot dimensions
@ -1591,6 +1766,9 @@ public:
ggml_tensor * k_shift; // I32 [kv_size*n_stream]
// note: assumes k_rot^2 == I
ggml_tensor * k_rot = nullptr;
const llama_kv_cache * kv_self;
};
@ -1600,6 +1778,10 @@ void llm_graph_input_k_shift::set_input(const llama_ubatch * ubatch) {
if (k_shift) {
kv_self->set_input_k_shift(k_shift);
}
if (k_rot) {
kv_self->set_input_k_rot(k_rot);
}
}
ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_context * lctx) const {
@ -1611,6 +1793,8 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, (int64_t) get_size()*n_stream);
ggml_set_input(inp->k_shift);
inp->k_rot = build_input_k_rot(ctx);
const auto & cparams = lctx->get_cparams();
for (const auto & layer : layers) {
@ -1635,7 +1819,7 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
ggml_row_size(layer.k->type, n_embd_k_gqa),
ggml_row_size(layer.k->type, n_embd_nope));
ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l, il);
ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, inp->k_rot, rope_factors, freq_base_l, freq_scale_l, il);
ggml_build_forward_expand(gf, cur);
}
@ -2239,6 +2423,14 @@ uint32_t llama_kv_cache_context::get_n_kv() const {
return n_kv;
}
ggml_type llama_kv_cache_context::type_k() const {
return kv->type_k();
}
ggml_type llama_kv_cache_context::type_v() const {
return kv->type_v();
}
ggml_tensor * llama_kv_cache_context::get_k(ggml_context * ctx, int32_t il) const {
return kv->get_k(ctx, il, n_kv, sinfos[i_cur]);
}
@ -2263,6 +2455,14 @@ ggml_tensor * llama_kv_cache_context::build_input_v_idxs(ggml_context * ctx, con
return kv->build_input_v_idxs(ctx, ubatch);
}
ggml_tensor * llama_kv_cache_context::build_input_k_rot(ggml_context * ctx) const {
return kv->build_input_k_rot(ctx);
}
ggml_tensor * llama_kv_cache_context::build_input_v_rot(ggml_context * ctx) const {
return kv->build_input_v_rot(ctx);
}
void llama_kv_cache_context::set_input_k_shift(ggml_tensor * dst) const {
kv->set_input_k_shift(dst);
}
@ -2282,3 +2482,11 @@ void llama_kv_cache_context::set_input_kq_mask(ggml_tensor * dst, const llama_ub
void llama_kv_cache_context::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const {
kv->set_input_pos_bucket(dst, ubatch);
}
void llama_kv_cache_context::set_input_k_rot(ggml_tensor * dst) const {
kv->set_input_k_rot(dst);
}
void llama_kv_cache_context::set_input_v_rot(ggml_tensor * dst) const {
kv->set_input_v_rot(dst);
}

View File

@ -152,6 +152,9 @@ public:
bool get_has_shift() const;
ggml_type type_k() const;
ggml_type type_v() const;
//
// graph_build API
//
@ -191,6 +194,9 @@ public:
ggml_tensor * build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
ggml_tensor * build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
ggml_tensor * build_input_k_rot(ggml_context * ctx) const;
ggml_tensor * build_input_v_rot(ggml_context * ctx) const;
void set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const;
void set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const;
@ -199,6 +205,9 @@ public:
void set_input_kq_mask (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const;
void set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const;
void set_input_k_rot(ggml_tensor * dst) const;
void set_input_v_rot(ggml_tensor * dst) const;
private:
const llama_model & model;
const llama_hparams & hparams;
@ -226,6 +235,13 @@ private:
// SWA
const uint32_t n_swa = 0;
// env: LLAMA_ATTN_ROT_DISABLE
bool attn_rot_k = false;
bool attn_rot_v = false;
// pre-computed hadamard martrices
std::unordered_map<int64_t, std::vector<float>> attn_rot_hadamard;
// env: LLAMA_KV_CACHE_DEBUG
int debug = 0;
@ -262,6 +278,7 @@ private:
ggml_context * ctx,
ggml_tensor * cur,
ggml_tensor * shift,
ggml_tensor * rot,
ggml_tensor * factors,
float freq_base,
float freq_scale,
@ -328,6 +345,9 @@ public:
uint32_t get_n_kv() const;
ggml_type type_k() const;
ggml_type type_v() const;
// get views of the current state of the cache
ggml_tensor * get_k(ggml_context * ctx, int32_t il) const;
ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
@ -347,6 +367,9 @@ public:
ggml_tensor * build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
ggml_tensor * build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
ggml_tensor * build_input_k_rot(ggml_context * ctx) const;
ggml_tensor * build_input_v_rot(ggml_context * ctx) const;
void set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch) const;
void set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch) const;
@ -354,6 +377,9 @@ public:
void set_input_kq_mask (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const;
void set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const;
void set_input_k_rot(ggml_tensor * dst) const;
void set_input_v_rot(ggml_tensor * dst) const;
private:
llama_memory_status status;

View File

@ -12,6 +12,7 @@
#include <cstring>
#include <future>
#include <regex>
#include <unordered_map>
static const size_t kiB = 1024;
static const size_t MiB = 1024*kiB;
@ -550,6 +551,7 @@ llama_model_loader::llama_model_loader(
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
file_paths.emplace_back(fname);
contexts.emplace_back(ctx);
if (use_mmap && use_direct_io) {
@ -632,6 +634,7 @@ llama_model_loader::llama_model_loader(
}
files.emplace_back(new llama_file(fname_split, "rb", use_direct_io));
file_paths.emplace_back(fname_split);
contexts.emplace_back(ctx);
// Save tensors data offset info of the shard.
@ -1410,6 +1413,21 @@ bool llama_model_loader::load_all_data(
std::vector<no_init<uint8_t>> read_buf;
std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
// Thread-local file handles to avoid seek/read races when loading multiple
// contexts in parallel.
thread_local std::unordered_map<std::string, std::unique_ptr<llama_file>> local_files;
auto get_local_file = [this](size_t idx) -> llama_file * {
const std::string key = file_paths.at(idx) + (use_direct_io ? "#dio1" : "#dio0");
auto it = local_files.find(key);
if (it == local_files.end()) {
auto local_file = std::make_unique<llama_file>(file_paths.at(idx).c_str(), "rb", use_direct_io);
auto * ptr = local_file.get();
local_files.emplace(key, std::move(local_file));
return ptr;
}
return it->second.get();
};
// 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
// NVMe raid configurations might require more / larger buffers.
constexpr size_t n_buffers = 4;
@ -1516,7 +1534,7 @@ bool llama_model_loader::load_all_data(
}
if (progress_callback) {
if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
if (!progress_callback((float) size_done.load(std::memory_order_relaxed) / size_data, progress_callback_user_data)) {
return false;
}
}
@ -1540,19 +1558,21 @@ bool llama_model_loader::load_all_data(
GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
if (buf_mmap && cur->data == nullptr) {
ggml_backend_tensor_alloc(buf_mmap, cur, data);
if (lmlocks) {
const auto & lmlock = lmlocks->at(weight->idx);
lmlock->grow_to(weight->offs + n_size);
{
std::lock_guard<std::mutex> lock(mmaps_used_mutex);
if (lmlocks) {
const auto & lmlock = lmlocks->at(weight->idx);
lmlock->grow_to(weight->offs + n_size);
}
auto & mmap_used = mmaps_used[weight->idx];
mmap_used.first = std::min(mmap_used.first, weight->offs);
mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
}
auto & mmap_used = mmaps_used[weight->idx];
mmap_used.first = std::min(mmap_used.first, weight->offs);
mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
} else {
ggml_backend_tensor_set(cur, data, 0, n_size);
}
} else {
const auto & file = files.at(weight->idx);
llama_file * file = get_local_file(weight->idx);
if (ggml_backend_buffer_is_host(cur->buffer)) {
file->seek(weight->offs, SEEK_SET);
@ -1628,7 +1648,7 @@ bool llama_model_loader::load_all_data(
}
}
size_done += n_size;
size_done.fetch_add(n_size, std::memory_order_relaxed);
}
// free temporary resources used for async uploads
@ -1655,7 +1675,12 @@ bool llama_model_loader::load_all_data(
}
// check if this is the last call and do final cleanup
if (size_done >= size_data) {
if (size_done.load(std::memory_order_relaxed) >= size_data) {
bool expected = false;
if (!final_cleanup_done.compare_exchange_strong(expected, true, std::memory_order_acq_rel)) {
return true;
}
// unmap offloaded tensors and metadata
if (use_mmap) {
for (uint32_t idx = 0; idx < mappings.size(); idx++) {

View File

@ -9,9 +9,11 @@
#include "ggml-cpp.h"
#include <atomic>
#include <cstddef>
#include <cstring>
#include <map>
#include <mutex>
#include <stdexcept>
#include <unordered_map>
@ -81,6 +83,7 @@ struct llama_model_loader {
bool no_alloc;
llama_files files;
std::vector<std::string> file_paths;
llama_ftype ftype;
llama_fver fver;
@ -99,9 +102,11 @@ struct llama_model_loader {
std::string arch_name;
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
size_t size_done = 0;
std::atomic<size_t> size_done = 0;
size_t size_data = 0;
std::vector<std::pair<size_t, size_t>> mmaps_used;
std::mutex mmaps_used_mutex;
std::atomic<bool> final_cleanup_done = false;
// define a comparator for the buft -> ctx map to ensure that the order is well-defined:
struct ggml_backend_buft_comparator {

View File

@ -17,12 +17,15 @@
#include "models/models.h"
#include <algorithm>
#include <atomic>
#include <cassert>
#include <cstdlib>
#include <cfloat>
#include <cstdint>
#include <cstring>
#include <cmath>
#include <functional>
#include <future>
#include <map>
#include <regex>
#include <sstream>
@ -7772,10 +7775,60 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
}
// load tensor data
for (auto & [ctx, buf_map] : ctx_buf_maps) {
if (!ml.load_all_data(ctx, buf_map, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
const char * limit_env = getenv("LLAMA_ARG_PARALLEL_LOAD");
const size_t default_limit = 0;
const int limit_val = limit_env ? atoi(limit_env) : (int) default_limit;
const size_t n_contexts = ctx_buf_maps.size();
const size_t parallel_limit = limit_val <= 0 ? n_contexts : (size_t) limit_val;
const bool use_parallel = n_contexts > 1 && parallel_limit > 1;
if (use_parallel) {
LLAMA_LOG_INFO("%s: using parallel loading for %zu GPU contexts (limit=%zu)\n", __func__, n_contexts, parallel_limit);
std::atomic<bool> load_failed{false};
std::vector<std::future<bool>> futures;
futures.reserve(n_contexts);
for (auto & [ctx, buf_map] : ctx_buf_maps) {
if (futures.size() >= parallel_limit) {
if (!futures.front().get()) {
load_failed.store(true, std::memory_order_relaxed);
}
futures.erase(futures.begin());
}
auto * ctx_ptr = ctx;
auto * buf_map_ptr = &buf_map;
auto * mlock_ptr = use_mlock ? &pimpl->mlock_mmaps : nullptr;
futures.emplace_back(std::async(std::launch::async, [&ml, ctx_ptr, buf_map_ptr, mlock_ptr, &load_failed]() {
if (load_failed.load(std::memory_order_relaxed)) {
return false;
}
return ml.load_all_data(
ctx_ptr,
*buf_map_ptr,
mlock_ptr,
nullptr,
nullptr);
}));
}
for (auto & future : futures) {
if (!future.get()) {
load_failed.store(true, std::memory_order_relaxed);
}
}
if (load_failed.load(std::memory_order_relaxed)) {
return false;
}
} else {
for (auto & [ctx, buf_map] : ctx_buf_maps) {
if (!ml.load_all_data(ctx, buf_map, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
return false;
}
}
}
if (use_mmap_buffer) {

View File

@ -2712,6 +2712,67 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
.run();
}
// LFM2.5 tests - uses plain "List of tools: [...]" and bare [name(args)] without wrapper tokens
{
auto tst = peg_tester("models/templates/LFM2.5-Instruct.jinja", detailed_debug);
// Basic content only
tst.test("Hello, world!\nWhat's up?").expect(message_assist).run();
// Single tool call without reasoning
tst.test("[special_function(arg1=1)]")
.tools({ special_function_tool })
.expect(message_assist_call)
.run();
// Tool call with string argument
tst.test("[get_time(city=\"XYZCITY\")]")
.tools({ get_time_tool })
.expect(message_with_tool_calls("get_time", "{\"city\":\"XYZCITY\"}"))
.run();
// Tool call with reasoning (enable_thinking=true)
tst.test("<think>I'm\nthinking</think>[special_function(arg1=1)]")
.enable_thinking(true)
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
.tools({ special_function_tool })
.expect(message_assist_call_thoughts)
.run();
// Multiple tool calls (parallel)
tst.test("[special_function(arg1=1), special_function_with_opt(arg1=1, arg2=2)]")
.parallel_tool_calls(true)
.tools({
special_function_tool, special_function_tool_with_optional_param
})
.expect_tool_calls({
{ "special_function", R"({"arg1": 1})", {} },
{ "special_function_with_opt", R"({"arg1": 1, "arg2": 2})", {} },
})
.run();
// Tool call with content before tool call
tst.test("Let me check the time.[get_time(city=\"Paris\")]")
.tools({ get_time_tool })
.expect(message_with_reasoning_content_and_multiple_tool_calls(
"", "Let me check the time.", { { "get_time", "{\"city\":\"Paris\"}" } }
))
.run();
// Partial tool call (streaming)
tst.test("[special_function(arg1=")
.tools({ special_function_tool })
.is_partial(true)
.expect(simple_assist_msg("", "", "special_function", "{\"arg1\": "))
.run();
// Tool call with empty arguments
tst.test("[empty_args()]")
.tools({ empty_args_tool })
.expect(simple_assist_msg("", "", "empty_args", "{}"))
.run();
}
// Apertus-8B-Instruct tests - FUNC_NAME_AS_KEY format
// Format: <|tools_prefix|>[{"function_name": {...arguments...}}]<|tools_suffix|>
{

View File

@ -143,7 +143,11 @@ bool server_http_context::init(const common_params & params) {
"/v1/health",
"/models",
"/v1/models",
"/api/tags"
"/api/tags",
"/",
"/index.html",
"/bundle.js",
"/bundle.css",
};
// If API key is not set, skip validation
@ -151,8 +155,8 @@ bool server_http_context::init(const common_params & params) {
return true;
}
// If path is public or is static file, skip validation
if (public_endpoints.find(req.path) != public_endpoints.end() || req.path == "/") {
// If path is public or static file, skip validation
if (public_endpoints.find(req.path) != public_endpoints.end()) {
return true;
}

View File

@ -22,6 +22,15 @@ def test_access_public_endpoint(endpoint: str):
assert "error" not in res.body
def test_access_static_assets_without_api_key():
"""Static web UI assets should not require API key authentication (issue #21229)"""
global server
server.start()
for path in ["/", "/bundle.js", "/bundle.css"]:
res = server.make_request("GET", path)
assert res.status_code == 200, f"Expected 200 for {path}, got {res.status_code}"
@pytest.mark.parametrize("api_key", [None, "invalid-key"])
def test_incorrect_api_key(api_key: str):
global server