Compare commits

...

12 Commits

Author SHA1 Message Date
Dan Hoffman f9d6cd646d
Merge 9f7ce433aa into 43a4ee4a2c 2026-04-03 13:16:36 +02:00
uvos 43a4ee4a2c
HIP: build eatch ci build test for a different architecture (#21337)
This helps improve our chances of finding build failures before the release workflow
builds for all architectures.
2026-04-03 11:38:22 +02:00
Tillerino f851fa5ab0
fix: add openssl to nix dependencies (#21353) (#21355) 2026-04-03 12:21:07 +03:00
Vishal Singh f1ac84119c
ggml-zendnn : add MUL_MAT_ID op support for MoE models (#21315)
* ggml-zendnn : add MUL_MAT_ID op support for MoE models
- Add MUL_MAT_ID op acceleration for Mixture-of-Experts models
- MUL_MAT_ID op fallback to CPU backend if total experts > 32
- Point ZenDNN lib to latest bits ZenDNN-2026-WW13

* ggml-zendnn : add braces to sgemm failure condition for consistency

Co-authored-by: Aaron Teo <taronaeo@gmail.com>

---------

Co-authored-by: Aaron Teo <taronaeo@gmail.com>
2026-04-03 12:19:08 +03:00
Piotr Wilkin (ilintar) b069b10ab4
vocab: fix Gemma4 tokenizer (#21343)
* seems to work

* fix case with new line

Co-authored-by: sayap <sokann@gmail.com>

* gemma 4: fix pre tok regex

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: sayap <sokann@gmail.com>
2026-04-03 10:33:03 +02:00
Radoslav Gerganov 0c58ba3365
rpc : reuse compute graph buffers (#21299)
Reuse the buffer for the ggml context which is used for creating the
compute graph on the server side. This partially addresses a memory leak
created by the CUDA backend due to using buffer addresses as cache
keys.

ref: #21265
ref: #20315
2026-04-03 10:28:09 +03:00
Georgi Gerganov 57ace0d612
chat : avoid including json in chat.h (#21306) 2026-04-03 09:07:59 +03:00
Georgi Gerganov 39b27f0da0
(revert) kv-cache : do not quantize SWA KV cache (#21332)
This reverts commit 17193cce34.
2026-04-03 09:07:01 +03:00
Vishal Singh f49e917876
ci : add AMD ZenDNN label to PR labeler (#21345)
* ci : add AMD CPU label to PR labeler
Add automatic labeling for PRs that modify AMD CPU (ZenDNN) backend files

* ci : rename label AMD CPU to AMD ZenDNN in labeler config

Co-authored-by: Aaron Teo <taronaeo@gmail.com>

---------

Co-authored-by: Aaron Teo <taronaeo@gmail.com>
2026-04-03 10:35:15 +08:00
Slobodan Josic 7c7d6ce5c7
[HIP] Bump ROCm version to 7.2.1 (#21066)
Bump ROCm version on Linux from 7.2 to 7.2.1
Add gfx1102 target
Delete LLVM workaround since ROCm 7.2.1 has fix for ROCm 7.2 perf regression https://github.com/ROCm/rocm-systems/issues/2865

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
2026-04-03 00:59:20 +02:00
Piotr Wilkin (ilintar) 5208e2d5ba
fix: gemma 4 template (#21326) 2026-04-02 23:31:02 +02:00
Dan Hoffman 9f7ce433aa Fix undefined timing measurement errors in server context 2026-03-30 21:09:31 -07:00
30 changed files with 3776 additions and 7339 deletions

View File

@ -16,7 +16,7 @@
rocmPackages,
vulkan-headers,
vulkan-loader,
curl,
openssl,
shaderc,
useBlas ?
builtins.all (x: !x) [
@ -160,7 +160,8 @@ effectiveStdenv.mkDerivation (finalAttrs: {
++ optionals useMpi [ mpi ]
++ optionals useRocm rocmBuildInputs
++ optionals useBlas [ blas ]
++ optionals useVulkan vulkanBuildInputs;
++ optionals useVulkan vulkanBuildInputs
++ [ openssl ];
cmakeFlags =
[

View File

@ -1,8 +1,8 @@
ARG UBUNTU_VERSION=24.04
# This needs to generally match the container host's environment.
ARG ROCM_VERSION=7.2
ARG AMDGPU_VERSION=7.2
ARG ROCM_VERSION=7.2.1
ARG AMDGPU_VERSION=7.2.1
# Target the ROCm build image
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
@ -12,11 +12,11 @@ FROM ${BASE_ROCM_DEV_CONTAINER} AS build
# Unless otherwise specified, we make a fat build.
# This is mostly tied to rocBLAS supported archs.
# check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.2.0/reference/system-requirements.html
# check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.2.1/reference/system-requirements.html
# check https://rocm.docs.amd.com/projects/radeon-ryzen/en/latest/docs/compatibility/compatibilityrad/native_linux/native_linux_compatibility.html
# check https://rocm.docs.amd.com/projects/radeon-ryzen/en/latest/docs/compatibility/compatibilityryz/native_linux/native_linux_compatibility.html
ARG ROCM_DOCKER_ARCH='gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1151;gfx1150;gfx1200;gfx1201'
ARG ROCM_DOCKER_ARCH='gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1150;gfx1200;gfx1201'
# Set ROCm architectures
ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}

5
.github/labeler.yml vendored
View File

@ -27,6 +27,11 @@ IBM zDNN:
- any-glob-to-any-file:
- ggml/include/ggml-zdnn.h
- ggml/src/ggml-zdnn/**
AMD ZenDNN:
- changed-files:
- any-glob-to-any-file:
- ggml/include/ggml-zendnn.h
- ggml/src/ggml-zendnn/**
documentation:
- changed-files:
- any-glob-to-any-file:

View File

@ -472,6 +472,7 @@ jobs:
cmake -B build -S . \
-DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
-DGGML_HIP_ROCWMMA_FATTN=ON \
-DGPU_TARGETS="gfx1030" \
-DGGML_HIP=ON
cmake --build build --config Release -j $(nproc)
@ -941,7 +942,7 @@ jobs:
- name: Grab rocWMMA package
id: grab_rocwmma
run: |
curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/7.2/pool/main/r/rocwmma-dev/rocwmma-dev_2.2.0.70200-43~24.04_amd64.deb"
curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/7.2.1/pool/main/r/rocwmma-dev/rocwmma-dev_2.2.0.70201-81~24.04_amd64.deb"
7z x rocwmma.deb
7z x data.tar
@ -984,12 +985,13 @@ jobs:
cmake -G "Unix Makefiles" -B build -S . `
-DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
-DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-7.2.0/include/" `
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-7.2.1/include/" `
-DCMAKE_BUILD_TYPE=Release `
-DLLAMA_BUILD_BORINGSSL=ON `
-DROCM_DIR="${env:HIP_PATH}" `
-DGGML_HIP=ON `
-DGGML_HIP_ROCWMMA_FATTN=ON `
-DGPU_TARGETS="gfx1100" `
-DGGML_RPC=ON
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}

View File

@ -35,7 +35,7 @@ env:
jobs:
ubuntu-22-hip-quality-check:
runs-on: ubuntu-22.04
container: rocm/dev-ubuntu-22.04:7.2
container: rocm/dev-ubuntu-22.04:7.2.1
steps:
- name: Clone
id: checkout
@ -59,7 +59,7 @@ jobs:
run: |
cmake -B build -S . \
-DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
-DGPU_TARGETS=gfx908 \
-DGPU_TARGETS=gfx942 \
-DGGML_HIP=ON \
-DGGML_HIP_EXPORT_METRICS=Off \
-DCMAKE_HIP_FLAGS="-Werror -Wno-tautological-compare" \

View File

@ -639,8 +639,8 @@ jobs:
strategy:
matrix:
include:
- ROCM_VERSION: "7.2"
gpu_targets: "gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1151;gfx1150;gfx1200;gfx1201"
- ROCM_VERSION: "7.2.1"
gpu_targets: "gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1150;gfx1200;gfx1201"
build: 'x64'
steps:
@ -662,7 +662,7 @@ jobs:
sudo apt install -y build-essential git cmake wget
- name: Setup Legacy ROCm
if: matrix.ROCM_VERSION == '7.2'
if: matrix.ROCM_VERSION == '7.2.1'
id: legacy_env
run: |
sudo mkdir --parents --mode=0755 /etc/apt/keyrings
@ -683,7 +683,7 @@ jobs:
sudo apt-get install -y libssl-dev rocm-hip-sdk
- name: Setup TheRock
if: matrix.ROCM_VERSION != '7.2'
if: matrix.ROCM_VERSION != '7.2.1'
id: therock_env
run: |
wget https://repo.amd.com/rocm/tarball/therock-dist-linux-gfx1151-${{ matrix.ROCM_VERSION }}.tar.gz
@ -699,7 +699,6 @@ jobs:
run: |
cmake -B build -S . \
-DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
-DCMAKE_HIP_FLAGS="-mllvm --amdgpu-unroll-threshold-local=600" \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_BACKEND_DL=ON \
-DGGML_NATIVE=OFF \
@ -717,17 +716,20 @@ jobs:
id: tag
uses: ./.github/actions/get-tag-name
- name: Get ROCm short version
run: echo "ROCM_VERSION_SHORT=$(echo '${{ matrix.ROCM_VERSION }}' | cut -d '.' -f 1,2)" >> $GITHUB_ENV
- name: Pack artifacts
id: pack_artifacts
run: |
cp LICENSE ./build/bin/
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-${{ env.ROCM_VERSION_SHORT }}-${{ matrix.build }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
- name: Upload artifacts
uses: actions/upload-artifact@v6
with:
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}.tar.gz
name: llama-bin-ubuntu-rocm-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}.tar.gz
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-${{ env.ROCM_VERSION_SHORT }}-${{ matrix.build }}.tar.gz
name: llama-bin-ubuntu-rocm-${{ env.ROCM_VERSION_SHORT }}-${{ matrix.build }}.tar.gz
windows-hip:
runs-on: windows-2022
@ -749,7 +751,7 @@ jobs:
- name: Grab rocWMMA package
id: grab_rocwmma
run: |
curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/7.2/pool/main/r/rocwmma-dev/rocwmma-dev_2.2.0.70200-43~24.04_amd64.deb"
curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/7.2.1/pool/main/r/rocwmma-dev/rocwmma-dev_2.2.0.70201-81~24.04_amd64.deb"
7z x rocwmma.deb
7z x data.tar
@ -806,7 +808,7 @@ jobs:
cmake -G "Unix Makefiles" -B build -S . `
-DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
-DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-7.2.0/include/ -Wno-ignored-attributes -Wno-nested-anon-types" `
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-7.2.1/include/ -Wno-ignored-attributes -Wno-nested-anon-types" `
-DCMAKE_BUILD_TYPE=Release `
-DGGML_BACKEND_DL=ON `
-DGGML_NATIVE=OFF `

View File

@ -7,11 +7,109 @@
#include "log.h"
#include "nlohmann/json.hpp"
#include <algorithm>
#include <stdexcept>
#include <string>
using json = nlohmann::ordered_json;
namespace {
// Gemma4-specific PEG builder extending the standard chat builder.
// Adds value type parsers that use <|\"|> as string delimiters
// instead of JSON's double quotes, and disables json-to-schema
// conversion for these types.
class common_peg_gemma4_builder {
common_chat_peg_builder & p_;
static constexpr const char * QUOTE = "<|\"|>";
public:
explicit common_peg_gemma4_builder(common_chat_peg_builder & p) : p_(p) {}
common_peg_parser gemma4_string() {
return p_.rule("gemma4-string", [&]() {
return p_.literal(QUOTE) + p_.until(QUOTE) + p_.literal(QUOTE);
});
}
common_peg_parser gemma4_number() {
return p_.rule("gemma4-number", [&]() {
auto digit1_9 = p_.chars("[1-9]", 1, 1);
auto digits = p_.chars("[0-9]");
auto int_part = p_.choice({p_.literal("0"), p_.sequence({digit1_9, p_.chars("[0-9]", 0, -1)})});
auto frac = p_.sequence({p_.literal("."), digits});
auto exp = p_.sequence({p_.choice({p_.literal("e"), p_.literal("E")}),
p_.optional(p_.chars("[+-]", 1, 1)), digits});
auto not_number_continuation = p_.negate(p_.chars("[0-9.eE+-]", 1, 1));
return p_.sequence({p_.optional(p_.literal("-")), int_part, p_.optional(frac),
p_.optional(exp), not_number_continuation});
});
}
common_peg_parser gemma4_bool() {
return p_.rule("gemma4-bool", [&]() {
return p_.choice({p_.literal("true"), p_.literal("false")});
});
}
common_peg_parser gemma4_null() {
return p_.rule("gemma4-null", [&]() {
return p_.literal("null");
});
}
common_peg_parser gemma4_dict() {
return p_.rule("gemma4-dict", [&]() {
auto ws = p_.space();
auto key = p_.until(":");
auto member = p_.sequence({key, p_.literal(":"), ws, gemma4_value()});
auto members = p_.sequence({member, p_.zero_or_more(p_.sequence({p_.literal(","), ws, member}))});
return p_.sequence({
p_.literal("{"), ws,
p_.choice({p_.literal("}"), p_.sequence({members, ws, p_.literal("}")})})
});
});
}
common_peg_parser gemma4_array() {
return p_.rule("gemma4-array", [&]() {
auto ws = p_.space();
auto elements = p_.sequence({gemma4_value(), p_.zero_or_more(p_.sequence({p_.literal(","), ws, gemma4_value()}))});
return p_.sequence({
p_.literal("["), ws,
p_.choice({p_.literal("]"), p_.sequence({elements, ws, p_.literal("]")})})
});
});
}
common_peg_parser gemma4_value() {
return p_.rule("gemma4-value", [&]() {
return p_.choice({gemma4_string(), gemma4_dict(), gemma4_array(),
gemma4_number(), gemma4_bool(), gemma4_null()});
});
}
// Select the appropriate value parser based on JSON schema type.
// Does NOT use schema() - the gemma4 types are pure PEG without
// JSON schema metadata, so GBNF is generated directly from the
// PEG structure.
common_peg_parser gemma4_value_for_type(const json & schema) {
if (!schema.contains("type") || !schema.at("type").is_string()) {
return gemma4_value();
}
std::string type = schema.at("type").get<std::string>();
if (type == "string") { return gemma4_string(); }
if (type == "number") { return gemma4_number(); }
if (type == "integer") { return gemma4_number(); }
if (type == "boolean") { return gemma4_bool(); }
if (type == "object") { return gemma4_dict(); }
if (type == "array") { return gemma4_array(); }
return gemma4_value();
}
};
} // anonymous namespace
// Helper to iterate over tools/functions
static void foreach_function(const json & tools, const std::function<void(const json &)> & fn) {
for (const auto & tool : tools) {
@ -43,7 +141,9 @@ common_chat_params peg_generator::generate_parser(const common_chat_template &
// Create the result structure
common_chat_params data;
data.prompt = common_chat_template_direct_apply(tmpl, inputs);
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
data.format = (autoparser.tools.format.mode == tool_format::TAG_WITH_GEMMA4_DICT)
? COMMON_CHAT_FORMAT_PEG_GEMMA4
: COMMON_CHAT_FORMAT_PEG_NATIVE;
data.preserved_tokens = autoparser.preserved_tokens;
auto parser = autoparser.build_parser(inputs);
@ -92,6 +192,7 @@ common_peg_arena autoparser::build_parser(const generation_params & inputs) cons
ctx.extracting_reasoning = extract_reasoning && reasoning.mode != reasoning_mode::NONE;
ctx.content = &content;
ctx.reasoning = &reasoning;
// Build reasoning parser
ctx.reasoning_parser = reasoning.build_parser(ctx);
@ -440,7 +541,7 @@ common_peg_parser analyze_tools::build_tool_parser_tag_gemma4_dict(parser_build_
const auto & inputs = ctx.inputs;
bool force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
// The Gemma4 string quote token used in place of JSON "
common_peg_gemma4_builder g4(p);
static const std::string QUOTE = "<|\"|>";
common_peg_parser tool_choice = p.choice();
@ -451,7 +552,6 @@ common_peg_parser analyze_tools::build_tool_parser_tag_gemma4_dict(parser_build_
const auto & params = func.at("parameters");
if (!params.contains("properties") || !params.at("properties").is_object()) {
// No arguments - just match the function name with empty braces
auto func_parser = p.atomic(
p.tool_open(p.literal(function.name_prefix) + p.tool_name(p.literal(name)) + p.literal("{")) +
p.tool_args(p.eps()) +
@ -486,9 +586,18 @@ common_peg_parser analyze_tools::build_tool_parser_tag_gemma4_dict(parser_build_
p.tool_arg_string_value(p.schema(p.until(QUOTE),
"tool-" + name + "-arg-" + param_name + "-schema", param_schema, true)) +
p.literal(QUOTE);
} else if (type == "number" || type == "integer") {
value_parser = p.tool_arg_value(g4.gemma4_number());
} else if (type == "boolean") {
value_parser = p.tool_arg_value(g4.gemma4_bool());
} else if (type == "null") {
value_parser = p.tool_arg_value(g4.gemma4_null());
} else if (type == "object") {
value_parser = p.tool_arg_value(g4.gemma4_dict());
} else if (type == "array") {
value_parser = p.tool_arg_value(g4.gemma4_array());
} else {
// Numbers, booleans: raw text up to the next comma or closing brace
value_parser = p.tool_arg_value(p.until_one_of({",", "}"}));
value_parser = p.tool_arg_value(g4.gemma4_value());
}
auto arg = p.tool_arg(
@ -538,9 +647,9 @@ common_peg_parser analyze_tools::build_tool_parser_tag_gemma4_dict(parser_build_
tool_calls = p.optional(tool_calls);
}
auto content_before_tools = p.until(format.per_call_start);
auto content_before_tools = p.until_one_of({ format.per_call_start, ctx.reasoning->start });
return ctx.reasoning_parser +
(force_tools ? p.eps() : p.optional(p.content(content_before_tools))) +
(force_tools ? p.eps() : p.optional(p.content(content_before_tools) + p.optional(ctx.reasoning_parser))) +
tool_calls + p.end();
}

View File

@ -1,7 +1,7 @@
#pragma once
#include "chat-auto-parser.h"
#include "peg-parser.h"
#include <functional>
#include <optional>
#include <string>

View File

@ -4,6 +4,7 @@
#include "common.h"
#include "jinja/caps.h"
#include "peg-parser.h"
#include "nlohmann/json.hpp"
#include <chrono>
#include <optional>
@ -215,12 +216,14 @@ struct tool_id_analysis {
// ============================================================================
struct analyze_content;
struct analyze_reasoning;
struct parser_build_context {
common_chat_peg_builder & p;
const generation_params & inputs;
const generation_params & inputs;
common_peg_parser reasoning_parser;
bool extracting_reasoning = false;
const analyze_reasoning * reasoning = nullptr;
const analyze_content * content = nullptr;
parser_build_context(common_chat_peg_builder & p, const generation_params & inputs);

View File

@ -104,10 +104,11 @@ static std::vector<std::function<void(const common_chat_template & tmpl, autopar
analysis.tools.function.name_suffix = "";
analysis.tools.arguments.start = "{";
analysis.tools.arguments.end = "}";
analysis.tools.arguments.name_prefix = "";
analysis.tools.arguments.name_suffix = ":";
analysis.tools.arguments.separator = ",";
analysis.reasoning.mode = reasoning_mode::TAG_BASED;
analysis.reasoning.start = "<|channel>thought\n";
analysis.reasoning.start = "<|channel>thought";
analysis.reasoning.end = "<channel|>";
analysis.preserved_tokens.clear();
analysis.preserved_tokens.push_back("<|tool_call>");

View File

@ -75,6 +75,84 @@ static std::string escape_json_string_inner(const std::string & s) {
return escaped;
}
static const std::string GEMMA4_QUOTE = "<|\"|>";
static std::string normalize_gemma4_to_json(const std::string & input) {
std::string result;
result.reserve(input.size() * 2);
enum Ctx { DICT, ARRAY };
std::vector<Ctx> ctx;
auto is_ws = [](char c) { return c == ' ' || c == '\t' || c == '\n' || c == '\r'; };
auto skip_ws = [&](size_t & pos) {
while (pos < input.size() && is_ws(input[pos])) {
result += input[pos++];
}
};
auto quote_unquoted_key = [&](size_t & pos) {
if (pos < input.size() && input[pos] != '"' && input[pos] != '}') {
result += '"';
while (pos < input.size() && input[pos] != ':' && !is_ws(input[pos])) {
result += input[pos++];
}
result += '"';
skip_ws(pos);
}
};
size_t i = 0;
while (i < input.size()) {
if (i + GEMMA4_QUOTE.size() <= input.size() &&
input.compare(i, GEMMA4_QUOTE.size(), GEMMA4_QUOTE) == 0) {
result += '"';
i += GEMMA4_QUOTE.size();
continue;
}
char c = input[i];
if (c == '{') {
result += c;
ctx.push_back(DICT);
++i;
skip_ws(i);
quote_unquoted_key(i);
continue;
}
if (c == '}') {
result += c;
if (!ctx.empty()) ctx.pop_back();
++i;
continue;
}
if (c == '[') {
result += c;
ctx.push_back(ARRAY);
++i;
continue;
}
if (c == ']') {
result += c;
if (!ctx.empty()) ctx.pop_back();
++i;
continue;
}
if (c == ',' && !ctx.empty() && ctx.back() == DICT) {
result += c;
++i;
skip_ws(i);
quote_unquoted_key(i);
continue;
}
result += c;
++i;
}
return result;
}
// Convert Python-style single-quoted strings to JSON double-quoted strings
// Only converts outer string delimiters, properly handling escape sequences:
// - {'key': 'value'} -> {"key": "value"}
@ -214,6 +292,14 @@ std::string & common_chat_peg_mapper::args_target() {
return (current_tool && !current_tool->name.empty()) ? current_tool->arguments : args_buffer;
}
std::string common_chat_peg_mapper::normalize_container_value(const std::string & input) {
return normalize_quotes_to_json(input);
}
std::string common_chat_peg_gemma4_mapper::normalize_container_value(const std::string & input) {
return normalize_quotes_to_json(normalize_gemma4_to_json(input));
}
void common_chat_peg_mapper::from_ast(const common_peg_ast_arena & arena,
const common_peg_parse_result & parse_result_arg) {
arena.visit(parse_result_arg, [this](const common_peg_ast_node & node) { map(node); });
@ -352,7 +438,7 @@ void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
// For potential containers, normalize Python-style single quotes to JSON double quotes
bool is_potential_container = value_content[0] == '[' || value_content[0] == '{';
if (is_potential_container) {
value_content = normalize_quotes_to_json(value_content);
value_content = normalize_container_value(value_content);
}
// Try to parse as JSON value (number, bool, null, object, array)

View File

@ -17,7 +17,9 @@ class common_chat_peg_mapper {
virtual void from_ast(const common_peg_ast_arena & arena, const common_peg_parse_result & result);
virtual void map(const common_peg_ast_node & node);
private:
protected:
virtual std::string normalize_container_value(const std::string & input);
private:
// Tool call handling state
std::optional<common_chat_tool_call> pending_tool_call; // Tool call waiting for name
common_chat_tool_call * current_tool = nullptr;
@ -30,6 +32,13 @@ class common_chat_peg_mapper {
std::string & args_target();
};
class common_chat_peg_gemma4_mapper : public common_chat_peg_mapper {
public:
common_chat_peg_gemma4_mapper(common_chat_msg & msg) : common_chat_peg_mapper(msg) {}
protected:
std::string normalize_container_value(const std::string & input) override;
};
struct content_structure;
struct tool_call_structure;

View File

@ -13,6 +13,8 @@
#include "jinja/caps.h"
#include "peg-parser.h"
#include "nlohmann/json.hpp"
#include <cstdio>
#include <cstdlib>
#include <ctime>
@ -694,6 +696,8 @@ const char * common_chat_format_name(common_chat_format format) {
return "peg-simple";
case COMMON_CHAT_FORMAT_PEG_NATIVE:
return "peg-native";
case COMMON_CHAT_FORMAT_PEG_GEMMA4:
return "peg-gemma4";
default:
throw std::runtime_error("Unknown chat format");
}
@ -760,12 +764,12 @@ static void foreach_parameter(const json &
}
}
std::string common_chat_template_direct_apply(
static std::string common_chat_template_direct_apply_impl(
const common_chat_template & tmpl,
const autoparser::generation_params & inputs,
const std::optional<json> & messages_override,
const std::optional<json> & tools_override,
const std::optional<json> & additional_context) {
const std::optional<json> & messages_override = std::nullopt,
const std::optional<json> & tools_override = std::nullopt,
const std::optional<json> & additional_context = std::nullopt) {
jinja::context ctx(tmpl.source());
nlohmann::ordered_json inp = nlohmann::ordered_json{
@ -812,6 +816,12 @@ std::string common_chat_template_direct_apply(
return result;
}
std::string common_chat_template_direct_apply(
const common_chat_template & tmpl,
const autoparser::generation_params & inputs) {
return common_chat_template_direct_apply_impl(tmpl, inputs, std::nullopt, std::nullopt, std::nullopt);
}
static common_chat_params common_chat_params_init_ministral_3(const common_chat_template & tmpl,
const autoparser::generation_params & inputs) {
common_chat_params data;
@ -862,7 +872,7 @@ static common_chat_params common_chat_params_init_ministral_3(const common_chat_
data.supports_thinking = true;
data.thinking_start_tag = "[THINK]";
data.thinking_end_tag = "[/THINK]";
data.prompt = common_chat_template_direct_apply(tmpl, inputs, /* messages_override = */ adjusted_messages);
data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs, /* messages_override = */ adjusted_messages);
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
data.preserved_tokens = {
"[THINK]",
@ -945,7 +955,7 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
adjusted_messages.push_back(msg);
}
auto prompt = common_chat_template_direct_apply(tmpl, inputs, /* messages_override= */ adjusted_messages);
auto prompt = common_chat_template_direct_apply_impl(tmpl, inputs, /* messages_override= */ adjusted_messages);
// Check if we need to replace the return token with end token during
// inference and without generation prompt. For more details see:
@ -1072,7 +1082,7 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
const autoparser::generation_params & inputs) {
common_chat_params data;
data.prompt = common_chat_template_direct_apply(tmpl, inputs);
data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs);
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
data.preserved_tokens = {
">>>all",
@ -1166,7 +1176,7 @@ static common_chat_params common_chat_params_init_kimi_k2(const common_chat_temp
const autoparser::generation_params & inputs) {
common_chat_params data;
data.prompt = common_chat_template_direct_apply(tmpl, inputs);
data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs);
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
data.supports_thinking = true;
data.preserved_tokens = {
@ -1289,7 +1299,7 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
const autoparser::generation_params & inputs) {
common_chat_params data;
data.prompt = common_chat_template_direct_apply(tmpl, inputs);
data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs);
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
data.supports_thinking = true;
data.preserved_tokens = {
@ -1368,7 +1378,7 @@ static common_chat_params common_chat_params_init_lfm2_5(const common_chat_templ
const autoparser::generation_params & inputs) {
common_chat_params data;
data.prompt = common_chat_template_direct_apply(tmpl, inputs);
data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs);
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
data.supports_thinking = true;
data.preserved_tokens = {
@ -1439,7 +1449,7 @@ static common_chat_params common_chat_params_init_gigachat_v3(
common_chat_params data;
data.prompt = common_chat_template_direct_apply(tmpl, inputs);
data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs);
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
data.supports_thinking = false;
data.preserved_tokens = {
@ -1722,9 +1732,9 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
}
params.add_generation_prompt = false;
std::string no_gen_prompt = common_chat_template_direct_apply(tmpl, params);
std::string no_gen_prompt = common_chat_template_direct_apply_impl(tmpl, params);
params.add_generation_prompt = true;
std::string gen_prompt = common_chat_template_direct_apply(tmpl, params);
std::string gen_prompt = common_chat_template_direct_apply_impl(tmpl, params);
auto diff = calculate_diff_split(no_gen_prompt, gen_prompt);
params.generation_prompt = diff.right;
@ -1758,7 +1768,7 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
common_chat_params data;
auto params_copy = params;
params_copy.reasoning_format = COMMON_REASONING_FORMAT_NONE;
data.prompt = common_chat_template_direct_apply(tmpl, params_copy);
data.prompt = common_chat_template_direct_apply_impl(tmpl, params_copy);
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
data.generation_prompt = params.generation_prompt;
auto parser = build_chat_peg_parser([&params](common_chat_peg_builder &p) {
@ -1905,8 +1915,13 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena & src_pars
// Try to extract any partial results from what was successfully parsed
common_chat_msg msg;
msg.role = "assistant";
auto mapper = common_chat_peg_mapper(msg);
mapper.from_ast(ctx.ast, result);
std::unique_ptr<common_chat_peg_mapper> mapper;
if (params.format == COMMON_CHAT_FORMAT_PEG_GEMMA4) {
mapper = std::make_unique<common_chat_peg_gemma4_mapper>(msg);
} else {
mapper = std::make_unique<common_chat_peg_mapper>(msg);
}
mapper->from_ast(ctx.ast, result);
if (ctx.is_debug()) {
fprintf(stderr, "\nAST for partial parse (fail):\n%s\n", ctx.ast.dump().c_str());
@ -1921,8 +1936,13 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena & src_pars
common_chat_msg msg;
msg.role = "assistant";
auto mapper = common_chat_peg_mapper(msg);
mapper.from_ast(ctx.ast, result);
std::unique_ptr<common_chat_peg_mapper> mapper;
if (params.format == COMMON_CHAT_FORMAT_PEG_GEMMA4) {
mapper = std::make_unique<common_chat_peg_gemma4_mapper>(msg);
} else {
mapper = std::make_unique<common_chat_peg_mapper>(msg);
}
mapper->from_ast(ctx.ast, result);
if (ctx.is_debug()) {
fprintf(stderr, "\nAST for %s parse:\n%s\n", is_partial ? "partial" : "full", ctx.ast.dump().c_str());

View File

@ -3,12 +3,12 @@
#pragma once
#include "common.h"
#include "jinja/parser.h"
#include "nlohmann/json_fwd.hpp"
#include "peg-parser.h"
#include "jinja/parser.h"
#include "jinja/runtime.h"
#include "jinja/caps.h"
#include "nlohmann/json.hpp"
#include "nlohmann/json_fwd.hpp"
#include <chrono>
#include <functional>
@ -19,8 +19,6 @@
using chat_template_caps = jinja::caps;
using json = nlohmann::ordered_json;
#include <nlohmann/json_fwd.hpp>
struct common_chat_templates;
namespace autoparser {
@ -75,41 +73,9 @@ struct common_chat_template {
const std::string & bos_token() const { return bos_tok; }
const std::string & eos_token() const { return eos_tok; }
// TODO: this is ugly, refactor it somehow
json add_system(const json & messages, const std::string & system_prompt) const {
GGML_ASSERT(messages.is_array());
auto msgs_copy = messages;
if (!caps.supports_system_role) {
if (msgs_copy.empty()) {
msgs_copy.insert(msgs_copy.begin(), json{
{"role", "user"},
{"content", system_prompt}
});
} else {
auto & first_msg = msgs_copy[0];
if (!first_msg.contains("content")) {
first_msg["content"] = "";
}
first_msg["content"] = system_prompt + "\n\n"
+ first_msg["content"].get<std::string>();
}
} else {
if (msgs_copy.empty() || msgs_copy[0].at("role") != "system") {
msgs_copy.insert(msgs_copy.begin(), json{
{"role", "system"},
{"content", system_prompt}
});
} else if (msgs_copy[0].at("role") == "system") {
msgs_copy[0]["content"] = system_prompt;
}
}
return msgs_copy;
}
chat_template_caps original_caps() const {
return caps;
}
};
struct common_chat_msg {
@ -184,6 +150,7 @@ enum common_chat_format {
// These are intended to be parsed by the PEG parser
COMMON_CHAT_FORMAT_PEG_SIMPLE,
COMMON_CHAT_FORMAT_PEG_NATIVE,
COMMON_CHAT_FORMAT_PEG_GEMMA4,
COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
};
@ -256,8 +223,8 @@ common_chat_templates_ptr common_chat_templates_init(const struct llama_model *
const std::string & bos_token_override = "",
const std::string & eos_token_override = "");
bool common_chat_templates_was_explicit(const struct common_chat_templates * tmpls);
std::string common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant = "");
bool common_chat_templates_was_explicit(const struct common_chat_templates * tmpls);
std::string common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant = "");
struct common_chat_params common_chat_templates_apply(const struct common_chat_templates * tmpls,
const struct common_chat_templates_inputs & inputs);
@ -274,9 +241,9 @@ std::string common_chat_format_example(const struct common_chat_templates *
bool use_jinja,
const std::map<std::string, std::string> & chat_template_kwargs);
const char * common_chat_format_name(common_chat_format format);
common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_parser_params & params);
common_chat_msg common_chat_peg_parse(const common_peg_arena & src_parser, const std::string & input, bool is_partial, const common_chat_parser_params & params);
const char * common_chat_format_name(common_chat_format format);
common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_parser_params & params);
common_chat_msg common_chat_peg_parse(const common_peg_arena & src_parser, const std::string & input, bool is_partial, const common_chat_parser_params & params);
// used by arg and server
const char * common_reasoning_format_name(common_reasoning_format format);
@ -302,7 +269,4 @@ std::map<std::string, bool> common_chat_templates_get_caps(const common_chat_tem
std::string common_chat_template_direct_apply(
const common_chat_template & tmpl,
const autoparser::generation_params & inputs,
const std::optional<json> & messages_override = std::nullopt,
const std::optional<json> & tools_override = std::nullopt,
const std::optional<json> & additional_context = std::nullopt);
const autoparser::generation_params & inputs);

View File

@ -7464,9 +7464,6 @@ class Gemma4Model(Gemma3Model):
assert len(tokens) == vocab.vocab_size
# TODO @ngxson : there are some known (rare) issues with the tokenizer during development
# but I don't have time to dive into them right now;
# using a dedicated tokenizer name so that we can fix later without re-converting GGUF
self.gguf_writer.add_tokenizer_model("gemma4")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)

View File

@ -57,13 +57,14 @@ ZenDNN is optimized for AMD EPYC™ processors and AMD Ryzen™ processors based
## Supported Operations
The ZenDNN backend currently accelerates **matrix multiplication (MUL_MAT)** operations only. Other operations are handled by the standard CPU backend.
The ZenDNN backend accelerates **matrix multiplication (MUL_MAT)** and **expert-based matrix multiplication (MUL_MAT_ID)** operations. Other operations are handled by the standard CPU backend.
| Operation | Status | Notes |
|:-------------|:-------:|:----------------------------------------------:|
| MUL_MAT | Support | Accelerated via ZenDNN LowOHA MatMul |
| MUL_MAT_ID | Support | Accelerated via ZenDNN LowOHA MatMul (MoE) |
*Note:* Since only MUL_MAT is accelerated, models will benefit most from ZenDNN when matrix multiplications dominate the computational workload (which is typical for transformer-based LLMs).
*Note:* Since MUL_MAT and MUL_MAT_ID are accelerated, models will benefit most from ZenDNN when matrix multiplications dominate the computational workload (which is typical for transformer-based LLMs and Mixture-of-Experts models).
## DataType Supports
@ -181,7 +182,7 @@ For detailed profiling and logging options, refer to the [ZenDNN Logging Documen
## Known Issues
- **Limited operation support**: Currently only matrix multiplication (MUL_MAT) is accelerated via ZenDNN. Other operations fall back to the standard CPU backend.
- **Limited operation support**: Currently matrix multiplication (MUL_MAT) and expert-based matrix multiplication (MUL_MAT_ID) are accelerated via ZenDNN. Other operations fall back to the standard CPU backend. Future updates may expand supported operations.
- **BF16 support**: BF16 operations require AMD Zen 4 or Zen 5 architecture (EPYC 9004/9005 series). On older CPUs, operations will use FP32.
- **NUMA awareness**: For multi-socket systems, manual NUMA binding may be required for optimal performance.
@ -216,4 +217,4 @@ Please add the **[ZenDNN]** prefix/tag in issues/PRs titles to help the ZenDNN-t
## TODO
- Expand operation support beyond MUL_MAT (attention operations, activations, etc.)
- Expand operation support beyond MUL_MAT and MUL_MAT_ID (attention operations, activations, etc.)

View File

@ -68,7 +68,7 @@ Legend:
| MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
| MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
| MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
| MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ❌ | | ❌ |
| MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ❌ | 🟡 | ❌ |
| NEG | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
| NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ❌ | ❌ | ❌ |
| OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |

File diff suppressed because it is too large Load Diff

View File

@ -1009,8 +1009,8 @@ public:
bool get_device_memory(const rpc_msg_get_device_memory_req & request, rpc_msg_get_device_memory_rsp & response);
struct stored_graph {
ggml_context_ptr ctx_ptr;
ggml_cgraph * graph;
std::vector<uint8_t> buffer;
ggml_cgraph * graph;
};
private:
@ -1518,10 +1518,12 @@ bool rpc_server::graph_compute(const std::vector<uint8_t> & input) {
LOG_DBG("[%s] device: %u, n_nodes: %u, n_tensors: %u\n", __func__, device, n_nodes, n_tensors);
size_t buf_size = ggml_tensor_overhead()*(n_nodes + n_tensors) + ggml_graph_overhead_custom(n_nodes, false);
if (stored_graphs[device].buffer.size() < buf_size) {
stored_graphs[device].buffer.resize(buf_size);
}
struct ggml_init_params params = {
/*.mem_size =*/ buf_size,
/*.mem_buffer =*/ NULL,
/*.mem_buffer =*/ stored_graphs[device].buffer.data(),
/*.no_alloc =*/ true,
};
ggml_context_ptr ctx_ptr { ggml_init(params) };
@ -1551,7 +1553,6 @@ bool rpc_server::graph_compute(const std::vector<uint8_t> & input) {
}
ggml_status status = ggml_backend_graph_compute(backends[device], graph);
GGML_ASSERT(status == GGML_STATUS_SUCCESS && "Unsuccessful graph computations are not supported with RPC");
stored_graphs[device].ctx_ptr.swap(ctx_ptr);
stored_graphs[device].graph = graph;
return true;
}

View File

@ -28,7 +28,7 @@ if (NOT ZENDNN_ROOT OR ZENDNN_ROOT STREQUAL "" OR ZENDNN_ROOT STREQUAL "OFF")
ExternalProject_Add(
zendnn
GIT_REPOSITORY https://github.com/amd/ZenDNN.git
GIT_TAG a18adf8c605fb5f5e52cefd7eda08a7b18febbaf # ZenDNN-2026-WW08
GIT_TAG f79f7321a1add65ced6397a6bfab7edba6e3e14e # ZenDNN-2026-WW13
PREFIX ${ZENDNN_PREFIX}
SOURCE_DIR ${ZENDNN_SOURCE_DIR}
BINARY_DIR ${ZENDNN_BUILD_DIR}

View File

@ -190,6 +190,170 @@ static void ggml_zendnn_compute_forward_mul_mat(
}
}
struct mmid_row_mapping {
int32_t i1;
int32_t i2;
};
static void ggml_zendnn_compute_forward_mul_mat_id(
ggml_backend_zendnn_context * ctx,
ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; // expert weights
const ggml_tensor * src1 = dst->src[1]; // inputs
const ggml_tensor * ids = dst->src[2]; // expert ids
GGML_TENSOR_BINARY_OP_LOCALS
// exit for no tokens to process
if (ne2 == 0 || ne11 == 0) {
return;
}
ggml_type const vec_dot_type = src0->type;
ggml_from_float_t const from_float = ggml_get_type_traits(vec_dot_type)->from_float_ref;
// we don't support permuted src0 or src1
GGML_ASSERT(nb00 == ggml_type_size(src0->type));
GGML_ASSERT(nb10 == ggml_type_size(src1->type));
// dst cannot be transposed or permuted
GGML_ASSERT(nb0 == sizeof(float));
GGML_ASSERT(nb0 <= nb1);
GGML_ASSERT(nb1 <= nb2);
GGML_ASSERT(nb2 <= nb3);
GGML_ASSERT(ne03 == 1);
GGML_ASSERT(ne13 == 1);
GGML_ASSERT(ne3 == 1);
// row groups
const int n_ids = ids->ne[0]; // n_expert_used
const int n_as = ne02; // n_experts
std::vector<int64_t> matrix_row_counts(n_as, 0);
std::vector<std::vector<mmid_row_mapping>> matrix_rows(n_as);
int64_t max_rows = 0;
// group rows by expert (preprocessing step)
for (int64_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) {
for (int id = 0; id < n_ids; ++id) {
const int32_t i02 = *(const int32_t *)((const char *)ids->data + iid1*ids->nb[1] + id*ids->nb[0]);
GGML_ASSERT(i02 >= 0 && i02 < n_as);
matrix_rows[i02].push_back({id, iid1});
matrix_row_counts[i02]++;
if (matrix_row_counts[i02] > max_rows) {
max_rows = matrix_row_counts[i02];
}
}
}
if (max_rows == 0) {
return; // no rows to process
}
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
// size for converting src1 rows to vec_dot_type if needed
const size_t nbw1 = row_size;
const size_t nbw2 = nbw1 * ne11;
const size_t nbw3 = nbw2 * ne12;
const size_t src1_conv_size = (src1->type != vec_dot_type) ? ne13 * nbw3 : 0;
// size for MoE gather/scatter buffers
const size_t wdata_cur_size = max_rows * row_size;
const size_t dst_cur_size = max_rows * ggml_row_size(dst->type, ne01);
// allocate single buffer for all needs
const size_t total_size = src1_conv_size + wdata_cur_size + dst_cur_size;
if (ctx->work_size < total_size) {
ctx->work_data.reset(new char[total_size]);
ctx->work_size = total_size;
}
// partition the buffer
char * work_data = ctx->work_data.get();
char * wdata_cur = work_data + src1_conv_size;
char * dst_cur = wdata_cur + wdata_cur_size;
if (src1->type != vec_dot_type) {
GGML_ASSERT(src1->type == GGML_TYPE_F32);
#pragma omp parallel for collapse(3) num_threads(ctx->n_threads) schedule(static)
for (int64_t i13 = 0; i13 < ne13; ++i13) {
for (int64_t i12 = 0; i12 < ne12; ++i12) {
for (int64_t i11 = 0; i11 < ne11; ++i11) {
const float * src1_f32 = (float *)((char *)src1->data + i11*nb11 + i12*nb12 + i13*nb13);
void * src1_conv = (char *)work_data + i11*nbw1 + i12*nbw2 + i13*nbw3;
from_float(src1_f32, src1_conv, ne10);
}
}
}
}
const void * wdata = src1->type == vec_dot_type ? src1->data : work_data;
// process each expert with gather -> gemm -> scatter pattern
for (int64_t cur_a = 0; cur_a < n_as; ++cur_a) {
const int64_t cne1 = matrix_row_counts[cur_a];
if (cne1 == 0) {
continue;
}
const char * src0_cur = (const char *) src0->data + cur_a*nb02;
// gather input rows for this expert
#pragma omp parallel for num_threads(ctx->n_threads) schedule(static)
for (int64_t ir1 = 0; ir1 < cne1; ++ir1) {
const mmid_row_mapping & row_mapping = matrix_rows[cur_a][ir1];
const int64_t id = row_mapping.i1;
const int64_t i11 = id % ne11;
const int64_t i12 = row_mapping.i2;
std::memcpy(
wdata_cur + ir1 * row_size,
(const char *) wdata + (i11 + i12*ne11) * row_size,
row_size
);
}
// batched gemm for all tokens in this expert
if (!ggml_zendnn_sgemm(ctx,
ne01, // m
cne1, // n
ne10, // k
src0_cur,
ne00, // lda
wdata_cur,
ne10, // ldb
dst_cur,
ne01, // ldc
src0->type,
vec_dot_type,
dst->type)) {
GGML_ABORT("%s: ZenDNN sgemm failed\n", __func__);
}
// scatter output rows to destination
#pragma omp parallel for num_threads(ctx->n_threads) schedule(static)
for (int64_t ir1 = 0; ir1 < cne1; ++ir1) {
const mmid_row_mapping & row_mapping = matrix_rows[cur_a][ir1];
const int64_t id = row_mapping.i1;
const int64_t i1 = id;
const int64_t i2 = row_mapping.i2;
std::memcpy(
(char *) dst->data + i1*nb1 + i2*nb2,
dst_cur + ir1 * ggml_row_size(dst->type, ne01),
ggml_row_size(dst->type, ne01)
);
}
}
}
// backend interface
static const char * ggml_backend_zendnn_get_name(ggml_backend_t backend) {
@ -218,6 +382,9 @@ static ggml_status ggml_backend_zendnn_graph_compute(ggml_backend_t backend, ggm
case GGML_OP_MUL_MAT:
ggml_zendnn_compute_forward_mul_mat(ctx, node);
break;
case GGML_OP_MUL_MAT_ID:
ggml_zendnn_compute_forward_mul_mat_id(ctx, node);
break;
case GGML_OP_NONE:
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
@ -361,6 +528,7 @@ static bool ggml_backend_zendnn_device_supports_op(ggml_backend_dev_t dev, const
return true;
case GGML_OP_MUL_MAT:
case GGML_OP_MUL_MAT_ID:
{
const ggml_tensor * weights = op->src[0];
const ggml_tensor * inputs = op->src[1];
@ -374,6 +542,17 @@ static bool ggml_backend_zendnn_device_supports_op(ggml_backend_dev_t dev, const
ne0 < min_batch || ne1 < min_batch || ne10 < min_batch) {
return false;
}
// MUL_MAT_ID performs best with a moderate number of experts due to its
// gather + batched matmul + scatter approach. Future versions will leverage
// ZenDNN's grouped_gemm for better scalability with larger expert counts:
// https://github.com/amd/ZenDNN/blob/main/docs/operator/lowoha_group_gemm_operator.md
if (op->op == GGML_OP_MUL_MAT_ID) {
const int64_t n_experts = weights->ne[2];
const int64_t max_experts = 32;
if (n_experts > max_experts) {
return false;
}
}
switch (weights->type) {
case GGML_TYPE_F32:
case GGML_TYPE_BF16:

View File

@ -0,0 +1,266 @@
{%- macro format_parameters(properties, required) -%}
{%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
{%- set ns = namespace(found_first=false) -%}
{%- for key, value in properties | dictsort -%}
{%- set add_comma = false -%}
{%- if key not in standard_keys -%}
{%- if ns.found_first %},{% endif -%}
{%- set ns.found_first = true -%}
{{ key }}:{
{%- if value['description'] -%}
description:<|"|>{{ value['description'] }}<|"|>
{%- set add_comma = true -%}
{%- endif -%}
{%- if value['nullable'] %}
{%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
nullable:true
{%- endif -%}
{%- if value['type'] | upper == 'STRING' -%}
{%- if value['enum'] -%}
{%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
enum:{{ format_argument(value['enum']) }}
{%- endif -%}
{%- elif value['type'] | upper == 'OBJECT' -%}
,properties:{
{%- if value['properties'] is defined and value['properties'] is mapping -%}
{{- format_parameters(value['properties'], value['required'] | default([])) -}}
{%- elif value is mapping -%}
{{- format_parameters(value, value['required'] | default([])) -}}
{%- endif -%}
}
{%- if value['required'] -%}
,required:[
{%- for item in value['required'] | default([]) -%}
<|"|>{{- item -}}<|"|>
{%- if not loop.last %},{% endif -%}
{%- endfor -%}
]
{%- endif -%}
{%- elif value['type'] | upper == 'ARRAY' -%}
{%- if value['items'] is mapping and value['items'] -%}
,items:{
{%- set ns_items = namespace(found_first=false) -%}
{%- for item_key, item_value in value['items'] | dictsort -%}
{%- if item_value is not none -%}
{%- if ns_items.found_first %},{% endif -%}
{%- set ns_items.found_first = true -%}
{%- if item_key == 'properties' -%}
properties:{
{%- if item_value is mapping -%}
{{- format_parameters(item_value, value['items']['required'] | default([])) -}}
{%- endif -%}
}
{%- elif item_key == 'required' -%}
required:[
{%- for req_item in item_value -%}
<|"|>{{- req_item -}}<|"|>
{%- if not loop.last %},{% endif -%}
{%- endfor -%}
]
{%- elif item_key == 'type' -%}
{%- if item_value is string -%}
type:{{ format_argument(item_value | upper) }}
{%- else -%}
type:{{ format_argument(item_value | map('upper') | list) }}
{%- endif -%}
{%- else -%}
{{ item_key }}:{{ format_argument(item_value) }}
{%- endif -%}
{%- endif -%}
{%- endfor -%}
}
{%- endif -%}
{%- endif -%}
{%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
type:<|"|>{{ value['type'] | upper }}<|"|>}
{%- endif -%}
{%- endfor -%}
{%- endmacro -%}
{%- macro format_function_declaration(tool_data) -%}
declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
{%- set params = tool_data['function']['parameters'] -%}
{%- if params -%}
,parameters:{
{%- if params['properties'] -%}
properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
{%- endif -%}
{%- if params['required'] -%}
required:[
{%- for item in params['required'] -%}
<|"|>{{- item -}}<|"|>
{{- ',' if not loop.last -}}
{%- endfor -%}
],
{%- endif -%}
{%- if params['type'] -%}
type:<|"|>{{- params['type'] | upper -}}<|"|>}
{%- endif -%}
{%- endif -%}
{%- if 'response' in tool_data['function'] -%}
{%- set response_declaration = tool_data['function']['response'] -%}
,response:{
{%- if response_declaration['description'] -%}
description:<|"|>{{- response_declaration['description'] -}}<|"|>,
{%- endif -%}
{%- if response_declaration['type'] | upper == 'OBJECT' -%}
type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
{%- endif -%}
{%- endif -%}
}
{%- endmacro -%}
{%- macro format_argument(argument, escape_keys=True) -%}
{%- if argument is string -%}
{{- '<|"|>' + argument + '<|"|>' -}}
{%- elif argument is boolean -%}
{{- 'true' if argument else 'false' -}}
{%- elif argument is mapping -%}
{{- '{' -}}
{%- set ns = namespace(found_first=false) -%}
{%- for key, value in argument | dictsort -%}
{%- if ns.found_first %},{% endif -%}
{%- set ns.found_first = true -%}
{%- if escape_keys -%}
{{- '<|"|>' + key + '<|"|>' -}}
{%- else -%}
{{- key -}}
{%- endif -%}
:{{- format_argument(value, escape_keys=escape_keys) -}}
{%- endfor -%}
{{- '}' -}}
{%- elif argument is sequence -%}
{{- '[' -}}
{%- for item in argument -%}
{{- format_argument(item, escape_keys=escape_keys) -}}
{%- if not loop.last %},{% endif -%}
{%- endfor -%}
{{- ']' -}}
{%- else -%}
{{- argument -}}
{%- endif -%}
{%- endmacro -%}
{%- macro strip_thinking(text) -%}
{%- set ns = namespace(result='') -%}
{%- for part in text.split('<channel|>') -%}
{%- if '<|channel>' in part -%}
{%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
{%- else -%}
{%- set ns.result = ns.result + part -%}
{%- endif -%}
{%- endfor -%}
{{- ns.result | trim -}}
{%- endmacro -%}
{%- set ns = namespace(prev_message_type=None) -%}
{%- set loop_messages = messages -%}
{{ bos_token }}
{#- Handle System/Tool Definitions Block -#}
{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
{{- '<|turn>system\n' -}}
{#- Inject Thinking token at the very top of the FIRST system turn -#}
{%- if enable_thinking is defined and enable_thinking -%}
{{- '<|think|>' -}}
{%- set ns.prev_message_type = 'think' -%}
{%- endif -%}
{%- if messages[0]['role'] in ['system', 'developer'] -%}
{{- messages[0]['content'] | trim -}}
{%- set loop_messages = messages[1:] -%}
{%- endif -%}
{%- if tools -%}
{%- for tool in tools %}
{{- '<|tool>' -}}
{{- format_function_declaration(tool) | trim -}}
{{- '<tool|>' -}}
{%- endfor %}
{%- set ns.prev_message_type = 'tool' -%}
{%- endif -%}
{{- '<turn|>\n' -}}
{%- endif %}
{#- Loop through messages -#}
{%- for message in loop_messages -%}
{%- set ns.prev_message_type = None -%}
{%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
{{- '<|turn>' + role + '\n' }}
{%- if message['tool_calls'] -%}
{%- for tool_call in message['tool_calls'] -%}
{%- set function = tool_call['function'] -%}
{{- '<|tool_call>call:' + function['name'] + '{' -}}
{%- if function['arguments'] is mapping -%}
{%- set ns_args = namespace(found_first=false) -%}
{%- for key, value in function['arguments'] | dictsort -%}
{%- if ns_args.found_first %},{% endif -%}
{%- set ns_args.found_first = true -%}
{{- key -}}:{{- format_argument(value, escape_keys=False) -}}
{%- endfor -%}
{%- elif function['arguments'] is string -%}
{{- function['arguments'] -}}
{%- endif -%}
{{- '}<tool_call|>' -}}
{%- endfor -%}
{%- set ns.prev_message_type = 'tool_call' -%}
{%- endif -%}
{%- if message['tool_responses'] -%}
{#- Tool Response handling -#}
{%- for tool_response in message['tool_responses'] -%}
{{- '<|tool_response>' -}}
{%- if tool_response['response'] is mapping -%}
{{- 'response:' + tool_response['name'] | default('unknown') + '{' -}}
{%- for key, value in tool_response['response'] | dictsort -%}
{{- key -}}:{{- format_argument(value, escape_keys=False) -}}
{%- if not loop.last %},{% endif -%}
{%- endfor -%}
{{- '}' -}}
{%- else -%}
{{- 'response:' + tool_response['name'] | default('unknown') + '{value:' + format_argument(tool_response['response'], escape_keys=False) + '}' -}}
{%- endif -%}
{{- '<tool_response|>' -}}
{%- endfor -%}
{%- set ns.prev_message_type = 'tool_response' -%}
{%- endif -%}
{%- if message['content'] is string -%}
{%- if role == 'model' -%}
{{- strip_thinking(message['content']) -}}
{%- else -%}
{{- message['content'] | trim -}}
{%- endif -%}
{%- elif message['content'] is sequence -%}
{%- for item in message['content'] -%}
{%- if item['type'] == 'text' -%}
{%- if role == 'model' -%}
{{- strip_thinking(item['text']) -}}
{%- else -%}
{{- item['text'] | trim -}}
{%- endif -%}
{%- elif item['type'] == 'image' -%}
{{- '\n\n<|image|>\n\n' -}}
{%- set ns.prev_message_type = 'image' -%}
{%- elif item['type'] == 'audio' -%}
{{- '<|audio|>' -}}
{%- set ns.prev_message_type = 'audio' -%}
{%- elif item['type'] == 'video' -%}
{{- '\n\n<|video|>\n\n' -}}
{%- set ns.prev_message_type = 'video' -%}
{%- endif -%}
{%- endfor -%}
{%- endif -%}
{%- if not (message['tool_responses'] and not message['content']) -%}
{{- '<turn|>\n' -}}
{%- endif -%}
{%- endfor -%}
{%- if add_generation_prompt -%}
{%- if ns.prev_message_type != 'tool_response' -%}
{{- '<|turn>model\n' -}}
{%- endif -%}
{%- if not enable_thinking | default(false) -%}
{{- '<|channel>thought\n<channel|>' -}}
{%- endif -%}
{%- endif -%}

View File

@ -66,9 +66,8 @@ llama_kv_cache_iswa::llama_kv_cache_iswa(
LLAMA_LOG_INFO("%s: creating SWA KV cache, size = %u cells\n", __func__, size_swa);
// note: the SWA cache is never quantized because it is relatively small
kv_swa = std::make_unique<llama_kv_cache>(
model, GGML_TYPE_F16, GGML_TYPE_F16,
model, type_k, type_v,
v_trans, offload, unified, size_swa, n_seq_max, n_pad,
hparams.n_swa, hparams.swa_type, filter_swa, reuse);
}

View File

@ -493,6 +493,16 @@ struct llm_tokenizer_bpe : llm_tokenizer {
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
};
break;
case LLAMA_VOCAB_PRE_TYPE_GEMMA4:
// Gemma4 uses SPM-style BPE: spaces are replaced with ▁ by the
// normalizer, then BPE merges run on the whole text without
// word-level pre-splitting. We only need to split on newlines
// since BPE merge lookup asserts no newlines in tokens.
regex_exprs = {
"[^\\n]+|[\\n]+",
};
byte_encode = false; // uses raw UTF-8, not GPT-2 byte encoding
break;
default:
// default regex for BPE tokenization pre-processing
regex_exprs = {
@ -506,6 +516,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
}
std::vector<std::string> regex_exprs;
bool byte_encode = true; // GPT-2 byte encoding; false for SPM-style BPE (raw UTF-8)
};
struct llm_tokenizer_bpe_session {
@ -550,9 +561,10 @@ struct llm_tokenizer_bpe_session {
void tokenize(const std::string & text, std::vector<llama_token> & output) {
int final_prev_index = -1;
const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs);
const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs, tokenizer.byte_encode);
symbols_final.clear();
auto tok_pre = vocab.get_pre_type();
for (const auto & word : word_collection) {
work_queue = llm_bigram_bpe::queue();
@ -565,6 +577,13 @@ struct llm_tokenizer_bpe_session {
if (vocab.get_ignore_merges() && vocab.text_to_token(word) != LLAMA_TOKEN_NULL) {
symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
offset = word.size();
} else if (tok_pre == LLAMA_VOCAB_PRE_TYPE_GEMMA4 && word.find_first_not_of('\n') == std::string::npos) {
// fix for gemma 4, ref: https://github.com/ggml-org/llama.cpp/pull/21343
auto tok = vocab.text_to_token(word);
if (tok != LLAMA_TOKEN_NULL) {
symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
offset = word.size();
}
}
while (offset < word.size()) {
@ -1864,7 +1883,31 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
special_pad_id = 3; // <|plamo:pad|>
special_mask_id = LLAMA_TOKEN_NULL;
} else if (tokenizer_model == "gemma4") {
type = LLAMA_VOCAB_TYPE_SPM;
type = LLAMA_VOCAB_TYPE_BPE;
// read bpe merges and populate bpe ranks
const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
if (merges_keyidx == -1) {
throw std::runtime_error("cannot find tokenizer merges in model file\n");
}
{
const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
for (int i = 0; i < n_merges; i++) {
const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
std::string first;
std::string second;
const size_t pos = word.find(' ', 1);
if (pos != std::string::npos) {
first = word.substr(0, pos);
second = word.substr(pos + 1);
}
bpe_ranks.emplace(std::make_pair(first, second), i);
}
}
// default special tokens (to be read from GGUF)
special_bos_id = LLAMA_TOKEN_NULL;
@ -1874,7 +1917,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
special_pad_id = LLAMA_TOKEN_NULL;
special_mask_id = LLAMA_TOKEN_NULL;
tokenizer_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
tokenizer_pre = "gemma4";
} else {
throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
}
@ -1882,6 +1925,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
// for now, only BPE models have pre-tokenizers
if (type == LLAMA_VOCAB_TYPE_BPE) {
add_space_prefix = false;
escape_whitespaces = false;
clean_spaces = true;
if (tokenizer_pre.empty()) {
LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
@ -1948,6 +1992,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
} else if (
tokenizer_pre == "jais-2") {
pre_type = LLAMA_VOCAB_PRE_TYPE_JAIS2;
} else if (
tokenizer_pre == "gemma4") {
pre_type = LLAMA_VOCAB_PRE_TYPE_GEMMA4;
escape_whitespaces = true;
} else if (
tokenizer_pre == "jina-v1-en" ||
tokenizer_pre == "jina-v2-code" ||
@ -3045,6 +3093,10 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
if (escape_whitespaces) {
llama_escape_whitespace(text);
}
#ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
#endif
@ -3224,6 +3276,12 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t
return _try_copy(token_text.data(), token_text.size());
}
if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
if (escape_whitespaces) {
// SPM-style BPE: tokens contain ▁ for spaces
std::string result = token_text;
llama_unescape_whitespace(result);
return _try_copy(result.data(), result.size());
}
std::string result = llama_decode_text(token_text);
return _try_copy(result.data(), result.size());
}

View File

@ -58,6 +58,7 @@ enum llama_vocab_pre_type {
LLAMA_VOCAB_PRE_TYPE_TINY_AYA = 47,
LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM = 48,
LLAMA_VOCAB_PRE_TYPE_JAIS2 = 49,
LLAMA_VOCAB_PRE_TYPE_GEMMA4 = 50,
};
struct LLM_KV;

View File

@ -912,7 +912,7 @@ bool unicode_cpt_is_han(uint32_t cpt) {
return false;
}
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs, bool byte_encode) {
// unicode categories
static const std::map<std::string, int> k_ucat_enum = {
{ "\\p{N}", unicode_cpt_flags::NUMBER },
@ -1099,5 +1099,9 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
start += offset;
}
return unicode_byte_encoding_process(bpe_words);
if (byte_encode) {
return unicode_byte_encoding_process(bpe_words);
}
return bpe_words;
}

View File

@ -108,4 +108,4 @@ uint32_t unicode_tolower(uint32_t cpt);
bool unicode_cpt_is_han(uint32_t cpt);
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs, bool byte_encode = true);

View File

@ -589,6 +589,51 @@ static common_chat_tool amount_tool{
})",
};
static common_chat_tool toggle_tool{
/* .name = */ "toggle",
/* .description = */ "Toggle a feature",
/* .parameters = */ R"({
"type": "object",
"properties": {
"enabled": {
"type": "boolean",
"description": "Whether to enable the feature"
}
},
"required": ["enabled"]
})",
};
static common_chat_tool nullable_tool{
/* .name = */ "set_nullable",
/* .description = */ "Set a nullable value",
/* .parameters = */ R"({
"type": "object",
"properties": {
"value": {
"type": "null",
"description": "A null value"
}
},
"required": ["value"]
})",
};
static common_chat_tool config_tool{
/* .name = */ "set_config",
/* .description = */ "Set configuration",
/* .parameters = */ R"({
"type": "object",
"properties": {
"config": {
"type": "object",
"description": "Configuration dict"
}
},
"required": ["config"]
})",
};
static common_chat_tool imaginary_number_tool{
/* .name = */ "imaginary_number",
/* .description = */ "Imaginary number converter",
@ -1869,6 +1914,130 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
tst.test("Line 1\nLine 2\nLine 3").expect(simple_assist_msg("Line 1\nLine 2\nLine 3")).expect_reconstruction().run();
}
{
// Google Gemma 4 (tool calling with Gemma4 dict format)
auto tst = peg_tester("models/templates/gemma4.jinja");
tst.test("Hello, world!").expect(simple_assist_msg("Hello, world!")).run();
// Simple tool call with string argument
tst.test(
"<|tool_call>call:get_time{city:<|\"|>London<|\"|>}<tool_call|>")
.tools({ get_time_tool })
.expect(message_with_tool_calls("get_time", R"({"city": "London"})"))
.run();
// Tool call with string argument containing special chars
tst.test(
"<|tool_call>call:get_time{city:<|\"|>San Francisco<|\"|>}<tool_call|>")
.tools({ get_time_tool })
.expect(message_with_tool_calls("get_time", R"({"city": "San Francisco"})"))
.run();
// Tool call with empty args
tst.test(
"<|tool_call>call:empty_args{}<tool_call|>")
.tools({ empty_args_tool })
.expect(message_with_tool_calls("empty_args", "{}"))
.run();
// Tool call with string and content
tst.test(
"Hello, world!\nWhat's up?<|tool_call>call:get_time{city:<|\"|>Paris<|\"|>}<tool_call|>")
.tools({ get_time_tool })
.expect(message_with_content_and_tool_call("Hello, world!\nWhat's up?", "get_time", R"({"city": "Paris"})"))
.run();
// Parallel tool calls
tst.test(
"<|tool_call>call:get_time{city:<|\"|>London<|\"|>}<tool_call|>"
"<|tool_call>call:get_weather{city:<|\"|>Paris<|\"|>}<tool_call|>")
.tools({ get_time_tool, get_weather_tool })
.parallel_tool_calls(true)
.expect_tool_calls({
{ "get_time", R"({"city": "London"})", "" },
{ "get_weather", R"({"city": "Paris"})", "" },
})
.run();
// Tool call with integer argument (number type)
tst.test(
"<|tool_call>call:special_function{arg1:42}<tool_call|>")
.tools({ special_function_tool })
.expect(message_with_tool_calls("special_function", R"({"arg1": 42})"))
.run();
// Tool call with negative number argument
tst.test(
"<|tool_call>call:special_function{arg1:-7}<tool_call|>")
.tools({ special_function_tool })
.expect(message_with_tool_calls("special_function", R"({"arg1": -7})"))
.run();
// Tool call with decimal number argument
tst.test(
"<|tool_call>call:amount{orig:3.14}<tool_call|>")
.tools({ amount_tool })
.expect(message_with_tool_calls("amount", R"({"orig": 3.14})"))
.run();
// Tool call with boolean argument (true)
tst.test(
"<|tool_call>call:toggle{enabled:true}<tool_call|>")
.tools({ toggle_tool })
.expect(message_with_tool_calls("toggle", R"({"enabled": true})"))
.run();
// Tool call with boolean argument (false)
tst.test(
"<|tool_call>call:toggle{enabled:false}<tool_call|>")
.tools({ toggle_tool })
.expect(message_with_tool_calls("toggle", R"({"enabled": false})"))
.run();
// Tool call with null argument
tst.test(
"<|tool_call>call:set_nullable{value:null}<tool_call|>")
.tools({ nullable_tool })
.expect(message_with_tool_calls("set_nullable", R"({"value": null})"))
.run();
// Tool call with array argument (todo list)
tst.test(
"<|tool_call>call:todo_list{todos:[<|\"|>buy milk<|\"|>,<|\"|>walk dog<|\"|>]}<tool_call|>")
.tools({ todo_list })
.expect(message_with_tool_calls("todo_list", R"({"todos":["buy milk","walk dog"]})"))
.run();
// Tool call with object/dict argument
tst.test(
"<|tool_call>call:set_config{config:{theme:<|\"|>dark<|\"|>,count:3}}<tool_call|>")
.tools({ config_tool })
.expect(message_with_tool_calls("set_config", R"({"config":{"theme":"dark","count":3}})"))
.run();
// Tool call with empty array
tst.test(
"<|tool_call>call:todo_list{todos:[]}<tool_call|>")
.tools({ todo_list })
.expect(message_with_tool_calls("todo_list", R"({"todos":[]})"))
.run();
// Tool call with empty dict
tst.test(
"<|tool_call>call:set_config{config:{}}<tool_call|>")
.tools({ config_tool })
.expect(message_with_tool_calls("set_config", R"({"config":{}})"))
.run();
// Tool call with scientific notation number
tst.test(
"<|tool_call>call:amount{orig:1.5e10}<tool_call|>")
.tools({ amount_tool })
.expect(message_with_tool_calls("amount", R"({"orig": 1.5e10})"))
.run();
}
{
// Qwen-QwQ-32B (reasoning model)
auto tst = peg_tester("models/templates/Qwen-QwQ-32B.jinja");

View File

@ -155,8 +155,8 @@ struct server_slot {
int64_t t_start_process_prompt;
int64_t t_start_generation;
double t_prompt_processing; // ms
double t_token_generation; // ms
double t_prompt_processing = 0.0; // ms
double t_token_generation = 0.0; // ms
std::function<void(int /* id_slot */)> callback_on_release;

View File

@ -261,14 +261,14 @@ struct result_timings {
int32_t cache_n = -1;
int32_t prompt_n = -1;
double prompt_ms;
double prompt_per_token_ms;
double prompt_per_second;
double prompt_ms = 0.0;
double prompt_per_token_ms = 0.0;
double prompt_per_second = 0.0;
int32_t predicted_n = -1;
double predicted_ms;
double predicted_per_token_ms;
double predicted_per_second;
double predicted_ms = 0.0;
double predicted_per_token_ms = 0.0;
double predicted_per_second = 0.0;
// Optional speculative metrics - only included when > 0
int32_t draft_n = 0;