Merge branch 'master' into HEAD

2025-12-10 13:41:32 +02:00 · 2025-12-10 13:41:32 +02:00 · 81cb5783c8
parent 6dc6614bf0 9e79b0116e
commit 81cb5783c8
46 changed files with 3074 additions and 293 deletions
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -15,6 +15,7 @@ The project differentiates between 3 levels of contributors:
    - If you modified the `ggml` source, run the `test-backend-ops` tool to check whether different backend implementations of the `ggml` operators produce consistent results (this requires access to at least two different `ggml` backends)
    - If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops`
 - Create separate PRs for each feature or fix. Avoid combining unrelated changes in a single PR
+- When adding support for a new model or feature, focus on **CPU support only** in the initial PR unless you have a good reason not to. Add support for other backends like CUDA in follow-up PRs
 - Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
 - If your PR becomes stale, rebase it on top of latest `master` to get maintainers attention
 - Maintainers will rely on your insights and approval when making a final decision to approve and merge a PR
--- a/common/chat.cpp
+++ b/common/chat.cpp
@ -1,5 +1,6 @@
 #include "chat.h"
 #include "chat-parser.h"
+#include "chat-peg-parser.h"
 #include "common.h"
 #include "json-partial.h"
 #include "json-schema-to-grammar.h"
@ -150,6 +151,7 @@ struct templates_params {
    common_chat_tool_choice tool_choice;
    json json_schema;
    bool parallel_tool_calls;
+    common_reasoning_format reasoning_format;
    bool stream;
    std::string grammar;
    bool add_generation_prompt = true;
@ -589,6 +591,16 @@ common_chat_templates_ptr common_chat_templates_init(
            "{%- if false %}");
    }

+    // TODO @aldehir : this is a temporary fix, pending Minja changes
+    // Ref: https://github.com/ggml-org/llama.cpp/pull/17713#issuecomment-3631342664
+    if (default_template_src.find("[TOOL_CALLS]") != std::string::npos
+            // search for the error message and patch it
+            && default_template_src.find("if (message['content'] is none or") != std::string::npos) {
+        string_replace_all(default_template_src,
+            "{%- if (message['content'] is none or message['content'] == '' or message['content']|length == 0) and (message['tool_calls'] is not defined or message['tool_calls'] is none or message['tool_calls']|length == 0) %}",
+            "{%- if false %}");
+    }
+
    std::string token_bos = bos_token_override;
    std::string token_eos = eos_token_override;
    bool add_bos = false;
@ -987,6 +999,118 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
    return data;
 }

+static common_chat_params common_chat_params_init_ministral_3(const common_chat_template & tmpl, const struct templates_params & inputs) {
+    common_chat_params data;
+
+    // Build up messages to follow the format: https://huggingface.co/mistralai/Ministral-3-14B-Reasoning-2512/blob/main/chat_template.jinja
+    auto adjusted_messages = json::array();
+    for (const auto & msg : inputs.messages) {
+        auto role = msg.value("role", "");
+        if (role != "system" && role != "assistant") {
+            // Only adjust system and assistant messages. Interestingly, the system message may contain thinking.
+            adjusted_messages.push_back(msg);
+            continue;
+        }
+
+        auto content = json::array();
+
+        // If message contains `reasoning_content`, add it as a block of type `thinking`
+        if (msg.contains("reasoning_content") && msg.at("reasoning_content").is_string()) {
+            content.push_back({
+                {"type", "thinking"},
+                {"thinking", msg.at("reasoning_content").get<std::string>()},
+            });
+        }
+
+        // If message contains `content`, add it as a block of type `text`
+        if (msg.contains("content")) {
+            if (msg.at("content").is_string()) {
+                content.push_back({
+                    {"type", "text"},
+                    {"text", msg.at("content").get<std::string>()},
+                });
+            } else if (msg.at("content").is_array()) {
+                auto blocks = msg.at("content");
+                content.insert(content.end(), blocks.begin(), blocks.end());
+            }
+        }
+
+        auto adjusted = msg;
+        adjusted["content"] = content;
+        adjusted.erase("reasoning_content");
+        adjusted_messages.push_back(adjusted);
+    }
+
+    auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
+    auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
+    auto include_grammar = true;
+
+    data.prompt = apply(tmpl, inputs, /* messages_override = */ adjusted_messages);
+    data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
+    data.preserved_tokens = {
+        "[THINK]",
+        "[/THINK]",
+        "[TOOL_CALLS]",
+        "[ARGS]",
+    };
+
+    auto parser = build_chat_peg_native_parser([&](common_chat_peg_native_builder & p) {
+        auto reasoning = extract_reasoning ? p.optional("[THINK]" + p.reasoning(p.until("[/THINK]")) + "[/THINK]") : p.eps();
+
+        // Response format parser
+        if (inputs.json_schema.is_object() && !inputs.json_schema.empty()) {
+            // Ministral wants to emit json surrounded by code fences
+            return reasoning << "```json" << p.content(p.schema(p.json(), "response-format", inputs.json_schema)) << "```";
+        }
+
+        // Tool call parser
+        if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
+            auto tool_choice = p.choice();
+            foreach_function(inputs.tools, [&](const json & tool) {
+                const auto & function = tool.at("function");
+                std::string name = function.at("name");
+                const auto & schema = function.at("parameters");
+
+                tool_choice |= p.rule("tool-" + name,
+                    p.tool_open(p.tool_name(p.literal(name)) + "[ARGS]")
+                    + p.tool_args(p.schema(p.json(), "tool-" + name + "-schema", schema))
+                );
+            });
+
+            auto min_calls = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED ? 1 : 0;
+            auto max_calls = inputs.parallel_tool_calls ? -1 : 1;
+            auto tool_calls = p.trigger_rule("tool-call", p.repeat("[TOOL_CALLS]" + tool_choice, min_calls, max_calls));
+
+            return reasoning << p.content(p.until("[TOOL_CALLS]")) << tool_calls;
+        }
+
+        // Content only parser
+        include_grammar = false;
+        return reasoning << p.content(p.rest());
+    });
+
+    data.parser = parser.save();
+
+    if (include_grammar) {
+        data.grammar_lazy = has_tools && inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
+
+        data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+            foreach_function(inputs.tools, [&](const json & tool) {
+                const auto & function = tool.at("function");
+                auto schema = function.at("parameters");
+                builder.resolve_refs(schema);
+            });
+            parser.build_grammar(builder, data.grammar_lazy);
+        });
+
+        data.grammar_triggers = {
+            {COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "[TOOL_CALLS]"}
+        };
+    }
+
+    return data;
+}
+
 static common_chat_params common_chat_params_init_magistral(const common_chat_template & tmpl, const struct templates_params & inputs) {
    common_chat_params data;
    data.prompt = apply(tmpl, inputs);
@ -2341,6 +2465,7 @@ static common_chat_params common_chat_templates_apply_jinja(
    params.messages = common_chat_msgs_to_json_oaicompat<json>(inputs.messages, /* concat_text= */ !tmpl.original_caps().requires_typed_content);
    params.add_generation_prompt = inputs.add_generation_prompt;
    params.tool_choice = inputs.tool_choice;
+    params.reasoning_format = inputs.reasoning_format;
    params.enable_thinking = inputs.enable_thinking;
    params.grammar = inputs.grammar;
    params.now = inputs.now;
@ -2504,6 +2629,13 @@ static common_chat_params common_chat_templates_apply_jinja(
        return common_chat_params_init_llama_3_x(tmpl, params, allow_python_tag_builtin_tools);
    }

+    // Ministral/Mistral Large 3
+    if (src.find("[SYSTEM_PROMPT]") != std::string::npos &&
+        src.find("[TOOL_CALLS]") != std::string::npos &&
+        src.find("[ARGS]") != std::string::npos) {
+        return common_chat_params_init_ministral_3(tmpl, params);
+    }
+
    if (src.find("[THINK]") != std::string::npos && src.find("[/THINK]") != std::string::npos) {
        return common_chat_params_init_magistral(tmpl, params);
    }
--- a/common/console.cpp
+++ b/common/console.cpp
@ -1,6 +1,11 @@
 #include "console.h"
 #include <vector>
 #include <iostream>
+#include <cassert>
+#include <cstddef>
+#include <cctype>
+#include <cwctype>
+#include <cstdint>

 #if defined(_WIN32)
 #define WIN32_LEAN_AND_MEAN
@ -35,9 +40,26 @@

 namespace console {

+#if defined (_WIN32)
+    namespace {
+        // Use private-use unicode values to represent special keys that are not reported
+        // as characters (e.g. arrows on Windows). These values should never clash with
+        // real input and let the rest of the code handle navigation uniformly.
+        static constexpr char32_t KEY_ARROW_LEFT       = 0xE000;
+        static constexpr char32_t KEY_ARROW_RIGHT      = 0xE001;
+        static constexpr char32_t KEY_ARROW_UP         = 0xE002;
+        static constexpr char32_t KEY_ARROW_DOWN       = 0xE003;
+        static constexpr char32_t KEY_HOME             = 0xE004;
+        static constexpr char32_t KEY_END              = 0xE005;
+        static constexpr char32_t KEY_CTRL_ARROW_LEFT  = 0xE006;
+        static constexpr char32_t KEY_CTRL_ARROW_RIGHT = 0xE007;
+        static constexpr char32_t KEY_DELETE           = 0xE008;
+    }
+
    //
    // Console state
    //
+#endif

    static bool      advanced_display = false;
    static bool      simple_io        = true;
@ -176,7 +198,18 @@ namespace console {
            if (record.EventType == KEY_EVENT && record.Event.KeyEvent.bKeyDown) {
                wchar_t wc = record.Event.KeyEvent.uChar.UnicodeChar;
                if (wc == 0) {
-                    continue;
+                    const DWORD ctrl_mask = LEFT_CTRL_PRESSED | RIGHT_CTRL_PRESSED;
+                    const bool ctrl_pressed = (record.Event.KeyEvent.dwControlKeyState & ctrl_mask) != 0;
+                    switch (record.Event.KeyEvent.wVirtualKeyCode) {
+                        case VK_LEFT:   return ctrl_pressed ? KEY_CTRL_ARROW_LEFT  : KEY_ARROW_LEFT;
+                        case VK_RIGHT:  return ctrl_pressed ? KEY_CTRL_ARROW_RIGHT : KEY_ARROW_RIGHT;
+                        case VK_UP:     return KEY_ARROW_UP;
+                        case VK_DOWN:   return KEY_ARROW_DOWN;
+                        case VK_HOME:   return KEY_HOME;
+                        case VK_END:    return KEY_END;
+                        case VK_DELETE: return KEY_DELETE;
+                        default:        continue;
+                    }
                }

                if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate
@ -315,6 +348,52 @@ namespace console {
 #endif
    }

+    static char32_t decode_utf8(const std::string & input, size_t pos, size_t & advance) {
+        unsigned char c = static_cast<unsigned char>(input[pos]);
+        if ((c & 0x80u) == 0u) {
+            advance = 1;
+            return c;
+        }
+        if ((c & 0xE0u) == 0xC0u && pos + 1 < input.size()) {
+            unsigned char c1 = static_cast<unsigned char>(input[pos + 1]);
+            if ((c1 & 0xC0u) != 0x80u) {
+                advance = 1;
+                return 0xFFFD;
+            }
+            advance = 2;
+            return ((c & 0x1Fu) << 6) | (static_cast<unsigned char>(input[pos + 1]) & 0x3Fu);
+        }
+        if ((c & 0xF0u) == 0xE0u && pos + 2 < input.size()) {
+            unsigned char c1 = static_cast<unsigned char>(input[pos + 1]);
+            unsigned char c2 = static_cast<unsigned char>(input[pos + 2]);
+            if ((c1 & 0xC0u) != 0x80u || (c2 & 0xC0u) != 0x80u) {
+                advance = 1;
+                return 0xFFFD;
+            }
+            advance = 3;
+            return ((c & 0x0Fu) << 12) |
+                   ((static_cast<unsigned char>(input[pos + 1]) & 0x3Fu) << 6) |
+                   (static_cast<unsigned char>(input[pos + 2]) & 0x3Fu);
+        }
+        if ((c & 0xF8u) == 0xF0u && pos + 3 < input.size()) {
+            unsigned char c1 = static_cast<unsigned char>(input[pos + 1]);
+            unsigned char c2 = static_cast<unsigned char>(input[pos + 2]);
+            unsigned char c3 = static_cast<unsigned char>(input[pos + 3]);
+            if ((c1 & 0xC0u) != 0x80u || (c2 & 0xC0u) != 0x80u || (c3 & 0xC0u) != 0x80u) {
+                advance = 1;
+                return 0xFFFD;
+            }
+            advance = 4;
+            return ((c & 0x07u) << 18) |
+                   ((static_cast<unsigned char>(input[pos + 1]) & 0x3Fu) << 12) |
+                   ((static_cast<unsigned char>(input[pos + 2]) & 0x3Fu) << 6) |
+                   (static_cast<unsigned char>(input[pos + 3]) & 0x3Fu);
+        }
+
+        advance = 1;
+        return 0xFFFD; // replacement character for invalid input
+    }
+
    static void append_utf8(char32_t ch, std::string & out) {
        if (ch <= 0x7F) {
            out.push_back(static_cast<unsigned char>(ch));
@ -336,22 +415,319 @@ namespace console {
    }

    // Helper function to remove the last UTF-8 character from a string
-    static void pop_back_utf8_char(std::string & line) {
-        if (line.empty()) {
+    static size_t prev_utf8_char_pos(const std::string & line, size_t pos) {
+        if (pos == 0) return 0;
+        pos--;
+        while (pos > 0 && (line[pos] & 0xC0) == 0x80) {
+            pos--;
+        }
+        return pos;
+    }
+
+    static size_t next_utf8_char_pos(const std::string & line, size_t pos) {
+        if (pos >= line.length()) return line.length();
+        pos++;
+        while (pos < line.length() && (line[pos] & 0xC0) == 0x80) {
+            pos++;
+        }
+        return pos;
+    }
+
+    static void move_cursor(int delta);
+    static void move_word_left(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths, const std::string & line);
+    static void move_word_right(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths, const std::string & line);
+    static void move_to_line_start(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths);
+    static void move_to_line_end(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths, const std::string & line);
+
+    static void delete_at_cursor(std::string & line, std::vector<int> & widths, size_t & char_pos, size_t & byte_pos) {
+        if (char_pos >= widths.size()) {
            return;
        }

-        size_t pos = line.length() - 1;
+        size_t next_pos = next_utf8_char_pos(line, byte_pos);
+        int w = widths[char_pos];
+        size_t char_len = next_pos - byte_pos;

-        // Find the start of the last UTF-8 character (checking up to 4 bytes back)
-        for (size_t i = 0; i < 3 && pos > 0; ++i, --pos) {
-            if ((line[pos] & 0xC0) != 0x80) {
-                break; // Found the start of the character
-            }
+        line.erase(byte_pos, char_len);
+        widths.erase(widths.begin() + char_pos);
+
+        size_t p = byte_pos;
+        int tail_width = 0;
+        for (size_t i = char_pos; i < widths.size(); ++i) {
+            size_t following = next_utf8_char_pos(line, p);
+            put_codepoint(line.c_str() + p, following - p, widths[i]);
+            tail_width += widths[i];
+            p = following;
        }
-        line.erase(pos);
+
+        for (int i = 0; i < w; ++i) {
+            fputc(' ', out);
+        }
+
+        move_cursor(-(tail_width + w));
    }

+    static void clear_current_line(const std::vector<int> & widths) {
+        int total_width = 0;
+        for (int w : widths) {
+            total_width += (w > 0 ? w : 1);
+        }
+
+        if (total_width > 0) {
+            std::string spaces(total_width, ' ');
+            fwrite(spaces.c_str(), 1, total_width, out);
+            move_cursor(-total_width);
+        }
+    }
+
+    static void set_line_contents(std::string new_line, std::string & line, std::vector<int> & widths, size_t & char_pos,
+                                  size_t & byte_pos) {
+        move_to_line_start(char_pos, byte_pos, widths);
+        clear_current_line(widths);
+
+        line = std::move(new_line);
+        widths.clear();
+        byte_pos = 0;
+        char_pos = 0;
+
+        size_t idx = 0;
+        while (idx < line.size()) {
+            size_t advance = 0;
+            char32_t cp = decode_utf8(line, idx, advance);
+            int expected_width = estimateWidth(cp);
+            int real_width = put_codepoint(line.c_str() + idx, advance, expected_width);
+            if (real_width < 0) real_width = 0;
+            widths.push_back(real_width);
+            idx += advance;
+            ++char_pos;
+            byte_pos = idx;
+        }
+    }
+
+    static void move_to_line_start(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths) {
+        int back_width = 0;
+        for (size_t i = 0; i < char_pos; ++i) {
+            back_width += widths[i];
+        }
+        move_cursor(-back_width);
+        char_pos = 0;
+        byte_pos = 0;
+    }
+
+    static void move_to_line_end(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths, const std::string & line) {
+        int forward_width = 0;
+        for (size_t i = char_pos; i < widths.size(); ++i) {
+            forward_width += widths[i];
+        }
+        move_cursor(forward_width);
+        char_pos = widths.size();
+        byte_pos = line.length();
+    }
+
+    static bool has_ctrl_modifier(const std::string & params) {
+        size_t start = 0;
+        while (start < params.size()) {
+            size_t end = params.find(';', start);
+            size_t len = (end == std::string::npos) ? params.size() - start : end - start;
+            if (len > 0) {
+                int value = 0;
+                for (size_t i = 0; i < len; ++i) {
+                    char ch = params[start + i];
+                    if (!std::isdigit(static_cast<unsigned char>(ch))) {
+                        value = -1;
+                        break;
+                    }
+                    value = value * 10 + (ch - '0');
+                }
+                if (value == 5) {
+                    return true;
+                }
+            }
+
+            if (end == std::string::npos) {
+                break;
+            }
+            start = end + 1;
+        }
+        return false;
+    }
+
+    static bool is_space_codepoint(char32_t cp) {
+        return std::iswspace(static_cast<wint_t>(cp)) != 0;
+    }
+
+    static void move_word_left(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths, const std::string & line) {
+        if (char_pos == 0) {
+            return;
+        }
+
+        size_t new_char_pos = char_pos;
+        size_t new_byte_pos = byte_pos;
+        int move_width = 0;
+
+        while (new_char_pos > 0) {
+            size_t prev_byte = prev_utf8_char_pos(line, new_byte_pos);
+            size_t advance = 0;
+            char32_t cp = decode_utf8(line, prev_byte, advance);
+            if (!is_space_codepoint(cp)) {
+                break;
+            }
+            move_width += widths[new_char_pos - 1];
+            new_char_pos--;
+            new_byte_pos = prev_byte;
+        }
+
+        while (new_char_pos > 0) {
+            size_t prev_byte = prev_utf8_char_pos(line, new_byte_pos);
+            size_t advance = 0;
+            char32_t cp = decode_utf8(line, prev_byte, advance);
+            if (is_space_codepoint(cp)) {
+                break;
+            }
+            move_width += widths[new_char_pos - 1];
+            new_char_pos--;
+            new_byte_pos = prev_byte;
+        }
+
+        move_cursor(-move_width);
+        char_pos = new_char_pos;
+        byte_pos = new_byte_pos;
+    }
+
+    static void move_word_right(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths, const std::string & line) {
+        if (char_pos >= widths.size()) {
+            return;
+        }
+
+        size_t new_char_pos = char_pos;
+        size_t new_byte_pos = byte_pos;
+        int move_width = 0;
+
+        while (new_char_pos < widths.size()) {
+            size_t advance = 0;
+            char32_t cp = decode_utf8(line, new_byte_pos, advance);
+            if (!is_space_codepoint(cp)) {
+                break;
+            }
+            move_width += widths[new_char_pos];
+            new_char_pos++;
+            new_byte_pos += advance;
+        }
+
+        while (new_char_pos < widths.size()) {
+            size_t advance = 0;
+            char32_t cp = decode_utf8(line, new_byte_pos, advance);
+            if (is_space_codepoint(cp)) {
+                break;
+            }
+            move_width += widths[new_char_pos];
+            new_char_pos++;
+            new_byte_pos += advance;
+        }
+
+        while (new_char_pos < widths.size()) {
+            size_t advance = 0;
+            char32_t cp = decode_utf8(line, new_byte_pos, advance);
+            if (!is_space_codepoint(cp)) {
+                break;
+            }
+            move_width += widths[new_char_pos];
+            new_char_pos++;
+            new_byte_pos += advance;
+        }
+
+        move_cursor(move_width);
+        char_pos = new_char_pos;
+        byte_pos = new_byte_pos;
+    }
+
+    static void move_cursor(int delta) {
+        if (delta == 0) return;
+#if defined(_WIN32)
+        if (hConsole != NULL) {
+            CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
+            GetConsoleScreenBufferInfo(hConsole, &bufferInfo);
+            COORD newCursorPosition = bufferInfo.dwCursorPosition;
+            int width = bufferInfo.dwSize.X;
+            int newX = newCursorPosition.X + delta;
+            int newY = newCursorPosition.Y;
+
+            while (newX >= width) {
+                newX -= width;
+                newY++;
+            }
+            while (newX < 0) {
+                newX += width;
+                newY--;
+            }
+
+            newCursorPosition.X = newX;
+            newCursorPosition.Y = newY;
+            SetConsoleCursorPosition(hConsole, newCursorPosition);
+        }
+#else
+        if (delta < 0) {
+            for (int i = 0; i < -delta; i++) fprintf(out, "\b");
+        } else {
+            for (int i = 0; i < delta; i++) fprintf(out, "\033[C");
+        }
+#endif
+    }
+
+    struct history_t {
+        std::vector<std::string> entries;
+        size_t viewing_idx = SIZE_MAX;
+        std::string backup_line; // current line before viewing history
+        void add(const std::string & line) {
+            if (line.empty()) {
+                return;
+            }
+            // avoid duplicates with the last entry
+            if (entries.empty() || entries.back() != line) {
+                entries.push_back(line);
+            }
+            // also clear viewing state
+            end_viewing();
+        }
+        bool prev(std::string & cur_line) {
+            if (entries.empty()) {
+                return false;
+            }
+            if (viewing_idx == SIZE_MAX) {
+                return false;
+            }
+            if (viewing_idx > 0) {
+                viewing_idx--;
+            }
+            cur_line = entries[viewing_idx];
+            return true;
+        }
+        bool next(std::string & cur_line) {
+            if (entries.empty() || viewing_idx == SIZE_MAX) {
+                return false;
+            }
+            viewing_idx++;
+            if (viewing_idx >= entries.size()) {
+                cur_line = backup_line;
+                end_viewing();
+            } else {
+                cur_line = entries[viewing_idx];
+            }
+            return true;
+        }
+        void begin_viewing(const std::string & line) {
+            backup_line = line;
+            viewing_idx = entries.size();
+        }
+        void end_viewing() {
+            viewing_idx = SIZE_MAX;
+            backup_line.clear();
+        }
+        bool is_viewing() const {
+            return viewing_idx != SIZE_MAX;
+        }
+    } history;
+
    static bool readline_advanced(std::string & line, bool multiline_input) {
        if (out != stdout) {
            fflush(stdout);
@ -362,8 +738,33 @@ namespace console {
        bool is_special_char = false;
        bool end_of_stream = false;

+        size_t byte_pos = 0; // current byte index
+        size_t char_pos = 0; // current character index (one char can be multiple bytes)
+
        char32_t input_char;
        while (true) {
+            assert(char_pos <= byte_pos);
+            assert(char_pos <= widths.size());
+            auto history_prev = [&]() {
+                if (!history.is_viewing()) {
+                    history.begin_viewing(line);
+                }
+                std::string new_line;
+                if (!history.prev(new_line)) {
+                    return;
+                }
+                set_line_contents(new_line, line, widths, char_pos, byte_pos);
+            };
+            auto history_next = [&]() {
+                if (history.is_viewing()) {
+                    std::string new_line;
+                    if (!history.next(new_line)) {
+                        return;
+                    }
+                    set_line_contents(new_line, line, widths, char_pos, byte_pos);
+                }
+            };
+
            fflush(out); // Ensure all output is displayed before waiting for input
            input_char = getchar32();

@ -371,7 +772,7 @@ namespace console {
                break;
            }

-            if (input_char == (char32_t) WEOF || input_char == 0x04 /* Ctrl+D*/) {
+            if (input_char == (char32_t) WEOF || input_char == 0x04 /* Ctrl+D */) {
                end_of_stream = true;
                break;
            }
@ -384,7 +785,71 @@ namespace console {

            if (input_char == '\033') { // Escape sequence
                char32_t code = getchar32();
-                if (code == '[' || code == 0x1B) {
+                if (code == '[') {
+                    std::string params;
+                    while (true) {
+                        code = getchar32();
+                        if ((code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z') || code == '~' || code == (char32_t) WEOF) {
+                            break;
+                        }
+                        params.push_back(static_cast<char>(code));
+                    }
+
+                    const bool ctrl_modifier = has_ctrl_modifier(params);
+
+                    if (code == 'D') { // left
+                        if (ctrl_modifier) {
+                            move_word_left(char_pos, byte_pos, widths, line);
+                        } else if (char_pos > 0) {
+                            int w = widths[char_pos - 1];
+                            move_cursor(-w);
+                            char_pos--;
+                            byte_pos = prev_utf8_char_pos(line, byte_pos);
+                        }
+                    } else if (code == 'C') { // right
+                        if (ctrl_modifier) {
+                            move_word_right(char_pos, byte_pos, widths, line);
+                        } else if (char_pos < widths.size()) {
+                            int w = widths[char_pos];
+                            move_cursor(w);
+                            char_pos++;
+                            byte_pos = next_utf8_char_pos(line, byte_pos);
+                        }
+                    } else if (code == 'H') { // home
+                        move_to_line_start(char_pos, byte_pos, widths);
+                    } else if (code == 'F') { // end
+                        move_to_line_end(char_pos, byte_pos, widths, line);
+                    } else if (code == 'A' || code == 'B') {
+                        // up/down
+                        if (code == 'A') {
+                            history_prev();
+                            is_special_char = false;
+                        } else if (code == 'B') {
+                            history_next();
+                            is_special_char = false;
+                        }
+                    } else if ((code == '~' || (code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z')) && !params.empty()) {
+                        std::string digits;
+                        for (char ch : params) {
+                            if (ch == ';') {
+                                break;
+                            }
+                            if (std::isdigit(static_cast<unsigned char>(ch))) {
+                                digits.push_back(ch);
+                            }
+                        }
+
+                        if (code == '~') {
+                            if (digits == "1" || digits == "7") { // home
+                                move_to_line_start(char_pos, byte_pos, widths);
+                            } else if (digits == "4" || digits == "8") { // end
+                                move_to_line_end(char_pos, byte_pos, widths, line);
+                            } else if (digits == "3") { // delete
+                                delete_at_cursor(line, widths, char_pos, byte_pos);
+                            }
+                        }
+                    }
+                } else if (code == 0x1B) {
                    // Discard the rest of the escape sequence
                    while ((code = getchar32()) != (char32_t) WEOF) {
                        if ((code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z') || code == '~') {
@ -392,28 +857,107 @@ namespace console {
                        }
                    }
                }
+#if defined(_WIN32)
+            } else if (input_char == KEY_ARROW_LEFT) {
+                if (char_pos > 0) {
+                    int w = widths[char_pos - 1];
+                    move_cursor(-w);
+                    char_pos--;
+                    byte_pos = prev_utf8_char_pos(line, byte_pos);
+                }
+            } else if (input_char == KEY_ARROW_RIGHT) {
+                if (char_pos < widths.size()) {
+                    int w = widths[char_pos];
+                    move_cursor(w);
+                    char_pos++;
+                    byte_pos = next_utf8_char_pos(line, byte_pos);
+                }
+            } else if (input_char == KEY_CTRL_ARROW_LEFT) {
+                move_word_left(char_pos, byte_pos, widths, line);
+            } else if (input_char == KEY_CTRL_ARROW_RIGHT) {
+                move_word_right(char_pos, byte_pos, widths, line);
+            } else if (input_char == KEY_HOME) {
+                move_to_line_start(char_pos, byte_pos, widths);
+            } else if (input_char == KEY_END) {
+                move_to_line_end(char_pos, byte_pos, widths, line);
+            } else if (input_char == KEY_DELETE) {
+                delete_at_cursor(line, widths, char_pos, byte_pos);
+            } else if (input_char == KEY_ARROW_UP || input_char == KEY_ARROW_DOWN) {
+                if (input_char == KEY_ARROW_UP) {
+                    history_prev();
+                    is_special_char = false;
+                } else if (input_char == KEY_ARROW_DOWN) {
+                    history_next();
+                    is_special_char = false;
+                }
+#endif
            } else if (input_char == 0x08 || input_char == 0x7F) { // Backspace
-                if (!widths.empty()) {
-                    int count;
-                    do {
-                        count = widths.back();
-                        widths.pop_back();
-                        // Move cursor back, print space, and move cursor back again
-                        for (int i = 0; i < count; i++) {
-                            replace_last(' ');
-                            pop_cursor();
-                        }
-                        pop_back_utf8_char(line);
-                    } while (count == 0 && !widths.empty());
+                if (char_pos > 0) {
+                    int w = widths[char_pos - 1];
+                    move_cursor(-w);
+                    char_pos--;
+                    size_t prev_pos = prev_utf8_char_pos(line, byte_pos);
+                    size_t char_len = byte_pos - prev_pos;
+                    byte_pos = prev_pos;
+
+                    // remove the character
+                    line.erase(byte_pos, char_len);
+                    widths.erase(widths.begin() + char_pos);
+
+                    // redraw tail
+                    size_t p = byte_pos;
+                    int tail_width = 0;
+                    for (size_t i = char_pos; i < widths.size(); ++i) {
+                        size_t next_p = next_utf8_char_pos(line, p);
+                        put_codepoint(line.c_str() + p, next_p - p, widths[i]);
+                        tail_width += widths[i];
+                        p = next_p;
+                    }
+
+                    // clear display
+                    for (int i = 0; i < w; ++i) {
+                        fputc(' ', out);
+                    }
+                    move_cursor(-(tail_width + w));
                }
            } else {
-                int offset = line.length();
-                append_utf8(input_char, line);
-                int width = put_codepoint(line.c_str() + offset, line.length() - offset, estimateWidth(input_char));
-                if (width < 0) {
-                    width = 0;
+                // insert character
+                std::string new_char_str;
+                append_utf8(input_char, new_char_str);
+                int w = estimateWidth(input_char);
+
+                if (char_pos == widths.size()) {
+                    // insert at the end
+                    line += new_char_str;
+                    int real_w = put_codepoint(new_char_str.c_str(), new_char_str.length(), w);
+                    if (real_w < 0) real_w = 0;
+                    widths.push_back(real_w);
+                    byte_pos += new_char_str.length();
+                    char_pos++;
+                } else {
+                    // insert in middle
+                    line.insert(byte_pos, new_char_str);
+
+                    int real_w = put_codepoint(new_char_str.c_str(), new_char_str.length(), w);
+                    if (real_w < 0) real_w = 0;
+
+                    widths.insert(widths.begin() + char_pos, real_w);
+
+                    // print the tail
+                    size_t p = byte_pos + new_char_str.length();
+                    int tail_width = 0;
+                    for (size_t i = char_pos + 1; i < widths.size(); ++i) {
+                        size_t next_p = next_utf8_char_pos(line, p);
+                        put_codepoint(line.c_str() + p, next_p - p, widths[i]);
+                        tail_width += widths[i];
+                        p = next_p;
+                    }
+
+                    move_cursor(-tail_width);
+
+                    byte_pos += new_char_str.length();
+                    char_pos++;
                }
-                widths.push_back(width);
            }

            if (!line.empty() && (line.back() == '\\' || line.back() == '/')) {
@ -451,6 +995,15 @@ namespace console {
            }
        }

+        if (!end_of_stream && !line.empty()) {
+            // remove the trailing newline for history storage
+            if (!line.empty() && line.back() == '\n') {
+                line.pop_back();
+            }
+            // TODO: maybe support multiline history entries?
+            history.add(line);
+        }
+
        fflush(out);
        return has_more;
    }
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -383,6 +383,17 @@ class ModelBase:
                        s = self.model_tensors[name]
                        self.model_tensors[weight_name] = lambda w=w, s=s, bs=block_size: dequant_simple(w(), s(), bs)
                        tensors_to_remove.append(name)
+                    if name.endswith(".activation_scale"):  # unused
+                        tensors_to_remove.append(name)
+                    # mistral format
+                    if name.endswith(".qscale_weight"):
+                        weight_name = name.removesuffix("qscale_weight") + "weight"
+                        w = self.model_tensors[weight_name]
+                        s = self.model_tensors[name]
+                        self.model_tensors[weight_name] = lambda w=w, s=s, bs=block_size: dequant_simple(w(), s(), bs)
+                        tensors_to_remove.append(name)
+                    if name.endswith(".qscale_act"):
+                        tensors_to_remove.append(name)
            elif quant_method == "gptq":
                for name in self.model_tensors.keys():
                    if name.endswith(".qweight"):
@ -2854,13 +2865,10 @@ class Mistral3Model(LlamaModel):
            self.gguf_writer.add_attn_temperature_scale(rope_params["llama_4_scaling_beta"])

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
-        # TODO: probably not worth supporting quantized weight, as official BF16 is also available
-        if name.endswith("weight_scale_inv"):
-            raise ValueError("This is a quantized weight, please use BF16 weight instead")
-
        name = name.replace("language_model.", "")
        if "multi_modal_projector" in name or "vision_tower" in name:
            return []
+
        return super().modify_tensors(data_torch, name, bid)


@ -5825,9 +5833,11 @@ class Gemma3Model(TextModel):
    norm_shift = 1.0  # Gemma3RMSNorm adds 1.0 to the norm value

    def set_vocab(self):
-        self._set_vocab_sentencepiece()
-
-        self.gguf_writer.add_add_space_prefix(False)
+        if (self.dir_model / "tokenizer.model").is_file():
+            self._set_vocab_sentencepiece()
+            self.gguf_writer.add_add_space_prefix(False)
+        else:
+            self._set_vocab_gpt2()

    def set_gguf_parameters(self):
        hparams = self.hparams
@ -5845,13 +5855,24 @@ class Gemma3Model(TextModel):
        self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1_000_000.0)) # for global layers
        # attn_logit_softcapping is removed in Gemma3
        assert hparams.get("attn_logit_softcapping") is None
-        self.gguf_writer.add_sliding_window(hparams["sliding_window"])
+        if (final_logit_softcap := hparams.get("final_logit_softcapping")):
+            self.gguf_writer.add_final_logit_softcapping(final_logit_softcap)
+        if hparams.get("sliding_window_pattern") != 1:
+            self.gguf_writer.add_sliding_window(hparams["sliding_window"])
        self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4))
        if hparams.get("rope_scaling") is not None:
-            assert hparams["rope_scaling"]["rope_type"] == "linear"
-            # important: this rope_scaling is only applied for global layers, and not used by 1B model
-            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
-            self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
+            rope_scaling = hparams["rope_scaling"]
+            if rope_scaling["rope_type"] == "linear":
+                # important: this rope_scaling is only applied for global layers, and not used by 1B model
+                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
+                self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
+            elif rope_scaling["rope_type"] == "yarn":
+                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
+                self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
+                self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
+                self.gguf_writer.add_rope_scaling_yarn_ext_factor(rope_scaling["extrapolation_factor"])
+                self.gguf_writer.add_rope_scaling_yarn_beta_fast(rope_scaling["beta_fast"])
+                self.gguf_writer.add_rope_scaling_yarn_beta_slow(rope_scaling["beta_slow"])

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        del bid  # unused
@ -5865,8 +5886,10 @@ class Gemma3Model(TextModel):

        # remove OOV (out-of-vocabulary) rows in token_embd
        if "embed_tokens.weight" in name:
-            vocab = self._create_vocab_sentencepiece()
-            tokens = vocab[0]
+            if (self.dir_model / "tokenizer.model").is_file():
+                tokens = self._create_vocab_sentencepiece()[0]
+            else:
+                tokens = self.get_vocab_base()[0]
            data_torch = data_torch[:len(tokens)]

        # ref code in Gemma3RMSNorm
@ -9883,6 +9906,18 @@ class MistralModel(LlamaModel):
            self.gguf_writer.add_architecture()
            self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)

+    def dequant_model(self):
+        # transform quantization config into HF format
+        quant_config = self.hparams.get("quantization")
+        if quant_config is not None:
+            assert quant_config["qformat_weight"] == "fp8_e4m3"
+            self.hparams["quantization_config"] = {
+                "activation_scheme": "static",
+                "quant_method": "fp8",
+                "weight_block_size": None,
+            }
+        return super().dequant_model()
+
    @staticmethod
    def get_community_chat_template(vocab: MistralVocab, templates_dir: Path, is_mistral_format: bool):
        assert TokenizerVersion is not None and Tekkenizer is not None and SentencePieceTokenizer is not None, _mistral_import_error_msg
--- a/docs/ops.md
+++ b/docs/ops.md
@ -36,14 +36,15 @@ Legend:
 |                              CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
 |               CROSS_ENTROPY_LOSS | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |          CROSS_ENTROPY_LOSS_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                           CUMSUM | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
+|                           CUMSUM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
+|                             DIAG | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                    DIAG_MASK_INF | ❌ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                              DIV | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                              DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                              ELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ |
 |                              EXP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                            EXPM1 | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                             FILL | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
+|                             FILL | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
 |                   FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
 |                            FLOOR | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
 |                GATED_LINEAR_ATTN | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
@ -102,7 +103,7 @@ Legend:
 |                         SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
 |                         SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                    SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
-|                        SOLVE_TRI | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
+|                        SOLVE_TRI | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
 |                              SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
 |                             SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
 |                         SSM_CONV | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
@ -115,8 +116,8 @@ Legend:
 |                       SWIGLU_OAI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ |
 |                             TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |               TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                            TOP_K | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
-|                              TRI | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
+|                            TOP_K | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
+|                              TRI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
 |                            TRUNC | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
 |                          UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
 |                            XIELU | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
--- a/docs/ops/CPU.csv
+++ b/docs/ops/CPU.csv
@ -4964,6 +4964,7 @@
 "CPU","CONV_TRANSPOSE_1D","ne_input=[2,1,1,1],ne_kernel=[3,1,1,1],s0=1,p0=0,d0=1","support","1","yes","CPU"
 "CPU","CONV_TRANSPOSE_2D","ne_input=[3,2,3,1],ne_kernel=[2,2,1,3],stride=1","support","1","yes","CPU"
 "CPU","CONV_TRANSPOSE_2D","ne_input=[10,10,9,1],ne_kernel=[3,3,1,9],stride=2","support","1","yes","CPU"
+"CPU","CONV_TRANSPOSE_2D","ne_input=[129,63,35,1],ne_kernel=[3,3,48,35],stride=1","support","1","yes","CPU"
 "CPU","COUNT_EQUAL","type=f32,ne=[4,500,1,1]","support","1","yes","CPU"
 "CPU","COUNT_EQUAL","type=f32,ne=[4,5000,1,1]","support","1","yes","CPU"
 "CPU","ARGMAX","type=f32,ne=[32,1,1,1]","support","1","yes","CPU"
@ -5419,17 +5420,45 @@
 "CPU","CPY","type_src=f16,type_dst=f16,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","CPU"
 "CPU","CPY","type_src=f32,type_dst=f32,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","CPU"
 "CPU","CPY","type_src=bf16,type_dst=bf16,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","CPU"
+"CPU","CPY","type_src=i32,type_dst=i32,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","CPU"
+"CPU","CPY","type_src=i32,type_dst=i32,ne=[256,1,4,1],permute_src=[1,2,0,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","CPU"
 "CPU","CPY","type_src=f32,type_dst=f32,ne=[256,1,4,1],permute_src=[1,2,0,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","CPU"
-"CPU","CONT","type=f32,ne=[10,10,10,1]","support","1","yes","CPU"
-"CPU","CONT","type=f32,ne=[2,1,1,1]","support","1","yes","CPU"
-"CPU","CONT","type=f32,ne=[2,1,3,5]","support","1","yes","CPU"
-"CPU","CONT","type=f32,ne=[2,3,5,7]","support","1","yes","CPU"
-"CPU","CONT","type=f16,ne=[2,1,1,1]","support","1","yes","CPU"
-"CPU","CONT","type=f16,ne=[2,1,3,5]","support","1","yes","CPU"
-"CPU","CONT","type=f16,ne=[2,3,5,7]","support","1","yes","CPU"
-"CPU","CONT","type=bf16,ne=[2,1,1,1]","support","1","yes","CPU"
-"CPU","CONT","type=bf16,ne=[2,1,3,5]","support","1","yes","CPU"
-"CPU","CONT","type=bf16,ne=[2,3,5,7]","support","1","yes","CPU"
+"CPU","CONT","type=f32,ne=[2,1,1,1],use_view_slice=1","support","1","yes","CPU"
+"CPU","CONT","type=f32,ne=[2,1,3,5],use_view_slice=1","support","1","yes","CPU"
+"CPU","CONT","type=f32,ne=[2,3,5,7],use_view_slice=1","support","1","yes","CPU"
+"CPU","CONT","type=f32,ne=[1,4,4,1],use_view_slice=1","support","1","yes","CPU"
+"CPU","CONT","type=f32,ne=[1,8,17,1],use_view_slice=1","support","1","yes","CPU"
+"CPU","CONT","type=f32,ne=[10,10,10,1],use_view_slice=1","support","1","yes","CPU"
+"CPU","CONT","type=f32,ne=[2,1,1,1],use_view_slice=0","support","1","yes","CPU"
+"CPU","CONT","type=f32,ne=[2,1,3,5],use_view_slice=0","support","1","yes","CPU"
+"CPU","CONT","type=f32,ne=[2,3,5,7],use_view_slice=0","support","1","yes","CPU"
+"CPU","CONT","type=f32,ne=[1,4,4,1],use_view_slice=0","support","1","yes","CPU"
+"CPU","CONT","type=f32,ne=[1,8,17,1],use_view_slice=0","support","1","yes","CPU"
+"CPU","CONT","type=f32,ne=[10,10,10,1],use_view_slice=0","support","1","yes","CPU"
+"CPU","CONT","type=i32,ne=[2,1,1,1],use_view_slice=1","support","1","yes","CPU"
+"CPU","CONT","type=i32,ne=[2,1,3,5],use_view_slice=1","support","1","yes","CPU"
+"CPU","CONT","type=i32,ne=[2,3,5,7],use_view_slice=1","support","1","yes","CPU"
+"CPU","CONT","type=i32,ne=[1,4,4,1],use_view_slice=1","support","1","yes","CPU"
+"CPU","CONT","type=i32,ne=[1,8,17,1],use_view_slice=1","support","1","yes","CPU"
+"CPU","CONT","type=i32,ne=[10,10,10,1],use_view_slice=1","support","1","yes","CPU"
+"CPU","CONT","type=i32,ne=[2,1,1,1],use_view_slice=0","support","1","yes","CPU"
+"CPU","CONT","type=i32,ne=[2,1,3,5],use_view_slice=0","support","1","yes","CPU"
+"CPU","CONT","type=i32,ne=[2,3,5,7],use_view_slice=0","support","1","yes","CPU"
+"CPU","CONT","type=i32,ne=[1,4,4,1],use_view_slice=0","support","1","yes","CPU"
+"CPU","CONT","type=i32,ne=[1,8,17,1],use_view_slice=0","support","1","yes","CPU"
+"CPU","CONT","type=i32,ne=[10,10,10,1],use_view_slice=0","support","1","yes","CPU"
+"CPU","CONT","type=f16,ne=[2,1,1,1],use_view_slice=0","support","1","yes","CPU"
+"CPU","CONT","type=f16,ne=[2,1,3,5],use_view_slice=0","support","1","yes","CPU"
+"CPU","CONT","type=f16,ne=[2,3,5,7],use_view_slice=0","support","1","yes","CPU"
+"CPU","CONT","type=f16,ne=[1,4,4,1],use_view_slice=0","support","1","yes","CPU"
+"CPU","CONT","type=f16,ne=[1,8,17,1],use_view_slice=0","support","1","yes","CPU"
+"CPU","CONT","type=f16,ne=[10,10,10,1],use_view_slice=0","support","1","yes","CPU"
+"CPU","CONT","type=bf16,ne=[2,1,1,1],use_view_slice=0","support","1","yes","CPU"
+"CPU","CONT","type=bf16,ne=[2,1,3,5],use_view_slice=0","support","1","yes","CPU"
+"CPU","CONT","type=bf16,ne=[2,3,5,7],use_view_slice=0","support","1","yes","CPU"
+"CPU","CONT","type=bf16,ne=[1,4,4,1],use_view_slice=0","support","1","yes","CPU"
+"CPU","CONT","type=bf16,ne=[1,8,17,1],use_view_slice=0","support","1","yes","CPU"
+"CPU","CONT","type=bf16,ne=[10,10,10,1],use_view_slice=0","support","1","yes","CPU"
 "CPU","ADD","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1","support","1","yes","CPU"
 "CPU","SUB","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1","support","1","yes","CPU"
 "CPU","MUL","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1","support","1","yes","CPU"
@ -5655,6 +5684,7 @@
 "CPU","MUL","type=f32,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1","support","1","yes","CPU"
 "CPU","DIV","type=f32,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1","support","1","yes","CPU"
 "CPU","ADD1","type=f32,ne=[10,5,4,3]","support","1","yes","CPU"
+"CPU","ADD1","type=f32,ne=[1024,1024,1,1]","support","1","yes","CPU"
 "CPU","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=0.000000,inplace=0","support","1","yes","CPU"
 "CPU","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=1.000000,inplace=0","support","1","yes","CPU"
 "CPU","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=1.000000,inplace=1","support","1","yes","CPU"
@ -8644,9 +8674,13 @@
 "CPU","CLAMP","type=f16,ne=[7,1,5,3],min=-0.500000,max=0.500000","support","1","yes","CPU"
 "CPU","LEAKY_RELU","type=f16,ne_a=[7,1,5,3],negative_slope=0.100000","support","1","yes","CPU"
 "CPU","FLOOR","type=f16,ne=[7,1,5,3]","support","1","yes","CPU"
+"CPU","FLOOR","type=f16,ne=[1024,1024,1,1]","support","1","yes","CPU"
 "CPU","CEIL","type=f16,ne=[7,1,5,3]","support","1","yes","CPU"
+"CPU","CEIL","type=f16,ne=[1024,1024,1,1]","support","1","yes","CPU"
 "CPU","ROUND","type=f16,ne=[7,1,5,3]","support","1","yes","CPU"
+"CPU","ROUND","type=f16,ne=[1024,1024,1,1]","support","1","yes","CPU"
 "CPU","TRUNC","type=f16,ne=[7,1,5,3]","support","1","yes","CPU"
+"CPU","TRUNC","type=f16,ne=[1024,1024,1,1]","support","1","yes","CPU"
 "CPU","SQR","type=f32,ne=[10,5,4,3]","support","1","yes","CPU"
 "CPU","SQRT","type=f32,ne=[10,3,3,2]","support","1","yes","CPU"
 "CPU","LOG","type=f32,ne=[10,5,4,3]","support","1","yes","CPU"
@ -8666,9 +8700,13 @@
 "CPU","CLAMP","type=f32,ne=[7,1,5,3],min=-0.500000,max=0.500000","support","1","yes","CPU"
 "CPU","LEAKY_RELU","type=f32,ne_a=[7,1,5,3],negative_slope=0.100000","support","1","yes","CPU"
 "CPU","FLOOR","type=f32,ne=[7,1,5,3]","support","1","yes","CPU"
+"CPU","FLOOR","type=f32,ne=[1024,1024,1,1]","support","1","yes","CPU"
 "CPU","CEIL","type=f32,ne=[7,1,5,3]","support","1","yes","CPU"
+"CPU","CEIL","type=f32,ne=[1024,1024,1,1]","support","1","yes","CPU"
 "CPU","ROUND","type=f32,ne=[7,1,5,3]","support","1","yes","CPU"
+"CPU","ROUND","type=f32,ne=[1024,1024,1,1]","support","1","yes","CPU"
 "CPU","TRUNC","type=f32,ne=[7,1,5,3]","support","1","yes","CPU"
+"CPU","TRUNC","type=f32,ne=[1024,1024,1,1]","support","1","yes","CPU"
 "CPU","DIAG_MASK_INF","type=f32,ne=[10,10,1,1],n_past=5","support","1","yes","CPU"
 "CPU","DIAG_MASK_INF","type=f32,ne=[10,10,3,1],n_past=5","support","1","yes","CPU"
 "CPU","DIAG_MASK_INF","type=f32,ne=[10,10,3,2],n_past=5","support","1","yes","CPU"
@ -9411,18 +9449,405 @@
 "CPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","1","yes","CPU"
 "CPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","CPU"
 "CPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[3,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[4,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[7,1,1,1],order=0","support","1","yes","CPU"
 "CPU","ARGSORT","type=f32,ne=[8,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[15,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[16,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[31,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[32,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[63,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[64,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[127,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[128,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[255,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[256,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[511,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[512,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[1023,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[1024,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[2047,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[2048,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[4095,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[4096,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[8191,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[8192,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[16383,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[16384,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[32767,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[32768,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[65535,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[65536,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[131071,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[131072,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[262143,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[262144,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[524287,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[524288,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[1048575,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[1048576,1,1,1],order=0","support","1","yes","CPU"
 "CPU","ARGSORT","type=f32,ne=[16,10,10,10],order=0","support","1","yes","CPU"
 "CPU","ARGSORT","type=f32,ne=[60,10,10,10],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[1024,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[16384,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[1023,2,1,3],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[1024,2,1,3],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[1025,2,1,3],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[2047,2,1,3],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[2048,2,1,3],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[2049,2,1,3],order=0","support","1","yes","CPU"
 "CPU","ARGSORT","type=f32,ne=[2,8,8192,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[8,1,1,1],order=1","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[3,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[4,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[7,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[8,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[15,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[16,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[31,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[32,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[63,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[64,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[127,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[128,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[255,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[256,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[511,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[512,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[1023,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[1024,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[2047,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[2048,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[4095,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[4096,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[8191,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[8192,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[16383,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[16384,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[32767,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[32768,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[65535,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[65536,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[131071,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[131072,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[262143,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[262144,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[524287,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[524288,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[1048575,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[1048576,1,1,1],order=0","support","1","yes","CPU"
 "CPU","ARGSORT","type=f32,ne=[16,10,10,10],order=1","support","1","yes","CPU"
 "CPU","ARGSORT","type=f32,ne=[60,10,10,10],order=1","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[1024,1,1,1],order=1","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[16384,1,1,1],order=1","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[1023,2,1,3],order=1","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[1024,2,1,3],order=1","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[1025,2,1,3],order=1","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[2047,2,1,3],order=1","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[2048,2,1,3],order=1","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[2049,2,1,3],order=1","support","1","yes","CPU"
 "CPU","ARGSORT","type=f32,ne=[2,8,8192,1],order=1","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1,1,1,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[12,1,2,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2,1,1,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[13,1,2,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2,1,1,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[13,1,2,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[4,1,1,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[15,1,2,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[4,1,1,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[15,1,2,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[4,1,1,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[15,1,2,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[8,1,1,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[19,1,2,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[8,1,1,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[19,1,2,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[8,1,1,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[19,1,2,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[8,1,1,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[19,1,2,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16,1,1,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[27,1,2,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16,1,1,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[27,1,2,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16,1,1,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[27,1,2,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16,1,1,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[27,1,2,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16,1,1,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[27,1,2,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[32,1,1,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[43,1,2,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[32,1,1,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[43,1,2,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[32,1,1,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[43,1,2,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[32,1,1,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[43,1,2,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[32,1,1,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[43,1,2,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[64,1,1,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[75,1,2,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[64,1,1,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[75,1,2,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[64,1,1,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[75,1,2,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[64,1,1,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[75,1,2,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[64,1,1,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[75,1,2,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[128,1,1,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[139,1,2,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[128,1,1,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[139,1,2,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[128,1,1,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[139,1,2,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[128,1,1,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[139,1,2,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[128,1,1,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[139,1,2,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[128,1,1,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[139,1,2,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[256,1,1,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[267,1,2,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[256,1,1,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[267,1,2,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[256,1,1,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[267,1,2,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[256,1,1,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[267,1,2,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[256,1,1,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[267,1,2,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[256,1,1,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[267,1,2,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[512,1,1,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[523,1,2,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[512,1,1,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[523,1,2,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[512,1,1,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[523,1,2,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[512,1,1,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[523,1,2,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[512,1,1,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[523,1,2,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[512,1,1,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[523,1,2,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[512,1,1,1],k=500,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[523,1,2,1],k=500,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1024,1,1,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1035,1,2,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1024,1,1,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1035,1,2,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1024,1,1,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1035,1,2,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1024,1,1,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1035,1,2,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1024,1,1,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1035,1,2,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1024,1,1,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1035,1,2,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1024,1,1,1],k=500,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1035,1,2,1],k=500,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1024,1,1,1],k=1023,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1035,1,2,1],k=1023,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2048,1,1,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2059,1,2,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2048,1,1,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2059,1,2,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2048,1,1,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2059,1,2,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2048,1,1,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2059,1,2,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2048,1,1,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2059,1,2,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2048,1,1,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2059,1,2,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2048,1,1,1],k=500,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2059,1,2,1],k=500,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2048,1,1,1],k=1023,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2059,1,2,1],k=1023,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[4096,1,1,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[4107,1,2,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[4096,1,1,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[4107,1,2,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[4096,1,1,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[4107,1,2,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[4096,1,1,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[4107,1,2,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[4096,1,1,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[4107,1,2,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[4096,1,1,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[4107,1,2,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[4096,1,1,1],k=500,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[4107,1,2,1],k=500,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[4096,1,1,1],k=1023,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[4107,1,2,1],k=1023,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[8192,1,1,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[8203,1,2,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[8192,1,1,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[8203,1,2,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[8192,1,1,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[8203,1,2,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[8192,1,1,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[8203,1,2,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[8192,1,1,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[8203,1,2,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[8192,1,1,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[8203,1,2,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[8192,1,1,1],k=500,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[8203,1,2,1],k=500,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[8192,1,1,1],k=1023,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[8203,1,2,1],k=1023,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16395,1,2,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16395,1,2,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16395,1,2,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16395,1,2,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16395,1,2,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16395,1,2,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=500,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16395,1,2,1],k=500,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=1023,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16395,1,2,1],k=1023,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=9999,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16395,1,2,1],k=9999,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[32768,1,1,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[32779,1,2,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[32768,1,1,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[32779,1,2,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[32768,1,1,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[32779,1,2,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[32768,1,1,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[32779,1,2,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[32768,1,1,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[32779,1,2,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[32768,1,1,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[32779,1,2,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[32768,1,1,1],k=500,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[32779,1,2,1],k=500,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[32768,1,1,1],k=1023,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[32779,1,2,1],k=1023,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[32768,1,1,1],k=9999,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[32779,1,2,1],k=9999,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[65536,1,1,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[65547,1,2,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[65536,1,1,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[65547,1,2,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[65536,1,1,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[65547,1,2,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[65536,1,1,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[65547,1,2,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[65536,1,1,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[65547,1,2,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[65536,1,1,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[65547,1,2,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[65536,1,1,1],k=500,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[65547,1,2,1],k=500,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[65536,1,1,1],k=1023,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[65547,1,2,1],k=1023,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[65536,1,1,1],k=9999,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[65547,1,2,1],k=9999,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[131072,1,1,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[131083,1,2,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[131072,1,1,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[131083,1,2,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[131072,1,1,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[131083,1,2,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[131072,1,1,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[131083,1,2,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[131072,1,1,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[131083,1,2,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[131072,1,1,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[131083,1,2,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[131072,1,1,1],k=500,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[131083,1,2,1],k=500,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[131072,1,1,1],k=1023,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[131083,1,2,1],k=1023,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[131072,1,1,1],k=9999,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[131083,1,2,1],k=9999,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[262144,1,1,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[262155,1,2,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[262144,1,1,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[262155,1,2,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[262144,1,1,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[262155,1,2,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[262144,1,1,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[262155,1,2,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[262144,1,1,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[262155,1,2,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[262144,1,1,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[262155,1,2,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[262144,1,1,1],k=500,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[262155,1,2,1],k=500,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[262144,1,1,1],k=1023,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[262155,1,2,1],k=1023,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[262144,1,1,1],k=9999,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[262155,1,2,1],k=9999,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[524288,1,1,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[524299,1,2,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[524288,1,1,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[524299,1,2,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[524288,1,1,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[524299,1,2,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[524288,1,1,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[524299,1,2,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[524288,1,1,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[524299,1,2,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[524288,1,1,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[524299,1,2,1],k=100,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[524288,1,1,1],k=500,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[524299,1,2,1],k=500,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[524288,1,1,1],k=1023,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[524299,1,2,1],k=1023,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[524288,1,1,1],k=9999,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[524299,1,2,1],k=9999,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16,10,10,10],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[60,10,10,10],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1023,2,1,3],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1024,2,1,3],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1025,2,1,3],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2047,2,1,3],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2048,2,1,3],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2049,2,1,3],k=1,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16,10,10,10],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[60,10,10,10],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1023,2,1,3],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1024,2,1,3],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1025,2,1,3],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2047,2,1,3],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2048,2,1,3],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2049,2,1,3],k=2,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16,10,10,10],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[60,10,10,10],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1023,2,1,3],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1024,2,1,3],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1025,2,1,3],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2047,2,1,3],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2048,2,1,3],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2049,2,1,3],k=3,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16,10,10,10],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[60,10,10,10],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1023,2,1,3],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1024,2,1,3],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1025,2,1,3],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2047,2,1,3],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2048,2,1,3],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2049,2,1,3],k=7,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16,10,10,10],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[60,10,10,10],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1023,2,1,3],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1024,2,1,3],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[1025,2,1,3],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2047,2,1,3],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2048,2,1,3],k=15,ties=0","support","1","yes","CPU"
+"CPU","TOP_K","type=f32,ne=[2049,2,1,3],k=15,ties=0","support","1","yes","CPU"
 "CPU","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=0","support","1","yes","CPU"
 "CPU","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=1","support","1","yes","CPU"
 "CPU","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=nearest,flags=none","support","1","yes","CPU"
@ -9435,6 +9860,10 @@
 "CPU","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bicubic,transpose=1","support","1","yes","CPU"
 "CPU","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bicubic,flags=none","support","1","yes","CPU"
 "CPU","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bicubic,flags=none","support","1","yes","CPU"
+"CPU","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=513,transpose=0","support","1","yes","CPU"
+"CPU","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=513,transpose=1","support","1","yes","CPU"
+"CPU","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear,flags=none","support","1","yes","CPU"
+"CPU","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bilinear,flags=none","support","1","yes","CPU"
 "CPU","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear,flags=align_corners","support","1","yes","CPU"
 "CPU","UPSCALE","type=f32,ne=[1,4,3,2],ne_tgt=[2,8,3,2],mode=bilinear,flags=align_corners","support","1","yes","CPU"
 "CPU","UPSCALE","type=f32,ne=[4,1,3,2],ne_tgt=[1,1,3,2],mode=bilinear,flags=align_corners","support","1","yes","CPU"
@ -9463,15 +9892,30 @@
 "CPU","GROUP_NORM","type=f32,ne=[64,64,320,1],num_groups=32,eps=0.000001","support","1","yes","CPU"
 "CPU","GROUP_NORM","type=f32,ne=[9,9,1280,1],num_groups=32,eps=0.000001","support","1","yes","CPU"
 "CPU","ACC","type=f32,ne_a=[256,17,1,1],ne_b=[256,16,1,1]","support","1","yes","CPU"
-"CPU","PAD","type=f32,ne_a=[512,512,1,1],pad_0=1,pad_1=1","support","1","yes","CPU"
-"CPU","PAD","type=f32,ne_a=[512,512,3,1],lp0=1,rp0=1,lp1=1,rp1=1,lp2=1,rp2=1,lp3=1,rp3=1,v=0","support","1","yes","CPU"
+"CPU","PAD","type=f32,ne_a=[512,512,1,1],pad_0=1,pad_1=1,circular=0","support","1","yes","CPU"
+"CPU","PAD","type=f32,ne_a=[33,17,2,1],pad_0=4,pad_1=3,circular=1","support","1","yes","CPU"
+"CPU","PAD","type=f32,ne_a=[512,512,3,1],lp0=1,rp0=1,lp1=1,rp1=1,lp2=1,rp2=1,lp3=1,rp3=1,v=0,circular=0","support","1","yes","CPU"
 "CPU","PAD_REFLECT_1D","type=f32,ne_a=[512,34,2,1],pad_0=10,pad_1=9","support","1","yes","CPU"
 "CPU","PAD_REFLECT_1D","type=f32,ne_a=[3000,384,4,1],pad_0=10,pad_1=9","support","1","yes","CPU"
 "CPU","ROLL","shift0=3,shift1=-2,shift3=1,shift4=-1","support","1","yes","CPU"
 "CPU","ARANGE","type=f32,start=0.000000,stop=10.000000,step=1.000000","support","1","yes","CPU"
+"CPU","ARANGE","type=f32,start=0.000000,stop=1048576.000000,step=1.000000","support","1","yes","CPU"
 "CPU","TIMESTEP_EMBEDDING","type=f32,ne_a=[2,1,1,1],dim=320,max_period=10000","support","1","yes","CPU"
 "CPU","LEAKY_RELU","type=f32,ne_a=[10,5,4,3],negative_slope=0.100000","support","1","yes","CPU"
 "CPU","CUMSUM","type=f32,ne=[10,5,4,3]","support","1","yes","CPU"
+"CPU","CUMSUM","type=f32,ne=[127,5,4,3]","support","1","yes","CPU"
+"CPU","CUMSUM","type=f32,ne=[128,5,4,3]","support","1","yes","CPU"
+"CPU","CUMSUM","type=f32,ne=[128,128,4,4]","support","1","yes","CPU"
+"CPU","CUMSUM","type=f32,ne=[255,5,4,3]","support","1","yes","CPU"
+"CPU","CUMSUM","type=f32,ne=[256,5,4,3]","support","1","yes","CPU"
+"CPU","CUMSUM","type=f32,ne=[511,5,4,3]","support","1","yes","CPU"
+"CPU","CUMSUM","type=f32,ne=[512,5,4,3]","support","1","yes","CPU"
+"CPU","CUMSUM","type=f32,ne=[1023,5,4,3]","support","1","yes","CPU"
+"CPU","CUMSUM","type=f32,ne=[1024,5,4,3]","support","1","yes","CPU"
+"CPU","CUMSUM","type=f32,ne=[2047,5,4,3]","support","1","yes","CPU"
+"CPU","CUMSUM","type=f32,ne=[2048,5,4,3]","support","1","yes","CPU"
+"CPU","CUMSUM","type=f32,ne=[242004,1,1,1]","support","1","yes","CPU"
+"CPU","CUMSUM","type=f32,ne=[375960,1,1,1]","support","1","yes","CPU"
 "CPU","XIELU","type=f32,ne=[10,5,4,3]","support","1","yes","CPU"
 "CPU","TRI","type=f32,ne=[10,10,4,3],tri_type=3","support","1","yes","CPU"
 "CPU","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","1","yes","CPU"
@ -9480,6 +9924,10 @@
 "CPU","FILL","type=f32,ne=[10,10,4,3],c=0.000000","support","1","yes","CPU"
 "CPU","FILL","type=f32,ne=[303,207,11,3],c=2.000000","support","1","yes","CPU"
 "CPU","FILL","type=f32,ne=[800,600,4,4],c=-152.000000","support","1","yes","CPU"
+"CPU","FILL","type=f32,ne=[2048,512,2,2],c=3.500000","support","1","yes","CPU"
+"CPU","DIAG","type=f32,ne=[10,1,4,3]","support","1","yes","CPU"
+"CPU","DIAG","type=f32,ne=[79,1,19,13]","support","1","yes","CPU"
+"CPU","DIAG","type=f32,ne=[256,1,8,16]","support","1","yes","CPU"
 "CPU","SOLVE_TRI","type=f32,ne_lhs=[10,10,4,3],ne_rhs=[3,10,4,3]","support","1","yes","CPU"
 "CPU","SOLVE_TRI","type=f32,ne_lhs=[11,11,1,1],ne_rhs=[5,11,1,1]","support","1","yes","CPU"
 "CPU","SOLVE_TRI","type=f32,ne_lhs=[17,17,2,4],ne_rhs=[9,17,2,4]","support","1","yes","CPU"
@ -9487,10 +9935,16 @@
 "CPU","SOLVE_TRI","type=f32,ne_lhs=[42,42,5,2],ne_rhs=[10,42,5,2]","support","1","yes","CPU"
 "CPU","SOLVE_TRI","type=f32,ne_lhs=[64,64,2,2],ne_rhs=[10,64,2,2]","support","1","yes","CPU"
 "CPU","SOLVE_TRI","type=f32,ne_lhs=[100,100,4,4],ne_rhs=[41,100,4,4]","support","1","yes","CPU"
-"CPU","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0","support","1","yes","CPU"
-"CPU","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0","support","1","yes","CPU"
-"CPU","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1","support","1","yes","CPU"
-"CPU","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1","support","1","yes","CPU"
+"CPU","SOLVE_TRI","type=f32,ne_lhs=[128,128,4,4],ne_rhs=[31,128,4,4]","support","1","yes","CPU"
+"CPU","SOLVE_TRI","type=f32,ne_lhs=[64,64,4,4],ne_rhs=[300,64,4,4]","support","1","yes","CPU"
+"CPU","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0,circular=0","support","1","yes","CPU"
+"CPU","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0,circular=0","support","1","yes","CPU"
+"CPU","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0,circular=1","support","1","yes","CPU"
+"CPU","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0,circular=1","support","1","yes","CPU"
+"CPU","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1,circular=0","support","1","yes","CPU"
+"CPU","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1,circular=0","support","1","yes","CPU"
+"CPU","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1,circular=1","support","1","yes","CPU"
+"CPU","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1,circular=1","support","1","yes","CPU"
 "CPU","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","CPU"
 "CPU","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","CPU"
 "CPU","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","1","yes","CPU"
--- a/docs/ops/CUDA.csv
+++ b/docs/ops/CUDA.csv
@ -4964,6 +4964,7 @@
 "CUDA0","CONV_TRANSPOSE_1D","ne_input=[2,1,1,1],ne_kernel=[3,1,1,1],s0=1,p0=0,d0=1","support","1","yes","CUDA"
 "CUDA0","CONV_TRANSPOSE_2D","ne_input=[3,2,3,1],ne_kernel=[2,2,1,3],stride=1","support","1","yes","CUDA"
 "CUDA0","CONV_TRANSPOSE_2D","ne_input=[10,10,9,1],ne_kernel=[3,3,1,9],stride=2","support","1","yes","CUDA"
+"CUDA0","CONV_TRANSPOSE_2D","ne_input=[129,63,35,1],ne_kernel=[3,3,48,35],stride=1","support","1","yes","CUDA"
 "CUDA0","COUNT_EQUAL","type=f32,ne=[4,500,1,1]","support","1","yes","CUDA"
 "CUDA0","COUNT_EQUAL","type=f32,ne=[4,5000,1,1]","support","1","yes","CUDA"
 "CUDA0","ARGMAX","type=f32,ne=[32,1,1,1]","support","1","yes","CUDA"
@ -5419,17 +5420,45 @@
 "CUDA0","CPY","type_src=f16,type_dst=f16,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","CUDA"
 "CUDA0","CPY","type_src=f32,type_dst=f32,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","CUDA"
 "CUDA0","CPY","type_src=bf16,type_dst=bf16,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","CUDA"
+"CUDA0","CPY","type_src=i32,type_dst=i32,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","CUDA"
+"CUDA0","CPY","type_src=i32,type_dst=i32,ne=[256,1,4,1],permute_src=[1,2,0,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","CUDA"
 "CUDA0","CPY","type_src=f32,type_dst=f32,ne=[256,1,4,1],permute_src=[1,2,0,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=f32,ne=[10,10,10,1]","support","1","yes","CUDA"
-"CUDA0","CONT","type=f32,ne=[2,1,1,1]","support","1","yes","CUDA"
-"CUDA0","CONT","type=f32,ne=[2,1,3,5]","support","1","yes","CUDA"
-"CUDA0","CONT","type=f32,ne=[2,3,5,7]","support","1","yes","CUDA"
-"CUDA0","CONT","type=f16,ne=[2,1,1,1]","support","1","yes","CUDA"
-"CUDA0","CONT","type=f16,ne=[2,1,3,5]","support","1","yes","CUDA"
-"CUDA0","CONT","type=f16,ne=[2,3,5,7]","support","1","yes","CUDA"
-"CUDA0","CONT","type=bf16,ne=[2,1,1,1]","support","1","yes","CUDA"
-"CUDA0","CONT","type=bf16,ne=[2,1,3,5]","support","1","yes","CUDA"
-"CUDA0","CONT","type=bf16,ne=[2,3,5,7]","support","1","yes","CUDA"
+"CUDA0","CONT","type=f32,ne=[2,1,1,1],use_view_slice=1","support","1","yes","CUDA"
+"CUDA0","CONT","type=f32,ne=[2,1,3,5],use_view_slice=1","support","1","yes","CUDA"
+"CUDA0","CONT","type=f32,ne=[2,3,5,7],use_view_slice=1","support","1","yes","CUDA"
+"CUDA0","CONT","type=f32,ne=[1,4,4,1],use_view_slice=1","support","1","yes","CUDA"
+"CUDA0","CONT","type=f32,ne=[1,8,17,1],use_view_slice=1","support","1","yes","CUDA"
+"CUDA0","CONT","type=f32,ne=[10,10,10,1],use_view_slice=1","support","1","yes","CUDA"
+"CUDA0","CONT","type=f32,ne=[2,1,1,1],use_view_slice=0","support","1","yes","CUDA"
+"CUDA0","CONT","type=f32,ne=[2,1,3,5],use_view_slice=0","support","1","yes","CUDA"
+"CUDA0","CONT","type=f32,ne=[2,3,5,7],use_view_slice=0","support","1","yes","CUDA"
+"CUDA0","CONT","type=f32,ne=[1,4,4,1],use_view_slice=0","support","1","yes","CUDA"
+"CUDA0","CONT","type=f32,ne=[1,8,17,1],use_view_slice=0","support","1","yes","CUDA"
+"CUDA0","CONT","type=f32,ne=[10,10,10,1],use_view_slice=0","support","1","yes","CUDA"
+"CUDA0","CONT","type=i32,ne=[2,1,1,1],use_view_slice=1","support","1","yes","CUDA"
+"CUDA0","CONT","type=i32,ne=[2,1,3,5],use_view_slice=1","support","1","yes","CUDA"
+"CUDA0","CONT","type=i32,ne=[2,3,5,7],use_view_slice=1","support","1","yes","CUDA"
+"CUDA0","CONT","type=i32,ne=[1,4,4,1],use_view_slice=1","support","1","yes","CUDA"
+"CUDA0","CONT","type=i32,ne=[1,8,17,1],use_view_slice=1","support","1","yes","CUDA"
+"CUDA0","CONT","type=i32,ne=[10,10,10,1],use_view_slice=1","support","1","yes","CUDA"
+"CUDA0","CONT","type=i32,ne=[2,1,1,1],use_view_slice=0","support","1","yes","CUDA"
+"CUDA0","CONT","type=i32,ne=[2,1,3,5],use_view_slice=0","support","1","yes","CUDA"
+"CUDA0","CONT","type=i32,ne=[2,3,5,7],use_view_slice=0","support","1","yes","CUDA"
+"CUDA0","CONT","type=i32,ne=[1,4,4,1],use_view_slice=0","support","1","yes","CUDA"
+"CUDA0","CONT","type=i32,ne=[1,8,17,1],use_view_slice=0","support","1","yes","CUDA"
+"CUDA0","CONT","type=i32,ne=[10,10,10,1],use_view_slice=0","support","1","yes","CUDA"
+"CUDA0","CONT","type=f16,ne=[2,1,1,1],use_view_slice=0","support","1","yes","CUDA"
+"CUDA0","CONT","type=f16,ne=[2,1,3,5],use_view_slice=0","support","1","yes","CUDA"
+"CUDA0","CONT","type=f16,ne=[2,3,5,7],use_view_slice=0","support","1","yes","CUDA"
+"CUDA0","CONT","type=f16,ne=[1,4,4,1],use_view_slice=0","support","1","yes","CUDA"
+"CUDA0","CONT","type=f16,ne=[1,8,17,1],use_view_slice=0","support","1","yes","CUDA"
+"CUDA0","CONT","type=f16,ne=[10,10,10,1],use_view_slice=0","support","1","yes","CUDA"
+"CUDA0","CONT","type=bf16,ne=[2,1,1,1],use_view_slice=0","support","1","yes","CUDA"
+"CUDA0","CONT","type=bf16,ne=[2,1,3,5],use_view_slice=0","support","1","yes","CUDA"
+"CUDA0","CONT","type=bf16,ne=[2,3,5,7],use_view_slice=0","support","1","yes","CUDA"
+"CUDA0","CONT","type=bf16,ne=[1,4,4,1],use_view_slice=0","support","1","yes","CUDA"
+"CUDA0","CONT","type=bf16,ne=[1,8,17,1],use_view_slice=0","support","1","yes","CUDA"
+"CUDA0","CONT","type=bf16,ne=[10,10,10,1],use_view_slice=0","support","1","yes","CUDA"
 "CUDA0","ADD","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1","support","1","yes","CUDA"
 "CUDA0","SUB","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1","support","1","yes","CUDA"
 "CUDA0","MUL","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1","support","1","yes","CUDA"
@ -5655,6 +5684,7 @@
 "CUDA0","MUL","type=f32,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1","support","1","yes","CUDA"
 "CUDA0","DIV","type=f32,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1","support","1","yes","CUDA"
 "CUDA0","ADD1","type=f32,ne=[10,5,4,3]","support","1","yes","CUDA"
+"CUDA0","ADD1","type=f32,ne=[1024,1024,1,1]","support","1","yes","CUDA"
 "CUDA0","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=0.000000,inplace=0","support","1","yes","CUDA"
 "CUDA0","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=1.000000,inplace=0","support","1","yes","CUDA"
 "CUDA0","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=1.000000,inplace=1","support","1","yes","CUDA"
@ -8644,9 +8674,13 @@
 "CUDA0","CLAMP","type=f16,ne=[7,1,5,3],min=-0.500000,max=0.500000","support","1","yes","CUDA"
 "CUDA0","LEAKY_RELU","type=f16,ne_a=[7,1,5,3],negative_slope=0.100000","support","1","yes","CUDA"
 "CUDA0","FLOOR","type=f16,ne=[7,1,5,3]","support","1","yes","CUDA"
+"CUDA0","FLOOR","type=f16,ne=[1024,1024,1,1]","support","1","yes","CUDA"
 "CUDA0","CEIL","type=f16,ne=[7,1,5,3]","support","1","yes","CUDA"
+"CUDA0","CEIL","type=f16,ne=[1024,1024,1,1]","support","1","yes","CUDA"
 "CUDA0","ROUND","type=f16,ne=[7,1,5,3]","support","1","yes","CUDA"
+"CUDA0","ROUND","type=f16,ne=[1024,1024,1,1]","support","1","yes","CUDA"
 "CUDA0","TRUNC","type=f16,ne=[7,1,5,3]","support","1","yes","CUDA"
+"CUDA0","TRUNC","type=f16,ne=[1024,1024,1,1]","support","1","yes","CUDA"
 "CUDA0","SQR","type=f32,ne=[10,5,4,3]","support","1","yes","CUDA"
 "CUDA0","SQRT","type=f32,ne=[10,3,3,2]","support","1","yes","CUDA"
 "CUDA0","LOG","type=f32,ne=[10,5,4,3]","support","1","yes","CUDA"
@ -8666,9 +8700,13 @@
 "CUDA0","CLAMP","type=f32,ne=[7,1,5,3],min=-0.500000,max=0.500000","support","1","yes","CUDA"
 "CUDA0","LEAKY_RELU","type=f32,ne_a=[7,1,5,3],negative_slope=0.100000","support","1","yes","CUDA"
 "CUDA0","FLOOR","type=f32,ne=[7,1,5,3]","support","1","yes","CUDA"
+"CUDA0","FLOOR","type=f32,ne=[1024,1024,1,1]","support","1","yes","CUDA"
 "CUDA0","CEIL","type=f32,ne=[7,1,5,3]","support","1","yes","CUDA"
+"CUDA0","CEIL","type=f32,ne=[1024,1024,1,1]","support","1","yes","CUDA"
 "CUDA0","ROUND","type=f32,ne=[7,1,5,3]","support","1","yes","CUDA"
+"CUDA0","ROUND","type=f32,ne=[1024,1024,1,1]","support","1","yes","CUDA"
 "CUDA0","TRUNC","type=f32,ne=[7,1,5,3]","support","1","yes","CUDA"
+"CUDA0","TRUNC","type=f32,ne=[1024,1024,1,1]","support","1","yes","CUDA"
 "CUDA0","DIAG_MASK_INF","type=f32,ne=[10,10,1,1],n_past=5","support","1","yes","CUDA"
 "CUDA0","DIAG_MASK_INF","type=f32,ne=[10,10,3,1],n_past=5","support","1","yes","CUDA"
 "CUDA0","DIAG_MASK_INF","type=f32,ne=[10,10,3,2],n_past=5","support","1","yes","CUDA"
@ -9411,18 +9449,405 @@
 "CUDA0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","0","no","CUDA"
 "CUDA0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","CUDA"
 "CUDA0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","0","no","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[3,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[4,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[7,1,1,1],order=0","support","1","yes","CUDA"
 "CUDA0","ARGSORT","type=f32,ne=[8,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[15,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[16,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[31,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[32,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[63,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[64,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[127,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[128,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[255,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[256,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[511,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[512,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[1023,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[1024,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[2047,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[2048,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[4095,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[4096,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[8191,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[8192,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[16383,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[16384,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[32767,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[32768,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[65535,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[65536,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[131071,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[131072,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[262143,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[262144,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[524287,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[524288,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[1048575,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[1048576,1,1,1],order=0","support","1","yes","CUDA"
 "CUDA0","ARGSORT","type=f32,ne=[16,10,10,10],order=0","support","1","yes","CUDA"
 "CUDA0","ARGSORT","type=f32,ne=[60,10,10,10],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[1024,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[16384,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[1023,2,1,3],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[1024,2,1,3],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[1025,2,1,3],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[2047,2,1,3],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[2048,2,1,3],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[2049,2,1,3],order=0","support","1","yes","CUDA"
 "CUDA0","ARGSORT","type=f32,ne=[2,8,8192,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[8,1,1,1],order=1","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[3,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[4,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[7,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[8,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[15,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[16,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[31,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[32,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[63,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[64,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[127,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[128,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[255,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[256,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[511,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[512,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[1023,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[1024,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[2047,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[2048,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[4095,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[4096,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[8191,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[8192,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[16383,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[16384,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[32767,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[32768,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[65535,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[65536,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[131071,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[131072,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[262143,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[262144,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[524287,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[524288,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[1048575,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[1048576,1,1,1],order=0","support","1","yes","CUDA"
 "CUDA0","ARGSORT","type=f32,ne=[16,10,10,10],order=1","support","1","yes","CUDA"
 "CUDA0","ARGSORT","type=f32,ne=[60,10,10,10],order=1","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[1024,1,1,1],order=1","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[16384,1,1,1],order=1","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[1023,2,1,3],order=1","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[1024,2,1,3],order=1","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[1025,2,1,3],order=1","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[2047,2,1,3],order=1","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[2048,2,1,3],order=1","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[2049,2,1,3],order=1","support","1","yes","CUDA"
 "CUDA0","ARGSORT","type=f32,ne=[2,8,8192,1],order=1","support","1","yes","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1,1,1,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[12,1,2,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2,1,1,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[13,1,2,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2,1,1,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[13,1,2,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[4,1,1,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[15,1,2,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[4,1,1,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[15,1,2,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[4,1,1,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[15,1,2,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[8,1,1,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[19,1,2,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[8,1,1,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[19,1,2,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[8,1,1,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[19,1,2,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[8,1,1,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[19,1,2,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16,1,1,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[27,1,2,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16,1,1,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[27,1,2,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16,1,1,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[27,1,2,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16,1,1,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[27,1,2,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16,1,1,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[27,1,2,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[32,1,1,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[43,1,2,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[32,1,1,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[43,1,2,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[32,1,1,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[43,1,2,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[32,1,1,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[43,1,2,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[32,1,1,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[43,1,2,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[64,1,1,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[75,1,2,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[64,1,1,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[75,1,2,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[64,1,1,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[75,1,2,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[64,1,1,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[75,1,2,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[64,1,1,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[75,1,2,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[128,1,1,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[139,1,2,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[128,1,1,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[139,1,2,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[128,1,1,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[139,1,2,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[128,1,1,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[139,1,2,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[128,1,1,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[139,1,2,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[128,1,1,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[139,1,2,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[256,1,1,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[267,1,2,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[256,1,1,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[267,1,2,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[256,1,1,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[267,1,2,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[256,1,1,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[267,1,2,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[256,1,1,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[267,1,2,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[256,1,1,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[267,1,2,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[512,1,1,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[523,1,2,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[512,1,1,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[523,1,2,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[512,1,1,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[523,1,2,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[512,1,1,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[523,1,2,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[512,1,1,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[523,1,2,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[512,1,1,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[523,1,2,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[512,1,1,1],k=500,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[523,1,2,1],k=500,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1024,1,1,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1035,1,2,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1024,1,1,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1035,1,2,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1024,1,1,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1035,1,2,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1024,1,1,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1035,1,2,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1024,1,1,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1035,1,2,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1024,1,1,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1035,1,2,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1024,1,1,1],k=500,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1035,1,2,1],k=500,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1024,1,1,1],k=1023,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1035,1,2,1],k=1023,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2048,1,1,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2059,1,2,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2048,1,1,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2059,1,2,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2048,1,1,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2059,1,2,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2048,1,1,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2059,1,2,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2048,1,1,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2059,1,2,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2048,1,1,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2059,1,2,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2048,1,1,1],k=500,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2059,1,2,1],k=500,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2048,1,1,1],k=1023,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2059,1,2,1],k=1023,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[4096,1,1,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[4107,1,2,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[4096,1,1,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[4107,1,2,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[4096,1,1,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[4107,1,2,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[4096,1,1,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[4107,1,2,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[4096,1,1,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[4107,1,2,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[4096,1,1,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[4107,1,2,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[4096,1,1,1],k=500,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[4107,1,2,1],k=500,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[4096,1,1,1],k=1023,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[4107,1,2,1],k=1023,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[8192,1,1,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[8203,1,2,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[8192,1,1,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[8203,1,2,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[8192,1,1,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[8203,1,2,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[8192,1,1,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[8203,1,2,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[8192,1,1,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[8203,1,2,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[8192,1,1,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[8203,1,2,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[8192,1,1,1],k=500,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[8203,1,2,1],k=500,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[8192,1,1,1],k=1023,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[8203,1,2,1],k=1023,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16395,1,2,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16395,1,2,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16395,1,2,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16395,1,2,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16395,1,2,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16395,1,2,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=500,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16395,1,2,1],k=500,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=1023,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16395,1,2,1],k=1023,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=9999,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16395,1,2,1],k=9999,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[32768,1,1,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[32779,1,2,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[32768,1,1,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[32779,1,2,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[32768,1,1,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[32779,1,2,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[32768,1,1,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[32779,1,2,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[32768,1,1,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[32779,1,2,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[32768,1,1,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[32779,1,2,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[32768,1,1,1],k=500,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[32779,1,2,1],k=500,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[32768,1,1,1],k=1023,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[32779,1,2,1],k=1023,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[32768,1,1,1],k=9999,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[32779,1,2,1],k=9999,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[65536,1,1,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[65547,1,2,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[65536,1,1,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[65547,1,2,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[65536,1,1,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[65547,1,2,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[65536,1,1,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[65547,1,2,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[65536,1,1,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[65547,1,2,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[65536,1,1,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[65547,1,2,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[65536,1,1,1],k=500,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[65547,1,2,1],k=500,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[65536,1,1,1],k=1023,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[65547,1,2,1],k=1023,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[65536,1,1,1],k=9999,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[65547,1,2,1],k=9999,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[131072,1,1,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[131083,1,2,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[131072,1,1,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[131083,1,2,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[131072,1,1,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[131083,1,2,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[131072,1,1,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[131083,1,2,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[131072,1,1,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[131083,1,2,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[131072,1,1,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[131083,1,2,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[131072,1,1,1],k=500,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[131083,1,2,1],k=500,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[131072,1,1,1],k=1023,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[131083,1,2,1],k=1023,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[131072,1,1,1],k=9999,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[131083,1,2,1],k=9999,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[262144,1,1,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[262155,1,2,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[262144,1,1,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[262155,1,2,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[262144,1,1,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[262155,1,2,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[262144,1,1,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[262155,1,2,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[262144,1,1,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[262155,1,2,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[262144,1,1,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[262155,1,2,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[262144,1,1,1],k=500,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[262155,1,2,1],k=500,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[262144,1,1,1],k=1023,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[262155,1,2,1],k=1023,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[262144,1,1,1],k=9999,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[262155,1,2,1],k=9999,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[524288,1,1,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[524299,1,2,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[524288,1,1,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[524299,1,2,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[524288,1,1,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[524299,1,2,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[524288,1,1,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[524299,1,2,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[524288,1,1,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[524299,1,2,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[524288,1,1,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[524299,1,2,1],k=100,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[524288,1,1,1],k=500,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[524299,1,2,1],k=500,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[524288,1,1,1],k=1023,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[524299,1,2,1],k=1023,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[524288,1,1,1],k=9999,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[524299,1,2,1],k=9999,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16,10,10,10],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[60,10,10,10],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1023,2,1,3],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1024,2,1,3],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1025,2,1,3],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2047,2,1,3],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2048,2,1,3],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2049,2,1,3],k=1,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16,10,10,10],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[60,10,10,10],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1023,2,1,3],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1024,2,1,3],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1025,2,1,3],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2047,2,1,3],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2048,2,1,3],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2049,2,1,3],k=2,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16,10,10,10],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[60,10,10,10],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1023,2,1,3],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1024,2,1,3],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1025,2,1,3],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2047,2,1,3],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2048,2,1,3],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2049,2,1,3],k=3,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16,10,10,10],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[60,10,10,10],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1023,2,1,3],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1024,2,1,3],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1025,2,1,3],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2047,2,1,3],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2048,2,1,3],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2049,2,1,3],k=7,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16,10,10,10],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[60,10,10,10],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1023,2,1,3],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1024,2,1,3],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[1025,2,1,3],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2047,2,1,3],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2048,2,1,3],k=15,ties=0","support","0","no","CUDA"
+"CUDA0","TOP_K","type=f32,ne=[2049,2,1,3],k=15,ties=0","support","0","no","CUDA"
 "CUDA0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=0","support","1","yes","CUDA"
 "CUDA0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=1","support","1","yes","CUDA"
 "CUDA0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=nearest,flags=none","support","1","yes","CUDA"
@ -9435,6 +9860,10 @@
 "CUDA0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bicubic,transpose=1","support","1","yes","CUDA"
 "CUDA0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bicubic,flags=none","support","1","yes","CUDA"
 "CUDA0","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bicubic,flags=none","support","1","yes","CUDA"
+"CUDA0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=513,transpose=0","support","1","yes","CUDA"
+"CUDA0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=513,transpose=1","support","1","yes","CUDA"
+"CUDA0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear,flags=none","support","1","yes","CUDA"
+"CUDA0","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bilinear,flags=none","support","1","yes","CUDA"
 "CUDA0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear,flags=align_corners","support","1","yes","CUDA"
 "CUDA0","UPSCALE","type=f32,ne=[1,4,3,2],ne_tgt=[2,8,3,2],mode=bilinear,flags=align_corners","support","1","yes","CUDA"
 "CUDA0","UPSCALE","type=f32,ne=[4,1,3,2],ne_tgt=[1,1,3,2],mode=bilinear,flags=align_corners","support","1","yes","CUDA"
@ -9463,34 +9892,59 @@
 "CUDA0","GROUP_NORM","type=f32,ne=[64,64,320,1],num_groups=32,eps=0.000001","support","1","yes","CUDA"
 "CUDA0","GROUP_NORM","type=f32,ne=[9,9,1280,1],num_groups=32,eps=0.000001","support","1","yes","CUDA"
 "CUDA0","ACC","type=f32,ne_a=[256,17,1,1],ne_b=[256,16,1,1]","support","1","yes","CUDA"
-"CUDA0","PAD","type=f32,ne_a=[512,512,1,1],pad_0=1,pad_1=1","support","1","yes","CUDA"
-"CUDA0","PAD","type=f32,ne_a=[512,512,3,1],lp0=1,rp0=1,lp1=1,rp1=1,lp2=1,rp2=1,lp3=1,rp3=1,v=0","support","1","yes","CUDA"
+"CUDA0","PAD","type=f32,ne_a=[512,512,1,1],pad_0=1,pad_1=1,circular=0","support","1","yes","CUDA"
+"CUDA0","PAD","type=f32,ne_a=[33,17,2,1],pad_0=4,pad_1=3,circular=1","support","1","yes","CUDA"
+"CUDA0","PAD","type=f32,ne_a=[512,512,3,1],lp0=1,rp0=1,lp1=1,rp1=1,lp2=1,rp2=1,lp3=1,rp3=1,v=0,circular=0","support","1","yes","CUDA"
 "CUDA0","PAD_REFLECT_1D","type=f32,ne_a=[512,34,2,1],pad_0=10,pad_1=9","support","1","yes","CUDA"
 "CUDA0","PAD_REFLECT_1D","type=f32,ne_a=[3000,384,4,1],pad_0=10,pad_1=9","support","1","yes","CUDA"
 "CUDA0","ROLL","shift0=3,shift1=-2,shift3=1,shift4=-1","support","1","yes","CUDA"
 "CUDA0","ARANGE","type=f32,start=0.000000,stop=10.000000,step=1.000000","support","1","yes","CUDA"
+"CUDA0","ARANGE","type=f32,start=0.000000,stop=1048576.000000,step=1.000000","support","1","yes","CUDA"
 "CUDA0","TIMESTEP_EMBEDDING","type=f32,ne_a=[2,1,1,1],dim=320,max_period=10000","support","1","yes","CUDA"
 "CUDA0","LEAKY_RELU","type=f32,ne_a=[10,5,4,3],negative_slope=0.100000","support","1","yes","CUDA"
-"CUDA0","CUMSUM","type=f32,ne=[10,5,4,3]","support","0","no","CUDA"
+"CUDA0","CUMSUM","type=f32,ne=[10,5,4,3]","support","1","yes","CUDA"
+"CUDA0","CUMSUM","type=f32,ne=[127,5,4,3]","support","1","yes","CUDA"
+"CUDA0","CUMSUM","type=f32,ne=[128,5,4,3]","support","1","yes","CUDA"
+"CUDA0","CUMSUM","type=f32,ne=[128,128,4,4]","support","1","yes","CUDA"
+"CUDA0","CUMSUM","type=f32,ne=[255,5,4,3]","support","1","yes","CUDA"
+"CUDA0","CUMSUM","type=f32,ne=[256,5,4,3]","support","1","yes","CUDA"
+"CUDA0","CUMSUM","type=f32,ne=[511,5,4,3]","support","1","yes","CUDA"
+"CUDA0","CUMSUM","type=f32,ne=[512,5,4,3]","support","1","yes","CUDA"
+"CUDA0","CUMSUM","type=f32,ne=[1023,5,4,3]","support","1","yes","CUDA"
+"CUDA0","CUMSUM","type=f32,ne=[1024,5,4,3]","support","1","yes","CUDA"
+"CUDA0","CUMSUM","type=f32,ne=[2047,5,4,3]","support","1","yes","CUDA"
+"CUDA0","CUMSUM","type=f32,ne=[2048,5,4,3]","support","1","yes","CUDA"
+"CUDA0","CUMSUM","type=f32,ne=[242004,1,1,1]","support","1","yes","CUDA"
+"CUDA0","CUMSUM","type=f32,ne=[375960,1,1,1]","support","1","yes","CUDA"
 "CUDA0","XIELU","type=f32,ne=[10,5,4,3]","support","0","no","CUDA"
-"CUDA0","TRI","type=f32,ne=[10,10,4,3],tri_type=3","support","0","no","CUDA"
-"CUDA0","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","0","no","CUDA"
-"CUDA0","TRI","type=f32,ne=[10,10,4,3],tri_type=1","support","0","no","CUDA"
-"CUDA0","TRI","type=f32,ne=[10,10,4,3],tri_type=0","support","0","no","CUDA"
-"CUDA0","FILL","type=f32,ne=[10,10,4,3],c=0.000000","support","0","no","CUDA"
-"CUDA0","FILL","type=f32,ne=[303,207,11,3],c=2.000000","support","0","no","CUDA"
-"CUDA0","FILL","type=f32,ne=[800,600,4,4],c=-152.000000","support","0","no","CUDA"
-"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[10,10,4,3],ne_rhs=[3,10,4,3]","support","0","no","CUDA"
-"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[11,11,1,1],ne_rhs=[5,11,1,1]","support","0","no","CUDA"
-"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[17,17,2,4],ne_rhs=[9,17,2,4]","support","0","no","CUDA"
-"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[30,30,7,1],ne_rhs=[8,30,7,1]","support","0","no","CUDA"
-"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[42,42,5,2],ne_rhs=[10,42,5,2]","support","0","no","CUDA"
-"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[64,64,2,2],ne_rhs=[10,64,2,2]","support","0","no","CUDA"
+"CUDA0","TRI","type=f32,ne=[10,10,4,3],tri_type=3","support","1","yes","CUDA"
+"CUDA0","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","1","yes","CUDA"
+"CUDA0","TRI","type=f32,ne=[10,10,4,3],tri_type=1","support","1","yes","CUDA"
+"CUDA0","TRI","type=f32,ne=[10,10,4,3],tri_type=0","support","1","yes","CUDA"
+"CUDA0","FILL","type=f32,ne=[10,10,4,3],c=0.000000","support","1","yes","CUDA"
+"CUDA0","FILL","type=f32,ne=[303,207,11,3],c=2.000000","support","1","yes","CUDA"
+"CUDA0","FILL","type=f32,ne=[800,600,4,4],c=-152.000000","support","1","yes","CUDA"
+"CUDA0","FILL","type=f32,ne=[2048,512,2,2],c=3.500000","support","1","yes","CUDA"
+"CUDA0","DIAG","type=f32,ne=[10,1,4,3]","support","1","yes","CUDA"
+"CUDA0","DIAG","type=f32,ne=[79,1,19,13]","support","1","yes","CUDA"
+"CUDA0","DIAG","type=f32,ne=[256,1,8,16]","support","1","yes","CUDA"
+"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[10,10,4,3],ne_rhs=[3,10,4,3]","support","1","yes","CUDA"
+"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[11,11,1,1],ne_rhs=[5,11,1,1]","support","1","yes","CUDA"
+"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[17,17,2,4],ne_rhs=[9,17,2,4]","support","1","yes","CUDA"
+"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[30,30,7,1],ne_rhs=[8,30,7,1]","support","1","yes","CUDA"
+"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[42,42,5,2],ne_rhs=[10,42,5,2]","support","1","yes","CUDA"
+"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[64,64,2,2],ne_rhs=[10,64,2,2]","support","1","yes","CUDA"
 "CUDA0","SOLVE_TRI","type=f32,ne_lhs=[100,100,4,4],ne_rhs=[41,100,4,4]","support","0","no","CUDA"
-"CUDA0","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0","support","1","yes","CUDA"
-"CUDA0","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0","support","1","yes","CUDA"
-"CUDA0","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1","support","0","no","CUDA"
-"CUDA0","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1","support","0","no","CUDA"
+"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[128,128,4,4],ne_rhs=[31,128,4,4]","support","0","no","CUDA"
+"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[64,64,4,4],ne_rhs=[300,64,4,4]","support","0","no","CUDA"
+"CUDA0","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0,circular=0","support","1","yes","CUDA"
+"CUDA0","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0,circular=0","support","1","yes","CUDA"
+"CUDA0","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0,circular=1","support","1","yes","CUDA"
+"CUDA0","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0,circular=1","support","1","yes","CUDA"
+"CUDA0","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1,circular=0","support","0","no","CUDA"
+"CUDA0","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1,circular=0","support","0","no","CUDA"
+"CUDA0","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1,circular=1","support","0","no","CUDA"
+"CUDA0","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1,circular=1","support","0","no","CUDA"
 "CUDA0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","CUDA"
 "CUDA0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","CUDA"
 "CUDA0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","CUDA"
--- a/examples/model-conversion/logits.cpp
+++ b/examples/model-conversion/logits.cpp
@ -144,7 +144,7 @@ int main(int argc, char ** argv) {
            return 1;
        }
        std::string s(buf, n);
-        printf("%s", s.c_str());
+        printf("%s (%d)", s.c_str(), id);
    }
    printf("\n");

--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@ -25,6 +25,7 @@ static bool ggml_is_view(const struct ggml_tensor * t) {
 // ops that return true for this function must not use restrict pointers for their backend implementations
 bool ggml_op_can_inplace(enum ggml_op op) {
    switch (op) {
+        case GGML_OP_FILL:
        case GGML_OP_SCALE:
        case GGML_OP_DIAG_MASK_ZERO:
        case GGML_OP_DIAG_MASK_INF:
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@ -2251,12 +2251,12 @@ static void aclnn_rope_cache_init(ggml_backend_cann_context & ctx,
                                  int                         sections[4],
                                  bool                        mrope_used,
                                  bool                        is_imrope,
-                                  bool                        indep_sects) {
-    ggml_tensor * src0 = dst->src[0];  // input
+                                  bool                        indep_sects,
+                                  int64_t                     rope_dims) {
    ggml_tensor * src1 = dst->src[1];  // position
    ggml_tensor * src2 = dst->src[2];  // freq_factors

-    int64_t theta_scale_length = src0->ne[0] / 2;
+    int64_t theta_scale_length = rope_dims / 2;
    int64_t position_length    = dst->ne[2];

    // TODO: check theta_scale_length and position_length.
@ -2331,18 +2331,17 @@ static void aclnn_rope_cache_init(ggml_backend_cann_context & ctx,
        ACL_CHECK(aclrtMemcpyAsync(ctx.rope_cache.theta_scale_cache, theta_scale_length * sizeof(float),
                                   ctx.rope_cache.theta_scale_exp_host, theta_scale_length * sizeof(float),
                                   ACL_MEMCPY_HOST_TO_DEVICE, ctx.stream()));
-
-        acl_theta_scale_tensor = ggml_cann_create_tensor(ctx.rope_cache.theta_scale_cache, ACL_FLOAT, sizeof(float),
-                                                         theta_scale_ne, theta_scale_nb, 1);
    }
+    acl_theta_scale_tensor = ggml_cann_create_tensor(ctx.rope_cache.theta_scale_cache, ACL_FLOAT, sizeof(float),
+                                                     theta_scale_ne, theta_scale_nb, 1);

    // Step1.2: prepare rope_yarn_ramp, if this part updated, should update theta_scale_tensor.
+    // TODO: acl_yarn_ramp_tensor use rope cache.
    bool                 yarn_ramp_tensor_updated = false;
    ggml_cann_pool_alloc yarn_ramp_allocator(ctx.pool());
    acl_tensor_ptr       acl_yarn_ramp_tensor;
-    if (ext_factor != 0 &&
-        // TODO: check more parameter.
-        (ctx.rope_cache.theta_scale_length != theta_scale_length || ctx.rope_cache.freq_scale != freq_scale)) {
+    if (ext_factor != 0 && (theta_scale_updated || ctx.rope_cache.theta_scale_length != theta_scale_length ||
+                            ctx.rope_cache.freq_scale != freq_scale)) {
        yarn_ramp_tensor_updated = true;

        // -rope_yarn_ramp
@ -2590,7 +2589,7 @@ static void aclnn_rope_cache_init(ggml_backend_cann_context & ctx,
        aclnn_muls(ctx, acl_cos_tensor.get(), attn_factor, nullptr, true);
    }

-    int64_t sin_reshape_ne[4] = { src0->ne[0], 1, dst->ne[2], 1 };
+    int64_t sin_reshape_ne[4] = { rope_dims, 1, dst->ne[2], 1 };
    size_t  sin_reshape_nb[GGML_MAX_DIMS];
    sin_reshape_nb[0] = sizeof(float);
    for (int i = 1; i < GGML_MAX_DIMS; i++) {
@ -2645,7 +2644,7 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) {

    // param
    float     freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
-    int sections[4];
+    int       sections[4];
    // const int n_past     = ((int32_t *) dst->op_params)[0];
    const int n_dims     = ((int32_t *) dst->op_params)[1];
    const int mode       = ((int32_t *) dst->op_params)[2];
@ -2654,44 +2653,60 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) {

    GGML_TENSOR_UNARY_OP_LOCALS

-    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
-    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
-    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
-    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
-    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
-    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
-    memcpy(&sections,    (int32_t *) dst->op_params + 11, sizeof(int)*4);
+    memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
+    memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
+    memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
+    memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
+    memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
+    memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
+    memcpy(&sections, (int32_t *) dst->op_params + 11, sizeof(int) * 4);

-    // TODO: n_dims <= ne0
-    GGML_ASSERT(n_dims == ne0);
    GGML_ASSERT(n_dims % 2 == 0);
+    GGML_ASSERT(n_dims <= ne00);

    const float theta_scale = powf(freq_base, -2.0f / n_dims);

    float corr_dims[2];
    ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);

-    bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
-    const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE; // qwen3vl apply interleaved mrope
-    const bool mrope_used = mode & GGML_ROPE_TYPE_MROPE;  // ggml_rope_multi, note: also true for vision (24 & 8 == true) and for imrope
-    const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
+    bool       is_neox    = mode & GGML_ROPE_TYPE_NEOX;
+    const bool is_imrope  = mode == GGML_ROPE_TYPE_IMROPE;  // qwen3vl apply interleaved mrope
+    // mrope_used means the GGML_ROPE_TYPE_MROPE bit is set.
+    // Note: this bit is also set for imrope and some vision modes,
+    // so mrope_used does NOT exclusively indicate pure mrope.
+    const bool mrope_used = mode & GGML_ROPE_TYPE_MROPE;
+    const bool is_vision  = mode == GGML_ROPE_TYPE_VISION;

    if (mrope_used) {
        GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
    }

    if (is_vision) {
-        GGML_ASSERT(n_dims == ne0/2);
+        GGML_ASSERT(n_dims == ne0 / 2);
    }

    if (is_imrope || mrope_used) {
        is_neox = true;
    }

-    // init ctx.rope_cos/rope_sin cache
-    aclnn_rope_cache_init(ctx, dst, corr_dims, ext_factor, theta_scale, freq_scale, attn_factor, is_neox, sections, mrope_used, is_imrope, is_vision);
+    int64_t rope_dims = n_dims;

-    int64_t sin_reshape_ne[4] = { ne00, 1, ne02, 1 };
+    //Our current RotaryPositionEmbedding does not support the VISION mode,
+    //but essentially it only modifies theta_base in mrope,
+    //then repeats it at the end in the same way as is_neox.
+    //In fact, RoPE is still applied across all dimensions.
+    if (is_vision) {
+        rope_dims = src0->ne[0];
+    }
+    int64_t tail_dims = ne00 - rope_dims;
+    bool    has_tail  = tail_dims > 0;
+
+    // init ctx.rope_cos/rope_sin cache
+    aclnn_rope_cache_init(ctx, dst, corr_dims, ext_factor, theta_scale, freq_scale, attn_factor, is_neox, sections,
+                          mrope_used, is_imrope, is_vision, rope_dims);
+
+    // Cache is generated with ne00 dimensions, so we use ne00 for reshape
+    int64_t sin_reshape_ne[4] = { rope_dims, 1, ne02, 1 };
    size_t  sin_reshape_nb[GGML_MAX_DIMS];
    sin_reshape_nb[0] = sizeof(float);
    for (int i = 1; i < GGML_MAX_DIMS; i++) {
@ -2704,7 +2719,6 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) {

    acl_tensor_ptr acl_src = ggml_cann_create_tensor(src0);
    acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
-
 #ifdef ASCEND_310P
    // Special ROPE operation for 310P

@ -2844,46 +2858,124 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
    }
    return;
 #endif
-
    int64_t acl_mode = is_neox ? 0 : 1;

-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src.get(), acl_cos_reshape_tensor.get(),
-                                        acl_sin_reshape_tensor.get(), acl_mode, acl_dst.get());
-                break;
-            }
-        case GGML_TYPE_F16:
-            {
-                ggml_cann_pool_alloc src_trans_allocator(ctx.pool(), ggml_nelements(src0) * sizeof(float));
-                void *               src_trans_buffer = src_trans_allocator.get();
-                ggml_cann_pool_alloc dst_trans_allocator(ctx.pool(), ggml_nelements(dst) * sizeof(float));
-                void *               dst_trans_buffer = dst_trans_allocator.get();
+    // Pre-define head and tail dimensions for reuse
+    int64_t head_ne[GGML_MAX_DIMS] = { rope_dims, ne01, ne02, ne03 };
+    int64_t tail_ne[GGML_MAX_DIMS] = { tail_dims, ne01, ne02, ne03 };

-                size_t src_trans_nb[GGML_MAX_DIMS];
-                src_trans_nb[0] = sizeof(float);
-                for (int i = 1; i < GGML_MAX_DIMS; i++) {
-                    src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
-                }
+    // Step 1: Prepare trans tensors for F16 type conversion to F32 if needed
+    bool                 src_dst_need_trans = false;
+    ggml_cann_pool_alloc src_trans_allocator(ctx.pool());
+    ggml_cann_pool_alloc dst_trans_allocator(ctx.pool());
+    acl_tensor_ptr       acl_src_trans_tensor;
+    acl_tensor_ptr       acl_dst_trans_tensor;
+    void *               src_trans_buffer = nullptr;
+    void *               dst_trans_buffer = nullptr;
+    size_t               src_dst_trans_nb[GGML_MAX_DIMS];
+    if (src0->type == GGML_TYPE_F16) {
+        src_dst_need_trans = true;
+        src_trans_buffer   = src_trans_allocator.alloc(ggml_nelements(src0) * sizeof(float));
+        dst_trans_buffer   = dst_trans_allocator.alloc(ggml_nelements(dst) * sizeof(float));

-                acl_tensor_ptr acl_src_trans_tensor = ggml_cann_create_tensor(
-                    src_trans_buffer, ACL_FLOAT, sizeof(float), src0->ne, src_trans_nb, GGML_MAX_DIMS);
-                acl_tensor_ptr acl_dst_trans_tensor = ggml_cann_create_tensor(
-                    dst_trans_buffer, ACL_FLOAT, sizeof(float), dst->ne, src_trans_nb, GGML_MAX_DIMS);
+        src_dst_trans_nb[0] = sizeof(float);
+        for (int i = 1; i < GGML_MAX_DIMS; i++) {
+            src_dst_trans_nb[i] = src_dst_trans_nb[i - 1] * src0->ne[i - 1];
+        }
+        acl_src_trans_tensor = ggml_cann_create_tensor(src_trans_buffer, ACL_FLOAT, sizeof(float), src0->ne,
+                                                       src_dst_trans_nb, GGML_MAX_DIMS);
+        acl_dst_trans_tensor = ggml_cann_create_tensor(dst_trans_buffer, ACL_FLOAT, sizeof(float), dst->ne,
+                                                       src_dst_trans_nb, GGML_MAX_DIMS);
+        aclnn_cast(ctx, acl_src.get(), acl_src_trans_tensor.get(), ACL_FLOAT);
+    }

-                aclnn_cast(ctx, acl_src.get(), acl_src_trans_tensor.get(), ACL_FLOAT);
+    // Step 2: Prepare head tensors for tail splitting if needed
+    acl_tensor_ptr acl_src_head;
+    acl_tensor_ptr acl_dst_head;
+    if (has_tail) {
+        // Create head views for RotaryPositionEmbedding (only first rope_dims dimensions)
+        // RotaryPositionEmbedding requires contiguous dst tensor, so we use a temporary buffer
+        if (src_dst_need_trans) {
+            // Use F32 trans tensor strides
+            acl_src_head = ggml_cann_create_tensor((char *) src_trans_buffer, ACL_FLOAT, sizeof(float), head_ne,
+                                                   src_dst_trans_nb, GGML_MAX_DIMS);
+        } else {
+            // Use original F32 tensor strides
+            acl_src_head = ggml_cann_create_tensor((char *) src0->data, ACL_FLOAT, sizeof(float), head_ne, src0->nb,
+                                                   GGML_MAX_DIMS);
+        }

-                GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_trans_tensor.get(),
-                                        acl_cos_reshape_tensor.get(), acl_sin_reshape_tensor.get(), acl_mode,
-                                        acl_dst_trans_tensor.get());
+        int64_t              head_elements = rope_dims * ne01 * ne02 * ne03;
+        ggml_cann_pool_alloc dst_head_contiguous_allocator(ctx.pool(), head_elements * sizeof(float));
+        void *               dst_head_contiguous_buffer = dst_head_contiguous_allocator.get();

-                aclnn_cast(ctx, acl_dst_trans_tensor.get(), acl_dst.get(), ACL_FLOAT16);
-                break;
-            }
-        default:
-            GGML_ABORT("Unsupported tensor type for GGML_OP_ROPE");
-            break;
+        size_t head_contiguous_nb[GGML_MAX_DIMS];
+        head_contiguous_nb[0] = sizeof(float);
+        for (int i = 1; i < GGML_MAX_DIMS; i++) {
+            head_contiguous_nb[i] = head_contiguous_nb[i - 1] * head_ne[i - 1];
+        }
+        acl_dst_head = ggml_cann_create_tensor(dst_head_contiguous_buffer, ACL_FLOAT, sizeof(float), head_ne,
+                                               head_contiguous_nb, GGML_MAX_DIMS);
+    }
+
+    // Step 3: Execute RotaryPositionEmbedding
+    if (has_tail) {
+        // Rotate only the head portion (first rope_dims dimensions)
+        GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_head.get(), acl_cos_reshape_tensor.get(),
+                                acl_sin_reshape_tensor.get(), acl_mode, acl_dst_head.get());
+
+        // Copy head result from contiguous buffer back to destination tensor
+        if (src_dst_need_trans) {
+            acl_tensor_ptr acl_dst_head_target = ggml_cann_create_tensor(
+                (char *) dst_trans_buffer, ACL_FLOAT, sizeof(float), head_ne, src_dst_trans_nb, GGML_MAX_DIMS);
+            cann_copy(ctx, acl_dst_head.get(), acl_dst_head_target.get());
+        } else {
+            acl_tensor_ptr acl_dst_head_target =
+                ggml_cann_create_tensor((char *) dst->data, ACL_FLOAT, sizeof(float), head_ne, dst->nb, GGML_MAX_DIMS);
+            cann_copy(ctx, acl_dst_head.get(), acl_dst_head_target.get());
+        }
+    } else if (src_dst_need_trans) {
+        // Rotate full tensor (no tail), using trans tensors
+        GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_trans_tensor.get(), acl_cos_reshape_tensor.get(),
+                                acl_sin_reshape_tensor.get(), acl_mode, acl_dst_trans_tensor.get());
+    } else {
+        // Rotate full tensor (no tail), using original tensors
+        GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src.get(), acl_cos_reshape_tensor.get(),
+                                acl_sin_reshape_tensor.get(), acl_mode, acl_dst.get());
+    }
+
+    // Step 4: Copy unrotated tail portion from source to destination
+    if (has_tail) {
+        size_t src_tail_offset;
+        size_t dst_tail_offset;
+
+        auto copy_tail_device = [&](void * src_ptr, void * dst_ptr, aclDataType dtype, size_t elem_size,
+                                    size_t * nb_src_arr, size_t * nb_dst_arr) {
+            acl_tensor_ptr acl_src_tail =
+                ggml_cann_create_tensor(src_ptr, dtype, elem_size, tail_ne, nb_src_arr, GGML_MAX_DIMS);
+            acl_tensor_ptr acl_dst_tail =
+                ggml_cann_create_tensor(dst_ptr, dtype, elem_size, tail_ne, nb_dst_arr, GGML_MAX_DIMS);
+            cann_copy(ctx, acl_src_tail.get(), acl_dst_tail.get());
+        };
+
+        if (src_dst_need_trans) {
+            // Use F32 trans tensor strides and offsets
+            src_tail_offset = rope_dims * src_dst_trans_nb[0];
+            dst_tail_offset = rope_dims * src_dst_trans_nb[0];
+            copy_tail_device((char *) src_trans_buffer + src_tail_offset, (char *) dst_trans_buffer + dst_tail_offset,
+                             ACL_FLOAT, sizeof(float), src_dst_trans_nb, src_dst_trans_nb);
+        } else {
+            // Use original tensor strides and offsets
+            src_tail_offset = rope_dims * nb00;
+            dst_tail_offset = rope_dims * nb0;
+            copy_tail_device((char *) src0->data + src_tail_offset, (char *) dst->data + dst_tail_offset,
+                             ggml_cann_type_mapping(dst->type), ggml_element_size(dst), src0->nb, dst->nb);
+        }
+    }
+
+    // Step 5: Cast back to F16 if needed
+    if (src_dst_need_trans) {
+        aclnn_cast(ctx, acl_dst_trans_tensor.get(), acl_dst.get(), ACL_FLOAT16);
    }
 }

--- a/ggml/src/ggml-cann/common.h
+++ b/ggml/src/ggml-cann/common.h
@ -315,7 +315,7 @@ struct ggml_cann_rope_cache {
        if (theta_scale_exp_host) {
            free(theta_scale_exp_host);
        }
-        if(position_select_index_host) {
+        if (position_select_index_host) {
            free(position_select_index_host);
        }
    }
@ -340,7 +340,7 @@ struct ggml_cann_rope_cache {

    void set(int64_t theta_scale_length,
             int64_t position_length,
-             float    ext_factor,
+             float   ext_factor,
             float   theta_scale,
             float   freq_scale,
             float   attn_factor,
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@ -2308,7 +2308,7 @@ static enum ggml_status ggml_backend_cann_graph_compute(ggml_backend_t backend,

    bool cann_graph_update_required = false;
 #ifdef USE_ACL_GRAPH
-    bool use_cann_graph             = true;
+    bool use_cann_graph = true;

    static bool prefill_use_graph = parse_bool(get_env("GGML_CANN_PREFILL_USE_GRAPH").value_or(""));
    if (!prefill_use_graph) {
@ -2338,7 +2338,7 @@ static enum ggml_status ggml_backend_cann_graph_compute(ggml_backend_t backend,
        }
    }
 #else
-    bool use_cann_graph             = false;
+    bool use_cann_graph = false;
 #endif  // USE_ACL_GRAPH
    evaluate_and_capture_cann_graph(cann_ctx, cgraph, use_cann_graph, cann_graph_update_required);

@ -2474,16 +2474,14 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
            }
        case GGML_OP_ROPE:
            {
-                // TODO: with ops-test v == 1
-                // TODO: n_dims <= ne0
-                if (op->src[0]->ne[0] != op->op_params[1]) {
-                    return false;
-                }
-
                if (op->src[0]->ne[0] > 896) {
                    return false;
                }
 #ifdef ASCEND_310P
+                // TODO: Support rope_dim < ne00(dim)
+                if (op->src[0]->ne[0] != op->op_params[1]) {
+                    return false;
+                }
                if (!ggml_is_contiguous(op->src[0])) {
                    return false;
                }
--- a/ggml/src/ggml-cuda/diag.cu
+++ b/ggml/src/ggml-cuda/diag.cu
@ -0,0 +1,77 @@
+#include "convert.cuh"
+#include "diag.cuh"
+#include "ggml.h"
+
+template <typename T>
+static __global__ void diag_kernel(T * __restrict__ dst,
+                                   const T * __restrict__ src,
+                                   const int64_t ne0,
+                                   const int64_t ne1,
+                                   const int64_t ne2,
+                                   const int64_t ne3,
+                                   const int64_t total_elements) {
+    const int64_t global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (global_idx >= total_elements) {
+        return;
+    }
+
+    const int64_t i0 = global_idx % ne0;
+    const int64_t i1 = (global_idx / ne0) % ne1;
+    const int64_t i2 = (global_idx / (ne0 * ne1)) % ne2;
+    const int64_t i3 = global_idx / (ne0 * ne1 * ne2);
+
+    const int64_t dst_idx = ((i3 * ne2 + i2) * ne1 + i1) * ne0 + i0;
+
+    if (i0 == i1) {
+        const int64_t batch_idx = i3 * ne2 + i2;
+        const int64_t src_idx   = batch_idx * ne0 + i0;
+        dst[dst_idx]            = src[src_idx];
+    } else {
+        dst[dst_idx] = ggml_cuda_cast<T>(0);
+    }
+    GGML_UNUSED_VARS(ne3);
+}
+
+void ggml_cuda_op_diag(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+
+    void *       dst_d  = dst->data;
+    const void * src0_d = src0->data;
+
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(ggml_is_contiguous(dst));
+    GGML_ASSERT(ggml_is_contiguous(src0));
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+
+    const int64_t ne0 = dst->ne[0];
+    const int64_t ne1 = dst->ne[1];
+    const int64_t ne2 = dst->ne[2];
+    const int64_t ne3 = dst->ne[3];
+
+    GGML_ASSERT(ne00 == ne0);
+    GGML_ASSERT(ne01 == 1);
+    GGML_ASSERT(ne02 == ne2);
+    GGML_ASSERT(ne03 == ne3);
+
+    const int64_t n_elems    = ggml_nelements(dst);
+    const int64_t num_blocks = (n_elems + CUDA_DIAG_BLOCK_SIZE - 1) / CUDA_DIAG_BLOCK_SIZE;
+
+    switch (dst->type) {
+        case GGML_TYPE_F32:
+            diag_kernel<<<num_blocks, CUDA_DIAG_BLOCK_SIZE, 0, stream>>>((float *) dst_d, (const float *) src0_d, ne0,
+                                                                         ne1, ne2, ne3, n_elems);
+            break;
+        case GGML_TYPE_F16:
+            diag_kernel<<<num_blocks, CUDA_DIAG_BLOCK_SIZE, 0, stream>>>((half *) dst_d, (const half *) src0_d, ne0,
+                                                                         ne1, ne2, ne3, n_elems);
+            break;
+        default:
+            GGML_ABORT("unsupported type");
+    }
+}
--- a/ggml/src/ggml-cuda/diag.cuh
+++ b/ggml/src/ggml-cuda/diag.cuh
@ -0,0 +1,5 @@
+#include "common.cuh"
+
+#define CUDA_DIAG_BLOCK_SIZE 256
+
+void ggml_cuda_op_diag(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml-cuda/fattn-tile.cuh
+++ b/ggml/src/ggml-cuda/fattn-tile.cuh
@ -564,6 +564,12 @@ static __device__ __forceinline__ void flash_attn_tile_iter(
        for (int i_KQ_0 = 0; i_KQ_0 < nbatch_fa; i_KQ_0 += np*warp_size) {
            const int i_KQ = i_KQ_0 + (threadIdx.y % np)*warp_size + threadIdx.x;

+#if defined(FAST_FP16_AVAILABLE) && !defined(V_DOT2_F32_F16_AVAILABLE)
+            // Without the v_dot2_f32_f16 instruction there is a higher risk of numerical overflow in the KQ calculation.
+            // Therefore, scale down Q values and apply the inverse scale the FP32 KQ values afterwards again.
+            KQ_acc[i_KQ_0/(np*warp_size)*cpw + jc0] *= 4.0f;
+#endif // defined(FAST_FP16_AVAILABLE) && !defined(V_DOT2_F32_F16_AVAILABLE)
+
            if (use_logit_softcap) {
                KQ_acc[(i_KQ_0/(np*warp_size))*cpw + jc0] = logit_softcap * tanhf(KQ_acc[(i_KQ_0/(np*warp_size))*cpw + jc0]);
            }
@ -858,6 +864,11 @@ static __global__ void flash_attn_tile(
 #pragma unroll
                for (int i1 = 0; i1 < cpy_ne_D; i1 += 2) {
                    tmp_h2[i1/2] = make_half2(tmp_f[i1 + 0], tmp_f[i1 + 1]);
+#if defined(FAST_FP16_AVAILABLE) && !defined(V_DOT2_F32_F16_AVAILABLE)
+                    // Without the v_dot2_f32_f16 instruction there is a higher risk of numerical overflow in the KQ calculation.
+                    // Therefore, scale down Q values and apply the inverse scale the FP32 KQ values afterwards again.
+                    tmp_h2[i1/2] *= make_half2(0.25f, 0.25f);
+#endif // defined(FAST_FP16_AVAILABLE) && !defined(V_DOT2_F32_F16_AVAILABLE)
                }
                ggml_cuda_memcpy_1<sizeof(tmp_h2)>(
                    &Q_tmp[jc*(DKQ/2) + i0/2 + (threadIdx.y % np)*(warp_size*cpy_ne_D/2) + threadIdx.x*(cpy_ne_D/2)],
--- a/ggml/src/ggml-cuda/fill.cu
+++ b/ggml/src/ggml-cuda/fill.cu
@ -4,7 +4,7 @@
 #define CUDA_FILL_BLOCK_SIZE 256

 template <typename T>
-static __global__ void fill_kernel(T * __restrict__ dst, const int64_t k, const T value) {
+static __global__ void fill_kernel(T * dst, const int64_t k, const T value) {
    const int64_t i = (int64_t)blockDim.x * blockIdx.x + threadIdx.x;
    if (i >= k) {
        return;
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@ -21,6 +21,7 @@
 #include "ggml-cuda/cross-entropy-loss.cuh"
 #include "ggml-cuda/cumsum.cuh"
 #include "ggml-cuda/diagmask.cuh"
+#include "ggml-cuda/diag.cuh"
 #include "ggml-cuda/fattn.cuh"
 #include "ggml-cuda/getrows.cuh"
 #include "ggml-cuda/im2col.cuh"
@ -2651,6 +2652,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
        case GGML_OP_PERMUTE:
        case GGML_OP_TRANSPOSE:
                break;
+        case GGML_OP_DIAG:
+            ggml_cuda_op_diag(ctx, dst);
+            break;
        case GGML_OP_DIAG_MASK_INF:
            ggml_cuda_op_diag_mask_inf(ctx, dst);
            break;
@ -4638,6 +4642,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
        case GGML_OP_FILL:
        case GGML_OP_CUMSUM:
        case GGML_OP_TRI:
+        case GGML_OP_DIAG:
            return true;
        case GGML_OP_SOLVE_TRI:
            return op->src[0]->ne[0] <= 64 && op->src[1]->ne[0] <= 32;
--- a/ggml/src/ggml-metal/ggml-metal-device.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-device.cpp
@ -411,6 +411,38 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_conv(ggml_me
    return res;
 }

+ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_conv_batched(ggml_metal_library_t lib, const ggml_tensor * op, int ssm_conv_bs) {
+    GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32);
+    GGML_ASSERT(op->src[1]->type == GGML_TYPE_F32);
+
+    GGML_ASSERT(ggml_is_contiguous(op->src[0]));
+    GGML_ASSERT(ggml_is_contiguous(op->src[1]));
+
+    char base[256];
+    char name[256];
+
+    const char * suffix = "";
+    if (op->src[1]->ne[0] % 4 == 0) {
+        suffix = "_4";
+    }
+
+    snprintf(base, 256, "kernel_ssm_conv_%s_%s_batched%s", ggml_type_name(op->src[0]->type), ggml_type_name(op->src[1]->type), suffix);
+    snprintf(name, 256, "%s_ssm_conv_bs=%d", base, ssm_conv_bs);
+
+    ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
+    if (!res.pipeline) {
+        ggml_metal_cv_t cv = ggml_metal_cv_init();
+
+        ggml_metal_cv_set_int16(cv, ssm_conv_bs, FC_SSM_CONV + 0);
+
+        res = ggml_metal_library_compile_pipeline(lib, base, name, cv);
+
+        ggml_metal_cv_free(cv);
+    }
+
+    return res;
+}
+
 ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_scan(ggml_metal_library_t lib, const ggml_tensor * op)  {
    GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);

@ -427,7 +459,12 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_scan(ggml_me
        res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr);
    }

-    res.smem = 32*sizeof(float)*nsg;
+    // Shared memory layout:
+    // - sgptg * NW floats for partial sums (nsg * 32)
+    // - sgptg floats for shared_x_dt (nsg)
+    // - sgptg floats for shared_dA (nsg)
+    // Total: nsg * (32 + 2) floats
+    res.smem = (32 + 2)*sizeof(float)*nsg;

    return res;
 }
--- a/ggml/src/ggml-metal/ggml-metal-device.h
+++ b/ggml/src/ggml-metal/ggml-metal-device.h
@ -117,6 +117,7 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_cumsum_ad
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_tri               (ggml_metal_library_t lib, const struct ggml_tensor * op);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_soft_max          (ggml_metal_library_t lib, const struct ggml_tensor * op);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_conv          (ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_conv_batched  (ggml_metal_library_t lib, const struct ggml_tensor * op, int ssm_conv_bs);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_scan          (ggml_metal_library_t lib, const struct ggml_tensor * op);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_rwkv              (ggml_metal_library_t lib, const struct ggml_tensor * op);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv_ext        (ggml_metal_library_t lib, enum ggml_type tsrc0, enum ggml_type tsrc1, int nsg, int nxpsg, int r1ptg);
--- a/ggml/src/ggml-metal/ggml-metal-impl.h
+++ b/ggml/src/ggml-metal/ggml-metal-impl.h
@ -77,6 +77,7 @@
 #define FC_MUL_MV                      600
 #define FC_MUL_MM                      700
 #define FC_ROPE                        800
+#define FC_SSM_CONV                    900

 // op-specific constants
 #define OP_FLASH_ATTN_EXT_NQPTG 8
--- a/ggml/src/ggml-metal/ggml-metal-ops.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp
@ -1365,15 +1365,43 @@ int ggml_metal_op_ssm_conv(ggml_metal_op_t ctx, int idx) {
        /*.nb2  =*/ nb2,
    };

-    auto pipeline = ggml_metal_library_get_pipeline_ssm_conv(lib, op);
+    // Use batched kernel for prefill (ne1 > 1) to reduce threadgroup dispatch overhead
+    const bool use_batched = (ne1 > 1);

-    ggml_metal_encoder_set_pipeline(enc, pipeline);
-    ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 0);
-    ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[0]), 1);
-    ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[1]), 2);
-    ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op),         3);
+    if (use_batched) {
+        // Determine the smallest power of 2 that's >= ne1, but <= 256
+        int BATCH_SIZE;
+        if      (ne1 > 128) BATCH_SIZE = 256;
+        else if (ne1 > 64 ) BATCH_SIZE = 128;
+        else if (ne1 > 32 ) BATCH_SIZE = 64;
+        else if (ne1 > 16 ) BATCH_SIZE = 32;
+        else if (ne1 > 8  ) BATCH_SIZE = 16;
+        else if (ne1 > 4  ) BATCH_SIZE = 8;
+        else                BATCH_SIZE = 2;

-    ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne1, ne02, 1, 1, 1);
+        auto pipeline = ggml_metal_library_get_pipeline_ssm_conv_batched(lib, op, BATCH_SIZE);
+
+        ggml_metal_encoder_set_pipeline(enc, pipeline);
+        ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 0);
+        ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[0]), 1);
+        ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[1]), 2);
+        ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op),         3);
+
+        // Dispatch: ne01 rows, ceil(ne1/BATCH_SIZE) token batches, ne02 sequences
+        // Each threadgroup has BATCH_SIZE threads, each handling one token
+        const int n_token_batches = (ne1 + BATCH_SIZE - 1) / BATCH_SIZE;
+        ggml_metal_encoder_dispatch_threadgroups(enc, ne01, n_token_batches, ne02, BATCH_SIZE, 1, 1);
+    } else {
+        auto pipeline = ggml_metal_library_get_pipeline_ssm_conv(lib, op);
+
+        ggml_metal_encoder_set_pipeline(enc, pipeline);
+        ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 0);
+        ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[0]), 1);
+        ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[1]), 2);
+        ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op),         3);
+
+        ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne1, ne02, 1, 1, 1);
+    }

    return 1;
 }
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@ -2343,7 +2343,102 @@ kernel void kernel_ssm_conv_f32_f32_4(
    x[0] = sumf;
 }

+constant short FC_ssm_conv_bs   [[function_constant(FC_SSM_CONV + 0)]];
+
+// Batched version: each threadgroup processes multiple tokens for better efficiency
+// Thread layout: each thread handles one token, threadgroup covers BATCH_SIZE tokens
+kernel void kernel_ssm_conv_f32_f32_batched(
+        constant ggml_metal_kargs_ssm_conv & args,
+        device const  void * src0,
+        device const  void * src1,
+        device       float * dst,
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3 tpitg[[thread_position_in_threadgroup]],
+        uint3   ntg[[threads_per_threadgroup]]) {
+    // tgpig.x = row index (ir)
+    // tgpig.y = batch of tokens (i2_base / BATCH_SIZE)
+    // tgpig.z = sequence index (i3)
+    // tpitg.x = thread within batch (0..BATCH_SIZE-1)
+    const short BATCH_SIZE = FC_ssm_conv_bs;
+
+    const int64_t ir      = tgpig.x;
+    const int64_t i2_base = tgpig.y * BATCH_SIZE;
+    const int64_t i3      = tgpig.z;
+    const int64_t i2_off  = tpitg.x;
+    const int64_t i2      = i2_base + i2_off;
+
+    const int64_t nc  = args.ne10;  // conv kernel size (typically 4)
+    const int64_t n_t = args.ne1;   // number of tokens
+
+    // Bounds check for partial batches at the end
+    if (i2 >= n_t) {
+        return;
+    }
+
+    // Load conv weights (shared across all tokens for this row)
+    device const float * c = (device const float *) ((device const char *) src1 + ir*args.nb11);
+
+    // Load source for this specific token
+    device const float * s = (device const float *) ((device const char *) src0 + ir*args.nb01 + i2*args.nb00 + i3*args.nb02);
+
+    // Output location for this token
+    device float * x = (device float *) ((device char *) dst + ir*args.nb0 + i2*args.nb1 + i3*args.nb2);
+
+    float sumf = 0.0f;
+    for (int64_t i0 = 0; i0 < nc; ++i0) {
+        sumf += s[i0] * c[i0];
+    }
+
+    x[0] = sumf;
+}
+
+kernel void kernel_ssm_conv_f32_f32_batched_4(
+        constant ggml_metal_kargs_ssm_conv & args,
+        device const  void * src0,
+        device const  void * src1,
+        device       float * dst,
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3 tpitg[[thread_position_in_threadgroup]],
+        uint3   ntg[[threads_per_threadgroup]]) {
+    // tgpig.x = row index (ir)
+    // tgpig.y = batch of tokens (i2_base / BATCH_SIZE)
+    // tgpig.z = sequence index (i3)
+    // tpitg.x = thread within batch (0..BATCH_SIZE-1)
+    const short BATCH_SIZE = FC_ssm_conv_bs;
+
+    const int64_t ir      = tgpig.x;
+    const int64_t i2_base = tgpig.y * BATCH_SIZE;
+    const int64_t i3      = tgpig.z;
+    const int64_t i2_off  = tpitg.x;
+    const int64_t i2      = i2_base + i2_off;
+
+    const int64_t nc  = args.ne10;  // conv kernel size (typically 4)
+    const int64_t n_t = args.ne1;   // number of tokens
+
+    // Bounds check for partial batches at the end
+    if (i2 >= n_t) {
+        return;
+    }
+
+    // Load conv weights (shared across all tokens for this row)
+    device const float4 * c = (device const float4 *) ((device const char *) src1 + ir*args.nb11);
+
+    // Load source for this specific token
+    device const float4 * s = (device const float4 *) ((device const char *) src0 + ir*args.nb01 + i2*args.nb00 + i3*args.nb02);
+
+    // Output location for this token
+    device float * x = (device float *) ((device char *) dst + ir*args.nb0 + i2*args.nb1 + i3*args.nb2);
+
+    float sumf = 0.0f;
+    for (int64_t i0 = 0; i0 < nc/4; ++i0) {
+        sumf += dot(s[i0], c[i0]);
+    }
+
+    x[0] = sumf;
+}
+
 // ref: ggml.c:ggml_compute_forward_ssm_scan_f32, Mamba-2 part
+// Optimized version: reduces redundant memory loads by having one thread load shared values
 kernel void kernel_ssm_scan_f32(
        constant ggml_metal_kargs_ssm_scan & args,
        device const void * src0,
@ -2363,7 +2458,15 @@ kernel void kernel_ssm_scan_f32(
        uint3    tgpg[[threadgroups_per_grid]]) {
    constexpr short NW = N_SIMDWIDTH;

-    shared[tpitg.x] = 0.0f;
+    // Shared memory layout:
+    // [0..sgptg*NW-1]: partial sums for reduction (existing)
+    // [sgptg*NW..sgptg*NW+sgptg-1]: pre-computed x_dt values for each token in batch
+    // [sgptg*NW+sgptg..sgptg*NW+2*sgptg-1]: pre-computed dA values for each token in batch
+    threadgroup float * shared_sums = shared;
+    threadgroup float * shared_x_dt = shared + sgptg * NW;
+    threadgroup float * shared_dA   = shared + sgptg * NW + sgptg;
+
+    shared_sums[tpitg.x] = 0.0f;

    const int32_t i0 = tpitg.x;
    const int32_t i1 = tgpig.x;
@ -2403,32 +2506,47 @@ kernel void kernel_ssm_scan_f32(
    for (int i2 = 0; i2 < n_t; i2 += sgptg) {
        threadgroup_barrier(mem_flags::mem_threadgroup);

-        for (int t = 0; t < sgptg && i2 + t < n_t; t++) {
-            const float dt0  = dt[0];
+        // Pre-compute x_dt and dA for this batch of tokens
+        // Only first sgptg threads do the loads and expensive math
+        if (i0 < sgptg && i2 + i0 < n_t) {
+            // ns12 and ns21 are element strides (nb12/nb10, nb21/nb20)
+            device const float * x_t  = x  + i0 * args.ns12;
+            device const float * dt_t = dt + i0 * args.ns21;
+
+            const float dt0  = dt_t[0];
            const float dtsp = dt0 <= 20.0f ? log(1.0f + exp(dt0)) : dt0;
-            const float x_dt = x[0] * dtsp;
-            const float dA   = exp(dtsp * A0);
+            shared_x_dt[i0] = x_t[0] * dtsp;
+            shared_dA[i0]   = dtsp;  // Store dtsp, compute exp(dtsp * A0) per-thread since A0 varies
+        }
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        for (int t = 0; t < sgptg && i2 + t < n_t; t++) {
+            const float x_dt = shared_x_dt[t];
+            const float dA   = exp(shared_dA[t] * A0);

            s = (s0 * dA) + (B[i0] * x_dt);

            const float sumf = simd_sum(s * C[i0]);

            if (tiisg == 0) {
-                shared[t*NW + sgitg] = sumf;
+                shared_sums[t*NW + sgitg] = sumf;
            }

            // recurse
            s0 = s;

-            x  += args.ns12;
-            dt += args.ns21;
            B  += args.ns42;
            C  += args.ns52;
        }

+        // Advance pointers for next batch
+        x  += sgptg * args.ns12;
+        dt += sgptg * args.ns21;
+
        threadgroup_barrier(mem_flags::mem_threadgroup);

-        const float sumf = simd_sum(shared[sgitg*NW + tiisg]);
+        const float sumf = simd_sum(shared_sums[sgitg*NW + tiisg]);

        if (tiisg == 0 && i2 + sgitg < n_t) {
            y[sgitg*nh*nr] = sumf;
--- a/ggml/src/ggml-sycl/softmax.cpp
+++ b/ggml/src/ggml-sycl/softmax.cpp
@ -277,7 +277,7 @@ static void soft_max_f32_sycl(const float *x, const T *mask,
    const int id       = get_current_device_id();
    const size_t smpbo = ggml_sycl_info().devices[id].smpbo;

-    if (nbytes_shared <= smpbo) {
+    if (nbytes_shared <= smpbo && ncols_x <= max_block_size) {
        launch_soft_max_kernels<32, 64, 128, 256, 512, 1024, 2048, 4096>(
            x, mask, sinks, dst, params, stream, block_dims, block_nums,
            nbytes_shared);
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@ -124,6 +124,13 @@ static void ggml_print_backtrace_symbols(void) {
    int nptrs = backtrace(trace, sizeof(trace)/sizeof(trace[0]));
    backtrace_symbols_fd(trace, nptrs, STDERR_FILENO);
 }
+#elif defined(__APPLE__)
+#include <execinfo.h>
+static void ggml_print_backtrace_symbols(void) {
+    void * trace[100];
+    int nptrs = backtrace(trace, sizeof(trace)/sizeof(trace[0]));
+    backtrace_symbols_fd(trace, nptrs, STDERR_FILENO);
+}
 #else
 static void ggml_print_backtrace_symbols(void) {
    // platform not supported
@ -135,6 +142,20 @@ void ggml_print_backtrace(void) {
    if (GGML_NO_BACKTRACE) {
        return;
    }
+#if defined(__APPLE__)
+    // On macOS, fork+debugger attachment is problematic due to:
+    // 1. libdispatch "poisons" forked child processes
+    // 2. lldb has issues attaching to parent from forked child
+    // Use simple backtrace() instead to avoid Terminal.app crashes
+    const char * GGML_BACKTRACE_LLDB = getenv("GGML_BACKTRACE_LLDB");
+    if (!GGML_BACKTRACE_LLDB) {
+        fprintf(stderr, "WARNING: Using native backtrace. Set GGML_BACKTRACE_LLDB for more info.\n");
+        fprintf(stderr, "WARNING: GGML_BACKTRACE_LLDB may cause native MacOS Terminal.app to crash.\n");
+        fprintf(stderr, "See: https://github.com/ggml-org/llama.cpp/pull/17869\n");
+        ggml_print_backtrace_symbols();
+        return;
+    }
+#endif
 #if defined(__linux__)
    FILE * f = fopen("/proc/self/status", "r");
    size_t size = 0;
--- a/grammars/README.md
+++ b/grammars/README.md
@ -67,6 +67,30 @@ Parentheses `()` can be used to group sequences, which allows for embedding alte
 - `{m,n}` repeats the precedent symbol or sequence at between `m` and `n` times (included)
 - `{0,n}` repeats the precedent symbol or sequence at most `n` times (included)

+## Tokens
+
+Tokens allow grammars to match specific tokenizer tokens rather than character sequences. This is useful for constraining outputs based on special tokens (like `<think>` or `</think>`).
+
+Tokens can be specified in two ways:
+
+1. **Token ID**: Use angle brackets with the token ID in square brackets: `<[token-id]>`. For example, `<[1000]>` matches the token with ID 1000.
+
+2. **Token string**: Use angle brackets with the token text directly: `<token>`. For example, `<think>` will match the token whose text is exactly `<think>`. This only works if the string tokenizes to exactly one token in the vocabulary, otherwise the grammar will fail to parse.
+
+You can negate token matches using the `!` prefix: `!<[1000]>` or `!<think>` matches any token *except* the specified one.
+
+```
+# Match a thinking block: <think>...</think>
+# Using token strings (requires these to be single tokens in the vocab)
+root ::= <think> thinking </think> .*
+thinking ::= !</think>*
+
+# Equivalent grammar using explicit token IDs
+# Assumes token 1000 = <think>, token 1001 = </think>
+root ::= <[1000]> thinking <[1001]> .*
+thinking ::= !<[1001]>*
+```
+
 ## Comments and newlines

 Comments can be specified with `#`:
--- a/models/templates/mistralai-Ministral-3-14B-Reasoning-2512.jinja
+++ b/models/templates/mistralai-Ministral-3-14B-Reasoning-2512.jinja
@ -0,0 +1,126 @@
+{#- Default system message if no system prompt is passed. #}
+{%- set default_system_message = '# HOW YOU SHOULD THINK AND ANSWER\n\nFirst draft your thinking process (inner monologue) until you arrive at a response. Format your response using Markdown, and use LaTeX for any mathematical equations. Write both your thoughts and the response in the same language as the input.\n\nYour thinking process must follow the template below:[THINK]Your thoughts or/and draft, like working through an exercise on scratch paper. Be as casual and as long as you want until you are confident to generate the response to the user.[/THINK]Here, provide a self-contained response.' %}
+
+{#- Begin of sequence token. #}
+{{- bos_token }}
+
+{#- Handle system prompt if it exists. #}
+{#- System prompt supports text content or text and thinking chunks. #}
+{%- if messages[0]['role'] == 'system' %}
+    {{- '[SYSTEM_PROMPT]' -}}
+    {%- if messages[0]['content'] is string %}
+        {{- messages[0]['content'] -}}
+    {%- else %}        
+        {%- for block in messages[0]['content'] %}
+            {%- if block['type'] == 'text' %}
+                {{- block['text'] }}
+            {%- elif block['type'] == 'thinking' %}
+                {{- '[THINK]' + block['thinking'] + '[/THINK]' }}
+            {%- else %}
+                {{- raise_exception('Only text and thinking chunks are supported in system message contents.') }}
+            {%- endif %}
+        {%- endfor %}
+    {%- endif %}
+    {{- '[/SYSTEM_PROMPT]' -}}
+    {%- set loop_messages = messages[1:] %}
+{%- else %}
+    {%- set loop_messages = messages %}
+    {%- if default_system_message != '' %}
+        {{- '[SYSTEM_PROMPT]' + default_system_message + '[/SYSTEM_PROMPT]' }}
+    {%- endif %}
+{%- endif %}
+
+
+{#- Tools definition #}
+{%- set tools_definition = '' %}
+{%- set has_tools = false %}
+{%- if tools is defined and tools is not none and tools|length > 0 %}
+    {%- set has_tools = true %}
+    {%- set tools_definition = '[AVAILABLE_TOOLS]' + (tools| tojson) + '[/AVAILABLE_TOOLS]' %}
+    {{- tools_definition }}
+{%- endif %}
+
+{#- Checks for alternating user/assistant messages. #}
+{%- set ns = namespace(index=0) %}
+{%- for message in loop_messages %}
+    {%- if message.role == 'user' or (message.role == 'assistant' and (message.tool_calls is not defined or message.tool_calls is none or message.tool_calls | length == 0)) %}
+        {%- if (message['role'] == 'user') != (ns.index % 2 == 0) %}
+            {{- raise_exception('After the optional system message, conversation roles must alternate user and assistant roles except for tool calls and results.') }}
+        {%- endif %}
+        {%- set ns.index = ns.index + 1 %}
+    {%- endif %}
+{%- endfor %}
+
+{#- Handle conversation messages. #}
+{%- for message in loop_messages %}
+
+    {#- User messages supports text content or text and image chunks. #}
+    {%- if message['role'] == 'user' %}
+        {%- if message['content'] is string %}
+            {{- '[INST]' + message['content'] + '[/INST]' }}
+        {%- elif message['content'] | length > 0 %}
+            {{- '[INST]' }}
+            {%- if message['content'] | length == 2 %}
+                {%- set blocks = message['content'] | sort(attribute='type') %}
+            {%- else %}
+                {%- set blocks = message['content'] %}
+            {%- endif %}
+            {%- for block in blocks %}
+                {%- if block['type'] == 'text' %}
+                    {{- block['text'] }}
+                {%- elif block['type'] in ['image', 'image_url'] %}
+                    {{- '[IMG]' }}
+                {%- else %}
+                    {{- raise_exception('Only text, image and image_url chunks are supported in user message content.') }}
+                {%- endif %}
+            {%- endfor %}
+            {{- '[/INST]' }}
+        {%- else %}
+            {{- raise_exception('User message must have a string or a list of chunks in content') }}
+        {%- endif %}
+
+    {#- Assistant messages supports text content or text, image and thinking chunks. #}
+    {%- elif message['role'] == 'assistant' %}
+        {%- if (message['content'] is none or message['content'] == '' or message['content']|length == 0) and (message['tool_calls'] is not defined or message['tool_calls'] is none or message['tool_calls']|length == 0) %}
+            {{- raise_exception('Assistant message must have a string or a list of chunks in content or a list of tool calls.') }}
+        {%- endif %}
+
+        {%- if message['content'] is string and message['content'] != '' %}
+            {{- message['content'] }}
+        {%- elif message['content'] | length > 0 %}
+            {%- for block in message['content'] %}
+                {%- if block['type'] == 'text' %}
+                    {{- block['text'] }}
+                {%- elif block['type'] == 'thinking' %}
+                    {{- '[THINK]' + block['thinking'] + '[/THINK]' }}
+                {%- else %}
+                    {{- raise_exception('Only text and thinking chunks are supported in assistant message contents.') }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        
+        {%- if message['tool_calls'] is defined and message['tool_calls'] is not none and message['tool_calls']|length > 0 %}
+            {%- for tool in message['tool_calls'] %}
+                {{- '[TOOL_CALLS]' }}
+                {%- set name = tool['function']['name'] %}
+                {%- set arguments = tool['function']['arguments'] %}
+                {%- if arguments is not string %}
+                    {%- set arguments = arguments|tojson|safe %}
+                {%- elif arguments == '' %}
+                    {%- set arguments = '{}' %}
+                {%- endif %}
+                {{- name + '[ARGS]' + arguments }}
+            {%- endfor %}
+        {%- endif %}
+
+        {{- eos_token }}
+
+    {#- Tool messages only supports text content. #}
+    {%- elif message['role'] == 'tool' %}
+        {{- '[TOOL_RESULTS]' + message['content']|string + '[/TOOL_RESULTS]' }}
+
+    {#- Raise exception for unsupported roles. #}
+    {%- else %}
+        {{- raise_exception('Only user, assistant and tool roles are supported, got ' + message['role'] + '.') }}
+    {%- endif %}
+{%- endfor %}
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -67,7 +67,7 @@ add_library(llama
            models/gemma-embedding.cpp
            models/gemma.cpp
            models/gemma2-iswa.cpp
-            models/gemma3-iswa.cpp
+            models/gemma3.cpp
            models/gemma3n-iswa.cpp
            models/glm4-moe.cpp
            models/glm4.cpp
@ -139,6 +139,7 @@ add_library(llama
 set_target_properties(llama PROPERTIES
    VERSION ${LLAMA_INSTALL_VERSION}
    SOVERSION 0
+    MACHO_CURRENT_VERSION 0 # keep macOS linker from seeing oversized version number
 )

 target_include_directories(llama PRIVATE .)
--- a/src/llama-grammar.cpp
+++ b/src/llama-grammar.cpp
@ -181,6 +181,52 @@ static std::pair<uint32_t, const char *> parse_char(const char * src) {
    throw std::runtime_error("unexpected end of input");
 }

+static std::pair<uint32_t, const char *> parse_token(const llama_vocab * vocab, const char * src) {
+    const char * pos = src;
+    if (*pos != '<') {
+        throw std::runtime_error(std::string("expecting '<' at ") + pos);
+    }
+    pos++;
+
+    // Parse <[id]>
+    if (*pos == '[') {
+        pos++;
+        const char * int_end = parse_int(pos);
+        uint32_t token_id = std::stoul(std::string(pos, int_end - pos));
+        pos = int_end;
+        if (*pos != ']') {
+            throw std::runtime_error(std::string("expecting ']' at ") + pos);
+        }
+        pos++;
+        if (*pos != '>') {
+            throw std::runtime_error(std::string("expecting '>' at ") + pos);
+        }
+        pos++;
+        return std::make_pair(token_id, pos);
+    }
+
+    if (vocab == nullptr) {
+        throw std::runtime_error(std::string("no vocab to parse token at ") + src);
+    }
+
+    // Parse <token> and tokenize to obtain the token id
+    while (*pos != 0 && *pos != '>') {
+        pos++;
+    }
+    if (*pos != '>') {
+        throw std::runtime_error(std::string("expecting '>' at ") + pos);
+    }
+    pos++;
+
+    llama_token tokens[2];
+    int32_t n_tokens = vocab->tokenize(src, static_cast<int32_t>(pos - src), tokens, 2, false, true);
+    if (n_tokens != 1) {
+        // must tokenize to exactly 1 token
+        throw std::runtime_error("invalid token '" + std::string(src, pos - src) + "'");
+    }
+    return std::make_pair(tokens[0], pos);
+}
+
 static void print_grammar_char(FILE * file, uint32_t c) {
    if (0x20 <= c && c <= 0x7f) {
        fprintf(file, "%c", static_cast<char>(c));
@ -212,6 +258,8 @@ static void print_rule_binary(FILE * file, const llama_grammar_rule & rule) {
            case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
            case LLAMA_GRETYPE_CHAR_ALT:       fprintf(file, "CHAR_ALT");       break;
            case LLAMA_GRETYPE_CHAR_ANY:       fprintf(file, "CHAR_ANY");       break;
+            case LLAMA_GRETYPE_TOKEN:          fprintf(file, "TOKEN");          break;
+            case LLAMA_GRETYPE_TOKEN_NOT:      fprintf(file, "TOKEN_NOT");      break;
        }
        switch (elem.type) {
            case LLAMA_GRETYPE_END:
@ -228,6 +276,17 @@ static void print_rule_binary(FILE * file, const llama_grammar_rule & rule) {
                print_grammar_char(file, elem.value);
                fprintf(file, "\") ");
                break;
+            case LLAMA_GRETYPE_TOKEN:
+                fprintf(file, "<[");
+                fprintf(file, "%u", elem.value);
+                fprintf(file, "]> ");
+                break;
+            case LLAMA_GRETYPE_TOKEN_NOT:
+                fprintf(file, "!");
+                fprintf(file, "<[");
+                fprintf(file, "%u", elem.value);
+                fprintf(file, "]> ");
+                break;
        }
    }
    fprintf(file, "\n");
@ -284,6 +343,17 @@ static void print_rule(
            case LLAMA_GRETYPE_CHAR_ANY:
                fprintf(file, ".");
                break;
+            case LLAMA_GRETYPE_TOKEN:
+                fprintf(file, "<[");
+                fprintf(file, "%u", elem.value);
+                fprintf(file, "]> ");
+                break;
+            case LLAMA_GRETYPE_TOKEN_NOT:
+                fprintf(file, "!");
+                fprintf(file, "<[");
+                fprintf(file, "%u", elem.value);
+                fprintf(file, "]> ");
+                break;
        }
        if (is_char_element(elem)) {
            switch (rule[i + 1].type) {
@ -444,6 +514,17 @@ const char * llama_grammar_parser::parse_sequence(
                }
            }
            pos = parse_space(pos + 1, is_nested);
+        } else if (*pos == '<' || *pos == '!') { // token
+            auto type = LLAMA_GRETYPE_TOKEN;
+            if (*pos == '!') { // token inverse
+                type = LLAMA_GRETYPE_TOKEN_NOT;
+                pos++;
+            }
+            auto token_pair = parse_token(vocab, pos);
+            const char * token_end  = token_pair.second;
+            last_sym_start = rule.size();
+            rule.push_back({type, token_pair.first});
+            pos = parse_space(token_end, is_nested);
        } else if (is_word_char(*pos)) { // rule reference
            const char * name_end    = parse_name(pos);
            uint32_t ref_rule_id = get_symbol_id(pos, name_end - pos);
@ -691,6 +772,21 @@ static bool llama_grammar_match_partial_char(
    return !is_positive_char;
 }

+// returns true iff token matches the rule at pos (regular or inverse)
+// asserts that pos is pointing to a token element
+static bool llama_grammar_match_token(
+    const llama_grammar_element * pos,
+    const llama_token             token) {
+    GGML_ASSERT(pos->type == LLAMA_GRETYPE_TOKEN || pos->type == LLAMA_GRETYPE_TOKEN_NOT);
+    if (pos->type == LLAMA_GRETYPE_TOKEN) {
+        return pos->value == static_cast<uint32_t>(token);
+    }
+    if (pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
+        return pos->value != static_cast<uint32_t>(token);
+    }
+    return false;
+}
+
 // transforms a grammar pushdown stack into N possible stacks, all ending
 // at a character range (terminal element)
 static void llama_grammar_advance_stack(
@ -738,6 +834,8 @@ static void llama_grammar_advance_stack(
        case LLAMA_GRETYPE_CHAR:
        case LLAMA_GRETYPE_CHAR_NOT:
        case LLAMA_GRETYPE_CHAR_ANY:
+        case LLAMA_GRETYPE_TOKEN:
+        case LLAMA_GRETYPE_TOKEN_NOT:
            if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
                // only add the stack if it's not a duplicate of one we already have
                new_stacks.emplace_back(stack);
@ -831,26 +929,38 @@ llama_grammar_stacks & llama_grammar_get_stacks(struct llama_grammar * grammar)
    return grammar->stacks;
 }

+static void llama_grammar_accept_chr(
+        struct llama_grammar       & grammar,
+        const llama_grammar_stack  & stack,
+              uint32_t               chr,
+              llama_grammar_stacks & new_stacks) {
+    if (stack.empty()) {
+        return;
+    }
+
+    const llama_grammar_element * pos = stack.back();
+
+    // ignore if this turns into a token
+    if (pos->type == LLAMA_GRETYPE_TOKEN || pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
+        return;
+    }
+
+    auto match = llama_grammar_match_char(pos, chr);
+    if (match.first) {
+        llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
+        if (!llama_grammar_is_end_of_sequence(match.second)) {
+            new_stack.push_back(match.second);
+        }
+        llama_grammar_advance_stack(grammar.rules, new_stack, new_stacks);
+    }
+}
+
 void llama_grammar_accept(struct llama_grammar * grammar, uint32_t chr) {
    llama_grammar_stacks stacks_new;
    stacks_new.reserve(grammar->stacks.size());

    for (const auto & stack : grammar->stacks) {
-        if (stack.empty()) {
-            continue;
-        }
-
-        auto match = llama_grammar_match_char(stack.back(), chr);
-        if (match.first) {
-            const llama_grammar_element * pos = match.second;
-
-            // update top of stack to next element, if any
-            llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
-            if (!llama_grammar_is_end_of_sequence(pos)) {
-                new_stack.push_back(pos);
-            }
-            llama_grammar_advance_stack(grammar->rules, new_stack, stacks_new);
-        }
+        llama_grammar_accept_chr(*grammar, stack, chr, stacks_new);
    }

    grammar->stacks = std::move(stacks_new);
@ -875,6 +985,22 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(

    const llama_grammar_element * stack_pos = stack.back();

+    // if the top of the stack is a token rule, then we only need to check the token id
+    if (stack_pos->type == LLAMA_GRETYPE_TOKEN || stack_pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
+        for (const auto & tok : candidates) {
+            if (*tok.code_points == 0) {
+                // reached the end of a token consumed by char rules, reject iff it ended
+                // in a partial response
+                if (tok.partial_utf8.n_remain != 0) {
+                    rejects.push_back(tok);
+                }
+            } else if (!llama_grammar_match_token(stack_pos, tok.id)) {
+                rejects.push_back(tok);
+            }
+        }
+        return rejects;
+    }
+
    llama_grammar_candidates next_candidates;
    next_candidates.reserve(candidates.size());

@ -887,7 +1013,7 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
                rejects.push_back(tok);
            }
        } else if (llama_grammar_match_char(stack_pos, *tok.code_points).first) {
-            next_candidates.push_back({ tok.index, tok.code_points + 1, tok.partial_utf8 });
+            next_candidates.push_back({ tok.index, tok.code_points + 1, tok.partial_utf8, tok.id });
        } else {
            rejects.push_back(tok);
        }
@ -905,7 +1031,7 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(

    auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
    for (const auto & tok : next_rejects) {
-        rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8 });
+        rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8, tok.id });
    }

    return rejects;
@ -972,12 +1098,13 @@ struct llama_grammar * llama_grammar_init_impl(
        vocab,
        std::move(vec_rules),
        std::move(stacks),
-        /* .partial_utf8 = */     {},
-        /* .lazy =*/              false,
-        /* .awaiting_trigger = */ false,
-        /* .trigger_buffer = */   "",
-        /* .trigger_tokens   = */ {},
-        /* .trigger_patterns    = */ {},
+        /* .partial_utf8 = */             {},
+        /* .lazy = */                     false,
+        /* .awaiting_trigger = */         false,
+        /* .trigger_buffer = */           "",
+        /* .trigger_buffer_positions = */ {},
+        /* .trigger_tokens = */           {},
+        /* .trigger_patterns = */         {},
    };
 }

@ -990,7 +1117,7 @@ struct llama_grammar * llama_grammar_init_impl(
                            size_t num_trigger_patterns,
               const llama_token * trigger_tokens,
                            size_t num_trigger_tokens) {
-    llama_grammar_parser parser;
+    llama_grammar_parser parser(vocab);

    // if there is a grammar, parse it
    // rules will be empty (default) if there are parse errors
@ -1077,10 +1204,11 @@ struct llama_grammar * llama_grammar_init_impl(
        vocab,
        std::move(vec_rules),
        std::move(stacks),
-        /* .partial_utf8 = */     {},
-        /* .lazy = */             lazy,
-        /* .awaiting_trigger = */ lazy,
-        /* .trigger_buffer = */   "",
+        /* .partial_utf8 = */             {},
+        /* .lazy = */                     lazy,
+        /* .awaiting_trigger = */         lazy,
+        /* .trigger_buffer = */           "",
+        /* .trigger_buffer_positions = */ {},
        std::move(vec_trigger_tokens),
        std::move(vec_trigger_patterns),
    };
@ -1103,6 +1231,7 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra
        grammar.lazy,
        grammar.awaiting_trigger,
        grammar.trigger_buffer,
+        grammar.trigger_buffer_positions,
        grammar.trigger_tokens,
        grammar.trigger_patterns,
    };
@ -1156,7 +1285,7 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
            cur_p->data[i].logit = -INFINITY;
        } else {
            candidates_decoded.push_back(decode_utf8(piece, grammar.partial_utf8));
-            candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
+            candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second, id });
        }
    }

@ -1175,10 +1304,12 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
        if (std::find(grammar.trigger_tokens.begin(), grammar.trigger_tokens.end(), token) != grammar.trigger_tokens.end()) {
            grammar.awaiting_trigger = false;
            grammar.trigger_buffer.clear();
-            llama_grammar_accept_str(grammar, piece);
+            llama_grammar_accept_token(grammar, token, piece);
            LLAMA_LOG_DEBUG("Grammar triggered on token %u (`%s`)", token, piece.c_str());
            return;
        } else {
+            auto position = std::make_pair(grammar.trigger_buffer.size(), grammar.trigger_buffer.size() + piece.size());
+            grammar.trigger_buffer_positions.push_back(std::make_pair(token, position));
            grammar.trigger_buffer += piece;

            std::smatch match;
@ -1196,10 +1327,23 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
                    if (start == std::string::npos) {
                        start = match.position(0);
                    }
+
+                    // replay tokens that overlap with [start, end)
+                    for (const auto & [tok, tok_pos] : grammar.trigger_buffer_positions) {
+                        auto [tok_start, tok_end] = tok_pos;
+                        if (tok_end <= start) {
+                            continue;
+                        }
+
+                        size_t piece_start = (tok_start < start) ? start : tok_start; // allow for partial token pieces
+                        size_t piece_len = tok_end - piece_start;
+                        auto tok_piece = grammar.trigger_buffer.substr(piece_start, piece_len);
+                        llama_grammar_accept_token(grammar, tok, tok_piece);
+                    }
+
                    auto constrained_str = grammar.trigger_buffer.substr(start);
-                    // std::string constrained_str(match[1].first, grammar.trigger_buffer.end());
                    grammar.trigger_buffer.clear();
-                    llama_grammar_accept_str(grammar, constrained_str);
+                    grammar.trigger_buffer_positions.clear();
                    LLAMA_LOG_DEBUG("Grammar triggered on regex: '%s'\n", constrained_str.c_str());
                    return;
                }
@ -1218,7 +1362,7 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
        GGML_ABORT("fatal error");
    }

-    llama_grammar_accept_str(grammar, piece);
+    llama_grammar_accept_token(grammar, token, piece);
 }

 void llama_grammar_accept_str(struct llama_grammar & grammar, const std::string & piece) {
@ -1235,3 +1379,59 @@ void llama_grammar_accept_str(struct llama_grammar & grammar, const std::string
        throw std::runtime_error("Unexpected empty grammar stack after accepting piece: " + piece);
    }
 }
+
+void llama_grammar_accept_token(struct llama_grammar & grammar, llama_token token, const std::string & piece) {
+    // Note terminating 0 in decoded string
+    const auto   decoded     = decode_utf8(piece, grammar.partial_utf8);
+    const auto & code_points = decoded.first;
+
+    llama_grammar_stacks stacks_new;
+    stacks_new.reserve(grammar.stacks.size());
+
+    for (const auto & stack : grammar.stacks) {
+        if (stack.empty()) {
+            continue;
+        }
+
+        const llama_grammar_element * pos = stack.back();
+
+        if (pos->type == LLAMA_GRETYPE_TOKEN || pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
+            if (llama_grammar_match_token(pos, token)) {
+                llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
+                if (!llama_grammar_is_end_of_sequence(pos + 1)) {
+                    new_stack.push_back(pos + 1);
+                }
+                llama_grammar_advance_stack(grammar.rules, new_stack, stacks_new);
+            }
+        } else {
+            llama_grammar_stacks current_stacks = {stack};
+
+            for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
+                llama_grammar_stacks next_stacks;
+
+                for (const auto & cur_stack : current_stacks) {
+                    llama_grammar_accept_chr(grammar, cur_stack, *it, next_stacks);
+                }
+
+                current_stacks = std::move(next_stacks);
+                if (current_stacks.empty()) {
+                    break;
+                }
+            }
+
+            for (auto & surviving_stack : current_stacks) {
+                if (std::find(stacks_new.begin(), stacks_new.end(), surviving_stack) == stacks_new.end()) {
+                    stacks_new.emplace_back(surviving_stack);
+                }
+            }
+        }
+    }
+
+    grammar.stacks = std::move(stacks_new);
+    grammar.partial_utf8 = decoded.second;
+
+    if (grammar.stacks.empty()) {
+        throw std::runtime_error("Unexpected empty grammar stack after accepting piece: " + piece + " (" + std::to_string(token) + ")");
+    }
+}
+
--- a/src/llama-grammar.h
+++ b/src/llama-grammar.h
@ -36,11 +36,17 @@ enum llama_gretype {

    // any character (.)
    LLAMA_GRETYPE_CHAR_ANY       = 7,
+
+    // terminal element: token (<[token-id]>)
+    LLAMA_GRETYPE_TOKEN          = 8,
+
+    // inverse token (!<[token-id]>)
+    LLAMA_GRETYPE_TOKEN_NOT      = 9,
 };

 typedef struct llama_grammar_element {
    enum llama_gretype type;
-    uint32_t           value; // Unicode code point or rule ID
+    uint32_t           value; // Unicode code point, rule ID, or token ID
 } llama_grammar_element;

 struct llama_partial_utf8 {
@ -52,6 +58,7 @@ struct llama_grammar_candidate {
    size_t               index;
    const uint32_t     * code_points;
    llama_partial_utf8   partial_utf8;
+    llama_token          id;
 };

 using llama_grammar_rule  = std::vector<      llama_grammar_element>;
@ -77,10 +84,13 @@ std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
        const llama_grammar_candidates & candidates);

 struct llama_grammar_parser {
+    const llama_vocab * vocab;
    std::map<std::string, uint32_t> symbol_ids;

    llama_grammar_rules rules;

+    llama_grammar_parser(const struct llama_vocab * vocab = nullptr) : vocab(vocab) {}
+
    llama_grammar_stack c_rules() const;

    uint32_t get_symbol_id(const char * src, size_t len);
@ -112,6 +122,9 @@ struct llama_grammar_trigger_pattern {
 };

 struct llama_grammar {
+    // maintain a list of llama_tokens and their positions in the trigger_buffer
+    using token_pos = std::pair<llama_token, std::pair<size_t, size_t>>;
+
    // note: allow null vocab for testing (not great)
    const llama_vocab * vocab;

@ -127,6 +140,7 @@ struct llama_grammar {
    bool                     lazy             = false;
    bool                     awaiting_trigger = false; // Initialized to true for lazy grammars only
    std::string              trigger_buffer;           // Output buffered by lazy grammar. Will be cleared once trigger is found.
+    std::vector<token_pos>   trigger_buffer_positions; // Tokens buffered by lazy grammar. Used to replay when a trigger is found.
    std::vector<llama_token> trigger_tokens;           // Tokens that trigger a lazy grammar, or tokens to force printing of (even if special).
    std::vector<llama_grammar_trigger_pattern>
                             trigger_patterns;         // Regular expressions that trigger a lazy grammar. Must be a full match of the entire generated
@ -171,3 +185,8 @@ void llama_grammar_accept_impl(
 void llama_grammar_accept_str(
              struct llama_grammar & grammar,
                 const std::string & piece);
+
+void llama_grammar_accept_token(
+              struct llama_grammar & grammar,
+                       llama_token   token,
+                 const std::string & piece);
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@ -1034,7 +1034,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(

        // mask out the other groups
        selection_probs = ggml_get_rows(ctx0, selection_groups, expert_groups); // [n_exp_per_group, n_group_used, n_tokens]
-        selection_probs = ggml_set_rows(ctx0, ggml_scale_bias(ctx0, selection_groups, 0.0f, -INFINITY), selection_probs, expert_groups); // [n_exp_per_group, n_expert_groups, n_tokens]
+        selection_probs = ggml_set_rows(ctx0, ggml_fill(ctx0, selection_groups, -INFINITY), selection_probs, expert_groups); // [n_exp_per_group, n_expert_groups, n_tokens]
        selection_probs = ggml_reshape_2d(ctx0, selection_probs, n_expert, n_tokens); // [n_expert, n_tokens]
        cb(selection_probs, "ffn_moe_probs_masked", il);
    }
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@ -1264,18 +1264,25 @@ void llama_model::load_hparams(llama_model_loader & ml) {
            } break;
        case LLM_ARCH_GEMMA3:
            {
-                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
-                hparams.set_swa_pattern(6);
+                const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
+                if (found_swa && hparams.n_swa > 0) {
+                    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+                    hparams.set_swa_pattern(6);

-                hparams.rope_freq_base_train_swa  = 10000.0f;
-                hparams.rope_freq_scale_train_swa = 1.0f;
+                    hparams.rope_freq_base_train_swa  = 10000.0f;
+                    hparams.rope_freq_scale_train_swa = 1.0f;
+                } else {
+                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
+                }

-                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa);
+                hparams.f_final_logit_softcapping = 0.0f;
+                ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);

                switch (hparams.n_layer) {
                    case 18: type = LLM_TYPE_270M; break;
                    case 26: type = LLM_TYPE_1B; break;
+                    case 32: type = LLM_TYPE_8B; break; // Rnj-1
                    case 34: type = LLM_TYPE_4B; break;
                    case 48: type = LLM_TYPE_12B; break;
                    case 62: type = LLM_TYPE_27B; break;
@ -1599,8 +1606,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale);

-                switch (hparams.n_layer) {
-                    case 28: type = LLM_TYPE_20B; break;
+                switch (hparams.n_ff_exp) {
+                    case 1408: type = LLM_TYPE_16B; break;
+                    case 1792: type = LLM_TYPE_20B; break;
                    default: type = LLM_TYPE_UNKNOWN;
                }
            } break;
@ -7304,7 +7312,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
            } break;
        case LLM_ARCH_GEMMA3:
            {
-                llm = std::make_unique<llm_build_gemma3_iswa>(*this, params);
+                if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
+                    llm = std::make_unique<llm_build_gemma3<true>>(*this, params);
+                } else {
+                    llm = std::make_unique<llm_build_gemma3<false>>(*this, params);
+                }
            } break;
        case LLM_ARCH_GEMMA3N:
            {
--- a/src/models/gemma3-iswa.cpp
+++ b/src/models/gemma3-iswa.cpp
@ -1,6 +1,7 @@
 #include "models.h"

-llm_build_gemma3_iswa::llm_build_gemma3_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+template <bool iswa>
+llm_build_gemma3<iswa>::llm_build_gemma3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
    const int64_t n_embd_head = hparams.n_embd_head_k;

    ggml_tensor * cur;
@ -17,13 +18,28 @@ llm_build_gemma3_iswa::llm_build_gemma3_iswa(const llama_model & model, const ll
    ggml_tensor * inp_pos = build_inp_pos();

    // TODO: is causal == true correct? might need some changes
-    auto * inp_attn = build_attn_inp_kv_iswa();
+    using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
+    inp_attn_type * inp_attn = nullptr;
+
+    if constexpr (iswa) {
+        inp_attn = build_attn_inp_kv_iswa();
+    } else {
+        inp_attn = build_attn_inp_kv();
+    }

    ggml_tensor * inp_out_ids = build_inp_out_ids();

    for (int il = 0; il < n_layer; ++il) {
-        const float freq_base_l  = model.get_rope_freq_base (cparams, il);
-        const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
+        float freq_base_l  = 0.0f;
+        float freq_scale_l = 0.0f;
+
+        if constexpr (iswa) {
+            freq_base_l  = model.get_rope_freq_base (cparams, il);
+            freq_scale_l = model.get_rope_freq_scale(cparams, il);
+        } else {
+            freq_base_l  = freq_base;
+            freq_scale_l = freq_scale;
+        }

        // norm
        cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
@ -102,7 +118,7 @@ llm_build_gemma3_iswa::llm_build_gemma3_iswa(const llama_model & model, const ll
        cur = build_norm(cur,
                model.layers[il].ffn_post_norm, NULL,
                LLM_NORM_RMS, -1);
-        cb(cur, "ffn_post_norm", -1);
+        cb(cur, "ffn_post_norm", il);

        cur = ggml_add(ctx0, cur, sa_out);

@ -124,8 +140,17 @@ llm_build_gemma3_iswa::llm_build_gemma3_iswa(const llama_model & model, const ll
    // lm_head
    cur = build_lora_mm(model.output, cur);

+    if (hparams.f_final_logit_softcapping) {
+        cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
+        cur = ggml_tanh(ctx0, cur);
+        cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
+    }
+
    cb(cur, "result_output", -1);
    res->t_logits = cur;

    ggml_build_forward_expand(gf, cur);
 }
+
+template struct llm_build_gemma3<false>;
+template struct llm_build_gemma3<true>;
--- a/src/models/models.h
+++ b/src/models/models.h
@ -179,8 +179,9 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
    llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params);
 };

-struct llm_build_gemma3_iswa : public llm_graph_context {
-    llm_build_gemma3_iswa(const llama_model & model, const llm_graph_params & params);
+template <bool iswa>
+struct llm_build_gemma3 : public llm_graph_context {
+    llm_build_gemma3(const llama_model & model, const llm_graph_params & params);
 };

 struct llm_build_gemma3n_iswa : public llm_graph_context {
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@ -6253,6 +6253,31 @@ struct test_solve_tri : public test_case {
    }
 };

+// GGML_OP_DIAG
+struct test_diag : public test_case {
+    const ggml_type              type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override { return VARS_TO_STR2(type, ne); }
+
+    test_diag(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = { 10, 1, 4, 3 })
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        GGML_ASSERT(ne[1] == 1);
+        ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_diag(ctx, a);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+
 enum llm_norm_type {
    LLM_NORM,
    LLM_NORM_RMS,
@ -7829,6 +7854,10 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
    test_cases.emplace_back(new test_fill(-152.0f, GGML_TYPE_F32, { 800, 600, 4, 4 }));
    test_cases.emplace_back(new test_fill(3.5f, GGML_TYPE_F32, { 2048, 512, 2, 2 }));

+    test_cases.emplace_back(new test_diag());
+    test_cases.emplace_back(new test_diag(GGML_TYPE_F32, { 79, 1, 19, 13 }));
+    test_cases.emplace_back(new test_diag(GGML_TYPE_F32, { 256, 1, 8, 16 }));
+
    test_cases.emplace_back(new test_solve_tri());
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 11, 11, 1, 1 }, { 5, 11, 1, 1 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 17, 17, 2, 4 }, { 9, 17, 2, 4 }));
@ -8181,6 +8210,12 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
        }
    }

+    // Examples from granite-4.0-h-1b/ggml-model-Q8_0.gguf
+    test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {515, 3328, 1, 1}, {4, 3328, 1, 1})); // prefill
+    test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {4,   3328, 1, 1}, {4, 3328, 1, 1})); // generate
+    test_cases.emplace_back(new test_ssm_scan(GGML_TYPE_F32, 128, 64, 48, 1, 512, 1)); // prefill
+    test_cases.emplace_back(new test_ssm_scan(GGML_TYPE_F32, 128, 64, 48, 1, 1,   1)); // generate
+
    return test_cases;
 }

--- a/tests/test-chat.cpp
+++ b/tests/test-chat.cpp
@ -539,6 +539,71 @@ const common_chat_msg message_assist_call_python_lines           = simple_assist
 const common_chat_msg message_assist_call_python_lines_unclosed  = simple_assist_msg("", "", "python", "{\"code\":\"# This is a program:\\nprint('hey')");
 const common_chat_msg message_assist_call_code_interpreter       = simple_assist_msg("", "", "code_interpreter", "{\"code\":\"print('hey')\"}");

+// Use for PEG parser implementations
+struct peg_test_case {
+    common_chat_templates_inputs params;
+    std::string input;
+    common_chat_msg expect;
+};
+
+struct make_peg_parser {
+    common_chat_params params_;
+    common_peg_arena arena_;
+
+    make_peg_parser(common_chat_templates * tmpls, const common_chat_templates_inputs & inputs) {
+        params_ = common_chat_templates_apply(tmpls, inputs);
+        arena_.load(params_.parser);
+    }
+
+    common_chat_msg parse(const std::string & msg, bool is_partial) {
+        return common_chat_peg_parse(arena_, msg, is_partial, /* syntax = */ {params_.format});
+    }
+};
+
+static void test_peg_parser(common_chat_templates * tmpls, const std::function<void(peg_test_case &)> & init) {
+    peg_test_case tc;
+    init(tc);
+    if (tc.params.messages.empty()) {
+        tc.params.messages = {message_user};
+    }
+    if (tc.expect.role.empty()) {
+        tc.expect.role = "assistant";
+    }
+
+    auto parser = make_peg_parser(tmpls, tc.params);
+
+    common_chat_msg msg_accum;
+    common_chat_msg msg_prev;
+    msg_accum.role = msg_prev.role = "assistant";
+
+    for (size_t i = 1; i <= tc.input.size(); ++i) {
+        auto is_partial = i < tc.input.size();
+        common_chat_msg msg_current = parser.parse(tc.input.substr(0, i), is_partial);
+
+        for (const auto & diff : common_chat_msg_diff::compute_diffs(msg_prev, msg_current)) {
+            if (!diff.reasoning_content_delta.empty()) {
+                msg_accum.reasoning_content += diff.reasoning_content_delta;
+            }
+            if (!diff.content_delta.empty()) {
+                msg_accum.content += diff.content_delta;
+            }
+            if (diff.tool_call_index != std::string::npos) {
+                if (!diff.tool_call_delta.name.empty()) {
+                    msg_accum.tool_calls.push_back({diff.tool_call_delta.name, "", ""});
+                }
+                if (!diff.tool_call_delta.arguments.empty()) {
+                    msg_accum.tool_calls.back().arguments += diff.tool_call_delta.arguments;
+                }
+            }
+        }
+        assert_msg_equals(msg_current, msg_accum, true);
+        msg_prev = msg_current;
+    }
+
+    assert_msg_equals(tc.expect, parser.parse(tc.input, false), true);
+    assert_msg_equals(tc.expect, msg_accum, true);
+}
+
 static void test_msgs_oaicompat_json_conversion() {
    printf("[%s]\n", __func__);
    std::vector<common_chat_msg> msgs{
@ -3434,7 +3499,95 @@ Hey there!<|im_end|>
        auto grammar = build_grammar(params.grammar);
        GGML_ASSERT(grammar && "Failed to build Qwen3-Coder grammar with union types");
    }
+}

+static void test_template_output_peg_parsers() {
+    printf("[%s]\n", __func__);
+
+    // JSON schemas
+    const char * invoice_schema = R"({
+        "type": "object",
+        "properties": {
+            "amount": {"type": "number"},
+            "date": {"type": "string"}
+        }
+    })";
+
+    {
+        // Ministral-3-14B-Reasoning-2512
+        auto tmpls = read_templates("models/templates/mistralai-Ministral-3-14B-Reasoning-2512.jinja");
+
+        // Test basic message
+        test_peg_parser(tmpls.get(), [&](auto & t) {
+            t.input = "Hello, world!\nWhat's up?";
+            t.expect = message_assist;
+        });
+
+        // Test basic message and reasoning with reasoning_format = none
+        test_peg_parser(tmpls.get(), [&](auto & t) {
+            t.input = "[THINK]I'm\nthinking[/THINK]Hello, world!\nWhat's up?";
+            t.expect.content = "[THINK]I'm\nthinking[/THINK]Hello, world!\nWhat's up?";
+        });
+
+        // Test basic message and reasoning with reasoning_format = auto
+        test_peg_parser(tmpls.get(), [&](auto & t) {
+            t.input = "[THINK]I'm\nthinking[/THINK]Hello, world!\nWhat's up?";
+            t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
+
+            t.expect = message_assist_thoughts;
+        });
+
+        // Test tool call
+        test_peg_parser(tmpls.get(), [&](auto & t) {
+            t.input = R"([TOOL_CALLS]special_function[ARGS]{"arg1":1})";
+            t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
+            t.params.tools = {special_function_tool};
+
+            t.expect = message_assist_call;
+        });
+
+        // Test tool call with reasoning
+        test_peg_parser(tmpls.get(), [&](auto & t) {
+            t.input = "[THINK]I'm\nthinking[/THINK]"
+                      R"([TOOL_CALLS]special_function[ARGS]{"arg1":1})";
+            t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
+            t.params.tools = {special_function_tool};
+
+            t.expect = message_assist_call_thoughts;
+        });
+
+        // Test parallel tool calls
+        test_peg_parser(tmpls.get(), [&](auto & t) {
+            t.input = R"([TOOL_CALLS]special_function[ARGS]{"arg1": 1})"
+                      R"([TOOL_CALLS]special_function_with_opt[ARGS]{"arg1": 1, "arg2": 2})";
+            t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
+            t.params.parallel_tool_calls = true;
+            t.params.tools = {special_function_tool, special_function_tool_with_optional_param};
+
+            t.expect.tool_calls = {{
+                /* .name = */      "special_function",
+                /* .arguments = */ R"({"arg1": 1})",
+                /* .id = */        {},
+            }, {
+                /* .name = */      "special_function_with_opt",
+                /* .arguments = */ R"({"arg1": 1, "arg2": 2})",
+                /* .id = */        {},
+            }};
+        });
+
+        // Test response format
+        test_peg_parser(tmpls.get(), [&](auto & t) {
+            t.input = "[THINK]I need to output the invoice details in JSON[/THINK]"
+                      "```json\n"
+                      R"({"amount": 123.45, "date": "2025-12-03"})"
+                      "\n```";
+            t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
+            t.params.json_schema = invoice_schema;
+
+            t.expect.reasoning_content = "I need to output the invoice details in JSON";
+            t.expect.content =R"({"amount": 123.45, "date": "2025-12-03"})";
+        });
+    }
 }

 static void test_msg_diffs_compute() {
@ -3560,6 +3713,7 @@ int main(int argc, char ** argv) {
            test_msgs_oaicompat_json_conversion();
            test_tools_oaicompat_json_conversion();
            test_template_output_parsers();
+            test_template_output_peg_parsers();
            std::cout << "\n[chat] All tests passed!" << '\n';
        }
        return 0;
--- a/tests/test-grammar-integration.cpp
+++ b/tests/test-grammar-integration.cpp
@ -32,13 +32,66 @@ static bool test_build_grammar_fails(const std::string & grammar_str) {
    return grammar_fails;
 }

+struct token_and_piece {
+    llama_token token;
+    std::string piece;
+};
+
+// token() encodes a 32-bit ID as 5 bytes: a 0xff marker followed by the ID in big-endian order.
+static std::string token(llama_token id) {
+    return std::string{
+        static_cast<char>(0xff),
+        static_cast<char>((id >> 24) & 0xff),
+        static_cast<char>((id >> 16) & 0xff),
+        static_cast<char>((id >> 8) & 0xff),
+        static_cast<char>(id & 0xff)
+    };
+}
+
+// parse_tokens() parses the token encodes above and UTF-8 text.
+static std::vector<token_and_piece> parse_tokens(const std::string & input) {
+    std::vector<token_and_piece> result;
+    result.reserve(input.size());
+    size_t offset = 0;
+    while (offset < input.size()) {
+        try {
+            if (static_cast<unsigned char>(input[offset]) == 0xff) {
+                if (offset + 5 > input.size()) {
+                    throw std::runtime_error("not enough bytes for token id");
+                }
+                uint32_t val =
+                    (static_cast<unsigned char>(input[offset + 1]) << 24) |
+                    (static_cast<unsigned char>(input[offset + 2]) << 16) |
+                    (static_cast<unsigned char>(input[offset + 3]) << 8)  |
+                    (static_cast<unsigned char>(input[offset + 4]));
+                auto piece = "<[" + std::to_string(val) + "]>";
+                result.push_back({static_cast<llama_token>(val), piece});
+                offset += 5;
+            } else {
+                uint32_t cpt = unicode_cpt_from_utf8(input, offset);
+                result.push_back({0, unicode_cpt_to_utf8(cpt)});
+            }
+        } catch (const std::invalid_argument & /*ex*/) {
+            // Silently ignore invalid UTF-8 input to avoid leaking the exception beyond llama_tokenize
+            ++offset;
+            result.push_back({0, unicode_cpt_to_utf8(0xFFFD)}); // replacement character
+        }
+    }
+    return result;
+}
+
 static bool match_string(const std::string & input, llama_grammar * grammar) {
-    const auto cpts = unicode_cpts_from_utf8(input);
+    const auto parsed = parse_tokens(input);

    auto & stacks_cur = llama_grammar_get_stacks(grammar);

-    for (const auto & cpt : cpts) {
-        llama_grammar_accept(grammar, cpt);
+    for (const auto & in : parsed) {
+        try {
+            llama_grammar_accept_token(*grammar, in.token, in.piece);
+        } catch (const std::runtime_error & /*e*/) {
+            // normally this shouldn't get hit because of llama_grammar_apply
+            return false;
+        }

        if (stacks_cur.empty()) {
            // no stacks means that the grammar failed to match at this point
@ -426,6 +479,30 @@ static void test_simple_grammar() {
            "12a45",
        }
    );
+
+    // Test case for a simple grammar with tokens
+    test_grammar(
+        "simple grammar with tokens",
+        R"""(
+            root ::= <[10]> content <[11]>
+            content ::= (!<[11]>)*)""",
+        // Passing strings
+        {
+            token(10) + "hello world" + token(11),
+            token(10) + "text with " + token(12) + " other tokens " + token(13) + " mixed in" + token(11),
+            token(10) + token(11),
+            token(10) + token(12) + token(13) + token(14) + token(15) + token(11),
+            token(10) + "a" + token(11),
+        },
+        // Failing strings
+        {
+            token(10) + "missing end token",
+            token(10),
+            "missing start token" + token(11),
+            token(10) + token(11) + token(11),  // double end token
+            token(11) + "wrong order" + token(10),
+        }
+    );
 }

 static void test_complex_grammar() {
@ -487,6 +564,34 @@ static void test_complex_grammar() {
            "123+456*789-123/456+789*123-456/789+123*456-789/123+456*789-123/456+789*123-456/",
        }
    );
+
+    // Test case for a more complex grammar with tokens
+    test_grammar(
+        "complex grammar with tokens",
+        R"""(
+            root ::= reasoning+ content tool-call*
+            reasoning ::= <[10]> (!<[11]>)* <[11]>
+            content ::= <[20]> (!<[21]>)* <[21]>
+            tool-call ::= <[12]> name <[13]> args <[14]>
+            name ::= (!<[13]>)+
+            args ::= (!<[14]>)*)""",
+        // Passing strings
+        {
+            token(10) + "I am thinking" + token(11) + token(20) + "hello world!" + token(21) + token(12) + "search" + token(13) + "query=test" + token(14),
+            token(10) + "reasoning 1" + token(11) + token(10) + "reasoning 2" + token(11) + token(20) + token(21) + token(12) + "tool" + token(13) + token(14),
+            token(10) + token(11) + token(20) + "content" + token(21),
+            token(10) + "think" + token(12) + " nested" + token(11) + token(20) + token(10) + "more content" + token(21) + token(12) + "fn" + token(13) + "x=1,y=2" + token(14) + token(12) + "fn2" + token(13) + token(14),
+            token(10) + "reasoning" + token(11) + token(10) + "more" + token(11) + token(10) + "even more" + token(11) + token(20) + "text" + token(21) + token(12) + "a" + token(13) + "b" + token(14) + token(12) + "c" + token(13) + "d" + token(14),
+        },
+        // Failing strings
+        {
+            token(20) + "content only" + token(21),
+            token(10) + "no closing reasoning",
+            token(10) + token(11) + token(20) + "no closing content",
+            token(10) + token(11) + token(20) + token(21) + token(12) + "incomplete tool",
+            token(10) + token(11) + token(11) + token(20) + token(21),
+        }
+    );
 }

 static void test_special_chars() {
--- a/tests/test-grammar-parser.cpp
+++ b/tests/test-grammar-parser.cpp
@ -515,5 +515,19 @@ int main()
        {LLAMA_GRETYPE_END, 0},
    });

+    // <[1000]> = "<think>"
+    // <[1001]> = "</think>"
+    verify_parsing(R"""(
+        root  ::= <[1000]> !<[1001]> <[1001]>
+    )""", {
+        {"root", 0}
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_TOKEN, 1000},
+        {LLAMA_GRETYPE_TOKEN_NOT, 1001},
+        {LLAMA_GRETYPE_TOKEN, 1001},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
    return 0;
 }
--- a/tests/test-llama-grammar.cpp
+++ b/tests/test-llama-grammar.cpp
@ -202,7 +202,7 @@ int main()
        uint32_t *cp = new uint32_t[2]; // dynamically allocate memory for code_point
        cp[0] = 37 + i;
        cp[1] = 0;
-        next_candidates[i] = {i, cp, {}};
+        next_candidates[i] = {i, cp, {}, 0};
    }

    std::vector<std::vector<std::pair<uint32_t, uint16_t>>> expected_reject = {
--- a/tools/mtmd/CMakeLists.txt
+++ b/tools/mtmd/CMakeLists.txt
@ -16,6 +16,7 @@ add_library(mtmd
 set_target_properties(mtmd PROPERTIES
    VERSION ${LLAMA_INSTALL_VERSION}
    SOVERSION 0
+    MACHO_CURRENT_VERSION 0 # keep macOS linker from seeing oversized version number
 )

 target_link_libraries     (mtmd PUBLIC ggml llama)
--- a/tools/server/README-dev.md
+++ b/tools/server/README-dev.md
@ -42,7 +42,15 @@ graph TD
    server_response --> server_routes
 ```

-TODO: mention about how batching is handled by `server_slot`
+### Batching
+
+The server context maintains a single batch shared across all slots. When `update_slots()` is invoked, the system iterates through all active slots to populate this batch. For each slot, either a generated token from the previous decoding step or available prompt tokens are added to the batch.
+
+Batching constraints apply: slots can only be batched together if they share compatible configurations. For instance, slots using a specific LoRA adapter can be batched with each other, but not with slots using a different LoRA adapter or no adapter at all.
+
+Once the batch reaches capacity or all slots have been processed, `llama_decode` is called to execute the inference. This operation represents the primary computational bottleneck in `update_slots()`.
+
+Following decoding, the system either retrieves embeddings or samples the next token using `common_sampler_sample`. If a slot has remaining prompt tokens to process, it yields until the next `update_slots()` iteration.

 ### Thread Management

@ -62,6 +70,23 @@ Each incoming HTTP request is handled by its own thread managed by the HTTP libr
 - All JSON formatting and chat template logic must stay in the HTTP layer.
 - Avoid passing raw JSON between the HTTP layer and `server_slot`. Instead, parse everything into native C++ types as early as possible.

+### Example trace of a request
+
+Here is an example trace of an API request for text completion:
+
+- A request arrives at the HTTP layer.
+- The request is routed to the corresponding handler inside `server_routes`. In this case, `handle_completions_impl` is invoked.
+- The handler parses the input request, constructs a new `server_task`, and passes it to `server_res_generator`.
+- `server_res_generator` creates a new `task_result_state` for each task:
+    - `task_result_state` stays in the HTTP layer, responsible for keeping track of the current state of the response (e.g., parsing tool calls or thinking messages).
+    - `server_task` is moved into `server_queue` inside `server_context`.
+- `server_context` launches the task by moving it into an available slot (see `launch_slot_with_task()`).
+- `update_slot()` processes the task as described in the "Batching" section above.
+- Results may be sent using `send_partial_response` or `send_final_response`, which creates a new `server_task_result` and pushes it to the response queue.
+- At the same time, `server_res_generator` listens to the response queue and retrieves this response.
+- As the response is stateless, `server_res_generator` calls `response->update()` to update the response with the current state.
+- `server_res_generator` then calls `response->to_json()` and passes the response to the HTTP layer.
+
 ### Testing

 `llama-server` includes an automated test suite based on `pytest`.
--- a/tools/server/server-common.cpp
+++ b/tools/server/server-common.cpp
@ -972,6 +972,9 @@ json oaicompat_chat_params_parse(
    inputs.parallel_tool_calls   = json_value(body, "parallel_tool_calls", false);
    inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
    inputs.reasoning_format      = opt.reasoning_format;
+    if (body.contains("reasoning_format")) {
+        inputs.reasoning_format = common_reasoning_format_from_name(body.at("reasoning_format").get<std::string>());
+    }
    inputs.enable_thinking       = opt.enable_thinking;
    if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
        if (body.contains("grammar")) {
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@ -2601,6 +2601,10 @@ struct server_context_impl {
    int get_slot_n_ctx() {
        return slots.back().n_ctx;
    }
+
+    server_response_reader get_response_reader() {
+        return server_response_reader(queue_tasks, queue_results, HTTP_POLLING_SECONDS);
+    }
 };

 //
@ -2630,8 +2634,8 @@ llama_context * server_context::get_llama_context() const {
    return impl->ctx;
 }

-std::pair<server_queue &, server_response &> server_context::get_queues() {
-    return { impl->queue_tasks, impl->queue_results };
+server_response_reader server_context::get_response_reader() {
+    return impl->get_response_reader();
 }


@ -2640,7 +2644,7 @@ std::pair<server_queue &, server_response &> server_context::get_queues() {
 struct server_res_generator : server_http_res {
    server_response_reader rd;
    server_res_generator(server_context_impl & ctx_server)
-        : rd({ctx_server.queue_tasks, ctx_server.queue_results}, HTTP_POLLING_SECONDS) {}
+        : rd(ctx_server.queue_tasks, ctx_server.queue_results, HTTP_POLLING_SECONDS) {}
    void ok(const json & response_data) {
        status = 200;
        data = safe_json_to_str(response_data);
@ -2673,9 +2677,6 @@ static std::unique_ptr<server_res_generator> handle_completions_impl(
    try {
        std::vector<server_task> tasks;

-        // tracking generation state and partial tool calls
-        std::vector<task_result_state> states;
-
        const auto & prompt = data.at("prompt");
        // TODO: this log can become very long, put it behind a flag or think about a more compact format
        //SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str());
@ -2691,7 +2692,6 @@ static std::unique_ptr<server_res_generator> handle_completions_impl(
            inputs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true);
        }
        tasks.reserve(inputs.size());
-        states.reserve(inputs.size());
        int idx = 0;
        for (size_t i = 0; i < inputs.size(); i++) {
            server_task task = server_task(type);
@ -2710,7 +2710,6 @@ static std::unique_ptr<server_res_generator> handle_completions_impl(
            task.params.res_type          = res_type;
            task.params.oaicompat_cmpl_id = completion_id;
            task.params.oaicompat_model   = ctx_server.model_name;
-            states.push_back(task.params.oaicompat_chat_syntax);

            if (task.params.n_cmpl > 1) {
                task.n_children = task.params.n_cmpl - 1;
@ -2719,7 +2718,6 @@ static std::unique_ptr<server_res_generator> handle_completions_impl(
                        task.id,
                        ctx_server.queue_tasks.get_new_id(),
                        idx++);
-                    states.push_back(child.params.oaicompat_chat_syntax);
                    tasks.push_back(std::move(child));
                }
            }
@ -2727,7 +2725,6 @@ static std::unique_ptr<server_res_generator> handle_completions_impl(
            tasks.push_back(std::move(task));
        }

-        rd.set_states(std::move(states));
        rd.post_tasks(std::move(tasks));
    } catch (const std::exception & e) {
        res->error(format_error_response(e.what(), ERROR_TYPE_INVALID_REQUEST));
@ -3457,7 +3454,7 @@ void server_routes::init_routes() {

        // create and queue the task
        json responses = json::array();
-        server_response_reader rd({ctx_server.queue_tasks, ctx_server.queue_results}, HTTP_POLLING_SECONDS);
+        server_response_reader rd = ctx_server.get_response_reader();
        {
            std::vector<server_task> tasks;
            tasks.reserve(documents.size());
@ -3717,7 +3714,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_embeddings_impl(cons

    // create and queue the task
    json responses = json::array();
-    server_response_reader rd({ctx_server.queue_tasks, ctx_server.queue_results}, HTTP_POLLING_SECONDS);
+    server_response_reader rd = ctx_server.get_response_reader();
    {
        std::vector<server_task> tasks;
        for (size_t i = 0; i < tokenized_prompts.size(); i++) {
--- a/tools/server/server-context.h
+++ b/tools/server/server-context.h
@ -31,9 +31,8 @@ struct server_context {
    // get the underlaying llama_context
    llama_context * get_llama_context() const;

-    // get the underlaying queue_tasks and queue_results
-    // used by CLI application
-    std::pair<server_queue &, server_response &> get_queues();
+    // get a new response reader, used by CLI application
+    server_response_reader get_response_reader();
 };


--- a/tools/server/server-queue.cpp
+++ b/tools/server/server-queue.cpp
@ -271,12 +271,21 @@ void server_response::terminate() {
 // server_response_reader
 //

-void server_response_reader::set_states(std::vector<task_result_state> && states) {
-    this->states = std::move(states);
+void server_response_reader::post_task(server_task && task) {
+    GGML_ASSERT(id_tasks.empty() && "post_task() can only be called once per reader");
+    id_tasks.insert(task.id);
+    states.push_back(task.create_state());
+    queue_results.add_waiting_task_id(task.id);
+    queue_tasks.post(std::move(task));
 }

 void server_response_reader::post_tasks(std::vector<server_task> && tasks) {
+    GGML_ASSERT(id_tasks.empty() && "post_tasks() can only be called once per reader");
    id_tasks = server_task::get_list_id(tasks);
+    states.reserve(tasks.size());
+    for (size_t i = 0; i < tasks.size(); i++) {
+        states.push_back(tasks[i].create_state());
+    }
    queue_results.add_waiting_tasks(tasks);
    queue_tasks.post(std::move(tasks));
 }
--- a/tools/server/server-queue.h
+++ b/tools/server/server-queue.h
@ -129,13 +129,13 @@ struct server_response_reader {
    std::vector<task_result_state> states;

    // should_stop function will be called each polling_interval_seconds
-    server_response_reader(std::pair<server_queue &, server_response &> server_queues, int polling_interval_seconds)
-        : queue_tasks(server_queues.first), queue_results(server_queues.second), polling_interval_seconds(polling_interval_seconds) {}
+    server_response_reader(server_queue & queue_tasks, server_response & queue_results, int polling_interval_seconds)
+        : queue_tasks(queue_tasks), queue_results(queue_results), polling_interval_seconds(polling_interval_seconds) {}
    ~server_response_reader() {
        stop();
    }

-    void set_states(std::vector<task_result_state> && states);
+    void post_task(server_task && tasks);
    void post_tasks(std::vector<server_task> && tasks);
    bool has_next() const;

--- a/tools/server/server-task.h
+++ b/tools/server/server-task.h
@ -85,6 +85,25 @@ struct task_params {
    json to_json(bool only_metrics = false) const;
 };

+// struct for tracking the state of a task (e.g., for streaming)
+struct task_result_state {
+    // tracking diffs for partial tool calls
+    std::vector<common_chat_msg_diff> diffs;
+    common_chat_syntax oaicompat_chat_syntax;
+    common_chat_msg chat_msg;
+    std::string generated_text; // append new chunks of generated text here
+    std::vector<std::string> generated_tool_call_ids;
+
+    task_result_state(const common_chat_syntax & oaicompat_chat_syntax)
+        : oaicompat_chat_syntax(oaicompat_chat_syntax) {}
+
+    // parse partial tool calls and update the internal state
+    common_chat_msg update_chat_msg(
+        const std::string & text_added,
+        bool is_partial,
+        std::vector<common_chat_msg_diff> & diffs);
+};
+
 struct server_task {
    int id    = -1; // to be filled by server_queue
    int index = -1; // used when there are multiple prompts (batch request)
@ -149,6 +168,12 @@ struct server_task {
        copy.tokens    = tokens.clone();
        return copy;
    }
+
+    // the task will be moved into queue, then onto slots
+    // however, the state must be kept by caller (e.g., HTTP thread)
+    task_result_state create_state() const {
+        return task_result_state(params.oaicompat_chat_syntax);
+    }
 };

 struct result_timings {
@ -180,25 +205,6 @@ struct result_prompt_progress {
    json to_json() const;
 };

-// struct for tracking the state of a task (e.g., for streaming)
-struct task_result_state {
-    // tracking diffs for partial tool calls
-    std::vector<common_chat_msg_diff> diffs;
-    common_chat_syntax oaicompat_chat_syntax;
-    common_chat_msg chat_msg;
-    std::string generated_text; // append new chunks of generated text here
-    std::vector<std::string> generated_tool_call_ids;
-
-    task_result_state(const common_chat_syntax & oaicompat_chat_syntax)
-        : oaicompat_chat_syntax(oaicompat_chat_syntax) {}
-
-    // parse partial tool calls and update the internal state
-    common_chat_msg update_chat_msg(
-        const std::string & text_added,
-        bool is_partial,
-        std::vector<common_chat_msg_diff> & diffs);
-};
-
 struct server_task_result {
    int id           = -1;
    int id_slot      = -1;