Compare commits

..

No commits in common. "master" and "b7592" have entirely different histories.

105 changed files with 810 additions and 6502 deletions

View File

@ -1098,7 +1098,6 @@ jobs:
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Build with CMake - name: Build with CMake
# TODO: Remove GGML_CUDA_CUB_3DOT2 flag once CCCL 3.2 is bundled within CTK and that CTK version is used in this project
run: | run: |
cmake -S . -B build -G Ninja \ cmake -S . -B build -G Ninja \
-DLLAMA_CURL=OFF \ -DLLAMA_CURL=OFF \
@ -1108,8 +1107,7 @@ jobs:
-DCMAKE_CUDA_ARCHITECTURES=89-real \ -DCMAKE_CUDA_ARCHITECTURES=89-real \
-DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined \ -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined \
-DGGML_NATIVE=OFF \ -DGGML_NATIVE=OFF \
-DGGML_CUDA=ON \ -DGGML_CUDA=ON
-DGGML_CUDA_CUB_3DOT2=ON
cmake --build build cmake --build build
windows-2022-cmake-cuda: windows-2022-cmake-cuda:
@ -1145,7 +1143,6 @@ jobs:
- name: Build - name: Build
id: cmake_build id: cmake_build
shell: cmd shell: cmd
# TODO: Remove GGML_CUDA_CUB_3DOT2 flag once CCCL 3.2 is bundled within CTK and that CTK version is used in this project
run: | run: |
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
cmake -S . -B build -G "Ninja Multi-Config" ^ cmake -S . -B build -G "Ninja Multi-Config" ^
@ -1156,8 +1153,7 @@ jobs:
-DGGML_BACKEND_DL=ON ^ -DGGML_BACKEND_DL=ON ^
-DGGML_CPU_ALL_VARIANTS=ON ^ -DGGML_CPU_ALL_VARIANTS=ON ^
-DGGML_CUDA=ON ^ -DGGML_CUDA=ON ^
-DGGML_RPC=ON ^ -DGGML_RPC=ON
-DGGML_CUDA_CUB_3DOT2=ON
set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1 set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
cmake --build build --config Release -j %NINJA_JOBS% -t ggml cmake --build build --config Release -j %NINJA_JOBS% -t ggml
cmake --build build --config Release cmake --build build --config Release
@ -1754,7 +1750,7 @@ jobs:
sudo apt-get update sudo apt-get update
# Install necessary packages # Install necessary packages
sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential libssl-dev wget ccache git-lfs sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential libssl-dev wget ccache
# Set gcc-14 and g++-14 as the default compilers # Set gcc-14 and g++-14 as the default compilers
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100 sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
@ -1766,8 +1762,6 @@ jobs:
rustup install stable rustup install stable
rustup default stable rustup default stable
git lfs install
- name: Clone - name: Clone
id: checkout id: checkout
uses: actions/checkout@v4 uses: actions/checkout@v4
@ -1853,7 +1847,7 @@ jobs:
sudo apt-get update sudo apt-get update
# Install necessary packages # Install necessary packages
sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential wget ccache git-lfs sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential wget ccache
# Set gcc-14 and g++-14 as the default compilers # Set gcc-14 and g++-14 as the default compilers
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100 sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
@ -1865,8 +1859,6 @@ jobs:
rustup install stable rustup install stable
rustup default stable rustup default stable
git lfs install
- name: GCC version check - name: GCC version check
run: | run: |
gcc --version gcc --version
@ -1947,7 +1939,7 @@ jobs:
sudo apt-get update sudo apt-get update
# Install necessary packages # Install necessary packages
sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential wget ccache git-lfs sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential wget ccache
# Set gcc-14 and g++-14 as the default compilers # Set gcc-14 and g++-14 as the default compilers
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100 sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
@ -1959,8 +1951,6 @@ jobs:
rustup install stable rustup install stable
rustup default stable rustup default stable
git lfs install
- name: GCC version check - name: GCC version check
run: | run: |
gcc --version gcc --version
@ -2021,7 +2011,7 @@ jobs:
sudo apt-get update sudo apt-get update
# Install necessary packages # Install necessary packages
sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential libssl-dev wget ccache git-lfs sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential libssl-dev wget ccache
# Set gcc-14 and g++-14 as the default compilers # Set gcc-14 and g++-14 as the default compilers
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100 sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
@ -2033,8 +2023,6 @@ jobs:
rustup install stable rustup install stable
rustup default stable rustup default stable
git lfs install
- name: GCC version check - name: GCC version check
run: | run: |
gcc --version gcc --version

View File

@ -420,7 +420,6 @@ jobs:
- name: Build - name: Build
id: cmake_build id: cmake_build
shell: cmd shell: cmd
# TODO: Remove GGML_CUDA_CUB_3DOT2 flag once CCCL 3.2 is bundled within CTK and that CTK version is used in this project
run: | run: |
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
cmake -S . -B build -G "Ninja Multi-Config" ^ cmake -S . -B build -G "Ninja Multi-Config" ^
@ -428,8 +427,7 @@ jobs:
-DGGML_NATIVE=OFF ^ -DGGML_NATIVE=OFF ^
-DGGML_CPU=OFF ^ -DGGML_CPU=OFF ^
-DGGML_CUDA=ON ^ -DGGML_CUDA=ON ^
-DLLAMA_CURL=OFF ^ -DLLAMA_CURL=OFF
-DGGML_CUDA_CUB_3DOT2=ON
set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1 set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
cmake --build build --config Release -j %NINJA_JOBS% --target ggml-cuda cmake --build build --config Release -j %NINJA_JOBS% --target ggml-cuda

View File

@ -41,10 +41,6 @@ jobs:
include: include:
- build_type: Release - build_type: Release
sanitizer: "" sanitizer: ""
extra_args: ""
- build_type: Release
sanitizer: ""
extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
steps: steps:
@ -69,12 +65,6 @@ jobs:
fetch-depth: 0 fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: Build
id: cmake_build
run: |
cmake -B build -DLLAMA_CURL=OFF -DLLAMA_BUILD_BORINGSSL=ON
cmake --build build --config ${{ matrix.build_type }} -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
- name: Python setup - name: Python setup
id: setup_python id: setup_python
uses: actions/setup-python@v5 uses: actions/setup-python@v5
@ -86,14 +76,6 @@ jobs:
run: | run: |
pip install -r tools/server/tests/requirements.txt pip install -r tools/server/tests/requirements.txt
- name: Tests
id: server_integration_tests
if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) && matrix.build_type == 'Release' }}
run: |
cd tools/server/tests
export ${{ matrix.extra_args }}
pytest -v -x -m "not slow"
server-windows: server-windows:
runs-on: windows-2022 runs-on: windows-2022

View File

@ -52,8 +52,7 @@ if [ ! -z ${GG_BUILD_METAL} ]; then
fi fi
if [ ! -z ${GG_BUILD_CUDA} ]; then if [ ! -z ${GG_BUILD_CUDA} ]; then
# TODO: Remove GGML_CUDA_CUB_3DOT2 flag once CCCL 3.2 is bundled within CTK and that CTK version is used in this project CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON"
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON -DGGML_CUDA_CUB_3DOT2=ON"
if command -v nvidia-smi >/dev/null 2>&1; then if command -v nvidia-smi >/dev/null 2>&1; then
CUDA_ARCH=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits 2>/dev/null | head -1 | tr -d '.') CUDA_ARCH=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits 2>/dev/null | head -1 | tr -d '.')

View File

@ -1695,13 +1695,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.sampling.grammar = json_schema_to_grammar(json::parse(schema)); params.sampling.grammar = json_schema_to_grammar(json::parse(schema));
} }
).set_sparam()); ).set_sparam());
add_opt(common_arg(
{"-bs", "--backend-sampling"},
"enable backend sampling (experimental) (default: disabled)",
[](common_params & params) {
params.sampling.backend_sampling = true;
}
).set_sparam().set_env("LLAMA_ARG_BACKEND_SAMPLING"));
add_opt(common_arg( add_opt(common_arg(
{"--pooling"}, "{none,mean,cls,last,rank}", {"--pooling"}, "{none,mean,cls,last,rank}",
"pooling type for embeddings, use model default if unspecified", "pooling type for embeddings, use model default if unspecified",

View File

@ -1395,14 +1395,6 @@ static void common_chat_parse_seed_oss(common_chat_msg_parser & builder) {
builder.consume_reasoning_with_xml_tool_calls(form, "<seed:think>", "</seed:think>"); builder.consume_reasoning_with_xml_tool_calls(form, "<seed:think>", "</seed:think>");
} }
static void common_chat_parse_solar_open(common_chat_msg_parser & builder) {
builder.try_parse_reasoning("<|think|>", "<|end|><|begin|>assistant<|content|>");
// TODO: Tool calling
builder.add_content(builder.consume_rest());
}
static void common_chat_parse_content_only(common_chat_msg_parser & builder) { static void common_chat_parse_content_only(common_chat_msg_parser & builder) {
builder.try_parse_reasoning("<think>", "</think>"); builder.try_parse_reasoning("<think>", "</think>");
builder.add_content(builder.consume_rest()); builder.add_content(builder.consume_rest());
@ -1487,9 +1479,6 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
case COMMON_CHAT_FORMAT_XIAOMI_MIMO: case COMMON_CHAT_FORMAT_XIAOMI_MIMO:
common_chat_parse_xiaomi_mimo(builder); common_chat_parse_xiaomi_mimo(builder);
break; break;
case COMMON_CHAT_FORMAT_SOLAR_OPEN:
common_chat_parse_solar_open(builder);
break;
default: default:
throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format)); throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
} }

View File

@ -380,8 +380,8 @@ std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const json & too
const auto & function = tool.at("function"); const auto & function = tool.at("function");
result.push_back({ result.push_back({
/* .name = */ function.at("name"), /* .name = */ function.at("name"),
/* .description = */ function.value("description", ""), /* .description = */ function.at("description"),
/* .parameters = */ function.value("parameters", json::object()).dump(), /* .parameters = */ function.at("parameters").dump(),
}); });
} }
} }
@ -669,7 +669,6 @@ const char * common_chat_format_name(common_chat_format format) {
case COMMON_CHAT_FORMAT_QWEN3_CODER_XML: return "Qwen3 Coder"; case COMMON_CHAT_FORMAT_QWEN3_CODER_XML: return "Qwen3 Coder";
case COMMON_CHAT_FORMAT_APRIEL_1_5: return "Apriel 1.5"; case COMMON_CHAT_FORMAT_APRIEL_1_5: return "Apriel 1.5";
case COMMON_CHAT_FORMAT_XIAOMI_MIMO: return "Xiaomi MiMo"; case COMMON_CHAT_FORMAT_XIAOMI_MIMO: return "Xiaomi MiMo";
case COMMON_CHAT_FORMAT_SOLAR_OPEN: return "Solar Open";
case COMMON_CHAT_FORMAT_PEG_SIMPLE: return "peg-simple"; case COMMON_CHAT_FORMAT_PEG_SIMPLE: return "peg-simple";
case COMMON_CHAT_FORMAT_PEG_NATIVE: return "peg-native"; case COMMON_CHAT_FORMAT_PEG_NATIVE: return "peg-native";
case COMMON_CHAT_FORMAT_PEG_CONSTRUCTED: return "peg-constructed"; case COMMON_CHAT_FORMAT_PEG_CONSTRUCTED: return "peg-constructed";
@ -2065,7 +2064,7 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
// Trigger on tool calls that appear in the commentary channel // Trigger on tool calls that appear in the commentary channel
data.grammar_triggers.push_back({ data.grammar_triggers.push_back({
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN, COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
"<\\|channel\\|>(?:commentary|analysis) to" "<\\|channel\\|>(commentary|analysis) to"
}); });
// Trigger tool calls that appear in the role section, either at the // Trigger tool calls that appear in the role section, either at the
@ -2398,17 +2397,17 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat
(inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call)); (inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call));
// Trigger on some common known "good bad" outputs (only from the start and with a json that's about a specific argument name to avoid false positives) // Trigger on some common known "good bad" outputs (only from the start and with a json that's about a specific argument name to avoid false positives)
data.grammar_triggers.push_back({ data.grammar_triggers.push_back({
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN, COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
// If thinking_forced_open, then we capture the </think> tag in the grammar, // If thinking_forced_open, then we capture the </think> tag in the grammar,
// (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar) // (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
std::string(data.thinking_forced_open ? "(</think>\\s*)" : "") + ( std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)" : "(?:<think>[\\s\\S]*?</think>\\s*)?") + (
"\\s*(" "\\s*("
"(?:<tool_call>" "(?:<tool_call>"
"|<function" "|<function"
"|(?:```(?:json|xml)?\n\\s*)?(?:<function_call>|<tools>|<xml><json>|<response>)?" "|(?:```(?:json|xml)?\n\\s*)?(?:<function_call>|<tools>|<xml><json>|<response>)?"
"\\s*\\{\\s*\"name\"\\s*:\\s*\"(?:" + string_join(escaped_names, "|") + ")\"" "\\s*\\{\\s*\"name\"\\s*:\\s*\"(?:" + string_join(escaped_names, "|") + ")\""
")" ")"
")" ")[\\s\\S]*"
), ),
}); });
data.preserved_tokens = { data.preserved_tokens = {
@ -2518,27 +2517,6 @@ static common_chat_params common_chat_params_init_granite(const common_chat_temp
return data; return data;
} }
static common_chat_params common_chat_params_init_solar_open(const common_chat_template & tmpl, const struct templates_params & inputs) {
common_chat_params data;
// TODO: Reasoning effort
json additional_context = {};
data.prompt = apply(tmpl, inputs, std::nullopt, std::nullopt, additional_context);
data.format = COMMON_CHAT_FORMAT_SOLAR_OPEN;
data.preserved_tokens = {
"<|think|>",
"<|content|>",
"<|begin|>",
"<|end|>",
};
// TODO: Tool calling
return data;
}
static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) { static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
common_chat_params data; common_chat_params data;
data.prompt = apply(tmpl, inputs); data.prompt = apply(tmpl, inputs);
@ -2802,13 +2780,6 @@ static common_chat_params common_chat_templates_apply_jinja(
return common_chat_params_init_magistral(tmpl, params); return common_chat_params_init_magistral(tmpl, params);
} }
// Solar Open
if (src.find("<|tool_response:begin|>") != std::string::npos &&
src.find("<|tool_response:name|>") != std::string::npos &&
src.find("<|tool_response:result|>") != std::string::npos) {
return common_chat_params_init_solar_open(tmpl, params);
}
// Plain handler (no tools) // Plain handler (no tools)
if (params.tools.is_null() || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) { if (params.tools.is_null() || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
return common_chat_params_init_without_tools(tmpl, params); return common_chat_params_init_without_tools(tmpl, params);

View File

@ -124,7 +124,6 @@ enum common_chat_format {
COMMON_CHAT_FORMAT_QWEN3_CODER_XML, COMMON_CHAT_FORMAT_QWEN3_CODER_XML,
COMMON_CHAT_FORMAT_APRIEL_1_5, COMMON_CHAT_FORMAT_APRIEL_1_5,
COMMON_CHAT_FORMAT_XIAOMI_MIMO, COMMON_CHAT_FORMAT_XIAOMI_MIMO,
COMMON_CHAT_FORMAT_SOLAR_OPEN,
// These are intended to be parsed by the PEG parser // These are intended to be parsed by the PEG parser
COMMON_CHAT_FORMAT_PEG_SIMPLE, COMMON_CHAT_FORMAT_PEG_SIMPLE,

View File

@ -1086,7 +1086,6 @@ struct common_init_result::impl {
std::vector<llama_adapter_lora_ptr> lora; std::vector<llama_adapter_lora_ptr> lora;
std::vector<common_sampler_ptr> samplers; std::vector<common_sampler_ptr> samplers;
std::vector<llama_sampler_seq_config> samplers_seq_config;
}; };
common_init_result::common_init_result(common_params & params) : common_init_result::common_init_result(common_params & params) :
@ -1163,19 +1162,10 @@ common_init_result::common_init_result(common_params & params) :
// params.sampling.dry_penalty_last_n = llama_n_ctx(lctx); // params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
//} //}
// init the backend samplers as part of the context creation
pimpl->samplers.resize(cparams.n_seq_max); pimpl->samplers.resize(cparams.n_seq_max);
pimpl->samplers_seq_config.resize(cparams.n_seq_max);
for (int i = 0; i < (int) cparams.n_seq_max; ++i) { for (int i = 0; i < (int) cparams.n_seq_max; ++i) {
pimpl->samplers[i].reset(common_sampler_init(model, params.sampling)); pimpl->samplers[i].reset(common_sampler_init(model, params.sampling));
pimpl->samplers_seq_config[i] = { i, common_sampler_get(pimpl->samplers[i].get()) };
}
// TODO: temporarily gated behind a flag
if (params.sampling.backend_sampling) {
cparams.samplers = pimpl->samplers_seq_config.data();
cparams.n_samplers = pimpl->samplers_seq_config.size();
} }
llama_context * lctx = llama_init_from_model(model, cparams); llama_context * lctx = llama_init_from_model(model, cparams);
@ -1199,12 +1189,6 @@ common_sampler * common_init_result::sampler(llama_seq_id seq_id) {
return pimpl->samplers[seq_id].get(); return pimpl->samplers[seq_id].get();
} }
void common_init_result::reset_samplers() {
for (int i = 0; i < (int) pimpl->samplers.size(); ++i) {
llama_sampler_reset(common_sampler_get(pimpl->samplers[i].get()));
}
}
std::vector<llama_adapter_lora_ptr> & common_init_result::lora() { std::vector<llama_adapter_lora_ptr> & common_init_result::lora() {
return pimpl->lora; return pimpl->lora;
} }
@ -1320,9 +1304,6 @@ common_init_result_ptr common_init_from_params(common_params & params) {
llama_synchronize(lctx); llama_synchronize(lctx);
llama_perf_context_reset(lctx); llama_perf_context_reset(lctx);
llama_set_warmup(lctx, false); llama_set_warmup(lctx, false);
// reset samplers to reset RNG state after warmup to the seeded state
res->reset_samplers();
} }
return res; return res;

View File

@ -216,8 +216,6 @@ struct common_params_sampling {
std::vector<llama_logit_bias> logit_bias; // logit biases to apply std::vector<llama_logit_bias> logit_bias; // logit biases to apply
std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
bool backend_sampling = false;
bool has_logit_bias() const { bool has_logit_bias() const {
return !logit_bias.empty(); return !logit_bias.empty();
} }
@ -691,9 +689,7 @@ struct common_init_result {
llama_model * model(); llama_model * model();
llama_context * context(); llama_context * context();
common_sampler * sampler(llama_seq_id seq_id); common_sampler * sampler(llama_seq_id seq_id);
void reset_samplers();
std::vector<llama_adapter_lora_ptr> & lora(); std::vector<llama_adapter_lora_ptr> & lora();

View File

@ -106,16 +106,12 @@ static void llama_sampler_llg_free(llama_sampler * smpl) {
} }
static llama_sampler_i llama_sampler_llg_i = { static llama_sampler_i llama_sampler_llg_i = {
/* .name = */ llama_sampler_llg_name, /* .name = */ llama_sampler_llg_name,
/* .accept = */ llama_sampler_llg_accept_impl, /* .accept = */ llama_sampler_llg_accept_impl,
/* .apply = */ llama_sampler_llg_apply, /* .apply = */ llama_sampler_llg_apply,
/* .reset = */ llama_sampler_llg_reset, /* .reset = */ llama_sampler_llg_reset,
/* .clone = */ llama_sampler_llg_clone, /* .clone = */ llama_sampler_llg_clone,
/* .free = */ llama_sampler_llg_free, /* .free = */ llama_sampler_llg_free,
/* .backend_init = */ NULL,
/* .backend_accept = */ NULL,
/* .backend_apply = */ NULL,
/* .backend_set_input = */ NULL,
}; };
static size_t llama_sampler_llg_tokenize_fn(const void * user_data, const uint8_t * bytes, size_t bytes_len, static size_t llama_sampler_llg_tokenize_fn(const void * user_data, const uint8_t * bytes, size_t bytes_len,

View File

@ -27,7 +27,7 @@ common_regex_match common_regex::search(const std::string & input, size_t pos, b
return res; return res;
} }
std::match_results<std::string::const_reverse_iterator> srmatch; std::match_results<std::string::const_reverse_iterator> srmatch;
if (std::regex_search(input.rbegin(), input.rend() - pos, srmatch, rx_reversed_partial, std::regex_constants::match_continuous)) { if (std::regex_match(input.rbegin(), input.rend() - pos, srmatch, rx_reversed_partial)) {
auto group = srmatch[1].str(); auto group = srmatch[1].str();
if (group.length() != 0) { if (group.length() != 0) {
auto it = srmatch[1].second.base(); auto it = srmatch[1].second.base();
@ -55,18 +55,18 @@ common_regex_match common_regex::search(const std::string & input, size_t pos, b
to see if a string ends with a partial regex match, but but it's not in std::regex yet. to see if a string ends with a partial regex match, but but it's not in std::regex yet.
Instead, we'll the regex into a partial match regex operating as a full match on the reverse iterators of the input. Instead, we'll the regex into a partial match regex operating as a full match on the reverse iterators of the input.
- /abcd/ -> ^(dcba|cba|ba|a) -> ^((?:(?:(?:(?:d)?c)?b)?a) - /abcd/ -> (dcba|cba|ba|a).* -> ((?:(?:(?:(?:d)?c)?b)?a).*
- /a|b/ -> ^(a|b) - /a|b/ -> (a|b).*
- /a*?/ -> error, could match "" - /a*?/ -> error, could match ""
- /a*b/ -> ^((?:b)?a*+) (final repetitions become eager) - /a*b/ -> ((?:b)?a*+).* (final repetitions become eager)
- /.*?ab/ -> ^((?:b)?a) (omit .*) - /.*?ab/ -> ((?:b)?a).* (merge .*)
- /a.*?b/ -> ^((?:b)?.*?a) (keep reluctant matches) - /a.*?b/ -> ((?:b)?.*?a).* (keep reluctant matches)
- /a(bc)d/ -> ^((?:(?:d)?(?:(?:c)?b))?a) - /a(bc)d/ -> ((?:(?:d)?(?:(?:c)?b))?a).*
- /a(bc|de)/ -> ^((?:(?:(?:e)?d)?|(?:(?:c)?b)?)?a) - /a(bc|de)/ -> ((?:(?:(?:e)?d)?|(?:(?:c)?b)?)?a).*
- /ab{2,4}c/ -> ^cbbb?b?a -> ^((?:(?:(?:(?:(?:c)?b)?b)?b?)?b?)?a) - /ab{2,4}c/ -> abbb?b?c -> ((?:(?:(?:(?:(?:c)?b)?b)?b?)?b?)?a).*
The regex will match a reversed string fully, and the end of the first (And only) capturing group will indicate the reversed start of the original partial pattern. The regex will match a reversed string fully, and the end of the first (And only) capturing group will indicate the reversed start of the original partial pattern
All other groups are turned into non-capturing groups, and reluctant quantifiers are ignored. (i.e. just where the final .* starts in the inverted pattern; all other groups are turned into non-capturing groups, and reluctant quantifiers are ignored)
*/ */
std::string regex_to_reversed_partial_regex(const std::string & pattern) { std::string regex_to_reversed_partial_regex(const std::string & pattern) {
auto it = pattern.begin(); auto it = pattern.begin();
@ -177,7 +177,7 @@ std::string regex_to_reversed_partial_regex(const std::string & pattern) {
} }
} }
// /abcd/ -> ^(dcba|cba|ba|a) -> ^((?:(?:(?:d)?c)?b)?a) // /abcd/ -> (dcba|cba|ba|a).* -> ((?:(?:(?:d)?c)?b)?a).*
// if n(=4) parts, opening n-1(=3) non-capturing groups after the 1 capturing group // if n(=4) parts, opening n-1(=3) non-capturing groups after the 1 capturing group
// We'll do the outermost capturing group and final .* in the enclosing function. // We'll do the outermost capturing group and final .* in the enclosing function.
std::vector<std::string> res_alts; std::vector<std::string> res_alts;
@ -200,5 +200,5 @@ std::string regex_to_reversed_partial_regex(const std::string & pattern) {
throw std::runtime_error("Unmatched '(' in pattern"); throw std::runtime_error("Unmatched '(' in pattern");
} }
return "^(" + res + ")"; return "(" + res + ")[\\s\\S]*";
} }

View File

@ -120,34 +120,17 @@ struct common_sampler {
} }
void set_logits(struct llama_context * ctx, int idx) { void set_logits(struct llama_context * ctx, int idx) {
const float * sampled_probs = llama_get_sampled_probs_ith (ctx, idx); const auto * logits = llama_get_logits_ith(ctx, idx);
const float * sampled_logits = llama_get_sampled_logits_ith (ctx, idx);
const llama_token * sampled_ids = llama_get_sampled_candidates_ith(ctx, idx);
const llama_model * model = llama_get_model(ctx); const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model); const llama_vocab * vocab = llama_model_get_vocab(model);
const int n_vocab = llama_vocab_n_tokens(vocab); const int n_vocab = llama_vocab_n_tokens(vocab);
if (sampled_probs) { cur.resize(n_vocab);
const uint32_t sampled_probs_count = llama_get_sampled_probs_count_ith(ctx, idx);
cur.resize(sampled_probs_count); for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
for (uint32_t i = 0; i < sampled_probs_count; ++i) { cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
cur[i] = llama_token_data{sampled_ids[i], sampled_logits[i], sampled_probs[i]};
}
} else if (sampled_logits) {
const uint32_t sampled_logits_count = llama_get_sampled_logits_count_ith(ctx, idx);
cur.resize(sampled_logits_count);
for (uint32_t i = 0; i < sampled_logits_count; i++) {
cur[i] = llama_token_data{sampled_ids[i], sampled_logits[i], 0.0f};
}
} else {
const auto * logits = llama_get_logits_ith(ctx, idx);
GGML_ASSERT(logits != nullptr);
cur.resize(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
}
} }
cur_p = { cur.data(), cur.size(), -1, false }; cur_p = { cur.data(), cur.size(), -1, false };
@ -176,7 +159,7 @@ std::string common_params_sampling::print() const {
return std::string(result); return std::string(result);
} }
struct common_sampler * common_sampler_init(const struct llama_model * model, struct common_params_sampling & params) { struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params) {
const llama_vocab * vocab = llama_model_get_vocab(model); const llama_vocab * vocab = llama_model_get_vocab(model);
llama_sampler_chain_params lparams = llama_sampler_chain_default_params(); llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
@ -196,30 +179,24 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
#endif // LLAMA_USE_LLGUIDANCE #endif // LLAMA_USE_LLGUIDANCE
} else { } else {
std::vector<std::string> trigger_patterns; std::vector<std::string> trigger_patterns;
std::vector<std::string> patterns_anywhere;
std::vector<llama_token> trigger_tokens; std::vector<llama_token> trigger_tokens;
for (const auto & trigger : params.grammar_triggers) { for (const auto & trigger : params.grammar_triggers) {
switch (trigger.type) { switch (trigger.type) {
case COMMON_GRAMMAR_TRIGGER_TYPE_WORD: case COMMON_GRAMMAR_TRIGGER_TYPE_WORD:
{ {
const auto & word = trigger.value; const auto & word = trigger.value;
trigger_patterns.push_back(regex_escape(word)); patterns_anywhere.push_back(regex_escape(word));
break; break;
} }
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN: case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN:
{ {
trigger_patterns.push_back(trigger.value); patterns_anywhere.push_back(trigger.value);
break; break;
} }
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL: case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL:
{ {
const auto & pattern = trigger.value; trigger_patterns.push_back(trigger.value);
std::string anchored = "^$";
if (!pattern.empty()) {
anchored = (pattern.front() != '^' ? "^" : "")
+ pattern
+ (pattern.back() != '$' ? "$" : "");
}
trigger_patterns.push_back(anchored);
break; break;
} }
case COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN: case COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN:
@ -233,6 +210,10 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
} }
} }
if (!patterns_anywhere.empty()) {
trigger_patterns.push_back("^[\\s\\S]*?(" + string_join(patterns_anywhere, "|") + ")[\\s\\S]*");
}
std::vector<const char *> trigger_patterns_c; std::vector<const char *> trigger_patterns_c;
trigger_patterns_c.reserve(trigger_patterns.size()); trigger_patterns_c.reserve(trigger_patterns.size());
for (const auto & regex : trigger_patterns) { for (const auto & regex : trigger_patterns) {
@ -315,12 +296,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
llama_sampler_chain_add(chain, smpl); llama_sampler_chain_add(chain, smpl);
} }
if (grmr && params.backend_sampling) {
LOG_WRN("%s: backend sampling is not compatible with grammar, disabling\n", __func__);
params.backend_sampling = false;
}
auto * result = new common_sampler { auto * result = new common_sampler {
/* .params = */ params, /* .params = */ params,
/* .grmr = */ grmr, /* .grmr = */ grmr,
@ -430,25 +405,6 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
auto & chain = gsmpl->chain; auto & chain = gsmpl->chain;
auto & cur_p = gsmpl->cur_p; // initialized by set_logits auto & cur_p = gsmpl->cur_p; // initialized by set_logits
// Check if a backend sampler has already sampled a token in which case we
// return that token id directly.
{
id = llama_get_sampled_token_ith(ctx, idx);
if (id != LLAMA_TOKEN_NULL) {
LOG_DBG("%s: Backend sampler selected token: '%d'. Will not run any CPU samplers\n", __func__, id);
GGML_ASSERT(!gsmpl->grmr && "using grammar in combination with backend sampling is not supported");
// TODO: simplify
gsmpl->cur.resize(1);
gsmpl->cur[0] = { id, 0.0f, 1.0f };
cur_p = { gsmpl->cur.data(), gsmpl->cur.size(), 0, true };
return id;
}
}
gsmpl->set_logits(ctx, idx); gsmpl->set_logits(ctx, idx);
if (grammar_first) { if (grammar_first) {

View File

@ -36,8 +36,7 @@ struct common_sampler;
// llama_sampler API overloads // llama_sampler API overloads
// note: can mutate params in some cases struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params);
struct common_sampler * common_sampler_init(const struct llama_model * model, struct common_params_sampling & params);
void common_sampler_free(struct common_sampler * gsmpl); void common_sampler_free(struct common_sampler * gsmpl);
@ -49,7 +48,6 @@ struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
// arguments can be nullptr to skip printing // arguments can be nullptr to skip printing
void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl); void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);
// get the underlying llama_sampler_chain
struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl); struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl);
// extended sampling implementation: // extended sampling implementation:

View File

@ -1062,9 +1062,6 @@ class TextModel(ModelBase):
if chkhsh == "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273": if chkhsh == "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273":
# ref: https://huggingface.co/alvarobartt/grok-2-tokenizer # ref: https://huggingface.co/alvarobartt/grok-2-tokenizer
res = "grok-2" res = "grok-2"
if chkhsh == "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df":
# ref: https://huggingface.co/aari1995/German_Semantic_V3
res = "jina-v2-de"
if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5": if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
# ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
res = "llama-bpe" res = "llama-bpe"
@ -1233,12 +1230,6 @@ class TextModel(ModelBase):
if chkhsh == "4a2e2abae11ca2b86d570fc5b44be4d5eb5e72cc8f22dd136a94b37da83ab665": if chkhsh == "4a2e2abae11ca2b86d570fc5b44be4d5eb5e72cc8f22dd136a94b37da83ab665":
# ref: https://huggingface.co/KORMo-Team/KORMo-tokenizer # ref: https://huggingface.co/KORMo-Team/KORMo-tokenizer
res = "kormo" res = "kormo"
if chkhsh == "9d70134b369a70e5735009b6de918f7581b5211f7c074d1f89f753aea8248af1":
# ref: https://huggingface.co/tencent/Youtu-LLM-2B
res = "youtu"
if chkhsh == "16389f0a1f51ee53e562ffd51c371dc508639ab0e4261502071836e50e223e91":
# ref: https://huggingface.co/upstage/Solar-Open-100B
res = "solar-open"
if res is None: if res is None:
logger.warning("\n") logger.warning("\n")
@ -2495,7 +2486,6 @@ class StableLMModel(TextModel):
"VLlama3ForCausalLM", "VLlama3ForCausalLM",
"LlavaForConditionalGeneration", "LlavaForConditionalGeneration",
"VoxtralForConditionalGeneration", "VoxtralForConditionalGeneration",
"IQuestCoderForCausalLM",
"LlamaModel") "LlamaModel")
class LlamaModel(TextModel): class LlamaModel(TextModel):
model_arch = gguf.MODEL_ARCH.LLAMA model_arch = gguf.MODEL_ARCH.LLAMA
@ -3513,7 +3503,7 @@ class QwenModel(TextModel):
self._set_vocab_qwen() self._set_vocab_qwen()
@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM", "Qwen2AudioForConditionalGeneration", "KORMoForCausalLM", "AudioFlamingo3ForConditionalGeneration") @ModelBase.register("Qwen2Model", "Qwen2ForCausalLM", "Qwen2AudioForConditionalGeneration", "KORMoForCausalLM")
class Qwen2Model(TextModel): class Qwen2Model(TextModel):
model_arch = gguf.MODEL_ARCH.QWEN2 model_arch = gguf.MODEL_ARCH.QWEN2
@ -5294,14 +5284,13 @@ class BertModel(TextModel):
self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1)) self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
# convert to phantom space vocab # convert to phantom space vocab
def phantom(tok, toktype): def phantom(tok):
if toktype == gguf.TokenType.CONTROL: if tok.startswith("[") and tok.endswith("]"):
return tok return tok
if tok.startswith("##"): if tok.startswith("##"):
return tok[2:] return tok[2:]
return "\u2581" + tok return "\u2581" + tok
assert len(tokens) == len(toktypes) tokens = list(map(phantom, tokens))
tokens = list(map(phantom, tokens, toktypes))
# add vocab to gguf # add vocab to gguf
self.gguf_writer.add_tokenizer_model("bert") self.gguf_writer.add_tokenizer_model("bert")
@ -6415,17 +6404,6 @@ class ARwkv7Model(Rwkv7Model):
self.gguf_writer.add_head_count(0) self.gguf_writer.add_head_count(0)
@ModelBase.register("MaincoderForCausalLM")
class MaincoderModel(TextModel):
model_arch = gguf.MODEL_ARCH.MAINCODER
def set_gguf_parameters(self):
super().set_gguf_parameters()
if (head_dim := self.hparams.get("head_dim")) is not None:
self.gguf_writer.add_rope_dimension_count(head_dim)
@ModelBase.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM") @ModelBase.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM")
class MambaModel(TextModel): class MambaModel(TextModel):
model_arch = gguf.MODEL_ARCH.MAMBA model_arch = gguf.MODEL_ARCH.MAMBA
@ -7203,7 +7181,6 @@ class DeepseekModel(TextModel):
"DeepseekV2ForCausalLM", "DeepseekV2ForCausalLM",
"DeepseekV3ForCausalLM", "DeepseekV3ForCausalLM",
"KimiVLForConditionalGeneration", "KimiVLForConditionalGeneration",
"YoutuForCausalLM",
) )
class DeepseekV2Model(TextModel): class DeepseekV2Model(TextModel):
model_arch = gguf.MODEL_ARCH.DEEPSEEK2 model_arch = gguf.MODEL_ARCH.DEEPSEEK2
@ -7270,15 +7247,7 @@ class DeepseekV2Model(TextModel):
super().set_gguf_parameters() super().set_gguf_parameters()
hparams = self.hparams hparams = self.hparams
# first_k_dense_replace: number of leading layers using dense FFN instead of MoE self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
# For non-MoE models (like Youtu), set to n_layer to use dense FFN for all layers
# For MoE models (like DeepSeek-V2), this is the number of leading non-MoE layers
has_moe = hparams.get("n_routed_experts") is not None
first_k_dense_replace = hparams.get("first_k_dense_replace")
if first_k_dense_replace is None:
# Default: if no MoE, all layers are dense; if MoE, none are dense
first_k_dense_replace = hparams["num_hidden_layers"] if not has_moe else 0
self.gguf_writer.add_leading_dense_block_count(first_k_dense_replace)
self.gguf_writer.add_vocab_size(hparams["vocab_size"]) self.gguf_writer.add_vocab_size(hparams["vocab_size"])
if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None: if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"]) self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
@ -7290,24 +7259,11 @@ class DeepseekV2Model(TextModel):
self.gguf_writer.add_key_length_mla(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"]) self.gguf_writer.add_key_length_mla(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
self.gguf_writer.add_value_length_mla(hparams["v_head_dim"]) self.gguf_writer.add_value_length_mla(hparams["v_head_dim"])
# MoE parameters (required by C++ code for DEEPSEEK2 arch) self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
# For non-MoE models like Youtu, use intermediate_size as expert_feed_forward_length self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
moe_intermediate_size = self.find_hparam(["moe_intermediate_size", "intermediate_size"], optional=False) self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
if (n_routed_experts := hparams.get("n_routed_experts")) is not None:
self.gguf_writer.add_expert_count(n_routed_experts)
# expert_shared_count is required by C++ code, default to 0 for non-MoE models
n_shared_experts = hparams.get("n_shared_experts", 0)
self.gguf_writer.add_expert_shared_count(n_shared_experts)
# When not set, C++ code will use scale_w = false to skip the no-op scaling
if (routed_scaling_factor := hparams.get("routed_scaling_factor")) is not None:
self.gguf_writer.add_expert_weights_scale(routed_scaling_factor)
if (norm_topk_prob := hparams.get("norm_topk_prob")) is not None and norm_topk_prob:
self.gguf_writer.add_expert_weights_norm(norm_topk_prob)
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
@ -7323,17 +7279,10 @@ class DeepseekV2Model(TextModel):
# skip vision tensors and remove "language_model." for Kimi-VL # skip vision tensors and remove "language_model." for Kimi-VL
if "vision_tower" in name or "multi_modal_projector" in name: if "vision_tower" in name or "multi_modal_projector" in name:
return [] return []
if name.startswith("siglip2.") or name.startswith("merger."):
return []
if name.startswith("language_model."): if name.startswith("language_model."):
name = name.replace("language_model.", "") name = name.replace("language_model.", "")
# skip lm_head.weight if tie_word_embeddings is True
if self.hparams.get("tie_word_embeddings", False):
if name == "lm_head.weight" or name == "model.lm_head.weight":
logger.info("Skipping tied output layer 'lm_head.weight' (will use token_embd.weight)")
return []
# rename e_score_correction_bias tensors # rename e_score_correction_bias tensors
if name.endswith("e_score_correction_bias"): if name.endswith("e_score_correction_bias"):
name = name.replace("e_score_correction_bias", "e_score_correction.bias") name = name.replace("e_score_correction_bias", "e_score_correction.bias")
@ -9343,19 +9292,6 @@ class VoxtralWhisperEncoderModel(WhisperEncoderModel):
self.gguf_writer.add_audio_stack_factor(4) # == intermediate_size // hidden_size self.gguf_writer.add_audio_stack_factor(4) # == intermediate_size // hidden_size
@ModelBase.register("AudioFlamingo3ForConditionalGeneration")
class AudioFlamingo3WhisperEncoderModel(WhisperEncoderModel):
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.MUSIC_FLAMINGO)
def tensor_force_quant(self, name, new_name, bid, n_dims):
if ".conv" in name and ".weight" in name:
# Was trained in BF16, being safe, avoiding quantizing to FP16
return gguf.GGMLQuantizationType.F32
return super().tensor_force_quant(name, new_name, bid, n_dims)
@ModelBase.register("FalconH1ForCausalLM") @ModelBase.register("FalconH1ForCausalLM")
class FalconH1Model(Mamba2Model): class FalconH1Model(Mamba2Model):
model_arch = gguf.MODEL_ARCH.FALCON_H1 model_arch = gguf.MODEL_ARCH.FALCON_H1
@ -10668,79 +10604,6 @@ class JanusProVisionModel(MmprojModel):
return [] return []
@ModelBase.register("YOUTUVLForConditionalGeneration", "YOUTUVLForCausalLM")
class YOUTUVLVisionModel(MmprojModel):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
assert self.hparams_vision is not None
self.hparams_vision["image_size"] = self.hparams_vision.get("image_size", 560)
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.YOUTUVL)
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
# Handle activation function
hidden_act = str(self.hparams.get("hidden_act", "gelu_pytorch_tanh")).lower()
if hidden_act in ("gelu", "gelu_pytorch_tanh", "gelu_fast", "gelu_new", "gelu_accurate"):
self.gguf_writer.add_vision_use_gelu(True)
elif hidden_act == "silu":
self.gguf_writer.add_vision_use_silu(True)
else:
raise ValueError(f"Unsupported activation function for YOUTUVL: {hidden_act}")
self.gguf_writer.add_vision_spatial_merge_size(self.hparams.get("spatial_merge_size", 2))
window_size = self.hparams.get("window_size")
if window_size is not None:
self.gguf_writer.add_vision_window_size(window_size)
# fullatt_block_indexes contains explicit layer indices that use full attention
# e.g., [2, 5, 8, 11] means layers 2, 5, 8, 11 use full attention
# All other layers use window attention
fullatt_block_indexes = self.hparams.get("fullatt_block_indexes")
assert fullatt_block_indexes is not None, "fullatt_block_indexes is required for youtuvl"
# Store the explicit layer indices for YoutuVL (irregular pattern approach)
self.gguf_writer.add_vision_wa_layer_indexes(layers=fullatt_block_indexes)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused
# Skip language model tensors
skip_prefixes = ('lm_head.', 'model.layers.', 'model.embed_tokens.', 'model.norm.')
if name.startswith(skip_prefixes):
return []
# Try to map the tensor using TensorNameMap (handles vision encoder and projector)
try:
new_name = self.map_tensor_name(name)
return [(new_name, data_torch)]
except ValueError:
# If mapping fails, log warning and skip
logger.warning(f"Cannot map tensor: {name}")
return []
@ModelBase.register("SolarOpenForCausalLM")
class SolarOpenModel(Glm4MoeModel):
model_arch = gguf.MODEL_ARCH.GLM4_MOE
def set_vocab(self):
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
tokens, toktypes, tokpre = self.get_vocab_base()
self.gguf_writer.add_tokenizer_model("gpt2")
self.gguf_writer.add_tokenizer_pre(tokpre)
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes)
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|endoftext|>"])
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<unk>"])
special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|startoftext|>"])
special_vocab.add_to_gguf(self.gguf_writer)
###### CONVERSION LOGIC ###### ###### CONVERSION LOGIC ######

View File

@ -145,8 +145,6 @@ models = [
{"name": "granite-docling", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-docling-258M", }, {"name": "granite-docling", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-docling-258M", },
{"name": "minimax-m2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/MiniMaxAI/MiniMax-M2", }, {"name": "minimax-m2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/MiniMaxAI/MiniMax-M2", },
{"name": "kormo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/KORMo-Team/KORMo-tokenizer", }, {"name": "kormo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/KORMo-Team/KORMo-tokenizer", },
{"name": "youtu", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Youtu-LLM-2B", },
{"name": "solar-open", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/upstage/Solar-Open-100B", },
] ]
# some models are known to be broken upstream, so we will skip them as exceptions # some models are known to be broken upstream, so we will skip them as exceptions
@ -167,8 +165,6 @@ pre_computed_hashes = [
{"name": "kimi-k2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/moonshotai/Kimi-K2-Base", "chkhsh": "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890"}, {"name": "kimi-k2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/moonshotai/Kimi-K2-Base", "chkhsh": "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890"},
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen3-Embedding-0.6B", "chkhsh": "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c"}, {"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen3-Embedding-0.6B", "chkhsh": "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c"},
{"name": "grok-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/alvarobartt/grok-2-tokenizer", "chkhsh": "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273"}, {"name": "grok-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/alvarobartt/grok-2-tokenizer", "chkhsh": "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273"},
# jina-v2-de variants
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/aari1995/German_Semantic_V3", "chkhsh": "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df"},
] ]

View File

@ -68,7 +68,7 @@ int main(int argc, char ** argv) {
auto sparams = llama_sampler_chain_default_params(); auto sparams = llama_sampler_chain_default_params();
sparams.no_perf = false; sparams.no_perf = false;
std::vector<llama_sampler_seq_config> sampler_configs; std::vector<llama_sampler *> samplers;
for (int32_t i = 0; i < n_parallel; ++i) { for (int32_t i = 0; i < n_parallel; ++i) {
llama_sampler * smpl = llama_sampler_chain_init(sparams); llama_sampler * smpl = llama_sampler_chain_init(sparams);
@ -78,13 +78,7 @@ int main(int argc, char ** argv) {
llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sampling.temp)); llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sampling.temp));
llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sampling.seed)); llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sampling.seed));
sampler_configs.push_back({ i, smpl }); samplers.push_back(smpl);
}
// TODO: temporarily gated behind a flag
if (params.sampling.backend_sampling) {
ctx_params.samplers = sampler_configs.data();
ctx_params.n_samplers = sampler_configs.size();
} }
llama_context * ctx = llama_init_from_model(model, ctx_params); llama_context * ctx = llama_init_from_model(model, ctx_params);
@ -186,7 +180,7 @@ int main(int argc, char ** argv) {
continue; continue;
} }
const llama_token new_token_id = llama_sampler_sample(sampler_configs[i].sampler, ctx, i_batch[i]); const llama_token new_token_id = llama_sampler_sample(samplers[i], ctx, i_batch[i]);
// is it an end of generation? -> mark the stream as finished // is it an end of generation? -> mark the stream as finished
if (llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_predict) { if (llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_predict) {
@ -242,15 +236,15 @@ int main(int argc, char ** argv) {
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
LOG("\n"); LOG("\n");
llama_perf_sampler_print(sampler_configs[0].sampler); llama_perf_sampler_print(samplers[0]);
llama_perf_context_print(ctx); llama_perf_context_print(ctx);
fprintf(stderr, "\n"); fprintf(stderr, "\n");
llama_batch_free(batch); llama_batch_free(batch);
for (auto & sampler_config : sampler_configs) { for (auto & sampler_config : samplers) {
llama_sampler_free(sampler_config.sampler); llama_sampler_free(sampler_config);
} }
llama_free(ctx); llama_free(ctx);

View File

@ -4,7 +4,7 @@ project("ggml" C CXX ASM)
### GGML Version ### GGML Version
set(GGML_VERSION_MAJOR 0) set(GGML_VERSION_MAJOR 0)
set(GGML_VERSION_MINOR 9) set(GGML_VERSION_MINOR 9)
set(GGML_VERSION_PATCH 5) set(GGML_VERSION_PATCH 4)
set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}") set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH) find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)

View File

@ -358,7 +358,7 @@ extern "C" {
typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data); typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
// Compare the output of two backends // Compare the output of two backends
GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor const * const * test_nodes, size_t num_test_nodes); GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor * test_node);
// Tensor initialization // Tensor initialization
GGML_API enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr); GGML_API enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);

View File

@ -2053,7 +2053,7 @@ void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) {
ggml_free(copy.ctx_unallocated); ggml_free(copy.ctx_unallocated);
} }
bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor const * const * test_nodes, size_t num_test_nodes) { bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor * test_node) {
struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph); struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph);
if (copy.buffer == NULL) { if (copy.buffer == NULL) {
return false; return false;
@ -2064,22 +2064,22 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
assert(g1->n_nodes == g2->n_nodes); assert(g1->n_nodes == g2->n_nodes);
if (num_test_nodes != 0) { if (test_node != nullptr) {
GGML_ASSERT(test_nodes); // Compute the whole graph and only test the output for a specific tensor
// Compute the whole graph and only test the output for specific tensors
ggml_backend_graph_compute(backend1, g1); ggml_backend_graph_compute(backend1, g1);
ggml_backend_graph_compute(backend2, g2); ggml_backend_graph_compute(backend2, g2);
bool verified = false; int test_node_idx = -1;
for (int i = 0; i < g1->n_nodes; i++) { for (int i = 0; i < g1->n_nodes; i++) {
for (size_t j = 0; j < num_test_nodes; ++j) { struct ggml_tensor * t1 = g1->nodes[i];
if (g1->nodes[i] == test_nodes[j]) { if (t1 == test_node) {
callback(i, g1->nodes[i], g2->nodes[i], user_data); test_node_idx = i;
verified = true; break;
}
} }
} }
GGML_ASSERT(verified); GGML_ASSERT(test_node_idx != -1);
callback(test_node_idx, g1->nodes[test_node_idx], g2->nodes[test_node_idx], user_data);
} else { } else {
for (int i = 0; i < g1->n_nodes; i++) { for (int i = 0; i < g1->n_nodes; i++) {
struct ggml_tensor * t1 = g1->nodes[i]; struct ggml_tensor * t1 = g1->nodes[i];

View File

@ -54,20 +54,6 @@ if (CUDAToolkit_FOUND)
enable_language(CUDA) enable_language(CUDA)
# TODO: Remove once CCCL 3.2 has been released and bundled with CUDA Toolkit
if (GGML_CUDA_CUB_3DOT2)
include(FetchContent)
FetchContent_Declare(
CCCL
GIT_REPOSITORY https://github.com/nvidia/cccl.git
GIT_TAG v3.2.0-rc2
GIT_SHALLOW TRUE
)
FetchContent_MakeAvailable(CCCL)
endif()
# Replace any plain 12X CUDA architectures with their "architecture-specific" equivalents 12Xa. # Replace any plain 12X CUDA architectures with their "architecture-specific" equivalents 12Xa.
# 12X is forwards-compatible, 12Xa is not. # 12X is forwards-compatible, 12Xa is not.
# Notably the Blackwell FP4 tensor core instructions are not forwards compatible and therefore need 12Xa. # Notably the Blackwell FP4 tensor core instructions are not forwards compatible and therefore need 12Xa.
@ -157,9 +143,6 @@ if (CUDAToolkit_FOUND)
# As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas) target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas)
else () else ()
if (GGML_CUDA_CUB_3DOT2)
target_link_libraries(ggml-cuda PRIVATE CCCL::CCCL)
endif()
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "10.1") if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "10.1")
target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
else() else()
@ -167,9 +150,6 @@ if (CUDAToolkit_FOUND)
endif() endif()
endif() endif()
else() else()
if (GGML_CUDA_CUB_3DOT2)
target_link_libraries(ggml-cuda PRIVATE CCCL::CCCL)
endif()
target_link_libraries(ggml-cuda PRIVATE CUDA::cudart CUDA::cublas) target_link_libraries(ggml-cuda PRIVATE CUDA::cudart CUDA::cublas)
endif() endif()
@ -238,10 +218,6 @@ if (CUDAToolkit_FOUND)
if (NOT MSVC) if (NOT MSVC)
list(APPEND CUDA_CXX_FLAGS -Wno-pedantic) list(APPEND CUDA_CXX_FLAGS -Wno-pedantic)
else()
# CCCL 3.2 onwards will require a cpp-standard-compliant preprocessor for MSVC
# https://github.com/NVIDIA/cccl/pull/6827
list(APPEND CUDA_CXX_FLAGS /Zc:preprocessor)
endif() endif()
list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED) # pass host compiler flags as a single argument list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED) # pass host compiler flags as a single argument

View File

@ -22,13 +22,13 @@ static __global__ void init_offsets(int * offsets, const int ncols, const int nr
} }
#ifdef GGML_CUDA_USE_CUB #ifdef GGML_CUDA_USE_CUB
void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool, static void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
const float * x, const float * x,
int * dst, int * dst,
const int ncols, const int ncols,
const int nrows, const int nrows,
ggml_sort_order order, ggml_sort_order order,
cudaStream_t stream) { cudaStream_t stream) {
ggml_cuda_pool_alloc<int> temp_indices_alloc(pool, ncols * nrows); ggml_cuda_pool_alloc<int> temp_indices_alloc(pool, ncols * nrows);
ggml_cuda_pool_alloc<float> temp_keys_alloc(pool, ncols * nrows); ggml_cuda_pool_alloc<float> temp_keys_alloc(pool, ncols * nrows);
ggml_cuda_pool_alloc<int> offsets_alloc(pool, nrows + 1); ggml_cuda_pool_alloc<int> offsets_alloc(pool, nrows + 1);
@ -49,49 +49,28 @@ void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
size_t temp_storage_bytes = 0; size_t temp_storage_bytes = 0;
if (order == GGML_SORT_ORDER_ASC) { if (order == GGML_SORT_ORDER_ASC) {
if (nrows == 1) { DeviceSegmentedRadixSort::SortPairs(nullptr, temp_storage_bytes, temp_keys, temp_keys, // keys (in-place)
DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, temp_keys, temp_keys, // keys (in-place) temp_indices, dst, // values (indices)
temp_indices, dst, // values (indices) ncols * nrows, nrows, // num items, num segments
ncols, 0, sizeof(float) * 8, stream); d_offsets, d_offsets + 1, 0, sizeof(float) * 8, // all bits
} else { stream);
DeviceSegmentedSort::SortPairs(nullptr, temp_storage_bytes, temp_keys, temp_keys, // keys (in-place)
temp_indices, dst, // values (indices)
ncols * nrows, nrows, // num items, num segments
d_offsets, d_offsets + 1, stream);
}
} else { } else {
if (nrows == 1) { DeviceSegmentedRadixSort::SortPairsDescending(nullptr, temp_storage_bytes, temp_keys, temp_keys, temp_indices,
DeviceRadixSort::SortPairsDescending(nullptr, temp_storage_bytes, temp_keys, temp_keys, // keys (in-place) dst, ncols * nrows, nrows, d_offsets, d_offsets + 1, 0,
temp_indices, dst, // values (indices) sizeof(float) * 8, stream);
ncols, 0, sizeof(float) * 8, stream);
} else {
DeviceSegmentedSort::SortPairsDescending(nullptr, temp_storage_bytes, temp_keys, temp_keys, temp_indices,
dst, ncols * nrows, nrows, d_offsets, d_offsets + 1, stream);
}
} }
ggml_cuda_pool_alloc<uint8_t> temp_storage_alloc(pool, temp_storage_bytes); ggml_cuda_pool_alloc<uint8_t> temp_storage_alloc(pool, temp_storage_bytes);
void * d_temp_storage = temp_storage_alloc.get(); void * d_temp_storage = temp_storage_alloc.get();
if (order == GGML_SORT_ORDER_ASC) { if (order == GGML_SORT_ORDER_ASC) {
if (nrows == 1) { DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys, temp_indices, dst,
DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys, // keys (in-place) ncols * nrows, nrows, d_offsets, d_offsets + 1, 0, sizeof(float) * 8,
temp_indices, dst, // values (indices) stream);
ncols, 0, sizeof(float) * 8, stream);
} else {
DeviceSegmentedSort::SortPairs(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys, temp_indices, dst,
ncols * nrows, nrows, d_offsets, d_offsets + 1, stream);
}
} else { } else {
if (nrows == 1) { DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys,
DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys, // keys (in-place) temp_indices, dst, ncols * nrows, nrows, d_offsets, d_offsets + 1,
temp_indices, dst, // values (indices) 0, sizeof(float) * 8, stream);
ncols, 0, sizeof(float) * 8, stream);
} else {
DeviceSegmentedSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys,
temp_indices, dst, ncols * nrows, nrows, d_offsets, d_offsets + 1,
stream);
}
} }
} }
#endif // GGML_CUDA_USE_CUB #endif // GGML_CUDA_USE_CUB
@ -162,12 +141,12 @@ static int next_power_of_2(int x) {
return n; return n;
} }
void argsort_f32_i32_cuda_bitonic(const float * x, static void argsort_f32_i32_cuda_bitonic(const float * x,
int * dst, int * dst,
const int ncols, const int ncols,
const int nrows, const int nrows,
ggml_sort_order order, ggml_sort_order order,
cudaStream_t stream) { cudaStream_t stream) {
// bitonic sort requires ncols to be power of 2 // bitonic sort requires ncols to be power of 2
const int ncols_pad = next_power_of_2(ncols); const int ncols_pad = next_power_of_2(ncols);

View File

@ -1,19 +1,3 @@
#include "common.cuh" #include "common.cuh"
void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
#ifdef GGML_CUDA_USE_CUB
void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
const float * x,
int * dst,
const int ncols,
const int nrows,
ggml_sort_order order,
cudaStream_t stream);
#endif // GGML_CUDA_USE_CUB
void argsort_f32_i32_cuda_bitonic(const float * x,
int * dst,
const int ncols,
const int nrows,
ggml_sort_order order,
cudaStream_t stream);

View File

@ -950,16 +950,15 @@ struct ggml_cuda_device_info {
int device_count; int device_count;
struct cuda_device_info { struct cuda_device_info {
int cc; // compute capability int cc; // compute capability
int nsm; // number of streaming multiprocessors int nsm; // number of streaming multiprocessors
size_t smpb; // max. shared memory per block size_t smpb; // max. shared memory per block
size_t smpbo; // max. shared memory per block (with opt-in) size_t smpbo; // max. shared memory per block (with opt-in)
bool integrated; // Device is integrated as opposed to discrete bool integrated; // Device is integrated as opposed to discrete
bool vmm; // virtual memory support bool vmm; // virtual memory support
size_t vmm_granularity; // granularity of virtual memory size_t vmm_granularity; // granularity of virtual memory
size_t total_vram; size_t total_vram;
int warp_size; // Number of threads in a dispatch int warp_size; // Number of threads in a dispatch
bool supports_cooperative_launch; // whether cooperative launch is supported
}; };
cuda_device_info devices[GGML_CUDA_MAX_DEVICES] = {}; cuda_device_info devices[GGML_CUDA_MAX_DEVICES] = {};
@ -1059,11 +1058,11 @@ struct ggml_cuda_graph {
cudaGraphExec_t instance = nullptr; cudaGraphExec_t instance = nullptr;
size_t num_nodes = 0; size_t num_nodes = 0;
std::vector<cudaGraphNode_t> nodes; std::vector<cudaGraphNode_t> nodes;
std::vector<cudaKernelNodeParams> params;
bool disable_due_to_gpu_arch = false; bool disable_due_to_gpu_arch = false;
bool disable_due_to_too_many_updates = false; bool disable_due_to_too_many_updates = false;
bool disable_due_to_failed_graph_capture = false; bool disable_due_to_failed_graph_capture = false;
int number_consecutive_updates = 0; int number_consecutive_updates = 0;
bool cuda_graphs_enabled = false;
std::vector<ggml_graph_node_properties> ggml_graph_properties; std::vector<ggml_graph_node_properties> ggml_graph_properties;
#endif #endif
}; };

View File

@ -12,11 +12,11 @@ const int CUDA_CPY_BLOCK_NM = 8; // block size of 3rd dimension if available
const int CUDA_CPY_BLOCK_ROWS = 8; // block dimension for marching through rows const int CUDA_CPY_BLOCK_ROWS = 8; // block dimension for marching through rows
template <cpy_kernel_t cpy_1> template <cpy_kernel_t cpy_1>
static __global__ void cpy_scalar(const char * cx, char * cdst, const int64_t ne, static __global__ void cpy_scalar(const char * cx, char * cdst, const int ne,
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02, const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
const int64_t nb12, const int64_t nb13) { const int nb12, const int nb13) {
const int64_t i = (int64_t)blockDim.x*blockIdx.x + threadIdx.x; const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= ne) { if (i >= ne) {
return; return;
@ -40,10 +40,10 @@ static __global__ void cpy_scalar(const char * cx, char * cdst, const int64_t ne
} }
template <typename T> template <typename T>
static __global__ void cpy_scalar_transpose(const char * cx, char * cdst, const int64_t ne, static __global__ void cpy_scalar_transpose(const char * cx, char * cdst, const int ne,
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02, const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
const int64_t nb12, const int64_t nb13) { const int nb12, const int nb13) {
const T* src = reinterpret_cast<const T*>(cx); const T* src = reinterpret_cast<const T*>(cx);
T* dst = reinterpret_cast<T*>(cdst); T* dst = reinterpret_cast<T*>(cdst);
@ -117,60 +117,60 @@ static __device__ void cpy_blck_q_f32(const char * cxi, char * cdsti) {
} }
template <cpy_kernel_t cpy_blck, int qk> template <cpy_kernel_t cpy_blck, int qk>
static __global__ void cpy_f32_q(const char * cx, char * cdst, const int64_t ne, static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02, const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
const int64_t nb12, const int64_t nb13) { const int nb12, const int nb13) {
const int64_t i = ((int64_t)blockDim.x*blockIdx.x + threadIdx.x)*qk; const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
if (i >= ne) { if (i >= ne) {
return; return;
} }
const int64_t i03 = i/(ne00 * ne01 * ne02); const int i03 = i/(ne00 * ne01 * ne02);
const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01); const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
const int64_t i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00; const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00; const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
const int64_t x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03; const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
const int64_t i13 = i/(ne10 * ne11 * ne12); const int i13 = i/(ne10 * ne11 * ne12);
const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11); const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10; const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10; const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
const int64_t dst_offset = (i10/qk)*nb10 + i11*nb11 + i12*nb12 + i13*nb13; const int dst_offset = (i10/qk)*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
cpy_blck(cx + x_offset, cdst + dst_offset); cpy_blck(cx + x_offset, cdst + dst_offset);
} }
template <cpy_kernel_t cpy_blck, int qk> template <cpy_kernel_t cpy_blck, int qk>
static __global__ void cpy_q_f32(const char * cx, char * cdst, const int64_t ne, static __global__ void cpy_q_f32(const char * cx, char * cdst, const int ne,
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02, const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
const int64_t nb12, const int64_t nb13) { const int nb12, const int nb13) {
const int64_t i = ((int64_t)blockDim.x*blockIdx.x + threadIdx.x)*qk; const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
if (i >= ne) { if (i >= ne) {
return; return;
} }
const int64_t i03 = i/(ne00 * ne01 * ne02); const int i03 = i/(ne00 * ne01 * ne02);
const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01); const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
const int64_t i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00; const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00; const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
const int64_t x_offset = (i00/qk)*nb00 + i01*nb01 + i02*nb02 + i03 * nb03; const int x_offset = (i00/qk)*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
const int64_t i13 = i/(ne10 * ne11 * ne12); const int i13 = i/(ne10 * ne11 * ne12);
const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11); const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10; const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10; const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13; const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
cpy_blck(cx + x_offset, cdst + dst_offset); cpy_blck(cx + x_offset, cdst + dst_offset);
} }
template<typename src_t, typename dst_t> template<typename src_t, typename dst_t>
static __global__ void cpy_scalar_contiguous(const char * cx, char * cdst, const int64_t ne) { static __global__ void cpy_scalar_contiguous(const char * cx, char * cdst, const int64_t ne) {
const int64_t i = (int64_t)blockDim.x*blockIdx.x + threadIdx.x; const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= ne) { if (i >= ne) {
return; return;
@ -188,20 +188,19 @@ static void ggml_cpy_scalar_contiguous_cuda(
cudaStream_t stream) { cudaStream_t stream) {
const int64_t num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; const int64_t num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
GGML_ASSERT(num_blocks < UINT_MAX);
cpy_scalar_contiguous<src_t, dst_t><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>> cpy_scalar_contiguous<src_t, dst_t><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
(cx, cdst, ne); (cx, cdst, ne);
} }
template<typename src_t, typename dst_t, bool transposed = false> template<typename src_t, typename dst_t, bool transposed = false>
static void ggml_cpy_scalar_cuda( static void ggml_cpy_scalar_cuda(
const char * cx, char * cdst, const int64_t ne, const char * cx, char * cdst, const int ne,
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02, const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) { const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
if (transposed) { if (transposed) {
GGML_ASSERT(ne == ne00*ne01*ne02); // ne[3] is 1 assumed GGML_ASSERT(ne == ne00*ne01*ne02); // ne[3] is 1 assumed
int64_t ne00n, ne01n, ne02n; int ne00n, ne01n, ne02n;
if (nb00 <= nb02) { // most likely safe to handle nb00 = nb02 case here if (nb00 <= nb02) { // most likely safe to handle nb00 = nb02 case here
ne00n = ne00; ne00n = ne00;
ne01n = ne01; ne01n = ne01;
@ -212,159 +211,143 @@ static void ggml_cpy_scalar_cuda(
ne02n = 1; ne02n = 1;
} }
int64_t grid_x = (ne01n + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D; dim3 dimGrid( (ne01n + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D,
int64_t grid_y = (ne00n + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D; (ne00n + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D,
int64_t grid_z = (ne/(ne01n*ne00n) + CUDA_CPY_BLOCK_NM - 1) / CUDA_CPY_BLOCK_NM; (ne/(ne01n*ne00n) + CUDA_CPY_BLOCK_NM - 1) / CUDA_CPY_BLOCK_NM);
GGML_ASSERT(grid_x < UINT_MAX);
GGML_ASSERT(grid_y < USHRT_MAX);
GGML_ASSERT(grid_z < USHRT_MAX);
dim3 dimGrid(grid_x, grid_y, grid_z);
dim3 dimBlock(CUDA_CPY_TILE_DIM_2D, CUDA_CPY_BLOCK_ROWS, 1); dim3 dimBlock(CUDA_CPY_TILE_DIM_2D, CUDA_CPY_BLOCK_ROWS, 1);
cpy_scalar_transpose<dst_t><<<dimGrid, dimBlock, 0, stream>>> cpy_scalar_transpose<dst_t><<<dimGrid, dimBlock, 0, stream>>>
(cx, cdst, ne, ne00n, ne01n, ne02n, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); (cx, cdst, ne, ne00n, ne01n, ne02n, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
} else { } else {
const int64_t num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
GGML_ASSERT(num_blocks < UINT_MAX);
cpy_scalar<cpy_1_scalar<src_t, dst_t>><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>> cpy_scalar<cpy_1_scalar<src_t, dst_t>><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
} }
} }
static void ggml_cpy_f32_q8_0_cuda( static void ggml_cpy_f32_q8_0_cuda(
const char * cx, char * cdst, const int64_t ne, const char * cx, char * cdst, const int ne,
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02, const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) { const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
GGML_ASSERT(ne % QK8_0 == 0); GGML_ASSERT(ne % QK8_0 == 0);
const int64_t num_blocks = ne / QK8_0; const int num_blocks = ne / QK8_0;
GGML_ASSERT(num_blocks < UINT_MAX);
cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, 1, 0, stream>>> cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, 1, 0, stream>>>
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
} }
static void ggml_cpy_q8_0_f32_cuda( static void ggml_cpy_q8_0_f32_cuda(
const char * cx, char * cdst, const int64_t ne, const char * cx, char * cdst, const int ne,
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02, const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) { const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
const int64_t num_blocks = ne; const int num_blocks = ne;
GGML_ASSERT(num_blocks < UINT_MAX);
cpy_q_f32<cpy_blck_q8_0_f32, QK8_0><<<num_blocks, 1, 0, stream>>> cpy_q_f32<cpy_blck_q8_0_f32, QK8_0><<<num_blocks, 1, 0, stream>>>
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
} }
static void ggml_cpy_f32_q4_0_cuda( static void ggml_cpy_f32_q4_0_cuda(
const char * cx, char * cdst, const int64_t ne, const char * cx, char * cdst, const int ne,
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02, const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) { const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
GGML_ASSERT(ne % QK4_0 == 0); GGML_ASSERT(ne % QK4_0 == 0);
const int64_t num_blocks = ne / QK4_0; const int num_blocks = ne / QK4_0;
GGML_ASSERT(num_blocks < UINT_MAX);
cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, 1, 0, stream>>> cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, 1, 0, stream>>>
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
} }
static void ggml_cpy_q4_0_f32_cuda( static void ggml_cpy_q4_0_f32_cuda(
const char * cx, char * cdst, const int64_t ne, const char * cx, char * cdst, const int ne,
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int ne00, const int ne01, const int ne02,
const int64_t nb00, const int64_t nb01, const int64_t nb02, const int nb00, const int nb01, const int nb02,
const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int nb03, const int ne10, const int ne11, const int ne12,
const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, const int nb10, const int nb11, const int nb12, const int nb13,
cudaStream_t stream) { cudaStream_t stream) {
const int64_t num_blocks = ne; const int num_blocks = ne;
GGML_ASSERT(num_blocks < UINT_MAX);
cpy_q_f32<cpy_blck_q_f32<dequantize_q4_0, QK4_0>, QK4_0><<<num_blocks, 1, 0, stream>>>( cpy_q_f32<cpy_blck_q_f32<dequantize_q4_0, QK4_0>, QK4_0><<<num_blocks, 1, 0, stream>>>(
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
ne10, ne11, ne12, nb10, nb11, nb12, nb13); ne10, ne11, ne12, nb10, nb11, nb12, nb13);
} }
static void ggml_cpy_f32_q4_1_cuda( static void ggml_cpy_f32_q4_1_cuda(
const char * cx, char * cdst, const int64_t ne, const char * cx, char * cdst, const int ne,
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02, const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) { const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
GGML_ASSERT(ne % QK4_1 == 0); GGML_ASSERT(ne % QK4_1 == 0);
const int64_t num_blocks = ne / QK4_1; const int num_blocks = ne / QK4_1;
GGML_ASSERT(num_blocks < UINT_MAX);
cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, 1, 0, stream>>> cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, 1, 0, stream>>>
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
} }
static void ggml_cpy_q4_1_f32_cuda( static void ggml_cpy_q4_1_f32_cuda(
const char * cx, char * cdst, const int64_t ne, const char * cx, char * cdst, const int ne,
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int ne00, const int ne01, const int ne02,
const int64_t nb00, const int64_t nb01, const int64_t nb02, const int nb00, const int nb01, const int nb02,
const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int nb03, const int ne10, const int ne11, const int ne12,
const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, const int nb10, const int nb11, const int nb12, const int nb13,
cudaStream_t stream) { cudaStream_t stream) {
const int64_t num_blocks = ne; const int num_blocks = ne;
GGML_ASSERT(num_blocks < UINT_MAX);
cpy_q_f32<cpy_blck_q_f32<dequantize_q4_1, QK4_1>, QK4_1><<<num_blocks, 1, 0, stream>>>( cpy_q_f32<cpy_blck_q_f32<dequantize_q4_1, QK4_1>, QK4_1><<<num_blocks, 1, 0, stream>>>(
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
ne10, ne11, ne12, nb10, nb11, nb12, nb13); ne10, ne11, ne12, nb10, nb11, nb12, nb13);
} }
static void ggml_cpy_f32_q5_0_cuda( static void ggml_cpy_f32_q5_0_cuda(
const char * cx, char * cdst, const int64_t ne, const char * cx, char * cdst, const int ne,
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02, const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) { const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
GGML_ASSERT(ne % QK5_0 == 0); GGML_ASSERT(ne % QK5_0 == 0);
const int64_t num_blocks = ne / QK5_0; const int num_blocks = ne / QK5_0;
GGML_ASSERT(num_blocks < UINT_MAX);
cpy_f32_q<cpy_blck_f32_q5_0, QK5_0><<<num_blocks, 1, 0, stream>>> cpy_f32_q<cpy_blck_f32_q5_0, QK5_0><<<num_blocks, 1, 0, stream>>>
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
} }
static void ggml_cpy_q5_0_f32_cuda( static void ggml_cpy_q5_0_f32_cuda(
const char * cx, char * cdst, const int64_t ne, const char * cx, char * cdst, const int ne,
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int ne00, const int ne01, const int ne02,
const int64_t nb00, const int64_t nb01, const int64_t nb02, const int nb00, const int nb01, const int nb02,
const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int nb03, const int ne10, const int ne11, const int ne12,
const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, const int nb10, const int nb11, const int nb12, const int nb13,
cudaStream_t stream) { cudaStream_t stream) {
const int64_t num_blocks = ne; const int num_blocks = ne;
GGML_ASSERT(num_blocks < UINT_MAX);
cpy_q_f32<cpy_blck_q_f32<dequantize_q5_0, QK5_0>, QK5_0><<<num_blocks, 1, 0, stream>>>( cpy_q_f32<cpy_blck_q_f32<dequantize_q5_0, QK5_0>, QK5_0><<<num_blocks, 1, 0, stream>>>(
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
ne10, ne11, ne12, nb10, nb11, nb12, nb13); ne10, ne11, ne12, nb10, nb11, nb12, nb13);
} }
static void ggml_cpy_f32_q5_1_cuda( static void ggml_cpy_f32_q5_1_cuda(
const char * cx, char * cdst, const int64_t ne, const char * cx, char * cdst, const int ne,
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02, const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) { const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
GGML_ASSERT(ne % QK5_1 == 0); GGML_ASSERT(ne % QK5_1 == 0);
const int64_t num_blocks = ne / QK5_1; const int num_blocks = ne / QK5_1;
GGML_ASSERT(num_blocks < UINT_MAX);
cpy_f32_q<cpy_blck_f32_q5_1, QK5_1><<<num_blocks, 1, 0, stream>>> cpy_f32_q<cpy_blck_f32_q5_1, QK5_1><<<num_blocks, 1, 0, stream>>>
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
} }
static void ggml_cpy_q5_1_f32_cuda( static void ggml_cpy_q5_1_f32_cuda(
const char * cx, char * cdst, const int64_t ne, const char * cx, char * cdst, const int ne,
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int ne00, const int ne01, const int ne02,
const int64_t nb00, const int64_t nb01, const int64_t nb02, const int nb00, const int nb01, const int nb02,
const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int nb03, const int ne10, const int ne11, const int ne12,
const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, const int nb10, const int nb11, const int nb12, const int nb13,
cudaStream_t stream) { cudaStream_t stream) {
const int64_t num_blocks = ne; const int num_blocks = ne;
GGML_ASSERT(num_blocks < UINT_MAX);
cpy_q_f32<cpy_blck_q_f32<dequantize_q5_1, QK5_1>, QK5_1><<<num_blocks, 1, 0, stream>>>( cpy_q_f32<cpy_blck_q_f32<dequantize_q5_1, QK5_1>, QK5_1><<<num_blocks, 1, 0, stream>>>(
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
ne10, ne11, ne12, nb10, nb11, nb12, nb13); ne10, ne11, ne12, nb10, nb11, nb12, nb13);
} }
static void ggml_cpy_f32_iq4_nl_cuda( static void ggml_cpy_f32_iq4_nl_cuda(
const char * cx, char * cdst, const int64_t ne, const char * cx, char * cdst, const int ne,
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02, const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) { const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
GGML_ASSERT(ne % QK4_NL == 0); GGML_ASSERT(ne % QK4_NL == 0);
const int64_t num_blocks = ne / QK4_NL; const int num_blocks = ne / QK4_NL;
GGML_ASSERT(num_blocks < UINT_MAX);
cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL><<<num_blocks, 1, 0, stream>>> cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL><<<num_blocks, 1, 0, stream>>>
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
} }
@ -373,6 +356,9 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
const int64_t ne = ggml_nelements(src0); const int64_t ne = ggml_nelements(src0);
GGML_ASSERT(ne == ggml_nelements(src1)); GGML_ASSERT(ne == ggml_nelements(src1));
GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
const int64_t ne00 = src0->ne[0]; const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1]; const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2]; const int64_t ne02 = src0->ne[2];

View File

@ -5,7 +5,7 @@
#include "ggml.h" #include "ggml.h"
#ifdef GGML_CUDA_USE_CUB #ifdef GGML_CUDA_USE_CUB
# include <cub/cub.cuh> # include <cub/block/block_scan.cuh>
#endif // GGML_CUDA_USE_CUB #endif // GGML_CUDA_USE_CUB
template<typename T, int BLOCK_SIZE> template<typename T, int BLOCK_SIZE>
@ -185,34 +185,9 @@ static __global__ void cumsum_kernel(
} }
} }
#ifdef GGML_CUDA_USE_CUB
template <typename T>
static void cumsum_cub(ggml_cuda_pool & pool,
const T * src,
T * dst,
int64_t ne,
cudaStream_t stream) {
size_t tmp_size = 0;
// Query how much temp storage CUDA UnBound (CUB) needs
cub::DeviceScan::InclusiveSum(nullptr, // d_temp_storage (null = just query size)
tmp_size, // reference to size (will be set by CUB)
src, // input pointer
dst, // output pointer
ne, // number of elements
stream // CUDA stream to use
);
ggml_cuda_pool_alloc<uint8_t> tmp_alloc(pool, tmp_size);
// Perform the inclusive scan
cub::DeviceScan::InclusiveSum((void *) tmp_alloc.get(), tmp_size, src, dst, ne, stream);
}
#endif // GGML_CUDA_USE_CUB
template<typename T> template<typename T>
static void cumsum_cuda( static void cumsum_cuda(
[[maybe_unused]] ggml_backend_cuda_context & ctx, const T * src, T * dst, const T * src, T * dst,
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03, const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
const int64_t nb00, const int64_t nb01, const int64_t nb02, const int64_t nb03, const int64_t nb00, const int64_t nb01, const int64_t nb02, const int64_t nb03,
const int64_t nb0, const int64_t nb1, const int64_t nb2, const int64_t nb3, const int64_t nb0, const int64_t nb1, const int64_t nb2, const int64_t nb3,
@ -226,15 +201,6 @@ static void cumsum_cuda(
if (is_contiguous) { if (is_contiguous) {
use_cub = true; use_cub = true;
const int64_t nrows = ne01 * ne02 * ne03;
// TODO: Compare with DeviceSegmentedScan::InclusiveSegmentedSum for nrows > 1 once InclusiveSegmentedSum is released
// Heuristics were determined as part of https://github.com/ggml-org/llama.cpp/pull/17004
if (((nrows == 1) && (ne00 > 1024)) || (ne00 / nrows > 4096)) {
for (int i=0; i<nrows; i++) {
cumsum_cub(ctx.pool(), src + i * ne00, dst + i * ne00, ne00, stream);
}
return;
}
} }
#endif // GGML_CUDA_USE_CUB #endif // GGML_CUDA_USE_CUB
dim3 grid_dims(ne01, ne02, ne03); dim3 grid_dims(ne01, ne02, ne03);
@ -273,7 +239,7 @@ void ggml_cuda_op_cumsum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
case GGML_TYPE_F32: case GGML_TYPE_F32:
{ {
cumsum_cuda( cumsum_cuda(
ctx, (const float *)src0->data, (float *)dst->data, (const float *)src0->data, (float *)dst->data,
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3],

View File

@ -918,9 +918,7 @@ void launch_fattn(
blocks_num.y = 1; blocks_num.y = 1;
blocks_num.z = 1; blocks_num.z = 1;
if (ntiles_total % blocks_num.x != 0) { // Fixup is only needed if the SMs work on fractional tiles. dst_tmp_meta.alloc(blocks_num.x*ncols * (2*2 + DV) * sizeof(float));
dst_tmp_meta.alloc((size_t(blocks_num.x) * ncols * (2 + DV/2)));
}
} else { } else {
const int ntiles_KQ = (K->ne[1] + nbatch_fa - 1) / nbatch_fa; // Max. number of parallel blocks limited by tensor size. const int ntiles_KQ = (K->ne[1] + nbatch_fa - 1) / nbatch_fa; // Max. number of parallel blocks limited by tensor size.

View File

@ -19,7 +19,6 @@
#include "ggml-cuda/count-equal.cuh" #include "ggml-cuda/count-equal.cuh"
#include "ggml-cuda/cpy.cuh" #include "ggml-cuda/cpy.cuh"
#include "ggml-cuda/cross-entropy-loss.cuh" #include "ggml-cuda/cross-entropy-loss.cuh"
#include "ggml-cuda/cumsum.cuh"
#include "ggml-cuda/diagmask.cuh" #include "ggml-cuda/diagmask.cuh"
#include "ggml-cuda/diag.cuh" #include "ggml-cuda/diag.cuh"
#include "ggml-cuda/fattn.cuh" #include "ggml-cuda/fattn.cuh"
@ -45,7 +44,6 @@
#include "ggml-cuda/ssm-scan.cuh" #include "ggml-cuda/ssm-scan.cuh"
#include "ggml-cuda/sum.cuh" #include "ggml-cuda/sum.cuh"
#include "ggml-cuda/sumrows.cuh" #include "ggml-cuda/sumrows.cuh"
#include "ggml-cuda/top-k.cuh"
#include "ggml-cuda/mean.cuh" #include "ggml-cuda/mean.cuh"
#include "ggml-cuda/tsembd.cuh" #include "ggml-cuda/tsembd.cuh"
#include "ggml-cuda/topk-moe.cuh" #include "ggml-cuda/topk-moe.cuh"
@ -203,6 +201,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
GGML_ASSERT(info.device_count <= GGML_CUDA_MAX_DEVICES); GGML_ASSERT(info.device_count <= GGML_CUDA_MAX_DEVICES);
int64_t total_vram = 0; int64_t total_vram = 0;
#ifdef GGML_CUDA_FORCE_MMQ
GGML_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
#else
GGML_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
#endif // GGML_CUDA_FORCE_MMQ
#ifdef GGML_CUDA_FORCE_CUBLAS
GGML_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: yes\n", __func__);
#else
GGML_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: no\n", __func__);
#endif // GGML_CUDA_FORCE_CUBLAS
GGML_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count); GGML_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
std::vector<std::pair<int, std::string>> turing_devices_without_mma; std::vector<std::pair<int, std::string>> turing_devices_without_mma;
@ -233,14 +241,6 @@ static ggml_cuda_device_info ggml_cuda_init() {
info.devices[id].nsm = prop.multiProcessorCount; info.devices[id].nsm = prop.multiProcessorCount;
info.devices[id].smpb = prop.sharedMemPerBlock; info.devices[id].smpb = prop.sharedMemPerBlock;
info.devices[id].warp_size = prop.warpSize; info.devices[id].warp_size = prop.warpSize;
#ifndef GGML_USE_MUSA
int supports_coop_launch = 0;
CUDA_CHECK(cudaDeviceGetAttribute(&supports_coop_launch, cudaDevAttrCooperativeLaunch, id));
info.devices[id].supports_cooperative_launch = !!supports_coop_launch;
#else
info.devices[id].supports_cooperative_launch = false;
#endif // !(GGML_USE_MUSA)
#if defined(GGML_USE_HIP) #if defined(GGML_USE_HIP)
info.devices[id].smpbo = prop.sharedMemPerBlock; info.devices[id].smpbo = prop.sharedMemPerBlock;
@ -2687,9 +2687,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_SUM: case GGML_OP_SUM:
ggml_cuda_op_sum(ctx, dst); ggml_cuda_op_sum(ctx, dst);
break; break;
case GGML_OP_CUMSUM:
ggml_cuda_op_cumsum(ctx, dst);
break;
case GGML_OP_SUM_ROWS: case GGML_OP_SUM_ROWS:
ggml_cuda_op_sum_rows(ctx, dst); ggml_cuda_op_sum_rows(ctx, dst);
break; break;
@ -2702,9 +2699,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_SSM_SCAN: case GGML_OP_SSM_SCAN:
ggml_cuda_op_ssm_scan(ctx, dst); ggml_cuda_op_ssm_scan(ctx, dst);
break; break;
case GGML_OP_TOP_K:
ggml_cuda_op_top_k(ctx, dst);
break;
case GGML_OP_ARGSORT: case GGML_OP_ARGSORT:
ggml_cuda_op_argsort(ctx, dst); ggml_cuda_op_argsort(ctx, dst);
break; break;
@ -2714,6 +2708,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_CROSS_ENTROPY_LOSS: case GGML_OP_CROSS_ENTROPY_LOSS:
ggml_cuda_cross_entropy_loss(ctx, dst); ggml_cuda_cross_entropy_loss(ctx, dst);
break; break;
case GGML_OP_CUMSUM:
ggml_cuda_op_cumsum(ctx, dst);
break;
case GGML_OP_TRI: case GGML_OP_TRI:
ggml_cuda_op_tri(ctx, dst); ggml_cuda_op_tri(ctx, dst);
break; break;
@ -3266,7 +3263,6 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
should_launch_concurrent_events = should_launch_concurrent_events && event.is_valid(); should_launch_concurrent_events = should_launch_concurrent_events && event.is_valid();
} }
} }
if (should_launch_concurrent_events) { if (should_launch_concurrent_events) {
// Restore original node order within each concurrent region to enable fusion within streams // Restore original node order within each concurrent region to enable fusion within streams
@ -3318,8 +3314,6 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
cgraph->nodes[start_pos + i] = const_cast<ggml_tensor *>(event.original_order[i]); cgraph->nodes[start_pos + i] = const_cast<ggml_tensor *>(event.original_order[i]);
} }
} }
} else {
stream_ctx.concurrent_events.clear();
} }
for (int i = 0; i < cgraph->n_nodes; i++) { for (int i = 0; i < cgraph->n_nodes; i++) {
@ -3708,7 +3702,10 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
} }
} }
static bool ggml_cuda_set_cuda_graph_enabled(ggml_backend_cuda_context * cuda_ctx) { static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
ggml_cuda_set_device(cuda_ctx->device);
#ifdef USE_CUDA_GRAPH #ifdef USE_CUDA_GRAPH
static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr); static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
@ -3719,6 +3716,7 @@ static bool ggml_cuda_set_cuda_graph_enabled(ggml_backend_cuda_context * cuda_ct
} }
bool use_cuda_graph = true; bool use_cuda_graph = true;
bool cuda_graph_update_required = false;
if (cuda_ctx->cuda_graph->graph == nullptr) { if (cuda_ctx->cuda_graph->graph == nullptr) {
if (ggml_cuda_info().devices[cuda_ctx->device].cc < GGML_CUDA_CC_AMPERE) { if (ggml_cuda_info().devices[cuda_ctx->device].cc < GGML_CUDA_CC_AMPERE) {
@ -3739,27 +3737,6 @@ static bool ggml_cuda_set_cuda_graph_enabled(ggml_backend_cuda_context * cuda_ct
use_cuda_graph = false; use_cuda_graph = false;
} }
cuda_ctx->cuda_graph->cuda_graphs_enabled = use_cuda_graph;
#else
bool use_cuda_graph = false;
#endif // USE_CUDA_GRAPH
return use_cuda_graph;
}
static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
ggml_cuda_set_device(cuda_ctx->device);
bool use_cuda_graph = false;
bool cuda_graph_update_required = false;
// graph_optimize calls set_cuda_graph_enabled, in-case it not called (i.e. graph_compute is directly called)
// we call it here instead.
#ifdef USE_CUDA_GRAPH
use_cuda_graph = ggml_cuda_set_cuda_graph_enabled(cuda_ctx);
if (use_cuda_graph) { if (use_cuda_graph) {
cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph); cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph);
@ -3774,13 +3751,11 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) { if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true; cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
cuda_ctx->cuda_graph->cuda_graphs_enabled = false;
#ifndef NDEBUG #ifndef NDEBUG
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__); GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
#endif #endif
} }
} }
#endif // USE_CUDA_GRAPH
if (use_cuda_graph && cuda_graph_update_required) { if (use_cuda_graph && cuda_graph_update_required) {
// Start CUDA graph capture // Start CUDA graph capture
@ -3792,6 +3767,11 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed)); CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
} }
#else
bool use_cuda_graph = false;
bool cuda_graph_update_required = false;
#endif // USE_CUDA_GRAPH
bool graph_evaluated_or_captured = false; bool graph_evaluated_or_captured = false;
evaluate_and_capture_cuda_graph(cuda_ctx, cgraph, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required); evaluate_and_capture_cuda_graph(cuda_ctx, cgraph, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required);
@ -3827,10 +3807,8 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev
static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) { static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context; ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
const bool use_cuda_graph = ggml_cuda_set_cuda_graph_enabled(cuda_ctx);
static bool enable_graph_optimization = [] { static bool enable_graph_optimization = [] {
const char * env = getenv("GGML_CUDA_GRAPH_OPT"); const char * env = getenv("GGML_CUDA_GRAPH_OPT");
return env != nullptr && atoi(env) == 1; return env != nullptr && atoi(env) == 1;
}(); }();
@ -3838,13 +3816,12 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph
return; return;
} }
GGML_ASSERT(ggml_backend_cuda_get_device_count() == 1 && "compute graph optimization is only supported on single GPU in the CUDA backend");
GGML_LOG_DEBUG("Optimizing CUDA graph %p with %d nodes\n", cgraph->nodes, cgraph->n_nodes);
ggml_cuda_stream_context & stream_context = cuda_ctx->stream_context(); ggml_cuda_stream_context & stream_context = cuda_ctx->stream_context();
stream_context.reset(); stream_context.reset();
if (!use_cuda_graph || ggml_backend_cuda_get_device_count() != 1) {
return;
}
// number of out-degrees for a particular node // number of out-degrees for a particular node
std::unordered_map<const ggml_tensor *, int> fan_out; std::unordered_map<const ggml_tensor *, int> fan_out;
// reverse mapping of node to index in the cgraph // reverse mapping of node to index in the cgraph
@ -3905,12 +3882,6 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph
if (count >= min_fan_out && count <= max_fan_out) { if (count >= min_fan_out && count <= max_fan_out) {
const int root_node_idx = node_indices[root_node]; const int root_node_idx = node_indices[root_node];
// only optimize for attn_norm
// TODO: make this more generic
if (!strstr(root_node->name, "attn_norm")) {
continue;
}
bool is_part_of_event = false; bool is_part_of_event = false;
for (const auto & [start, end] : concurrent_node_ranges) { for (const auto & [start, end] : concurrent_node_ranges) {
if (root_node_idx >= start && root_node_idx <= end) { if (root_node_idx >= start && root_node_idx <= end) {
@ -4639,7 +4610,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
return true; return true;
case GGML_OP_SUM: case GGML_OP_SUM:
return ggml_is_contiguous_rows(op->src[0]); return ggml_is_contiguous_rows(op->src[0]);
case GGML_OP_TOP_K:
case GGML_OP_ARGSORT: case GGML_OP_ARGSORT:
#ifndef GGML_CUDA_USE_CUB #ifndef GGML_CUDA_USE_CUB
return op->src[0]->ne[0] <= 1024; return op->src[0]->ne[0] <= 1024;

View File

@ -1,14 +1,6 @@
#include "common.cuh" #include "common.cuh"
#include "ggml.h" #include "ggml.h"
#include "softmax.cuh" #include "softmax.cuh"
#ifdef GGML_USE_HIP
#include <hip/hip_cooperative_groups.h>
#else
#include <cooperative_groups.h>
#include <cooperative_groups/reduce.h>
#endif // GGML_USE_HIP
#include <cstdint> #include <cstdint>
#include <utility> #include <utility>
@ -168,156 +160,6 @@ static __global__ void soft_max_f32(
dst[col] = vals[col] * inv_sum; dst[col] = vals[col] * inv_sum;
} }
} }
// TODO: This is a common pattern used across kernels that could be moved to common.cuh + templated
static __device__ float two_stage_warp_reduce_max(float val) {
val = warp_reduce_max(val);
if (blockDim.x > WARP_SIZE) {
assert((blockDim.x <= 1024) && (blockDim.x % WARP_SIZE) == 0);
__shared__ float local_vals[32];
const int warp_id = threadIdx.x / WARP_SIZE;
const int lane_id = threadIdx.x % WARP_SIZE;
if (lane_id == 0) {
local_vals[warp_id] = val;
}
__syncthreads();
val = -INFINITY;
if (lane_id < (static_cast<int>(blockDim.x) / WARP_SIZE)) {
val = local_vals[lane_id];
}
return warp_reduce_max(val);
} else {
return val;
}
}
static __device__ float two_stage_warp_reduce_sum(float val) {
val = warp_reduce_sum(val);
if (blockDim.x > WARP_SIZE) {
assert((blockDim.x <= 1024) && (blockDim.x % WARP_SIZE) == 0);
__shared__ float local_vals[32];
const int warp_id = threadIdx.x / WARP_SIZE;
const int lane_id = threadIdx.x % WARP_SIZE;
if (lane_id == 0) {
local_vals[warp_id] = val;
}
__syncthreads();
val = 0.0f;
if (lane_id < (static_cast<int>(blockDim.x) / WARP_SIZE)) {
val = local_vals[lane_id];
}
return warp_reduce_sum(val);
} else {
return val;
}
}
// TODO: Template to allow keeping ncols in registers if they fit
static __device__ void soft_max_f32_parallelize_cols_single_row(const float * __restrict__ x,
float * __restrict__ dst,
float * __restrict__ tmp_maxs,
float * __restrict__ tmp_sums,
const soft_max_params p) {
namespace cg = cooperative_groups;
const cg::grid_group g = cg::this_grid();
const int tid = threadIdx.x;
const int col_start = blockIdx.x * blockDim.x + tid;
const int n_elem_per_thread = 4;
float local_vals[n_elem_per_thread] = { -INFINITY, -INFINITY, -INFINITY, -INFINITY };
float local_max = -INFINITY;
const int step_size = gridDim.x * blockDim.x;
// Compute thread-local max
for (int col = col_start; col < p.ncols;) {
#pragma unroll
for (int i = 0; i < n_elem_per_thread; i++) {
const int idx = col + i * step_size;
local_vals[i] = idx < p.ncols ? x[idx] : -INFINITY;
}
#pragma unroll
for (int i = 0; i < n_elem_per_thread; i++) {
local_max = fmaxf(local_max, local_vals[i]);
}
col += step_size * n_elem_per_thread;
}
// Compute CTA-level max
local_max = two_stage_warp_reduce_max(local_max);
// Store CTA-level max to GMEM
if (tid == 0) {
tmp_maxs[blockIdx.x] = local_max;
}
g.sync();
// Compute compute global max from CTA-level maxs
assert(gridDim.x < blockDim.x); // currently we only support this case
if (tid < gridDim.x) {
local_max = tmp_maxs[tid];
} else {
local_max = -INFINITY;
}
local_max = two_stage_warp_reduce_max(local_max);
// Compute softmax dividends, accumulate divisor
float tmp_expf = 0.0f;
for (int col = col_start; col < p.ncols;) {
#pragma unroll
for (int i = 0; i < n_elem_per_thread; i++) {
const int idx = col + i * step_size;
local_vals[i] = idx < p.ncols ? x[idx] : -INFINITY;
}
#pragma unroll
for (int i = 0; i < n_elem_per_thread; i++) {
const int idx = col + i * step_size;
if (idx < p.ncols) {
const float tmp = expf(local_vals[i] - local_max);
tmp_expf += tmp;
dst[idx] = tmp;
}
}
col += step_size * n_elem_per_thread;
}
// Reduce divisor within CTA
tmp_expf = two_stage_warp_reduce_sum(tmp_expf);
// Store CTA-level sum to GMEM
if (tid == 0) {
tmp_sums[blockIdx.x] = tmp_expf;
}
g.sync();
// Compute global sum from CTA-level sums
if (tid < gridDim.x) {
tmp_expf = tmp_sums[tid];
} else {
tmp_expf = 0.0f;
}
tmp_expf = two_stage_warp_reduce_sum(tmp_expf);
// Divide dividend by global sum + store data
for (int col = col_start; col < p.ncols;) {
#pragma unroll
for (int i = 0; i < n_elem_per_thread; i++) {
const int idx = col + i * step_size;
local_vals[i] = idx < p.ncols ? dst[idx] : -INFINITY;
}
#pragma unroll
for (int i = 0; i < n_elem_per_thread; i++) {
const int idx = col + i * step_size;
if (idx < p.ncols) {
dst[idx] = local_vals[i] / tmp_expf;
}
}
col += step_size * n_elem_per_thread;
}
}
#ifdef __clang__ #ifdef __clang__
#pragma clang diagnostic pop #pragma clang diagnostic pop
#endif // __clang__ #endif // __clang__
@ -374,31 +216,9 @@ static void launch_soft_max_kernels(const float * x, const T * mask, const float
soft_max_f32<true, 0, 0><<<block_nums, block_dims, nbytes_shared, stream>>>(x, mask, sinks, dst, p); soft_max_f32<true, 0, 0><<<block_nums, block_dims, nbytes_shared, stream>>>(x, mask, sinks, dst, p);
} }
__launch_bounds__(8*WARP_SIZE, 1) static __global__ void soft_max_f32_parallelize_cols(const float * __restrict__ x,
float * __restrict__ dst,
float * __restrict__ tmp_maxs,
float * __restrict__ tmp_sums,
const soft_max_params p)
// We loop over all instead of parallelizing across gridDim.y as cooperative groups
// currently only support synchronizing the complete grid if not launched as a cluster group
// (which requires CC > 9.0)
// https://docs.nvidia.com/cuda/cuda-programming-guide/05-appendices/device-callable-apis.html#grid-synchronization
// https://docs.nvidia.com/cuda/cuda-programming-guide/05-appendices/device-callable-apis.html#class-cluster-group
{
for (int rowx = 0; rowx < p.ne01 * p.ne02 * p.ne03; rowx++) {
soft_max_f32_parallelize_cols_single_row(x + int64_t(rowx) * p.ncols, dst + int64_t(rowx) * p.ncols, tmp_maxs,
tmp_sums, p);
}
}
template <typename T> template<typename T>
static void soft_max_f32_cuda(const float * x, static void soft_max_f32_cuda(const float * x, const T * mask, const float * sinks, float * dst, const soft_max_params & params, cudaStream_t stream) {
const T * mask,
const float * sinks,
float * dst,
const soft_max_params & params,
cudaStream_t stream,
[[maybe_unused]] ggml_backend_cuda_context & ctx) {
int nth = WARP_SIZE; int nth = WARP_SIZE;
const int64_t ncols_x = params.ncols; const int64_t ncols_x = params.ncols;
@ -416,25 +236,8 @@ static void soft_max_f32_cuda(const float * x,
if (nbytes_shared <= smpbo) { if (nbytes_shared <= smpbo) {
launch_soft_max_kernels<32, 64, 128, 256, 512, 1024, 2048, 4096>(x, mask, sinks, dst, params, stream, block_dims, block_nums, nbytes_shared); launch_soft_max_kernels<32, 64, 128, 256, 512, 1024, 2048, 4096>(x, mask, sinks, dst, params, stream, block_dims, block_nums, nbytes_shared);
} else { } else {
// Parallelize across SMs for top-p/dist-sampling const size_t nbytes_shared_low = WARP_SIZE*sizeof(float);
// The heuristic for parallelizing rows across SMs vs parallelizing single row & looping over all rows was done on the basis of a B6000 GPU and soft_max_f32<false, 0, 0><<<block_nums, block_dims, nbytes_shared_low, stream>>>(x, mask, sinks, dst, params);
// Can be adapted further for lower-SM-count GPUs, though keeping data in registers should be implemented first as that is the optimal solution.
if (ggml_cuda_info().devices[id].supports_cooperative_launch &&
ncols_x / (params.ne01 * params.ne02 * params.ne03) > 8192 && mask == nullptr && sinks == nullptr &&
params.scale == 1.0f && params.max_bias == 0.0f) {
ggml_cuda_pool_alloc<float> tmp_maxs_alloc(ctx.pool(), ggml_cuda_info().devices[id].nsm * sizeof(float));
ggml_cuda_pool_alloc<float> tmp_sums_alloc(ctx.pool(), ggml_cuda_info().devices[id].nsm * sizeof(float));
void * kernel_args[] = { (void *) &x, (void *) &dst, (void *) &tmp_maxs_alloc.ptr,
(void *) &tmp_sums_alloc.ptr, (void *) const_cast<soft_max_params *>(&params) };
CUDA_CHECK(cudaLaunchCooperativeKernel((void *) soft_max_f32_parallelize_cols,
dim3(ggml_cuda_info().devices[id].nsm, 1, 1),
dim3(WARP_SIZE * 8, 1, 1), kernel_args, 0, stream));
} else {
const size_t nbytes_shared_low = WARP_SIZE * sizeof(float);
soft_max_f32<false, 0, 0>
<<<block_nums, block_dims, nbytes_shared_low, stream>>>(x, mask, sinks, dst, params);
}
} }
} }
@ -512,9 +315,9 @@ void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
params.m1 = m1; params.m1 = m1;
if (use_f16) { if (use_f16) {
soft_max_f32_cuda(src0_d, (const half *) src1_d, (const float *) src2_d, dst_d, params, stream, ctx); soft_max_f32_cuda(src0_d, (const half *) src1_d, (const float *) src2_d, dst_d, params, stream);
} else { } else {
soft_max_f32_cuda(src0_d, (const float *) src1_d, (const float *) src2_d, dst_d, params, stream, ctx); soft_max_f32_cuda(src0_d, (const float *) src1_d, (const float *) src2_d, dst_d, params, stream);
} }
} }

View File

@ -1,96 +0,0 @@
#include "argsort.cuh"
#include "top-k.cuh"
#ifdef GGML_CUDA_USE_CUB
# include <cub/cub.cuh>
# if (CCCL_MAJOR_VERSION >= 3 && CCCL_MINOR_VERSION >= 2)
# include <cuda/iterator>
# define CUB_TOP_K_AVAILABLE
using namespace cub;
# endif // CCCL_MAJOR_VERSION >= 3 && CCCL_MINOR_VERSION >= 2
#endif // GGML_CUDA_USE_CUB
#ifdef CUB_TOP_K_AVAILABLE
static void top_k_cub(ggml_cuda_pool & pool,
const float * src,
int * dst,
const int ncols,
const int k,
cudaStream_t stream) {
auto requirements = cuda::execution::require(cuda::execution::determinism::not_guaranteed,
cuda::execution::output_ordering::unsorted);
auto stream_env = cuda::stream_ref{ stream };
auto env = cuda::std::execution::env{ stream_env, requirements };
auto indexes_in = cuda::make_counting_iterator(0);
size_t temp_storage_bytes = 0;
DeviceTopK::MaxPairs(nullptr, temp_storage_bytes, src, cuda::discard_iterator(), indexes_in, dst, ncols, k,
env);
ggml_cuda_pool_alloc<uint8_t> temp_storage_alloc(pool, temp_storage_bytes);
void * d_temp_storage = temp_storage_alloc.get();
DeviceTopK::MaxPairs(d_temp_storage, temp_storage_bytes, src, cuda::discard_iterator(), indexes_in, dst,
ncols, k, env);
}
#elif defined(GGML_CUDA_USE_CUB) // CUB_TOP_K_AVAILABLE
static int next_power_of_2(int x) {
int n = 1;
while (n < x) {
n *= 2;
}
return n;
}
#endif // CUB_TOP_K_AVAILABLE
void ggml_cuda_op_top_k(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *) src0->data;
int * dst_d = (int *) dst->data;
cudaStream_t stream = ctx.stream();
// are these asserts truly necessary?
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_I32);
GGML_ASSERT(ggml_is_contiguous(src0));
const int64_t ncols = src0->ne[0];
const int64_t nrows = ggml_nrows(src0);
const int64_t k = dst->ne[0];
ggml_cuda_pool & pool = ctx.pool();
#ifdef CUB_TOP_K_AVAILABLE
// TODO: Switch to `DeviceSegmentedTopK` for multi-row TopK once implemented
// https://github.com/NVIDIA/cccl/issues/6391
// TODO: investigate if there exists a point where parallelized argsort is faster than sequential top-k
for (int i = 0; i < nrows; i++) {
top_k_cub(pool, src0_d + i * ncols, dst_d + i * k, ncols, k, stream);
}
#elif defined(GGML_CUDA_USE_CUB) // CUB_TOP_K_AVAILABLE
// Fall back to argsort + copy
const int ncols_pad = next_power_of_2(ncols);
const size_t shared_mem = ncols_pad * sizeof(int);
const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb;
ggml_cuda_pool_alloc<int> temp_dst_alloc(pool, ncols * nrows);
int * tmp_dst = temp_dst_alloc.get();
if (shared_mem > max_shared_mem || ncols > 1024) {
argsort_f32_i32_cuda_cub(pool, src0_d, tmp_dst, ncols, nrows, GGML_SORT_ORDER_DESC, stream);
} else {
argsort_f32_i32_cuda_bitonic(src0_d, tmp_dst, ncols, nrows, GGML_SORT_ORDER_DESC, stream);
}
CUDA_CHECK(cudaMemcpy2DAsync(dst_d, k * sizeof(int), tmp_dst, ncols * sizeof(int), k * sizeof(int), nrows,
cudaMemcpyDeviceToDevice, stream));
#else // GGML_CUDA_USE_CUB
ggml_cuda_pool_alloc<int> temp_dst_alloc(pool, ncols * nrows);
int * tmp_dst = temp_dst_alloc.get();
argsort_f32_i32_cuda_bitonic(src0_d, tmp_dst, ncols, nrows, GGML_SORT_ORDER_DESC, stream);
CUDA_CHECK(cudaMemcpy2DAsync(dst_d, k * sizeof(int), tmp_dst, ncols * sizeof(int), k * sizeof(int), nrows,
cudaMemcpyDeviceToDevice, stream));
#endif
}

View File

@ -1,3 +0,0 @@
#include "common.cuh"
void ggml_cuda_op_top_k(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View File

@ -45,11 +45,9 @@
#define cublasSgemm hipblasSgemm #define cublasSgemm hipblasSgemm
#define cublasStatus_t hipblasStatus_t #define cublasStatus_t hipblasStatus_t
#define cublasOperation_t hipblasOperation_t #define cublasOperation_t hipblasOperation_t
#define cudaDevAttrCooperativeLaunch hipDeviceAttributeCooperativeLaunch
#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer #define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess #define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
#define cudaDeviceGetAttribute hipDeviceGetAttribute
#define cudaDeviceProp hipDeviceProp_t #define cudaDeviceProp hipDeviceProp_t
#define cudaDeviceSynchronize hipDeviceSynchronize #define cudaDeviceSynchronize hipDeviceSynchronize
#define cudaError_t hipError_t #define cudaError_t hipError_t
@ -72,7 +70,6 @@
#define cudaHostRegisterPortable hipHostRegisterPortable #define cudaHostRegisterPortable hipHostRegisterPortable
#define cudaHostRegisterReadOnly hipHostRegisterReadOnly #define cudaHostRegisterReadOnly hipHostRegisterReadOnly
#define cudaHostUnregister hipHostUnregister #define cudaHostUnregister hipHostUnregister
#define cudaLaunchCooperativeKernel hipLaunchCooperativeKernel
#define cudaLaunchHostFunc hipLaunchHostFunc #define cudaLaunchHostFunc hipLaunchHostFunc
#define cudaMalloc hipMalloc #define cudaMalloc hipMalloc
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault) #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)

View File

@ -61,7 +61,6 @@
#define cudaHostRegisterPortable musaHostRegisterPortable #define cudaHostRegisterPortable musaHostRegisterPortable
#define cudaHostRegisterReadOnly musaHostRegisterReadOnly #define cudaHostRegisterReadOnly musaHostRegisterReadOnly
#define cudaHostUnregister musaHostUnregister #define cudaHostUnregister musaHostUnregister
#define cudaLaunchCooperativeKernel musaLaunchCooperativeKernel
#define cudaLaunchHostFunc musaLaunchHostFunc #define cudaLaunchHostFunc musaLaunchHostFunc
#define cudaMalloc musaMalloc #define cudaMalloc musaMalloc
#define cudaMallocHost musaMallocHost #define cudaMallocHost musaMallocHost

View File

@ -85,16 +85,13 @@ static void glu_swiglu_fp32_per_thread(const struct htp_tensor * src0,
struct htp_spad * dst_spad, struct htp_spad * dst_spad,
uint32_t nth, uint32_t nth,
uint32_t ith, uint32_t ith,
uint32_t src0_nrows_per_thread, uint32_t src0_nrows_per_thread) {
dma_queue * dma_queue) {
htp_act_preamble3; htp_act_preamble3;
size_t src0_row_size = nb01; size_t src0_row_size = nb01;
size_t src1_row_size = nb11; size_t src1_row_size = nb11;
size_t dst_row_size = nb1; size_t dst_row_size = nb1;
const uint32_t src0_nrows = ne01 * ne02 * ne03; // src0 rows const uint32_t src0_nrows = ne01 * ne02 * ne03; // src0 rows
const uint32_t src0_start_row = src0_nrows_per_thread * ith; const uint32_t src0_start_row = src0_nrows_per_thread * ith;
@ -108,6 +105,12 @@ static void glu_swiglu_fp32_per_thread(const struct htp_tensor * src0,
uint64_t t1, t2; uint64_t t1, t2;
t1 = HAP_perf_get_qtimer_count(); t1 = HAP_perf_get_qtimer_count();
int is_aligned = 1;
if (!htp_is_aligned((void *) src0->data, VLEN) || !htp_is_aligned((void *) dst->data, VLEN)) {
is_aligned = 0;
FARF(HIGH, "swiglu-f32: unaligned addresses in elementwise op, possibly slower execution\n");
}
const uint8_t * restrict data_src0 = (const uint8_t *) src0->data; const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
const uint8_t * restrict data_src1 = (const uint8_t *) src1->data; const uint8_t * restrict data_src1 = (const uint8_t *) src1->data;
uint8_t * restrict data_dst = (uint8_t *) dst->data; uint8_t * restrict data_dst = (uint8_t *) dst->data;
@ -124,81 +127,37 @@ static void glu_swiglu_fp32_per_thread(const struct htp_tensor * src0,
data_src1 += swapped ? 0 : nc_in_bytes; data_src1 += swapped ? 0 : nc_in_bytes;
} }
const size_t src0_row_size_aligned = htp_round_up(src0_row_size, VLEN); uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_row_size);
const size_t src1_row_size_aligned = htp_round_up(src1_row_size, VLEN); uint8_t * restrict src1_spad_data = src1_spad->data + (ith * src1_row_size);
const size_t dst_row_size_aligned = htp_round_up(dst_row_size, VLEN); uint8_t * restrict dst_spad_data = dst_spad->data + (ith * dst_row_size);
uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_spad->size_per_thread); const bool opt_path = ((1 == is_aligned) && !(nb01 & (VLEN - 1)));
uint8_t * restrict src1_spad_data = src1_spad->data + (ith * src1_spad->size_per_thread); for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) {
uint8_t * restrict dst_spad_data = dst_spad->data + (ith * dst_spad->size_per_thread); const float * restrict src0 = (float *) (data_src0 + (ir * src0_row_size));
const float * restrict src1 = (float *) (data_src1 + (ir * src1_row_size));
float * restrict dst = (float *) (data_dst + (ir * dst_row_size));
// While given src0_spad->size_per_thread, divide it to two ping-pong buffer for src0 if (ir + 1 < src0_end_row) {
size_t src0_spad_half_size = src0_spad->size_per_thread / 2; htp_l2fetch(src0 + src0_row_size, 1, src0_row_size, src0_row_size);
size_t src1_spad_half_size = src1_spad->size_per_thread / 2;
size_t dst_spad_half_size = dst_spad->size_per_thread / 2;
const int BLOCK = src0_spad_half_size / src0_row_size_aligned; // How many rows can we process in one block
if (BLOCK == 0) {
FARF(ERROR,
"swiglu-f32 : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n",
src0_spad->size_per_thread, src0_row_size_aligned);
return;
}
// See discussion: https://github.com/ggml-org/llama.cpp/pull/18151#issuecomment-3678235379
for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; ir += BLOCK, spad_idx++) {
const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
// Dummy DMA transation for sequencing (interleaving dst,src,dst,...)
dma_queue_push_vtcm_to_ddr(dma_queue,
dma_make_ptr(data_dst, dst_spad_data + (spad_idx * dst_spad_half_size)),
dst_row_size, dst_row_size_aligned, 0);
dma_queue_push_ddr_to_vtcm(dma_queue,
dma_make_ptr(src0_spad_data + (spad_idx * src0_spad_half_size), data_src0 + (ir * src0_row_size)),
src0_row_size_aligned, src0_row_size, block_size);
dma_queue_push_ddr_to_vtcm(dma_queue,
dma_make_ptr(src1_spad_data + (spad_idx * src1_spad_half_size), data_src1 + (ir * src1_row_size)),
src1_row_size_aligned, src1_row_size, block_size);
}
for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) {
const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
float * dst_spad = (float *) dma_queue_pop(dma_queue).src;
float * src0_spad = (float *) dma_queue_pop(dma_queue).dst;
float * src1_spad = (float *) dma_queue_pop(dma_queue).dst;
for (uint32_t ib = 0; ib < block_size; ib++) {
const float * src0_spad_ptr = src0_spad + ib * (src0_row_size_aligned / sizeof(float));
const float * src1_spad_ptr = src1_spad + ib * (src1_row_size_aligned / sizeof(float));
float * dst_spad_ptr = dst_spad + ib * (dst_row_size_aligned / sizeof(float));
//swiglu(x) = x1 * sigmoid(x0)
hvx_fast_sigmoid_f32((const uint8_t *) src0_spad_ptr, (uint8_t *) dst_spad_ptr, nc);
hvx_mul_mul_f32_opt((const uint8_t *) src0_spad_ptr, (const uint8_t *) dst_spad_ptr,
(const uint8_t *) src1_spad_ptr, (uint8_t *) dst_spad_ptr, nc);
} }
dma_queue_push_vtcm_to_ddr(dma_queue, dma_make_ptr(data_dst + (ir * dst_row_size), dst_spad), dst_row_size, if (opt_path) {
dst_row_size_aligned, block_size); hvx_fast_sigmoid_f32((const uint8_t *) src0, (uint8_t *) src0_spad_data, nc);
hvx_mul_mul_f32_opt((const uint8_t *) src0, (const uint8_t *) src0_spad_data, (const uint8_t *) src1,
(uint8_t *) dst, nc);
} else {
hvx_exp_f32((const uint8_t *) src0, src0_spad_data, nc, true);
hvx_add_scalar_f32(src0_spad_data, 1.0, src1_spad_data, nc);
hvx_inverse_f32(src1_spad_data, src0_spad_data, nc);
// prefetch N+2 loop iteration if any hvx_mul_f32((const uint8_t *) src0, src0_spad_data, dst_spad_data, nc);
const uint32_t pref_block = (ir + BLOCK * 2); hvx_mul_f32(dst_spad_data, (const uint8_t *) src1, (uint8_t *) dst, nc);
if (pref_block < src0_end_row) {
const uint32_t pref_block_size = MIN(BLOCK, src0_end_row - pref_block);
dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(src0_spad, data_src0 + (pref_block * src0_row_size)),
src0_row_size_aligned, src0_row_size, pref_block_size);
dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(src1_spad, data_src1 + (pref_block * src1_row_size)),
src1_row_size_aligned, src1_row_size, pref_block_size);
} }
} }
dma_queue_flush(dma_queue);
t2 = HAP_perf_get_qtimer_count(); t2 = HAP_perf_get_qtimer_count();
FARF(HIGH, "swiglu-f32 %d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth, FARF(HIGH, "swiglu-f32 %d/%d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth, opt_path,
ne00, ne01, ne02, ne03, src0_start_row, src0_end_row, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, ne00, ne01, ne02, ne03, src0_start_row, src0_end_row, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3,
(unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
} }
@ -212,16 +171,15 @@ static void glu_swiglu_oai_fp32_per_thread(const struct htp_tensor * src0,
struct htp_spad * dst_spad, struct htp_spad * dst_spad,
uint32_t nth, uint32_t nth,
uint32_t ith, uint32_t ith,
uint32_t src0_nrows_per_thread, uint32_t src0_nrows_per_thread) {
dma_queue * dma_queue) {
htp_act_preamble3; htp_act_preamble3;
uint64_t t1, t2; uint64_t t1, t2;
t1 = HAP_perf_get_qtimer_count(); t1 = HAP_perf_get_qtimer_count();
size_t src0_row_size = nb01; const size_t src0_row_size = nb01;
size_t src1_row_size = nb11; const size_t src1_row_size = nb11;
size_t dst_row_size = nb1; const size_t dst_row_size = nb1;
const uint32_t src0_nrows = ne01 * ne02 * ne03; // src0 rows const uint32_t src0_nrows = ne01 * ne02 * ne03; // src0 rows
@ -233,110 +191,66 @@ static void glu_swiglu_oai_fp32_per_thread(const struct htp_tensor * src0,
return; return;
} }
if (!htp_is_aligned((void *) src0->data, VLEN) || !htp_is_aligned((void *) dst->data, VLEN)) {
FARF(HIGH, "act-f32: unaligned addresses in activations op, possibly slower execution\n");
}
const uint8_t * restrict data_src0 = (const uint8_t *) src0->data; const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
const uint8_t * restrict data_src1 = (const uint8_t *) src1->data; const uint8_t * restrict data_src1 = (const uint8_t *) src1->data;
uint8_t * restrict data_dst = (uint8_t *) dst->data; uint8_t * restrict data_dst = (uint8_t *) dst->data;
const bool src1_valid = src1->ne[0]; bool src1_valid = src1->ne[0];
const int nc = (src1_valid) ? ne00 : ne00 / 2;
if (!src1_valid) { if (!src1_valid) {
const int32_t swapped = op_params[1]; data_src1 = data_src0;
data_src1 = data_src0;
src1_row_size = src0_row_size;
const size_t nc_in_bytes = nc * SIZEOF_FP32;
data_src0 += swapped ? nc_in_bytes : 0;
data_src1 += swapped ? 0 : nc_in_bytes;
} }
const size_t src0_row_size_aligned = htp_round_up(src0_row_size, VLEN); uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_row_size);
const size_t src1_row_size_aligned = htp_round_up(src1_row_size, VLEN); uint8_t * restrict src1_spad_data = src1_spad->data + (ith * src1_row_size);
const size_t dst_row_size_aligned = htp_round_up(dst_row_size, VLEN); uint8_t * restrict dst_spad_data = dst_spad->data + (ith * dst_row_size);
uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_spad->size_per_thread); const int32_t swapped = op_params[1];
uint8_t * restrict src1_spad_data = src1_spad->data + (ith * src1_spad->size_per_thread); const float alpha = ((const float *) (op_params))[2];
uint8_t * restrict dst_spad_data = dst_spad->data + (ith * dst_spad->size_per_thread); const float limit = ((const float *) (op_params))[3];
// While given src0_spad->size_per_thread, divide it to two ping-pong buffer for src0 const int nc = (src1_valid) ? ne00 : ne00 / 2;
size_t src0_spad_half_size = src0_spad->size_per_thread / 2;
size_t src1_spad_half_size = src1_spad->size_per_thread / 2;
size_t dst_spad_half_size = dst_spad->size_per_thread / 2;
const int BLOCK = src0_spad_half_size / src0_row_size_aligned; // How many rows can we process in one block for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) {
if (BLOCK == 0) { const float * restrict src0 = (float *) (data_src0 + (ir * src0_row_size));
FARF(ERROR, const float * restrict src1 = (float *) (data_src1 + (ir * src1_row_size));
"swiglu-oai-f32 : current VTCM reservation %zu is too small for even 1 row per thread, needed at least " float * restrict dst = (float *) (data_dst + (ir * dst_row_size));
"%zu\n",
src0_spad->size_per_thread, src0_row_size_aligned);
return;
}
const float alpha = ((const float *) (op_params))[2];
const float limit = ((const float *) (op_params))[3];
// See discussion: https://github.com/ggml-org/llama.cpp/pull/18151#issuecomment-3678235379 if (ir + 1 < src0_end_row) {
for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; ir += BLOCK, spad_idx++) { htp_l2fetch(src0 + src0_row_size, 1, src0_row_size, src0_row_size);
const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
// Dummy DMA transation for sequencing (interleaving dst,src,dst,...)
dma_queue_push_vtcm_to_ddr(dma_queue, dma_make_ptr(data_dst, dst_spad_data + (spad_idx * dst_spad_half_size)),
dst_row_size, dst_row_size_aligned, 0);
dma_queue_push_ddr_to_vtcm(
dma_queue,
dma_make_ptr(src0_spad_data + (spad_idx * src0_spad_half_size), data_src0 + (ir * src0_row_size)),
src0_row_size_aligned, src0_row_size, block_size);
dma_queue_push_ddr_to_vtcm(
dma_queue,
dma_make_ptr(src1_spad_data + (spad_idx * src1_spad_half_size), data_src1 + (ir * src1_row_size)),
src1_row_size_aligned, src1_row_size, block_size);
}
for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) {
const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
float * dst_spad = (float *) dma_queue_pop(dma_queue).src;
float * src0_spad = (float *) dma_queue_pop(dma_queue).dst;
float * src1_spad = (float *) dma_queue_pop(dma_queue).dst;
for (uint32_t ib = 0; ib < block_size; ib++) {
const float * src0_spad_ptr = src0_spad + ib * (src0_row_size_aligned / sizeof(float));
const float * src1_spad_ptr = src1_spad + ib * (src1_row_size_aligned / sizeof(float));
float * dst_spad_ptr = dst_spad + ib * (dst_row_size_aligned / sizeof(float));
// x (src0_spad_data) = std::min(src0_p[k], limit);
hvx_min_scalar_f32((const uint8_t *) src0_spad_ptr, limit, (uint8_t *) src0_spad_ptr, nc);
// y1 (src1_spad_data) = std::clamp(src1_p[k], -limit, limit);
hvx_clamp_scalar_f32((const uint8_t *) src1_spad_ptr, -limit, limit, (uint8_t *) src1_spad_ptr, nc);
// y (src1_spad_data) = y1 + 1.f
hvx_add_scalar_f32((const uint8_t *) src1_spad_ptr, 1.0, (uint8_t *) src1_spad_ptr, nc);
// x1 (dst_spad_data) = alpha * (x)
hvx_mul_scalar_f32((const uint8_t *) src0_spad_ptr, alpha, (uint8_t *) dst_spad_ptr, nc);
// x2 (dst_spad_data) = sigmoid(x1) = 1/(1+exp(-x1))
hvx_fast_sigmoid_f32((const uint8_t *) dst_spad_ptr, (uint8_t *) dst_spad_ptr, nc);
// out = x * sigmoid(alpha * x) * (y + 1.f)
hvx_mul_mul_f32_opt((const uint8_t *) src0_spad_ptr, (const uint8_t *) dst_spad_ptr,
(const uint8_t *) src1_spad_ptr, (uint8_t *) dst_spad_ptr, nc);
} }
dma_queue_push_vtcm_to_ddr(dma_queue, dma_make_ptr(data_dst + (ir * dst_row_size), dst_spad), dst_row_size, if (!src1) {
dst_row_size_aligned, block_size); src0 += swapped ? nc : 0;
src1 += swapped ? 0 : nc;
// prefetch N+2 loop iteration if any
const uint32_t pref_block = (ir + BLOCK * 2);
if (pref_block < src0_end_row) {
const uint32_t pref_block_size = MIN(BLOCK, src0_end_row - pref_block);
dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(src0_spad, data_src0 + (pref_block * src0_row_size)),
src0_row_size_aligned, src0_row_size, pref_block_size);
dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(src1_spad, data_src1 + (pref_block * src1_row_size)),
src1_row_size_aligned, src1_row_size, pref_block_size);
} }
}
dma_queue_flush(dma_queue); // x (src0_spad_data) = std::min(src0_p[k], limit);
hvx_min_scalar_f32((const uint8_t *) src0, limit, src0_spad_data, nc);
// y1 (src1_spad_data) = std::clamp(src1_p[k], -limit, limit);
hvx_clamp_scalar_f32((const uint8_t *) src1, -limit, limit, src1_spad_data, nc);
// y (src1_spad_data) = y1 + 1.f
hvx_add_scalar_f32(src1_spad_data, 1.0, src1_spad_data, nc);
// x1 (dst_spad_data) = alpha * (x)
hvx_mul_scalar_f32(src0_spad_data, alpha, dst_spad_data, nc);
// x2 (dst_spad_data) = expf(-x1)
hvx_exp_f32(dst_spad_data, dst_spad_data, nc, true);
// x3 (dst_spad_data) = x2 + 1.f
hvx_add_scalar_f32(dst_spad_data, 1.0, dst_spad_data, nc);
// x4 (dst_spad_data) = 1 / x3
hvx_inverse_f32(dst_spad_data, dst_spad_data, nc);
// out_glu(dst_spad_data) = x * x4
hvx_mul_f32(src0_spad_data, dst_spad_data, dst_spad_data, nc);
// out = out_glu * (y + 1.f);
hvx_mul_f32(dst_spad_data, src1_spad_data, (uint8_t *) dst, nc);
}
t2 = HAP_perf_get_qtimer_count(); t2 = HAP_perf_get_qtimer_count();
FARF(HIGH, "swiglu-oai-f32 %d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth, src0->ne[0], FARF(HIGH, "swiglu-f32 %d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth, src0->ne[0],
src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0], src1->ne[1], src1->ne[2], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0], src1->ne[1], src1->ne[2],
src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
} }
@ -457,8 +371,7 @@ static void unary_silu_fp32_per_thread(const struct htp_tensor * src0,
struct htp_spad * dst_spad, struct htp_spad * dst_spad,
uint32_t nth, uint32_t nth,
uint32_t ith, uint32_t ith,
uint32_t src0_nrows_per_thread, uint32_t src0_nrows_per_thread) {
dma_queue * dma_queue) {
htp_act_preamble2; htp_act_preamble2;
uint64_t t1, t2; uint64_t t1, t2;
@ -466,8 +379,6 @@ static void unary_silu_fp32_per_thread(const struct htp_tensor * src0,
const size_t src0_row_size = nb01; const size_t src0_row_size = nb01;
const size_t dst_row_size = nb1; const size_t dst_row_size = nb1;
const size_t src0_row_size_aligned = htp_round_up(src0_row_size, VLEN);
const size_t dst_row_size_aligned = htp_round_up(dst_row_size, VLEN);
const uint32_t src0_nrows = ne01 * ne02 * ne03; const uint32_t src0_nrows = ne01 * ne02 * ne03;
@ -479,91 +390,64 @@ static void unary_silu_fp32_per_thread(const struct htp_tensor * src0,
return; return;
} }
const uint8_t * data_src0 = (const uint8_t *) src0->data; int is_aligned = 1;
uint8_t * data_dst = (uint8_t *) dst->data; int opt_path = 0;
if (!htp_is_aligned((void *) src0->data, VLEN) || !htp_is_aligned((void *) dst->data, VLEN)) {
uint8_t * src0_spad_data = src0_spad->data + (ith * src0_spad->size_per_thread); is_aligned = 0;
uint8_t * dst_spad_data = dst_spad->data + (ith * dst_spad->size_per_thread); FARF(HIGH, "silu-f32: unaligned addresses in elementwise op, possibly slower execution\n");
}
// While given src0_spad->size_per_thread, divide it to two ping-pong buffer for src0 if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
size_t src0_spad_half_size = src0_spad->size_per_thread / 2; opt_path = 1;
size_t dst_spad_half_size = dst_spad->size_per_thread / 2;
const int BLOCK = src0_spad_half_size / src0_row_size_aligned; // How many rows can we process in one block
if (BLOCK == 0) {
FARF(ERROR, "silu-f32 : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n",
src0_spad->size_per_thread, src0_row_size_aligned);
return;
} }
// See discussion: https://github.com/ggml-org/llama.cpp/pull/18151#issuecomment-3678235379 const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; ir += BLOCK, spad_idx++) { uint8_t * restrict data_dst = (uint8_t *) dst->data;
const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
// Dummy DMA transation for sequencing (interleaving dst,src,dst,...) uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_row_size);
dma_queue_push_vtcm_to_ddr(dma_queue, uint8_t * restrict dst_spad_data = dst_spad->data + (ith * dst_row_size);
dma_make_ptr(data_dst, dst_spad_data + (spad_idx * dst_spad_half_size)),
dst_row_size, dst_row_size_aligned, 0);
dma_queue_push_ddr_to_vtcm(dma_queue, for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) {
dma_make_ptr(src0_spad_data + (spad_idx * src0_spad_half_size), data_src0 + (ir * src0_row_size)), const float * restrict src0 = (float *) (data_src0 + (ir * src0_row_size));
src0_row_size_aligned, src0_row_size, block_size); float * restrict dst = (float *) (data_dst + (ir * dst_row_size));
}
for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) { if (ir + 1 < src0_end_row) {
const uint32_t block_size = MIN(BLOCK, src0_end_row - ir); htp_l2fetch(src0 + src0_row_size, 1, src0_row_size, src0_row_size);
float* dst_spad = (float *) dma_queue_pop(dma_queue).src;
float* src0_spad = (float *) dma_queue_pop(dma_queue).dst;
for (uint32_t ib = 0; ib < block_size; ib++) {
const float* src0_spad_ptr = src0_spad + ib * (src0_row_size_aligned / sizeof(float));
float* dst_spad_ptr = dst_spad + ib * (dst_row_size_aligned / sizeof(float));
// silu = x * sigmoid(x)
hvx_fast_sigmoid_f32((const uint8_t *) src0_spad_ptr, (uint8_t *) dst_spad_ptr, ne0);
hvx_mul_f32_opt((const uint8_t *) src0_spad_ptr, (uint8_t *) dst_spad_ptr, (uint8_t *) dst_spad_ptr, ne0);
} }
dma_queue_push_vtcm_to_ddr(dma_queue, if (1 == opt_path) {
dma_make_ptr(data_dst + (ir * dst_row_size), dst_spad), hvx_fast_sigmoid_f32((const uint8_t *) src0, (uint8_t *) src0_spad_data, ne0);
dst_row_size, dst_row_size_aligned, block_size); hvx_mul_f32_opt((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
} else {
hvx_exp_f32((const uint8_t *) src0, src0_spad_data, ne0, true);
hvx_add_scalar_f32(src0_spad_data, 1.0, dst_spad_data, ne0);
hvx_inverse_f32(dst_spad_data, src0_spad_data, ne0);
// prefetch N+2 loop iteration if any hvx_mul_f32((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
const uint32_t pref_block = (ir + BLOCK * 2);
if (pref_block < src0_end_row) {
const uint32_t pref_block_size = MIN(BLOCK, src0_end_row - pref_block);
dma_queue_push_ddr_to_vtcm(dma_queue,
dma_make_ptr(src0_spad, data_src0 + (pref_block * src0_row_size)),
src0_row_size_aligned, src0_row_size, pref_block_size);
} }
} }
dma_queue_flush(dma_queue);
t2 = HAP_perf_get_qtimer_count(); t2 = HAP_perf_get_qtimer_count();
FARF(HIGH, "silu-f32 %d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n", ith, nth, ne00, ne01, ne02, FARF(HIGH, "silu-f32 %d/%d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n", ith, nth, opt_path, ne00, ne01, ne02,
ne03, src0_start_row, src0_end_row, ne0, ne1, ne2, ne3, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); ne03, src0_start_row, src0_end_row, ne0, ne1, ne2, ne3, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
} }
static void unary_silu_fp32(unsigned int n, unsigned int i, void * data) { static void unary_silu_fp32(unsigned int n, unsigned int i, void * data) {
struct htp_ops_context * octx = (struct htp_ops_context *) data; struct htp_ops_context * octx = (struct htp_ops_context *) data;
unary_silu_fp32_per_thread(&octx->src0, &octx->dst, octx->op_params, &octx->src0_spad, &octx->dst_spad, n, i, unary_silu_fp32_per_thread(&octx->src0, &octx->dst, octx->op_params, &octx->src0_spad, &octx->dst_spad, n, i,
octx->src0_nrows_per_thread, octx->ctx->dma[i]); octx->src0_nrows_per_thread);
} }
static void glu_swiglu_fp32(unsigned int n, unsigned int i, void * data) { static void glu_swiglu_fp32(unsigned int n, unsigned int i, void * data) {
struct htp_ops_context * octx = (struct htp_ops_context *) data; struct htp_ops_context * octx = (struct htp_ops_context *) data;
glu_swiglu_fp32_per_thread(&octx->src0, &octx->src1, &octx->dst, octx->op_params, &octx->src0_spad, glu_swiglu_fp32_per_thread(&octx->src0, &octx->src1, &octx->dst, octx->op_params, &octx->src0_spad,
&octx->src1_spad, &octx->dst_spad, n, i, octx->src0_nrows_per_thread, octx->ctx->dma[i]); &octx->src1_spad, &octx->dst_spad, n, i, octx->src0_nrows_per_thread);
} }
static void glu_swiglu_oai_fp32(unsigned int n, unsigned int i, void * data) { static void glu_swiglu_oai_fp32(unsigned int n, unsigned int i, void * data) {
struct htp_ops_context * octx = (struct htp_ops_context *) data; struct htp_ops_context * octx = (struct htp_ops_context *) data;
glu_swiglu_oai_fp32_per_thread(&octx->src0, &octx->src1, &octx->dst, octx->op_params, &octx->src0_spad, glu_swiglu_oai_fp32_per_thread(&octx->src0, &octx->src1, &octx->dst, octx->op_params, &octx->src0_spad,
&octx->src1_spad, &octx->dst_spad, n, i, octx->src0_nrows_per_thread, octx->ctx->dma[i]); &octx->src1_spad, &octx->dst_spad, n, i, octx->src0_nrows_per_thread);
} }
static int execute_op_activations_fp32(struct htp_ops_context * octx) { static int execute_op_activations_fp32(struct htp_ops_context * octx) {

View File

@ -2181,11 +2181,7 @@ size_t ggml_metal_op_flash_attn_ext_extra_pad(const ggml_tensor * op) {
const bool has_mask = op->src[3] != nullptr; const bool has_mask = op->src[3] != nullptr;
// note: the non-vec kernel requires more extra memory, so always reserve for it if (ggml_metal_op_flash_attn_ext_use_vec(op)) {
GGML_ASSERT(OP_FLASH_ATTN_EXT_NCPSG >= OP_FLASH_ATTN_EXT_VEC_NCPSG);
//if (ggml_metal_op_flash_attn_ext_use_vec(op)) {
if (false) {
// note: always reserve the padding space to avoid graph reallocations // note: always reserve the padding space to avoid graph reallocations
//const bool has_kvpad = ne11 % OP_FLASH_ATTN_EXT_VEC_NCPSG != 0; //const bool has_kvpad = ne11 % OP_FLASH_ATTN_EXT_VEC_NCPSG != 0;
const bool has_kvpad = true; const bool has_kvpad = true;

View File

@ -1517,12 +1517,10 @@ bool rpc_server::graph_compute(const std::vector<uint8_t> & input) {
struct ggml_cgraph * graph = ggml_new_graph_custom(ctx, n_nodes, false); struct ggml_cgraph * graph = ggml_new_graph_custom(ctx, n_nodes, false);
graph->n_nodes = n_nodes; graph->n_nodes = n_nodes;
std::unordered_map<uint64_t, const rpc_tensor*> tensor_ptrs; std::unordered_map<uint64_t, const rpc_tensor*> tensor_ptrs;
tensor_ptrs.reserve(n_tensors);
for (uint32_t i = 0; i < n_tensors; i++) { for (uint32_t i = 0; i < n_tensors; i++) {
tensor_ptrs.emplace(tensors[i].id, &tensors[i]); tensor_ptrs[tensors[i].id] = &tensors[i];
} }
std::unordered_map<uint64_t, ggml_tensor*> tensor_map; std::unordered_map<uint64_t, ggml_tensor*> tensor_map;
tensor_map.reserve(n_nodes);
for (uint32_t i = 0; i < n_nodes; i++) { for (uint32_t i = 0; i < n_nodes; i++) {
int64_t id; int64_t id;
memcpy(&id, &nodes[i], sizeof(id)); memcpy(&id, &nodes[i], sizeof(id));

View File

@ -434,15 +434,8 @@ static constexpr std::initializer_list<ggml_op> topk_moe_early_softmax_norm{ GGM
GGML_OP_VIEW, GGML_OP_GET_ROWS, GGML_OP_RESHAPE, GGML_OP_VIEW, GGML_OP_GET_ROWS, GGML_OP_RESHAPE,
GGML_OP_SUM_ROWS, GGML_OP_CLAMP, GGML_OP_DIV, GGML_OP_SUM_ROWS, GGML_OP_CLAMP, GGML_OP_DIV,
GGML_OP_RESHAPE }; GGML_OP_RESHAPE };
static constexpr std::initializer_list<ggml_op> topk_moe_sigmoid_norm_bias{ GGML_OP_UNARY, GGML_OP_RESHAPE, GGML_OP_ADD,
GGML_OP_ARGSORT, GGML_OP_VIEW, GGML_OP_GET_ROWS,
GGML_OP_RESHAPE, GGML_OP_SUM_ROWS, GGML_OP_CLAMP,
GGML_OP_DIV, GGML_OP_RESHAPE };
static constexpr std::initializer_list<ggml_op> topk_moe_early_softmax { GGML_OP_SOFT_MAX, GGML_OP_RESHAPE, GGML_OP_ARGSORT, static constexpr std::initializer_list<ggml_op> topk_moe_early_softmax { GGML_OP_SOFT_MAX, GGML_OP_RESHAPE, GGML_OP_ARGSORT,
GGML_OP_VIEW, GGML_OP_GET_ROWS }; GGML_OP_VIEW, GGML_OP_GET_ROWS };
static constexpr std::initializer_list<ggml_op> topk_moe_late_softmax { GGML_OP_ARGSORT, GGML_OP_VIEW, static constexpr std::initializer_list<ggml_op> topk_moe_late_softmax { GGML_OP_ARGSORT, GGML_OP_VIEW,
GGML_OP_GET_ROWS, GGML_OP_RESHAPE, GGML_OP_GET_ROWS, GGML_OP_RESHAPE,
GGML_OP_SOFT_MAX, GGML_OP_RESHAPE }; GGML_OP_SOFT_MAX, GGML_OP_RESHAPE };
@ -471,32 +464,6 @@ static constexpr std::initializer_list<std::array<int, 3>> topk_moe_early_softma
{ 9, 0, 8 }, // reshape->src[0] == div { 9, 0, 8 }, // reshape->src[0] == div
}; };
//node #436 ( UNARY): ffn_moe_probs-10 ( 256K) [Vulka ] use=2: ffn_moe_logits-10 ( 256K) [Vulka ]
//node #437 ( RESHAPE): ffn_moe_probs-10 (re ( 256K) [Vulka ] use=1: ffn_moe_probs-10 ( 256K) [Vulka ]
//node #438 ( ADD): ffn_moe_probs_biased ( 256K) [Vulka ] use=1: ffn_moe_probs-10 ( 256K) [Vulka ] blk.10.exp_probs_b.b ( 0K) [Vulka ]
//node #439 ( ARGSORT): ffn_moe_argsort-10 ( 256K) [Vulka ] use=1: ffn_moe_probs_biased ( 256K) [Vulka ]
//node #440 ( VIEW): ffn_moe_topk-10 ( 255K) [Vulka ] use=3: ffn_moe_argsort-10 ( 256K) [Vulka ]
//node #441 ( GET_ROWS): ffn_moe_weights-10 ( 12K) [Vulka ] use=1: ffn_moe_probs-10 (re ( 256K) [Vulka ] ffn_moe_topk-10 ( 255K) [Vulka ]
//node #442 ( RESHAPE): ffn_moe_weights-10 ( ( 12K) [Vulka ] use=2: ffn_moe_weights-10 ( 12K) [Vulka ]
//node #443 ( SUM_ROWS): ffn_moe_weights_sum- ( 2K) [Vulka ] use=1: ffn_moe_weights-10 ( ( 12K) [Vulka ]
//node #444 ( CLAMP): ffn_moe_weights_sum_ ( 2K) [Vulka ] use=1: ffn_moe_weights_sum- ( 2K) [Vulka ]
//node #445 ( DIV): ffn_moe_weights_norm ( 12K) [Vulka ] use=1: ffn_moe_weights-10 ( ( 12K) [Vulka ] ffn_moe_weights_sum_ ( 2K) [Vulka ]
//node #446 ( RESHAPE): ffn_moe_weights_norm ( 12K) [Vulka ] use=1: ffn_moe_weights_norm ( 12K) [Vulka ]
static constexpr std::initializer_list<std::array<int, 3>> topk_moe_sigmoid_norm_bias_edges {
{ 1, 0, 0 }, // reshape->src[0] == sigmoid
{ 2, 0, 0 }, // add->src[0] == sigmoid
{ 3, 0, 2 }, // argsort->src[0] == add
{ 4, 0, 3 }, // view->src[0] == argsort
{ 5, 0, 1 }, // get_rows->src[0] == reshape
{ 5, 1, 4 }, // get_rows->src[1] == view
{ 6, 0, 5 }, // reshape->src[0] == get_rows
{ 7, 0, 6 }, // sum_rows->src[0] == reshape
{ 8, 0, 7 }, // clamp->src[0] == sum_rows
{ 9, 0, 6 }, // div->src[0] == reshape
{ 9, 1, 8 }, // div->src[1] == clamp
{10, 0, 9 }, // reshape->src[0] == div
};
// same as early_softmax_norm but ending after the get_rows // same as early_softmax_norm but ending after the get_rows
static constexpr std::initializer_list<std::array<int, 3>> topk_moe_early_softmax_edges { static constexpr std::initializer_list<std::array<int, 3>> topk_moe_early_softmax_edges {
{ 1, 0, 0 }, // reshape->src[0] == softmax { 1, 0, 0 }, // reshape->src[0] == softmax
@ -524,10 +491,16 @@ enum topk_moe_mode {
TOPK_MOE_EARLY_SOFTMAX, TOPK_MOE_EARLY_SOFTMAX,
TOPK_MOE_EARLY_SOFTMAX_NORM, TOPK_MOE_EARLY_SOFTMAX_NORM,
TOPK_MOE_LATE_SOFTMAX, TOPK_MOE_LATE_SOFTMAX,
TOPK_MOE_SIGMOID_NORM_BIAS,
TOPK_MOE_COUNT, TOPK_MOE_COUNT,
}; };
static topk_moe_mode ggml_vk_num_additional_ops_to_topk_moe_mode(uint32_t num) {
topk_moe_mode mode = num == topk_moe_early_softmax_norm.size() - 1 ? TOPK_MOE_EARLY_SOFTMAX_NORM :
num == topk_moe_early_softmax.size() - 1 ? TOPK_MOE_EARLY_SOFTMAX :
TOPK_MOE_LATE_SOFTMAX;
return mode;
}
static constexpr std::initializer_list<std::array<int, 3>> rope_view_set_rows_edges { static constexpr std::initializer_list<std::array<int, 3>> rope_view_set_rows_edges {
{ 1, 0, 0 }, // view->src[0] == rope { 1, 0, 0 }, // view->src[0] == rope
{ 2, 0, 1 }, // set_rows->src[0] == view { 2, 0, 1 }, // set_rows->src[0] == view
@ -765,9 +738,6 @@ struct vk_device_struct {
vk_pipeline pipeline_topk_f32[num_topk_pipelines]; vk_pipeline pipeline_topk_f32[num_topk_pipelines];
vk_pipeline pipeline_sum_rows_f32; vk_pipeline pipeline_sum_rows_f32;
vk_pipeline pipeline_cumsum_f32; vk_pipeline pipeline_cumsum_f32;
vk_pipeline pipeline_cumsum_small_f32;
vk_pipeline pipeline_cumsum_multipass1_f32;
vk_pipeline pipeline_cumsum_multipass2_f32;
vk_pipeline pipeline_argmax_f32; vk_pipeline pipeline_argmax_f32;
vk_pipeline pipeline_count_equal_i32; vk_pipeline pipeline_count_equal_i32;
std::map<vk_solve_tri_pipeline_state, vk_pipeline> pipeline_solve_tri_f32; std::map<vk_solve_tri_pipeline_state, vk_pipeline> pipeline_solve_tri_f32;
@ -796,7 +766,7 @@ struct vk_device_struct {
vk_pipeline pipeline_count_experts; vk_pipeline pipeline_count_experts;
// [2] is for whether to take n_experts from spec constant (0) or push constant (1) // [2] is for whether to take n_experts from spec constant (0) or push constant (1)
vk_pipeline pipeline_topk_moe[num_topk_moe_pipelines][2]; vk_pipeline pipeline_topk_moe[num_topk_moe_pipelines][TOPK_MOE_COUNT][2];
std::vector<vk_pipeline_ref> all_pipelines; std::vector<vk_pipeline_ref> all_pipelines;
@ -1211,11 +1181,6 @@ struct vk_op_topk_moe_push_constants {
uint32_t n_expert_used; uint32_t n_expert_used;
float clamp_min; float clamp_min;
float clamp_max; float clamp_max;
uint32_t gating_func;
uint32_t has_bias;
uint32_t with_norm;
float output_scale;
float output_bias;
}; };
struct vk_op_add_id_push_constants { struct vk_op_add_id_push_constants {
@ -1806,8 +1771,6 @@ struct ggml_backend_vk_context {
// Bit 'i' means nodes[start_of_fusion + i] writes to memory. // Bit 'i' means nodes[start_of_fusion + i] writes to memory.
// If there's no fusion, bit 0 is still set. // If there's no fusion, bit 0 is still set.
int fused_ops_write_mask {}; int fused_ops_write_mask {};
topk_moe_mode fused_topk_moe_mode {};
bool fused_topk_moe_scale {};
// for GGML_VK_PERF_LOGGER // for GGML_VK_PERF_LOGGER
std::unique_ptr<vk_perf_logger> perf_logger; std::unique_ptr<vk_perf_logger> perf_logger;
@ -2705,7 +2668,7 @@ static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vec
switch (src0_type) { switch (src0_type) {
case GGML_TYPE_IQ1_S: case GGML_TYPE_IQ1_S:
case GGML_TYPE_IQ1_M: case GGML_TYPE_IQ1_M:
lut_size = 2*2048 + 4*2048; lut_size = 2*2048;
break; break;
case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XXS:
lut_size = 8*256; lut_size = 8*256;
@ -3630,7 +3593,6 @@ static void ggml_vk_load_shaders(vk_device& device) {
uint32_t rm_kq = 2; uint32_t rm_kq = 2;
uint32_t rm_stdq_int = 1; uint32_t rm_stdq_int = 1;
uint32_t rm_kq_int = 1; uint32_t rm_kq_int = 1;
auto const &rm_iq_int = [](uint32_t i) { return i == 0 ? 8u : 4u; };
if (device->vendor_id == VK_VENDOR_ID_AMD) { if (device->vendor_id == VK_VENDOR_ID_AMD) {
if (device->architecture == AMD_GCN) { if (device->architecture == AMD_GCN) {
rm_stdq = 2; rm_stdq = 2;
@ -3734,10 +3696,6 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q4_K][i], "mul_mat_vec_q4_k_q8_1_f32", arr_dmmv_q4_k_q8_1_f32_len[reduc], arr_dmmv_q4_k_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int, i+1}, 1, true, use_subgroups, subgroup_size_int); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q4_K][i], "mul_mat_vec_q4_k_q8_1_f32", arr_dmmv_q4_k_q8_1_f32_len[reduc], arr_dmmv_q4_k_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int, i+1}, 1, true, use_subgroups, subgroup_size_int);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q5_K][i], "mul_mat_vec_q5_k_q8_1_f32", arr_dmmv_q5_k_q8_1_f32_len[reduc], arr_dmmv_q5_k_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int, i+1}, 1, true, use_subgroups, subgroup_size_int); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q5_K][i], "mul_mat_vec_q5_k_q8_1_f32", arr_dmmv_q5_k_q8_1_f32_len[reduc], arr_dmmv_q5_k_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int, i+1}, 1, true, use_subgroups, subgroup_size_int);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q6_K][i], "mul_mat_vec_q6_k_q8_1_f32", arr_dmmv_q6_k_q8_1_f32_len[reduc], arr_dmmv_q6_k_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int, i+1}, 1, true, use_subgroups, subgroup_size_int); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q6_K][i], "mul_mat_vec_q6_k_q8_1_f32", arr_dmmv_q6_k_q8_1_f32_len[reduc], arr_dmmv_q6_k_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int, i+1}, 1, true, use_subgroups, subgroup_size_int);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_IQ1_S][i], "mul_mat_vec_iq1_s_q8_1_f32", arr_dmmv_iq1_s_q8_1_f32_len[reduc], arr_dmmv_iq1_s_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_iq_int(i), 1, 1}, {wg_size_subgroup_int, 1*rm_iq_int(i), i+1}, 1, true, use_subgroups, subgroup_size_int);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_IQ1_M][i], "mul_mat_vec_iq1_m_q8_1_f32", arr_dmmv_iq1_m_q8_1_f32_len[reduc], arr_dmmv_iq1_m_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_iq_int(i), 1, 1}, {wg_size_subgroup_int, 1*rm_iq_int(i), i+1}, 1, true, use_subgroups, subgroup_size_int);
} }
#endif // GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT #endif // GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT
} }
@ -3784,9 +3742,6 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_q8_1_f32", arr_dmmv_id_q4_k_q8_1_f32_len[reduc], arr_dmmv_id_q4_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_q8_1_f32", arr_dmmv_id_q4_k_q8_1_f32_len[reduc], arr_dmmv_id_q4_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_q8_1_f32", arr_dmmv_id_q5_k_q8_1_f32_len[reduc], arr_dmmv_id_q5_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_q8_1_f32", arr_dmmv_id_q5_k_q8_1_f32_len[reduc], arr_dmmv_id_q5_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_q8_1_f32", arr_dmmv_id_q6_k_q8_1_f32_len[reduc], arr_dmmv_id_q6_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_q8_1_f32", arr_dmmv_id_q6_k_q8_1_f32_len[reduc], arr_dmmv_id_q6_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_IQ1_S], "mul_mat_vec_id_iq1_s_q8_1_f32", arr_dmmv_id_iq1_s_q8_1_f32_len[reduc], arr_dmmv_id_iq1_s_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_iq_int(0), 1, 1}, {wg_size_subgroup_int, 1*rm_iq_int(0)}, 1, true, use_subgroups, subgroup_size_int);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_IQ1_M], "mul_mat_vec_id_iq1_m_q8_1_f32", arr_dmmv_id_iq1_m_q8_1_f32_len[reduc], arr_dmmv_id_iq1_m_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_iq_int(0), 1, 1}, {wg_size_subgroup_int, 1*rm_iq_int(0)}, 1, true, use_subgroups, subgroup_size_int);
} }
#endif // GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT #endif // GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT
} }
@ -3794,7 +3749,6 @@ static void ggml_vk_load_shaders(vk_device& device) {
#if !defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) #if !defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
GGML_UNUSED(rm_stdq_int); GGML_UNUSED(rm_stdq_int);
GGML_UNUSED(rm_kq_int); GGML_UNUSED(rm_kq_int);
GGML_UNUSED(rm_iq_int);
#endif #endif
// dequant shaders // dequant shaders
@ -4181,11 +4135,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_sum_rows_f32, "sum_rows_f32", sum_rows_f32_len, sum_rows_f32_data, "main", 2, sizeof(vk_op_sum_rows_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); ggml_vk_create_pipeline(device, device->pipeline_sum_rows_f32, "sum_rows_f32", sum_rows_f32_len, sum_rows_f32_data, "main", 2, sizeof(vk_op_sum_rows_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
const uint32_t cumsum_elem_per_thread = (device->vendor_id == VK_VENDOR_ID_AMD || device->vendor_id == VK_VENDOR_ID_INTEL) ? 2 : 4; ggml_vk_create_pipeline(device, device->pipeline_cumsum_f32, "cumsum_f32", cumsum_f32_len, cumsum_f32_data, "main", 2, sizeof(vk_op_sum_rows_push_constants), {1, 1, 1}, { 128, device->subgroup_size }, 1, true, true, device->subgroup_size);
ggml_vk_create_pipeline(device, device->pipeline_cumsum_f32, "cumsum_f32", cumsum_f32_len, cumsum_f32_data, "main", 2, sizeof(vk_op_sum_rows_push_constants), {1, 1, 1}, { 256, device->subgroup_size, cumsum_elem_per_thread }, 1, true, true, device->subgroup_size);
ggml_vk_create_pipeline(device, device->pipeline_cumsum_small_f32, "cumsum_f32", cumsum_f32_len, cumsum_f32_data, "main", 2, sizeof(vk_op_sum_rows_push_constants), {1, 1, 1}, { 128, device->subgroup_size, 1 }, 1, true, true, device->subgroup_size);
ggml_vk_create_pipeline(device, device->pipeline_cumsum_multipass1_f32, "cumsum_multipass1_f32", cumsum_multipass1_f32_len, cumsum_multipass1_f32_data, "main", 3, sizeof(vk_op_sum_rows_push_constants), {256, 1, 1}, { 256, device->subgroup_size }, 1, true, true, device->subgroup_size);
ggml_vk_create_pipeline(device, device->pipeline_cumsum_multipass2_f32, "cumsum_multipass2_f32", cumsum_multipass2_f32_len, cumsum_multipass2_f32_data, "main", 3, sizeof(vk_op_sum_rows_push_constants), {256, 1, 1}, { 256, device->subgroup_size }, 1, true, true, device->subgroup_size);
ggml_vk_create_pipeline(device, device->pipeline_count_equal_i32, "count_equal_i32", count_equal_i32_len, count_equal_i32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, { device->subgroup_size }, 1); ggml_vk_create_pipeline(device, device->pipeline_count_equal_i32, "count_equal_i32", count_equal_i32_len, count_equal_i32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, { device->subgroup_size }, 1);
@ -4341,7 +4291,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
for (uint32_t use_push = 0; use_push < 2; ++use_push) { for (uint32_t use_push = 0; use_push < 2; ++use_push) {
for (uint32_t i = 0; i < num_topk_moe_pipelines; ++i) { for (uint32_t i = 0; i < num_topk_moe_pipelines; ++i) {
ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][use_push], "topk_moe_f32_"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 4, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, use_push}, 1, true, true, device->subgroup_size); ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX][use_push], "topk_moe_f32_early_softmax_"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 0, use_push}, 1, true, true, device->subgroup_size);
ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX_NORM][use_push], "topk_moe_f32_early_softmax_norm"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 1, 0, use_push}, 1, true, true, device->subgroup_size);
ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_LATE_SOFTMAX][use_push], "topk_moe_f32_late_softmax"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 1, use_push}, 1, true, true, device->subgroup_size);
} }
} }
@ -5632,8 +5584,6 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context *
case GGML_TYPE_Q4_K: case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K: case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K: case GGML_TYPE_Q6_K:
case GGML_TYPE_IQ1_S:
case GGML_TYPE_IQ1_M:
break; break;
default: default:
return nullptr; return nullptr;
@ -5790,8 +5740,6 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec_id(ggml_backend_vk_context
case GGML_TYPE_Q4_K: case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K: case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K: case GGML_TYPE_Q6_K:
case GGML_TYPE_IQ1_S:
case GGML_TYPE_IQ1_M:
break; break;
default: default:
return nullptr; return nullptr;
@ -7057,7 +7005,7 @@ static bool ggml_vk_should_use_mmvq(const vk_device& device, uint32_t m, uint32_
// Quantization overhead is not worth it for small k // Quantization overhead is not worth it for small k
switch (device->vendor_id) { switch (device->vendor_id) {
case VK_VENDOR_ID_NVIDIA: case VK_VENDOR_ID_NVIDIA:
if (src0_type == GGML_TYPE_Q2_K || src0_type == GGML_TYPE_IQ1_S || src0_type == GGML_TYPE_IQ1_M) { if (src0_type == GGML_TYPE_Q2_K) {
return true; return true;
} }
@ -8736,9 +8684,10 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
if (ctx->num_additional_fused_ops) { if (ctx->num_additional_fused_ops) {
uint32_t idx = (uint32_t)ceilf(log2f(float(dst->ne[0]))); uint32_t idx = (uint32_t)ceilf(log2f(float(dst->ne[0])));
GGML_ASSERT(idx < num_topk_moe_pipelines); GGML_ASSERT(idx < num_topk_moe_pipelines);
topk_moe_mode mode = ggml_vk_num_additional_ops_to_topk_moe_mode(ctx->num_additional_fused_ops);
// use n_experts from push constant if it's not equal to the power of two spec constant // use n_experts from push constant if it's not equal to the power of two spec constant
bool use_push = dst->ne[0] != (1u << idx); bool use_push = dst->ne[0] != (1u << idx);
return ctx->device->pipeline_topk_moe[idx][use_push]; return ctx->device->pipeline_topk_moe[idx][mode][use_push];
} }
if (src0->type == GGML_TYPE_F32 && (src1 == nullptr || src1->type == GGML_TYPE_F32) && dst->type == GGML_TYPE_F32) { if (src0->type == GGML_TYPE_F32 && (src1 == nullptr || src1->type == GGML_TYPE_F32) && dst->type == GGML_TYPE_F32) {
@ -8811,11 +8760,7 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
return nullptr; return nullptr;
case GGML_OP_CUMSUM: case GGML_OP_CUMSUM:
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
if (src0->ne[0] <= 512) { return ctx->device->pipeline_cumsum_f32;
return ctx->device->pipeline_cumsum_small_f32;
} else {
return ctx->device->pipeline_cumsum_f32;
}
} }
return nullptr; return nullptr;
case GGML_OP_SOLVE_TRI: case GGML_OP_SOLVE_TRI:
@ -10401,16 +10346,14 @@ static void ggml_vk_soft_max_back(ggml_backend_vk_context * ctx, vk_context& sub
} }
static void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx) { static void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx) {
topk_moe_mode mode = ctx->fused_topk_moe_mode; topk_moe_mode mode = ggml_vk_num_additional_ops_to_topk_moe_mode(ctx->num_additional_fused_ops);
ggml_tensor * logits = cgraph->nodes[node_idx + 0]->src[0]; ggml_tensor * logits = cgraph->nodes[node_idx + 0]->src[0];
ggml_tensor * bias = (mode == TOPK_MOE_SIGMOID_NORM_BIAS) ? cgraph->nodes[node_idx + 2]->src[1] : logits; ggml_tensor * weights = (mode == TOPK_MOE_EARLY_SOFTMAX_NORM) ? cgraph->nodes[node_idx + 9] :
ggml_tensor * weights = cgraph->nodes[node_idx + ctx->num_additional_fused_ops]; (mode == TOPK_MOE_EARLY_SOFTMAX) ? cgraph->nodes[node_idx + 4] :
ggml_tensor * ids = (mode == TOPK_MOE_SIGMOID_NORM_BIAS) ? cgraph->nodes[node_idx + 4] : cgraph->nodes[node_idx + 5];
(mode == TOPK_MOE_LATE_SOFTMAX) ? cgraph->nodes[node_idx + 1] : ggml_tensor * ids = (mode == TOPK_MOE_LATE_SOFTMAX) ? cgraph->nodes[node_idx + 1] : cgraph->nodes[node_idx + 3];
cgraph->nodes[node_idx + 3];
GGML_ASSERT(logits->type == GGML_TYPE_F32); GGML_ASSERT(logits->type == GGML_TYPE_F32);
GGML_ASSERT(bias->type == GGML_TYPE_F32);
GGML_ASSERT(weights->type == GGML_TYPE_F32); GGML_ASSERT(weights->type == GGML_TYPE_F32);
GGML_ASSERT(ids->type == GGML_TYPE_I32); GGML_ASSERT(ids->type == GGML_TYPE_I32);
@ -10425,7 +10368,6 @@ static void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx,
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
vk_subbuffer logits_buf = ggml_vk_tensor_subbuffer(ctx, logits); vk_subbuffer logits_buf = ggml_vk_tensor_subbuffer(ctx, logits);
vk_subbuffer bias_buf = ggml_vk_tensor_subbuffer(ctx, bias);
vk_subbuffer weights_buf = ggml_vk_tensor_subbuffer(ctx, weights); vk_subbuffer weights_buf = ggml_vk_tensor_subbuffer(ctx, weights);
vk_subbuffer ids_buf = ggml_vk_tensor_subbuffer(ctx, ids); vk_subbuffer ids_buf = ggml_vk_tensor_subbuffer(ctx, ids);
@ -10433,45 +10375,18 @@ static void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx,
pc.n_rows = n_rows; pc.n_rows = n_rows;
pc.n_experts_push = n_experts; pc.n_experts_push = n_experts;
pc.n_expert_used = n_expert_used; pc.n_expert_used = n_expert_used;
pc.clamp_min = -std::numeric_limits<float>::infinity();
pc.clamp_max = std::numeric_limits<float>::infinity();
if (mode == TOPK_MOE_EARLY_SOFTMAX_NORM) { if (mode == TOPK_MOE_EARLY_SOFTMAX_NORM) {
ggml_tensor * clamp = cgraph->nodes[node_idx + 7]; ggml_tensor * clamp = cgraph->nodes[node_idx + 7];
GGML_ASSERT(clamp->op == GGML_OP_CLAMP);
pc.clamp_min = ggml_get_op_params_f32(clamp, 0); pc.clamp_min = ggml_get_op_params_f32(clamp, 0);
pc.clamp_max = ggml_get_op_params_f32(clamp, 1); pc.clamp_max = ggml_get_op_params_f32(clamp, 1);
} }
if (mode == TOPK_MOE_SIGMOID_NORM_BIAS) {
ggml_tensor * clamp = cgraph->nodes[node_idx + 8];
GGML_ASSERT(clamp->op == GGML_OP_CLAMP);
pc.clamp_min = ggml_get_op_params_f32(clamp, 0);
pc.clamp_max = ggml_get_op_params_f32(clamp, 1);
}
#define GATING_FUNC_SOFTMAX 0
#define GATING_FUNC_SIGMOID 1
#define GATING_FUNC_SOFTMAX_WEIGHT 2
pc.gating_func = mode == TOPK_MOE_SIGMOID_NORM_BIAS ? GATING_FUNC_SIGMOID :
mode == TOPK_MOE_LATE_SOFTMAX ? GATING_FUNC_SOFTMAX_WEIGHT :
GATING_FUNC_SOFTMAX;
pc.has_bias = mode == TOPK_MOE_SIGMOID_NORM_BIAS;
pc.with_norm = mode == TOPK_MOE_EARLY_SOFTMAX_NORM || mode == TOPK_MOE_SIGMOID_NORM_BIAS;
if (ctx->fused_topk_moe_scale) {
GGML_ASSERT(weights->op == GGML_OP_SCALE);
pc.output_scale = ggml_get_op_params_f32(weights, 0);
pc.output_bias = ggml_get_op_params_f32(weights, 1);
} else {
pc.output_scale = 1.0f;
pc.output_bias = 0.0f;
}
GGML_ASSERT(n_expert_used <= n_experts); GGML_ASSERT(n_expert_used <= n_experts);
const uint32_t rows_per_block = 4; const uint32_t rows_per_block = 4;
std::array<uint32_t, 3> elements = { CEIL_DIV(n_rows, rows_per_block), 1, 1 }; std::array<uint32_t, 3> elements = { CEIL_DIV(n_rows, rows_per_block), 1, 1 };
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, {logits_buf, bias_buf, weights_buf, ids_buf}, pc, elements); ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, {logits_buf, weights_buf, ids_buf}, pc, elements);
} }
static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_cgraph * cgraph, int node_idx, bool backprop) { static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_cgraph * cgraph, int node_idx, bool backprop) {
@ -10719,50 +10634,8 @@ static void ggml_vk_mean(ggml_backend_vk_context * ctx, vk_context& subctx, cons
} }
static void ggml_vk_cumsum(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { static void ggml_vk_cumsum(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
vk_op_sum_rows_push_constants pc = vk_op_sum_rows_push_constants_init(src0, dst, src0->ne[0]); vk_op_sum_rows_push_constants p = vk_op_sum_rows_push_constants_init(src0, dst, src0->ne[0]);
// Use the single pass shader when the rows are small or there are enough rows to fill the GPU. ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_CUMSUM, p);
// For fewer, larger rows, use the multipass shader to spread each row across SMs.
if (dst->ne[0] <= 4096 || ggml_nrows(dst) >= ctx->device->shader_core_count) {
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_CUMSUM, pc);
return;
}
// First pass computes partial sums within a block, and stores the last partial
// to the temp buffer. Second pass sums the block partials from the temp buffer
// and adds that to the result of the first pass.
vk_pipeline pipeline1 = ctx->device->pipeline_cumsum_multipass1_f32;
vk_pipeline pipeline2 = ctx->device->pipeline_cumsum_multipass2_f32;
GGML_ASSERT(pipeline1 != nullptr && pipeline2 != nullptr);
ggml_pipeline_request_descriptor_sets(ctx, pipeline1, 1);
ggml_pipeline_request_descriptor_sets(ctx, pipeline2, 1);
std::array<uint32_t, 3> elements;
elements[0] = dst->ne[0];
elements[1] = (uint32_t)ggml_nrows(dst);
elements[2] = 1;
size_t temp_size = sizeof(float) * elements[0] * ggml_nrows(dst);
if (ctx->prealloc_size_split_k < temp_size) {
ctx->prealloc_size_split_k = temp_size;
ggml_vk_preallocate_buffers(ctx, subctx);
}
vk_subbuffer src_buf = ggml_vk_tensor_subbuffer(ctx, src0);
vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst);
vk_subbuffer temp_buf = ggml_vk_subbuffer(ctx, ctx->prealloc_split_k, 0);
if (ctx->prealloc_split_k_need_sync) {
ggml_vk_sync_buffers(ctx, subctx);
}
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline1, {src_buf, dst_buf, temp_buf}, pc, elements);
ggml_vk_sync_buffers(ctx, subctx);
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline2, {src_buf, dst_buf, temp_buf}, pc, elements);
ctx->prealloc_split_k_need_sync = true;
} }
static void ggml_vk_argmax(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { static void ggml_vk_argmax(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
@ -12255,11 +12128,6 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
break; break;
case GGML_OP_UNARY: case GGML_OP_UNARY:
if (ctx->fused_topk_moe_mode != TOPK_MOE_COUNT) {
ggml_vk_topk_moe(ctx, compute_ctx, cgraph, node_idx);
break;
}
switch (ggml_get_unary_op(node)) { switch (ggml_get_unary_op(node)) {
case GGML_UNARY_OP_EXP: case GGML_UNARY_OP_EXP:
case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_SILU:
@ -12307,7 +12175,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
break; break;
case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX:
if (ctx->fused_topk_moe_mode != TOPK_MOE_COUNT) { if (ctx->num_additional_fused_ops) {
ggml_vk_topk_moe(ctx, compute_ctx, cgraph, node_idx); ggml_vk_topk_moe(ctx, compute_ctx, cgraph, node_idx);
} else { } else {
ggml_vk_soft_max(ctx, compute_ctx, src0, src1, src2, node); ggml_vk_soft_max(ctx, compute_ctx, src0, src1, src2, node);
@ -12327,7 +12195,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
break; break;
case GGML_OP_ARGSORT: case GGML_OP_ARGSORT:
if (ctx->fused_topk_moe_mode != TOPK_MOE_COUNT) { if (ctx->num_additional_fused_ops) {
ggml_vk_topk_moe(ctx, compute_ctx, cgraph, node_idx); ggml_vk_topk_moe(ctx, compute_ctx, cgraph, node_idx);
} else { } else {
ggml_vk_argsort(ctx, compute_ctx, src0, node); ggml_vk_argsort(ctx, compute_ctx, src0, node);
@ -13180,24 +13048,6 @@ static bool ggml_vk_can_fuse_topk_moe(ggml_backend_vk_context * ctx, const struc
get_rows = cgraph->nodes[node_idx + 4]; get_rows = cgraph->nodes[node_idx + 4];
argsort = cgraph->nodes[node_idx + 2]; argsort = cgraph->nodes[node_idx + 2];
break; break;
case TOPK_MOE_SIGMOID_NORM_BIAS:
softmax = cgraph->nodes[node_idx + 0]; // really sigmoid
weights = cgraph->nodes[node_idx + 10];
get_rows = cgraph->nodes[node_idx + 5];
argsort = cgraph->nodes[node_idx + 3];
if (ggml_get_unary_op(softmax) != GGML_UNARY_OP_SIGMOID) {
return false;
}
// bias is expected to be 1D
if (ggml_nrows(cgraph->nodes[node_idx + 2]->src[1]) != 1 ||
!ggml_is_contiguous(cgraph->nodes[node_idx + 2]->src[1])) {
return false;
}
// sigmoid fusion seems to generate infinities on moltenvk
if (ctx->device->driver_id == vk::DriverId::eMoltenvk) {
return false;
}
break;
case TOPK_MOE_EARLY_SOFTMAX: case TOPK_MOE_EARLY_SOFTMAX:
softmax = cgraph->nodes[node_idx + 0]; softmax = cgraph->nodes[node_idx + 0];
weights = cgraph->nodes[node_idx + 4]; weights = cgraph->nodes[node_idx + 4];
@ -13221,28 +13071,26 @@ static bool ggml_vk_can_fuse_topk_moe(ggml_backend_vk_context * ctx, const struc
probs = probs->src[0]; probs = probs->src[0];
ggml_tensor * selection_probs = argsort->src[0]; ggml_tensor * selection_probs = argsort->src[0];
if (probs != selection_probs && mode != TOPK_MOE_SIGMOID_NORM_BIAS) { if (probs != selection_probs) {
return false; return false;
} }
const float * op_params = (const float *)softmax->op_params;
float scale = op_params[0];
float max_bias = op_params[1];
if (!ggml_is_contiguous(softmax->src[0]) || !ggml_is_contiguous(weights)) { if (!ggml_is_contiguous(softmax->src[0]) || !ggml_is_contiguous(weights)) {
return false; return false;
} }
if (softmax->op == GGML_OP_SOFT_MAX) { if (scale != 1.0f || max_bias != 0.0f) {
const float * op_params = (const float *)softmax->op_params; return false;
}
float scale = op_params[0]; // don't fuse when masks or sinks are present
float max_bias = op_params[1]; if (softmax->src[1] || softmax->src[2]) {
return false;
if (scale != 1.0f || max_bias != 0.0f) {
return false;
}
// don't fuse when masks or sinks are present
if (softmax->src[1] || softmax->src[2]) {
return false;
}
} }
const int n_expert = softmax->ne[0]; const int n_expert = softmax->ne[0];
@ -13515,8 +13363,6 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
total_mul_mat_bytes += bytes; total_mul_mat_bytes += bytes;
} }
ctx->fused_topk_moe_mode = TOPK_MOE_COUNT;
ctx->fused_topk_moe_scale = false;
const char *fusion_string {}; const char *fusion_string {};
if (!ctx->device->disable_fusion) { if (!ctx->device->disable_fusion) {
uint32_t num_adds = ggml_vk_fuse_multi_add(ctx, cgraph, i); uint32_t num_adds = ggml_vk_fuse_multi_add(ctx, cgraph, i);
@ -13562,23 +13408,13 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
ctx->num_additional_fused_ops = topk_moe_early_softmax_norm.size() - 1; ctx->num_additional_fused_ops = topk_moe_early_softmax_norm.size() - 1;
// view of argsort writes to memory // view of argsort writes to memory
ctx->fused_ops_write_mask |= 1 << 3; ctx->fused_ops_write_mask |= 1 << 3;
ctx->fused_topk_moe_mode = TOPK_MOE_EARLY_SOFTMAX_NORM;
fusion_string = "TOPK_MOE_EARLY_SOFTMAX_NORM"; fusion_string = "TOPK_MOE_EARLY_SOFTMAX_NORM";
} else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_sigmoid_norm_bias, { i + 4, i + 10 }) &&
ggml_check_edges(cgraph, i, topk_moe_sigmoid_norm_bias_edges) &&
ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_SIGMOID_NORM_BIAS)) {
ctx->num_additional_fused_ops = topk_moe_sigmoid_norm_bias.size() - 1;
// view of argsort writes to memory
ctx->fused_ops_write_mask |= 1 << 4;
ctx->fused_topk_moe_mode = TOPK_MOE_SIGMOID_NORM_BIAS;
fusion_string = "TOPK_MOE_SIGMOID_NORM_BIAS";
} else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax, { i + 3, i + 4 }) && } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax, { i + 3, i + 4 }) &&
ggml_check_edges(cgraph, i, topk_moe_early_softmax_edges) && ggml_check_edges(cgraph, i, topk_moe_early_softmax_edges) &&
ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX)) { ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX)) {
ctx->num_additional_fused_ops = topk_moe_early_softmax.size() - 1; ctx->num_additional_fused_ops = topk_moe_early_softmax.size() - 1;
// view of argsort writes to memory // view of argsort writes to memory
ctx->fused_ops_write_mask |= 1 << 3; ctx->fused_ops_write_mask |= 1 << 3;
ctx->fused_topk_moe_mode = TOPK_MOE_EARLY_SOFTMAX;
fusion_string = "TOPK_MOE_EARLY_SOFTMAX"; fusion_string = "TOPK_MOE_EARLY_SOFTMAX";
} else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_late_softmax, { i + 1, i + 5 }) && } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_late_softmax, { i + 1, i + 5 }) &&
ggml_check_edges(cgraph, i, topk_moe_late_softmax_edges) && ggml_check_edges(cgraph, i, topk_moe_late_softmax_edges) &&
@ -13586,17 +13422,8 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
ctx->num_additional_fused_ops = topk_moe_late_softmax.size() - 1; ctx->num_additional_fused_ops = topk_moe_late_softmax.size() - 1;
// view of argsort writes to memory // view of argsort writes to memory
ctx->fused_ops_write_mask |= 1 << 1; ctx->fused_ops_write_mask |= 1 << 1;
ctx->fused_topk_moe_mode = TOPK_MOE_LATE_SOFTMAX;
fusion_string = "TOPK_MOE_LATE_SOFTMAX"; fusion_string = "TOPK_MOE_LATE_SOFTMAX";
} }
if (ctx->fused_topk_moe_mode != TOPK_MOE_COUNT) {
// Look for an additional scale op to fuse - occurs in deepseek2 and nemotron3 nano.
if (ggml_can_fuse_subgraph(cgraph, i + ctx->num_additional_fused_ops - 1, { GGML_OP_DIV, GGML_OP_RESHAPE, GGML_OP_SCALE }, { i + ctx->num_additional_fused_ops + 1 }) ||
ggml_can_fuse_subgraph(cgraph, i + ctx->num_additional_fused_ops, { GGML_OP_GET_ROWS, GGML_OP_SCALE }, { i + ctx->num_additional_fused_ops + 1 })) {
ctx->fused_topk_moe_scale = true;
ctx->num_additional_fused_ops++;
}
}
} }
ctx->fused_ops_write_mask |= 1 << ctx->num_additional_fused_ops; ctx->fused_ops_write_mask |= 1 << ctx->num_additional_fused_ops;
@ -13775,9 +13602,6 @@ static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph *
if (keep_pattern(topk_moe_early_softmax_norm)) { if (keep_pattern(topk_moe_early_softmax_norm)) {
continue; continue;
} }
if (keep_pattern(topk_moe_sigmoid_norm_bias)) {
continue;
}
if (keep_pattern(topk_moe_early_softmax)) { if (keep_pattern(topk_moe_early_softmax)) {
continue; continue;
} }
@ -13804,7 +13628,6 @@ static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph *
} }
// Don't pull forward nodes from fusion patterns // Don't pull forward nodes from fusion patterns
if (match_pattern(topk_moe_early_softmax_norm, j) || if (match_pattern(topk_moe_early_softmax_norm, j) ||
match_pattern(topk_moe_sigmoid_norm_bias, j) ||
match_pattern(topk_moe_early_softmax, j) || match_pattern(topk_moe_early_softmax, j) ||
match_pattern(topk_moe_late_softmax, j)) { match_pattern(topk_moe_late_softmax, j)) {
continue; continue;

View File

@ -14,7 +14,6 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
layout (constant_id = 0) const uint BLOCK_SIZE = 128; layout (constant_id = 0) const uint BLOCK_SIZE = 128;
layout (constant_id = 1) const uint SUBGROUP_SIZE = 32; layout (constant_id = 1) const uint SUBGROUP_SIZE = 32;
layout (constant_id = 2) const uint ELEM_PER_THREAD = 4;
#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b)) #define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
@ -39,45 +38,32 @@ void main() {
last_sum = 0; last_sum = 0;
} }
uint col = tid * ELEM_PER_THREAD; uint col = tid;
uint num_iter = CEIL_DIV(p.n_cols, BLOCK_SIZE * ELEM_PER_THREAD); uint num_iter = CEIL_DIV(p.n_cols, BLOCK_SIZE);
for (int i = 0; i < num_iter; ++i) { for (int i = 0; i < num_iter; ++i) {
FLOAT_TYPE v[ELEM_PER_THREAD]; FLOAT_TYPE v = 0;
FLOAT_TYPE thread_sum = 0; if (col < p.n_cols) {
[[unroll]] for (uint j = 0; j < ELEM_PER_THREAD; ++j) { v = FLOAT_TYPE(data_a[src_idx + col]);
if (col + j < p.n_cols) {
thread_sum += FLOAT_TYPE(data_a[src_idx + col + j]);
}
v[j] = thread_sum;
} }
v = subgroupInclusiveAdd(v);
thread_sum = subgroupExclusiveAdd(thread_sum);
[[unroll]] for (uint j = 0; j < ELEM_PER_THREAD; ++j) {
v[j] += thread_sum;
}
// Store the largest partial sum for each subgroup, then add the partials for all // Store the largest partial sum for each subgroup, then add the partials for all
// lower subgroups and the final partial sum from the previous iteration. // lower subgroups and the final partial sum from the previous iteration.
if (gl_SubgroupInvocationID == SUBGROUP_SIZE - 1) { if (gl_SubgroupInvocationID == SUBGROUP_SIZE - 1) {
partial[subgroup_id] = v[ELEM_PER_THREAD - 1]; partial[subgroup_id] = v;
} }
barrier(); barrier();
for (int s = 0; s < subgroup_id; ++s) { for (int j = 0; j < subgroup_id; ++j) {
[[unroll]] for (uint j = 0; j < ELEM_PER_THREAD; ++j) { v += partial[j];
v[j] += partial[s];
}
}
[[unroll]] for (uint j = 0; j < ELEM_PER_THREAD; ++j) {
v[j] += last_sum;
} }
v += last_sum;
barrier(); barrier();
if (tid == BLOCK_SIZE - 1) { if (tid == BLOCK_SIZE - 1) {
last_sum = v[ELEM_PER_THREAD - 1]; last_sum = v;
} }
[[unroll]] for (uint j = 0; j < ELEM_PER_THREAD; ++j) { if (col < p.n_cols) {
if (col + j < p.n_cols) { data_d[dst_idx + col] = D_TYPE(v);
data_d[dst_idx + col + j] = D_TYPE(v[j]);
}
} }
col += BLOCK_SIZE * ELEM_PER_THREAD; col += BLOCK_SIZE;
} }
} }

View File

@ -1,60 +0,0 @@
#version 450
#include "types.glsl"
#include "sum_rows.glsl"
#extension GL_EXT_control_flow_attributes : enable
#extension GL_KHR_shader_subgroup_arithmetic : enable
#extension GL_KHR_shader_subgroup_basic : enable
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
layout (binding = 2) writeonly buffer T {D_TYPE data_t[];};
layout (constant_id = 0) const uint BLOCK_SIZE = 128;
layout (constant_id = 1) const uint SUBGROUP_SIZE = 32;
#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
shared FLOAT_TYPE partial[BLOCK_SIZE / SUBGROUP_SIZE];
void main() {
const uint row = gl_WorkGroupID.y;
const uint tid = gl_LocalInvocationID.x;
const uint col = gl_GlobalInvocationID.x;
const uint i03 = fastdiv(row, p.ne0_12mp, p.ne0_12L);
const uint i03_offset = i03 * p.ne01*p.ne02;
const uint i02 = fastdiv(row - i03_offset, p.ne0_1mp, p.ne0_1L);
const uint i01 = row - i03_offset - i02*p.ne01;
const uint src_idx = get_aoffset() + i01 * p.nb01 + i02 * p.nb02 + i03 * p.nb03;
const uint dst_idx = get_doffset() + i01 * p.nb11 + i02 * p.nb12 + i03 * p.nb13;
uint subgroup_id = tid / SUBGROUP_SIZE;
FLOAT_TYPE v = 0;
if (col < p.n_cols) {
v = FLOAT_TYPE(data_a[src_idx + col]);
}
v = subgroupInclusiveAdd(v);
// Store the largest partial sum for each subgroup, then add the partials for all
// lower subgroups and the final partial sum from the previous iteration.
if (gl_SubgroupInvocationID == SUBGROUP_SIZE - 1) {
partial[subgroup_id] = v;
}
barrier();
for (int j = 0; j < subgroup_id; ++j) {
v += partial[j];
}
barrier();
if (tid == BLOCK_SIZE - 1) {
data_t[gl_WorkGroupID.x + gl_NumWorkGroups.x * row] = v;
}
if (col < p.n_cols) {
data_d[dst_idx + col] = D_TYPE(v);
}
}

View File

@ -1,66 +0,0 @@
#version 450
#include "types.glsl"
#include "sum_rows.glsl"
#extension GL_EXT_control_flow_attributes : enable
#extension GL_KHR_shader_subgroup_arithmetic : enable
#extension GL_KHR_shader_subgroup_basic : enable
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
layout (binding = 1) buffer D {D_TYPE data_d[];};
layout (binding = 2) readonly buffer T {D_TYPE data_t[];};
layout (constant_id = 0) const uint BLOCK_SIZE = 128;
layout (constant_id = 1) const uint SUBGROUP_SIZE = 32;
#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
shared FLOAT_TYPE temp[BLOCK_SIZE / SUBGROUP_SIZE];
void main() {
const uint row = gl_WorkGroupID.y;
const uint tid = gl_LocalInvocationID.x;
const uint i03 = fastdiv(row, p.ne0_12mp, p.ne0_12L);
const uint i03_offset = i03 * p.ne01*p.ne02;
const uint i02 = fastdiv(row - i03_offset, p.ne0_1mp, p.ne0_1L);
const uint i01 = row - i03_offset - i02*p.ne01;
const uint src_idx = get_aoffset() + i01 * p.nb01 + i02 * p.nb02 + i03 * p.nb03;
const uint dst_idx = get_doffset() + i01 * p.nb11 + i02 * p.nb12 + i03 * p.nb13;
const uint col = gl_GlobalInvocationID.x;
float v = 0;
// prefetch value we're adding to
if (col < p.n_cols) {
v = data_d[dst_idx + col];
}
// compute the sum of all previous blocks
uint c = tid;
float sum = 0;
while (c < gl_WorkGroupID.x) {
sum += data_t[c + gl_NumWorkGroups.x * row];
c += BLOCK_SIZE;
}
sum = subgroupAdd(sum);
if (gl_SubgroupInvocationID == 0) {
temp[gl_SubgroupID] = sum;
}
barrier();
sum = 0;
[[unroll]] for (uint s = 0; s < BLOCK_SIZE / SUBGROUP_SIZE; ++s) {
sum += temp[s];
}
// Add the sum to what the first pass computed
if (col < p.n_cols) {
data_d[dst_idx + col] = v + sum;
}
}

View File

@ -14,8 +14,6 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
#define K_PER_ITER 8 #define K_PER_ITER 8
#elif defined(DATA_A_QUANT_K) #elif defined(DATA_A_QUANT_K)
#define K_PER_ITER 16 #define K_PER_ITER 16
#elif defined(DATA_A_IQ1_S) || defined(DATA_A_IQ1_M)
#define K_PER_ITER 32
#else #else
#error unimplemented #error unimplemented
#endif #endif
@ -51,15 +49,6 @@ void iter(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint first_row, const
cache_b_qs[1] = data_b[b_block_idx_outer].qs[b_block_idx_inner * 8 + b_qs_idx * 4 + 1]; cache_b_qs[1] = data_b[b_block_idx_outer].qs[b_block_idx_inner * 8 + b_qs_idx * 4 + 1];
cache_b_qs[2] = data_b[b_block_idx_outer].qs[b_block_idx_inner * 8 + b_qs_idx * 4 + 2]; cache_b_qs[2] = data_b[b_block_idx_outer].qs[b_block_idx_inner * 8 + b_qs_idx * 4 + 2];
cache_b_qs[3] = data_b[b_block_idx_outer].qs[b_block_idx_inner * 8 + b_qs_idx * 4 + 3]; cache_b_qs[3] = data_b[b_block_idx_outer].qs[b_block_idx_inner * 8 + b_qs_idx * 4 + 3];
#elif K_PER_ITER == 32
cache_b_qs[0] = data_b[b_block_idx_outer].qs[b_block_idx_inner * 8 ];
cache_b_qs[1] = data_b[b_block_idx_outer].qs[b_block_idx_inner * 8 + 1];
cache_b_qs[2] = data_b[b_block_idx_outer].qs[b_block_idx_inner * 8 + 2];
cache_b_qs[3] = data_b[b_block_idx_outer].qs[b_block_idx_inner * 8 + 3];
cache_b_qs[4] = data_b[b_block_idx_outer].qs[b_block_idx_inner * 8 + 4];
cache_b_qs[5] = data_b[b_block_idx_outer].qs[b_block_idx_inner * 8 + 5];
cache_b_qs[6] = data_b[b_block_idx_outer].qs[b_block_idx_inner * 8 + 6];
cache_b_qs[7] = data_b[b_block_idx_outer].qs[b_block_idx_inner * 8 + 7];
#else #else
#error unimplemented #error unimplemented
#endif #endif

View File

@ -377,118 +377,3 @@ FLOAT_TYPE mmvq_dot_product(const uint ib_a, const uint iqs) {
return FLOAT_TYPE(float(cache_b_ds.x) * float(d_scale) * float(q_sum)); return FLOAT_TYPE(float(cache_b_ds.x) * float(d_scale) * float(q_sum));
} }
#endif #endif
#if defined(DATA_A_IQ1_S)
void repack8(uint ib, uint iqs, out i32vec4 out0, out i32vec4 out1) {
const uint ib32 = iqs / 32;
const uint qh = data_a[ib].qh[ib32];
const uint qs16_0 = data_a_packed16[ib].qs[(4 * ib32 + 0) / 2];
const uint qs16_1 = data_a_packed16[ib].qs[(4 * ib32 + 2) / 2];
const uint qs0 = qs16_0 & 0xFF;
const uint qs1 = qs16_0 >> 8;
const uint qs2 = qs16_1 & 0xFF;
const uint qs3 = qs16_1 >> 8;
const uint hi0 = bitfieldExtract(qh, 3 * int(0), 3);
const uint hi1 = bitfieldExtract(qh, 3 * int(1), 3);
const uint hi2 = bitfieldExtract(qh, 3 * int(2), 3);
const uint hi3 = bitfieldExtract(qh, 3 * int(3), 3);
const int32_t grid0 = int32_t(iq1s_grid_gpu[qs0 | (hi0 << 8)]);
const int32_t grid1 = int32_t(iq1s_grid_gpu[qs1 | (hi1 << 8)]);
const int32_t grid2 = int32_t(iq1s_grid_gpu[qs2 | (hi2 << 8)]);
const int32_t grid3 = int32_t(iq1s_grid_gpu[qs3 | (hi3 << 8)]);
out0 = i32vec4((grid0 >> 0) & 0x0F0F0F0F,
(grid0 >> 4) & 0x0F0F0F0F,
(grid1 >> 0) & 0x0F0F0F0F,
(grid1 >> 4) & 0x0F0F0F0F);
out1 = i32vec4((grid2 >> 0) & 0x0F0F0F0F,
(grid2 >> 4) & 0x0F0F0F0F,
(grid3 >> 0) & 0x0F0F0F0F,
(grid3 >> 4) & 0x0F0F0F0F);
}
vec2 get_dm(uint ib, uint iqs) {
const uint ib32 = iqs / 32;
const uint qh = data_a[ib].qh[ib32];
const float delta = ((qh & 0x8000) != 0) ? -IQ1S_DELTA : IQ1S_DELTA;
const float d = float(data_a[ib].d);
const float dl = d * float(2 * bitfieldExtract(qh, 12, 3) + 1);
// the -1 cancels out the bias in iq1s_grid_gpu
return FLOAT_TYPE_VEC2(dl, dl * (delta - 1));
}
FLOAT_TYPE mmvq_dot_product(const uint ib_a, const uint iqs) {
int32_t q_sum = 0;
const uint ib_k = ib_a / 8;
const uint iqs_k = (ib_a % 8) * 32 + iqs * 32;
i32vec4 qs_a0;
i32vec4 qs_a1;
repack8(ib_k, iqs_k, qs_a0, qs_a1);
const vec2 dm = get_dm(ib_k, iqs_k);
q_sum += dotPacked4x8EXT(qs_a0.x, cache_b_qs[0]);
q_sum += dotPacked4x8EXT(qs_a0.y, cache_b_qs[1]);
q_sum += dotPacked4x8EXT(qs_a0.z, cache_b_qs[2]);
q_sum += dotPacked4x8EXT(qs_a0.w, cache_b_qs[3]);
q_sum += dotPacked4x8EXT(qs_a1.x, cache_b_qs[4]);
q_sum += dotPacked4x8EXT(qs_a1.y, cache_b_qs[5]);
q_sum += dotPacked4x8EXT(qs_a1.z, cache_b_qs[6]);
q_sum += dotPacked4x8EXT(qs_a1.w, cache_b_qs[7]);
return FLOAT_TYPE(float(cache_b_ds.x) * float(dm.x) * float(q_sum) + float(dm.y) * float(cache_b_ds.y));
}
#endif
#if defined(DATA_A_IQ1_M)
FLOAT_TYPE mmvq_dot_product(const uint ib_a, const uint iqs) {
const uint ib_k = ib_a / 8;
const uint iqs_k = (ib_a % 8) * 32 + iqs * 32;
const uint ib32 = iqs_k / 32;
const uint ib64 = ib32 / 2;
const uint16_t[4] scales = data_a[ib_k].scales;
const u16vec4 s = u16vec4(scales[0], scales[1], scales[2], scales[3]) >> 12;
const float d = float(unpackHalf2x16(s.x | (s.y << 4) | (s.z << 8) | (s.w << 12)).x);
const uint qs32 = data_a_packed32[ib_k].qs[ib32];
const uint qh16 = data_a_packed16[ib_k].qh[ib32];
float sum = 0;
const uint sc = data_a[ib_k].scales[ib64];
[[unroll]] for (int l = 0; l < 4; ++l) {
const uint ib16 = 2 * ib32 + l / 2;
const float dl = d * (2 * bitfieldExtract(sc, 3 * int(ib16 & 3), 3) + 1);
const uint qh = qh16 >> (4 * l);
const uint qs = (qs32 >> (8 * l)) & 0xFF;
const float delta = ((qh & 8) != 0) ? -IQ1M_DELTA : IQ1M_DELTA;
const int32_t grid = int32_t(iq1s_grid_gpu[qs | ((qh & 7) << 8)]);
int32_t q_sum = 0;
q_sum += dotPacked4x8EXT((grid >> 0) & 0x0F0F0F0F, cache_b_qs[2 * l + 0]);
q_sum += dotPacked4x8EXT((grid >> 4) & 0x0F0F0F0F, cache_b_qs[2 * l + 1]);
int32_t y_sum = 0;
y_sum += dotPacked4x8EXT(int(0x01010101), cache_b_qs[2 * l + 0]);
y_sum += dotPacked4x8EXT(int(0x01010101), cache_b_qs[2 * l + 1]);
// the -1 cancels out the bias in iq1s_grid_gpu
sum += dl * (q_sum + y_sum * (delta - 1));
}
sum *= float(cache_b_ds.x);
return sum;
}
#endif

View File

@ -7,10 +7,6 @@
#include "types.glsl" #include "types.glsl"
#define GATING_FUNC_SOFTMAX 0
#define GATING_FUNC_SIGMOID 1
#define GATING_FUNC_SOFTMAX_WEIGHT 2
layout (push_constant) uniform parameter layout (push_constant) uniform parameter
{ {
uint n_rows; uint n_rows;
@ -18,18 +14,15 @@ layout (push_constant) uniform parameter
uint n_expert_used; uint n_expert_used;
float clamp_min; float clamp_min;
float clamp_max; float clamp_max;
uint gating_func;
uint has_bias;
uint with_norm;
float output_scale;
float output_bias;
}; };
layout(local_size_x_id = 0, local_size_y = 4, local_size_z = 1) in; layout(local_size_x_id = 0, local_size_y = 4, local_size_z = 1) in;
layout(constant_id = 0) const uint WARP_SIZE = 32; layout(constant_id = 0) const uint WARP_SIZE = 32;
layout(constant_id = 1) const uint n_experts_spec = 512; layout(constant_id = 1) const uint n_experts_spec = 512;
layout(constant_id = 2) const bool nexperts_use_push = false; layout(constant_id = 2) const bool with_norm = true;
layout(constant_id = 3) const bool late_softmax = false;
layout(constant_id = 4) const bool nexperts_use_push = false;
uint n_experts = nexperts_use_push ? n_experts_push : n_experts_spec; uint n_experts = nexperts_use_push ? n_experts_push : n_experts_spec;
@ -38,9 +31,8 @@ uint n_experts = nexperts_use_push ? n_experts_push : n_experts_spec;
const uint experts_per_thread = CEIL_DIV(n_experts_spec, WARP_SIZE); const uint experts_per_thread = CEIL_DIV(n_experts_spec, WARP_SIZE);
layout (binding = 0, std430) readonly buffer Logits {float logits[];}; layout (binding = 0, std430) readonly buffer Logits {float logits[];};
layout (binding = 1, std430) readonly buffer BiasProbs {float bias[];}; layout (binding = 1, std430) writeonly buffer Weights {float weights[];};
layout (binding = 2, std430) writeonly buffer Weights {float weights[];}; layout (binding = 2, std430) writeonly buffer Ids {uint ids[];};
layout (binding = 3, std430) writeonly buffer Ids {uint ids[];};
const float INFINITY = 1.0 / 0.0; const float INFINITY = 1.0 / 0.0;
@ -95,40 +87,20 @@ void main() {
} }
const uint logits_offset = n_experts * row; const uint logits_offset = n_experts * row;
const uint bias_offset = 0; // 1D
const uint weights_offset = n_expert_used * row; const uint weights_offset = n_expert_used * row;
const uint ids_offset = n_experts * row; const uint ids_offset = n_experts * row;
const uint lane = gl_SubgroupInvocationID; const uint lane = gl_SubgroupInvocationID;
float probs[experts_per_thread]; float wt[experts_per_thread];
[[unroll]] [[unroll]]
for (uint i = 0; i < n_experts; i += WARP_SIZE) { for (uint i = 0; i < n_experts; i += WARP_SIZE) {
const uint expert = i + lane; const uint expert = i + lane;
probs[i / WARP_SIZE] = (n_experts % WARP_SIZE == 0 || expert < n_experts) ? logits[logits_offset + expert] : -INFINITY; wt[i / WARP_SIZE] = (n_experts % WARP_SIZE == 0 || expert < n_experts) ? logits[logits_offset + expert] : -INFINITY;
} }
if (gating_func == GATING_FUNC_SOFTMAX) { if (!late_softmax) {
softmax_warp_inplace(probs, n_experts, lane, nexperts_use_push); softmax_warp_inplace(wt, n_experts, lane, nexperts_use_push);
} else if (gating_func == GATING_FUNC_SIGMOID) {
[[unroll]]
for (int i = 0; i < experts_per_thread; i++) {
probs[i] = 1.f / (1.f + exp(-probs[i]));
}
}
float selection_probs[experts_per_thread];
if (has_bias != 0) {
[[unroll]]
for (uint i = 0; i < n_experts; i += WARP_SIZE) {
const uint expert = i + lane;
selection_probs[i / WARP_SIZE] = (n_experts % WARP_SIZE == 0 || expert < n_experts) ? probs[i / WARP_SIZE] + bias[bias_offset + expert] : -INFINITY;
}
} else {
[[unroll]]
for (int i = 0; i < experts_per_thread; i++) {
selection_probs[i] = probs[i];
}
} }
// at this point, each thread holds a portion of softmax, // at this point, each thread holds a portion of softmax,
@ -145,16 +117,14 @@ void main() {
} }
for (int k = 0; k < n_expert_used; k++) { for (int k = 0; k < n_expert_used; k++) {
float max_val = probs[0]; float max_val = wt[0];
float max_val_s = selection_probs[0];
uint max_expert = lane; uint max_expert = lane;
[[unroll]] [[unroll]]
for (int i = 1; i < experts_per_thread; i++) { for (int i = 1; i < experts_per_thread; i++) {
const uint expert = lane + i * WARP_SIZE; const uint expert = lane + i * WARP_SIZE;
if ((n_experts % WARP_SIZE == 0 || expert < n_experts) && selection_probs[i] > max_val_s) { if ((n_experts % WARP_SIZE == 0 || expert < n_experts) && wt[i] > max_val) {
max_val = probs[i]; max_val = wt[i];
max_val_s = selection_probs[i];
max_expert = expert; max_expert = expert;
} }
} }
@ -162,11 +132,9 @@ void main() {
[[unroll]] [[unroll]]
for (uint mask = WARP_SIZE / 2; mask > 0; mask /= 2) { for (uint mask = WARP_SIZE / 2; mask > 0; mask /= 2) {
const float val = subgroupShuffleXor(max_val, mask); const float val = subgroupShuffleXor(max_val, mask);
const float val_s = subgroupShuffleXor(max_val_s, mask);
const uint expert = subgroupShuffleXor(max_expert, mask); const uint expert = subgroupShuffleXor(max_expert, mask);
if (val_s > max_val_s || (val_s == max_val_s && expert < max_expert)) { if (val > max_val || (val == max_val && expert < max_expert)) {
max_val = val; max_val = val;
max_val_s = val_s;
max_expert = expert; max_expert = expert;
} }
} }
@ -176,14 +144,16 @@ void main() {
} }
if ((max_expert & (WARP_SIZE - 1)) == lane) { if ((max_expert & (WARP_SIZE - 1)) == lane) {
selection_probs[max_expert / WARP_SIZE] = -INFINITY; wt[max_expert / WARP_SIZE] = -INFINITY;
ids[ids_offset + k] = max_expert; ids[ids_offset + k] = max_expert;
wt_sum += max_val; if (with_norm) {
wt_sum += max_val;
}
} }
} }
if (with_norm != 0) { if (with_norm) {
wt_sum = subgroupAdd(wt_sum); wt_sum = subgroupAdd(wt_sum);
wt_sum = clamp(wt_sum, clamp_min, clamp_max); wt_sum = clamp(wt_sum, clamp_min, clamp_max);
const float inv_sum = 1.0f / wt_sum; const float inv_sum = 1.0f / wt_sum;
@ -194,7 +164,7 @@ void main() {
} }
} }
if (gating_func == GATING_FUNC_SOFTMAX_WEIGHT) { if (late_softmax) {
softmax_warp_inplace(output_weights, n_expert_used, lane, true); softmax_warp_inplace(output_weights, n_expert_used, lane, true);
} }
@ -202,7 +172,7 @@ void main() {
for (uint i = 0; i < experts_per_thread; ++i) { for (uint i = 0; i < experts_per_thread; ++i) {
uint idx = i * WARP_SIZE + lane; uint idx = i * WARP_SIZE + lane;
if (idx < n_expert_used) { if (idx < n_expert_used) {
weights[weights_offset + idx] = output_scale * output_weights[i] + output_bias; weights[weights_offset + idx] = output_weights[i];
} }
} }
} }

View File

@ -396,12 +396,6 @@ struct block_iq1_s {
uint16_t qh[QUANT_K_IQ1_S/32]; uint16_t qh[QUANT_K_IQ1_S/32];
}; };
struct block_iq1_s_packed16 {
float16_t d;
uint16_t qs[QUANT_K_IQ1_S/8/2];
uint16_t qh[QUANT_K_IQ1_S/32];
};
#define QUANT_K_IQ1_M 256 #define QUANT_K_IQ1_M 256
#define QUANT_R_IQ1_M 1 #define QUANT_R_IQ1_M 1
@ -411,18 +405,6 @@ struct block_iq1_m {
uint16_t scales[QUANT_K_IQ1_M/64]; uint16_t scales[QUANT_K_IQ1_M/64];
}; };
struct block_iq1_m_packed16 {
uint16_t qs[QUANT_K_IQ1_M/8/2];
uint16_t qh[QUANT_K_IQ1_M/16/2];
uint16_t scales[QUANT_K_IQ1_M/64];
};
struct block_iq1_m_packed32 {
uint32_t qs[QUANT_K_IQ1_M/8/4];
uint32_t qh[QUANT_K_IQ1_M/16/4];
uint32_t scales[QUANT_K_IQ1_M/64/2];
};
struct block_iq1_m_packed64 { struct block_iq1_m_packed64 {
uint64_t qs[QUANT_K_IQ1_M/8/8]; uint64_t qs[QUANT_K_IQ1_M/8/8];
uint64_t qh[QUANT_K_IQ1_M/16/8]; uint64_t qh[QUANT_K_IQ1_M/16/8];
@ -433,15 +415,12 @@ struct block_iq1_m_packed64 {
#define QUANT_K QUANT_K_IQ1_S #define QUANT_K QUANT_K_IQ1_S
#define QUANT_R QUANT_R_IQ1_S #define QUANT_R QUANT_R_IQ1_S
#define A_TYPE block_iq1_s #define A_TYPE block_iq1_s
#define A_TYPE_PACKED16 block_iq1_s_packed16
#endif #endif
#if defined(DATA_A_IQ1_M) #if defined(DATA_A_IQ1_M)
#define QUANT_K QUANT_K_IQ1_M #define QUANT_K QUANT_K_IQ1_M
#define QUANT_R QUANT_R_IQ1_M #define QUANT_R QUANT_R_IQ1_M
#define A_TYPE block_iq1_m #define A_TYPE block_iq1_m
#define A_TYPE_PACKED16 block_iq1_m_packed16
#define A_TYPE_PACKED32 block_iq1_m_packed32
#endif #endif
#if defined(DATA_A_IQ1_S) || defined(DATA_A_IQ1_M) #if defined(DATA_A_IQ1_S) || defined(DATA_A_IQ1_M)
@ -580,270 +559,7 @@ const uint[1024] iq1s_grid_const = {
0x55dd55df, 0x55d555d7, 0x5503550c, 0x557f5501, 0x5577557d, 0x55405575, 0x555d555f, 0x55555557 0x55dd55df, 0x55d555d7, 0x5503550c, 0x557f5501, 0x5577557d, 0x55405575, 0x555d555f, 0x55555557
}; };
// Same content as iq1s_grid_const except each 2-bit value is expanded to 4-bit
// and has 1 added to it (allows packed values to be extracted with & 0x0F0F0F0F
// and 0xF0F0F0F0).
const uint32_t[2048] iq1s_grid_gpu_const = {
0x00000000, 0x00000002, 0x00000101, 0x00000200, 0x00000202, 0x00010001, 0x00010101, 0x00020000,
0x00020002, 0x00020200, 0x00020202, 0x01000101, 0x01010001, 0x01010100, 0x01010102, 0x01020101,
0x02000000, 0x02000002, 0x02000200, 0x02000202, 0x02010101, 0x02020000, 0x02020002, 0x02020200,
0x02020202, 0x00000110, 0x00000111, 0x00010011, 0x00010110, 0x00010112, 0x00010211, 0x00010212,
0x00020111, 0x01000011, 0x01000112, 0x01000211, 0x01010012, 0x01010111, 0x01010212, 0x01020011,
0x01020110, 0x01020112, 0x01020210, 0x02000111, 0x02010011, 0x02010110, 0x02010112, 0x02020111,
0x00000020, 0x00000022, 0x00000220, 0x00000222, 0x00010121, 0x00020020, 0x00020022, 0x00020220,
0x00020222, 0x01000121, 0x01010021, 0x01010221, 0x01020120, 0x01020221, 0x02000020, 0x02000022,
0x02000220, 0x02000222, 0x02010021, 0x02010121, 0x02010221, 0x02020020, 0x02020022, 0x02020220,
0x02020222, 0x00011001, 0x00011100, 0x00011102, 0x00021101, 0x01001001, 0x01001201, 0x01011101,
0x01011202, 0x01021100, 0x01021101, 0x02011001, 0x02011201, 0x02021101, 0x00001011, 0x00001110,
0x00001111, 0x00001112, 0x00011111, 0x00011210, 0x00011212, 0x00021211, 0x01001010, 0x01001111,
0x01001212, 0x01011010, 0x01011011, 0x01011110, 0x01011111, 0x01011112, 0x01011211, 0x01021010,
0x01021012, 0x01021111, 0x01021210, 0x01021212, 0x02001011, 0x02011011, 0x02011111, 0x02011210,
0x02011212, 0x02021011, 0x02021110, 0x02021111, 0x02021112, 0x02021211, 0x00011120, 0x00011221,
0x01001021, 0x01001120, 0x01011020, 0x01011022, 0x01011121, 0x01011220, 0x01021020, 0x01021021,
0x01021122, 0x01021221, 0x02001121, 0x02011021, 0x02011120, 0x02011221, 0x00002000, 0x00002002,
0x00002200, 0x00002202, 0x00012101, 0x00022000, 0x00022002, 0x00022200, 0x00022202, 0x01002101,
0x01012001, 0x01012102, 0x01022101, 0x02002000, 0x02002002, 0x02002200, 0x02002202, 0x02012101,
0x02022000, 0x02022002, 0x02022200, 0x02022202, 0x00002111, 0x00012011, 0x00012110, 0x00012211,
0x00022110, 0x00022111, 0x01002011, 0x01012010, 0x01012011, 0x01012111, 0x01022011, 0x01022110,
0x01022211, 0x02012011, 0x02012110, 0x02012112, 0x02012211, 0x02022111, 0x00002020, 0x00002022,
0x00002220, 0x00002222, 0x00012121, 0x00022020, 0x00022022, 0x00022220, 0x00022222, 0x01002121,
0x01012021, 0x01012221, 0x01022021, 0x01022121, 0x02002020, 0x02002022, 0x02002121, 0x02002220,
0x02002222, 0x02012121, 0x02022020, 0x02022022, 0x02022220, 0x02022222, 0x00110000, 0x00110001,
0x00110100, 0x00110201, 0x00120100, 0x00120101, 0x01100001, 0x01100100, 0x01110000, 0x01110101,
0x01110200, 0x01120001, 0x01120100, 0x01120101, 0x01120201, 0x02110001, 0x02110100, 0x02110102,
0x02120001, 0x02120101, 0x00100011, 0x00100110, 0x00100112, 0x00100211, 0x00110010, 0x00110012,
0x00110111, 0x00110210, 0x00120011, 0x00120110, 0x00120211, 0x01100111, 0x01100212, 0x01110010,
0x01110011, 0x01110012, 0x01110110, 0x01110111, 0x01110112, 0x01110211, 0x01120010, 0x01120111,
0x02100110, 0x02110012, 0x02110111, 0x02120011, 0x02120110, 0x00110021, 0x00110120, 0x00110122,
0x00120121, 0x01100020, 0x01100122, 0x01100221, 0x01110022, 0x01110121, 0x01110220, 0x01110222,
0x01120120, 0x01120122, 0x02100121, 0x02110021, 0x02110120, 0x02110122, 0x02120121, 0x00101001,
0x00101102, 0x00101201, 0x00111100, 0x00111101, 0x00111200, 0x00111201, 0x00121001, 0x00121102,
0x01101001, 0x01101101, 0x01101102, 0x01101200, 0x01101202, 0x01111001, 0x01111100, 0x01111101,
0x01111102, 0x01111201, 0x01121002, 0x01121101, 0x01121200, 0x02101100, 0x02101201, 0x02111000,
0x02111100, 0x02111101, 0x02111200, 0x02111201, 0x02111202, 0x02121001, 0x02121100, 0x02121101,
0x02121201, 0x00101012, 0x00101111, 0x00101212, 0x00111011, 0x00111110, 0x00111111, 0x00111112,
0x00111211, 0x00121010, 0x00121012, 0x00121111, 0x00121210, 0x00121212, 0x01101011, 0x01101110,
0x01101111, 0x01101112, 0x01111011, 0x01111012, 0x01111110, 0x01111111, 0x01111112, 0x01111211,
0x01111212, 0x01121011, 0x01121110, 0x01121111, 0x01121112, 0x01121211, 0x02101010, 0x02101012,
0x02101110, 0x02101111, 0x02101210, 0x02101212, 0x02111010, 0x02111011, 0x02111110, 0x02111111,
0x02111112, 0x02111211, 0x02111212, 0x02121010, 0x02121012, 0x02121111, 0x00101021, 0x00101120,
0x00101121, 0x00101122, 0x00111121, 0x00111122, 0x00111220, 0x00111222, 0x00121021, 0x00121122,
0x01101020, 0x01101022, 0x01101120, 0x01101121, 0x01101220, 0x01101222, 0x01111021, 0x01111121,
0x01111122, 0x01111220, 0x01111221, 0x01121021, 0x01121120, 0x01121121, 0x01121220, 0x01121221,
0x01121222, 0x02101122, 0x02101222, 0x02111022, 0x02111121, 0x02121120, 0x02121221, 0x00112001,
0x00112102, 0x00122101, 0x01102001, 0x01102100, 0x01102102, 0x01102201, 0x01112000, 0x01112101,
0x01112200, 0x01112202, 0x01122000, 0x01122001, 0x01122100, 0x01122102, 0x01122201, 0x02102101,
0x02112001, 0x02112100, 0x02122101, 0x00112010, 0x00112012, 0x00112111, 0x00112212, 0x00122011,
0x00122111, 0x01102012, 0x01102110, 0x01102111, 0x01102210, 0x01112011, 0x01112110, 0x01112111,
0x01112112, 0x01112211, 0x01112212, 0x01122010, 0x01122111, 0x01122212, 0x02102211, 0x02112011,
0x02112012, 0x02112111, 0x02112210, 0x02122011, 0x02122112, 0x02122211, 0x00102221, 0x00112122,
0x00122120, 0x00122122, 0x01102120, 0x01102122, 0x01102221, 0x01112020, 0x01112022, 0x01112121,
0x01112220, 0x01122021, 0x01122122, 0x01122221, 0x02102121, 0x02112021, 0x02112122, 0x02112222,
0x00200000, 0x00200002, 0x00200200, 0x00200202, 0x00210101, 0x00220000, 0x00220002, 0x00220101,
0x00220200, 0x00220202, 0x01200101, 0x01210001, 0x01210201, 0x01220001, 0x01220101, 0x02200000,
0x02200002, 0x02200200, 0x02200202, 0x02210101, 0x02220000, 0x02220002, 0x02220101, 0x02220200,
0x02220202, 0x00200111, 0x00210011, 0x00210110, 0x00210211, 0x00220111, 0x01200012, 0x01200110,
0x01200211, 0x01210111, 0x01210210, 0x01210212, 0x01220011, 0x01220110, 0x01220111, 0x01220112,
0x02200111, 0x02210010, 0x02210112, 0x02210211, 0x02220111, 0x00200021, 0x00200220, 0x00200222,
0x00210021, 0x00210121, 0x00220020, 0x00220022, 0x00220220, 0x00220222, 0x01200121, 0x01210021,
0x01210122, 0x01210221, 0x01220121, 0x02200021, 0x02200220, 0x02200222, 0x02210021, 0x02210121,
0x02220020, 0x02220022, 0x02220220, 0x02220222, 0x00201101, 0x00211100, 0x00211102, 0x00211201,
0x00221101, 0x01201100, 0x01201101, 0x01201102, 0x01201201, 0x01211002, 0x01211101, 0x01211200,
0x01211202, 0x01221102, 0x02201101, 0x02211001, 0x02211100, 0x02211201, 0x02221001, 0x02221101,
0x00201211, 0x00211111, 0x00221011, 0x00221211, 0x01201010, 0x01201111, 0x01201210, 0x01211011,
0x01211110, 0x01211111, 0x01211211, 0x01221012, 0x01221111, 0x01221210, 0x02201211, 0x02211010,
0x02211110, 0x02211111, 0x02211210, 0x02211212, 0x02221011, 0x02221110, 0x02221112, 0x02221211,
0x00201121, 0x00211020, 0x00211022, 0x00211221, 0x00221121, 0x01201021, 0x01201221, 0x01211121,
0x01221020, 0x01221021, 0x01221221, 0x02201120, 0x02201122, 0x02211020, 0x02211222, 0x00202000,
0x00202002, 0x00202200, 0x00202202, 0x00212101, 0x00222000, 0x00222002, 0x00222200, 0x00222202,
0x01202101, 0x01212001, 0x01212100, 0x01222101, 0x02202000, 0x02202002, 0x02202200, 0x02202202,
0x02222000, 0x02222002, 0x02222200, 0x02222202, 0x00202211, 0x00212011, 0x00212110, 0x00212211,
0x00222111, 0x01202112, 0x01202211, 0x01212012, 0x01212111, 0x01222011, 0x01222110, 0x01222112,
0x01222211, 0x02202111, 0x02212010, 0x02212112, 0x02212211, 0x02222110, 0x02222111, 0x00202020,
0x00202022, 0x00202220, 0x00202222, 0x00222020, 0x00222022, 0x00222220, 0x00222222, 0x01202121,
0x01212021, 0x01212122, 0x01212221, 0x01222121, 0x02202020, 0x02202022, 0x02202220, 0x02202222,
0x02212121, 0x02222020, 0x02222022, 0x02222220, 0x02222222, 0x10000101, 0x10010001, 0x10010102,
0x10020101, 0x11000201, 0x11010002, 0x11010101, 0x11010200, 0x11010202, 0x11020001, 0x11020100,
0x11020102, 0x12010100, 0x12010201, 0x12020001, 0x12020102, 0x10000010, 0x10000011, 0x10000110,
0x10000112, 0x10000211, 0x10010012, 0x10010111, 0x10010112, 0x10010210, 0x10010212, 0x10020011,
0x10020112, 0x10020211, 0x11000111, 0x11000210, 0x11000212, 0x11010011, 0x11010110, 0x11010111,
0x11010112, 0x11010211, 0x11010212, 0x11020111, 0x11020210, 0x11020212, 0x12000011, 0x12000110,
0x12000112, 0x12010010, 0x12010012, 0x12010111, 0x12020010, 0x12020011, 0x12020012, 0x10000121,
0x10010021, 0x10010120, 0x10010122, 0x10020121, 0x11000021, 0x11010022, 0x11010121, 0x11010222,
0x11020120, 0x11020221, 0x12000221, 0x12010120, 0x12020121, 0x10001001, 0x10011101, 0x10011201,
0x10021201, 0x11001101, 0x11001200, 0x11001202, 0x11011001, 0x11011100, 0x11011101, 0x11011102,
0x11021001, 0x11021002, 0x11021101, 0x11021200, 0x11021202, 0x12001001, 0x12001102, 0x12001201,
0x12011000, 0x12011002, 0x12011101, 0x12021000, 0x12021001, 0x12021201, 0x10001011, 0x10001012,
0x10001111, 0x10001212, 0x10011011, 0x10011110, 0x10011111, 0x10011112, 0x10011211, 0x10021010,
0x10021111, 0x10021212, 0x11001011, 0x11001110, 0x11001111, 0x11001112, 0x11001211, 0x11011010,
0x11011011, 0x11011110, 0x11011111, 0x11011112, 0x11011210, 0x11011211, 0x11021011, 0x11021110,
0x11021111, 0x11021112, 0x11021211, 0x12001012, 0x12001110, 0x12001111, 0x12001210, 0x12011011,
0x12011110, 0x12011111, 0x12011112, 0x12011211, 0x12011212, 0x12021111, 0x12021210, 0x12021212,
0x10001021, 0x10001121, 0x10001221, 0x10011120, 0x10011121, 0x10011220, 0x10011222, 0x10021021,
0x10021120, 0x10021221, 0x11001020, 0x11001022, 0x11001121, 0x11001220, 0x11011020, 0x11011021,
0x11011022, 0x11011121, 0x11011122, 0x11011221, 0x11021022, 0x11021121, 0x11021220, 0x12001021,
0x12001121, 0x12001222, 0x12011120, 0x12011121, 0x12021021, 0x12021120, 0x12021122, 0x10002101,
0x10012001, 0x10012101, 0x10012202, 0x10022101, 0x11002002, 0x11002201, 0x11012000, 0x11012101,
0x11012200, 0x11022001, 0x11022100, 0x11022102, 0x11022201, 0x12002101, 0x12012001, 0x12012100,
0x12012102, 0x12012201, 0x12022101, 0x10002011, 0x10002111, 0x10002112, 0x10002212, 0x10012010,
0x10012110, 0x10012111, 0x10012210, 0x10022011, 0x10022110, 0x10022112, 0x11002010, 0x11002111,
0x11002212, 0x11012011, 0x11012012, 0x11012110, 0x11012111, 0x11012112, 0x11012211, 0x11022010,
0x11022012, 0x11022111, 0x11022112, 0x11022212, 0x12002112, 0x12002211, 0x12012012, 0x12012111,
0x12012112, 0x12012210, 0x12022011, 0x12022110, 0x12022112, 0x12022211, 0x10012122, 0x11002120,
0x11002122, 0x11002221, 0x11012121, 0x11012220, 0x11012222, 0x11022120, 0x11022221, 0x12012120,
0x12022121, 0x10100001, 0x10100100, 0x10100101, 0x10100102, 0x10100201, 0x10110002, 0x10110101,
0x10110202, 0x10120001, 0x10120100, 0x10120201, 0x11100000, 0x11100101, 0x11100200, 0x11110001,
0x11110100, 0x11110101, 0x11110102, 0x11110201, 0x11120101, 0x11120200, 0x12100102, 0x12100201,
0x12110101, 0x12110200, 0x12120000, 0x12120001, 0x12120102, 0x12120201, 0x10100111, 0x10100210,
0x10100211, 0x10100212, 0x10110011, 0x10110110, 0x10110111, 0x10110112, 0x10110210, 0x10110211,
0x10120010, 0x10120111, 0x10120112, 0x10120210, 0x10120212, 0x11100011, 0x11100110, 0x11100111,
0x11100112, 0x11100211, 0x11110010, 0x11110011, 0x11110012, 0x11110110, 0x11110111, 0x11110112,
0x11110210, 0x11110211, 0x11110212, 0x11120011, 0x11120110, 0x11120111, 0x11120112, 0x11120211,
0x12100012, 0x12100111, 0x12110011, 0x12110110, 0x12110111, 0x12110112, 0x12110211, 0x12120010,
0x12120111, 0x12120212, 0x10100021, 0x10100122, 0x10110022, 0x10110121, 0x10110222, 0x10120021,
0x10120120, 0x11100022, 0x11100121, 0x11100222, 0x11110021, 0x11110120, 0x11110121, 0x11110122,
0x11110221, 0x11120022, 0x11120121, 0x12100121, 0x12110020, 0x12110022, 0x12110121, 0x12110221,
0x12110222, 0x12120120, 0x10101100, 0x10101101, 0x10111001, 0x10111100, 0x10111101, 0x10111102,
0x10111200, 0x10111201, 0x10121001, 0x10121101, 0x10121200, 0x10121202, 0x11101001, 0x11101100,
0x11101101, 0x11101102, 0x11101201, 0x11101202, 0x11111000, 0x11111001, 0x11111100, 0x11111101,
0x11111102, 0x11111200, 0x11111201, 0x11111202, 0x11121001, 0x11121002, 0x11121100, 0x11121101,
0x11121102, 0x11121201, 0x12101000, 0x12101200, 0x12101202, 0x12111001, 0x12111100, 0x12111101,
0x12111102, 0x12111201, 0x12121001, 0x12121100, 0x12121101, 0x12121202, 0x10101011, 0x10101012,
0x10101110, 0x10101111, 0x10101112, 0x10101211, 0x10111010, 0x10111011, 0x10111012, 0x10111110,
0x10111111, 0x10111112, 0x10111211, 0x10111212, 0x10121011, 0x10121110, 0x10121111, 0x10121112,
0x10121211, 0x11101010, 0x11101011, 0x11101012, 0x11101110, 0x11101111, 0x11101112, 0x11101210,
0x11101211, 0x11111010, 0x11111011, 0x11111012, 0x11111110, 0x11111111, 0x11111112, 0x11111210,
0x11111211, 0x11111212, 0x11121010, 0x11121011, 0x11121110, 0x11121111, 0x11121112, 0x11121210,
0x11121211, 0x11121212, 0x12101011, 0x12101110, 0x12101111, 0x12101211, 0x12101212, 0x12111010,
0x12111011, 0x12111110, 0x12111111, 0x12111112, 0x12111210, 0x12111211, 0x12121011, 0x12121110,
0x12121111, 0x12121112, 0x12121211, 0x10101020, 0x10101021, 0x10101022, 0x10101120, 0x10101122,
0x10101220, 0x10101221, 0x10111021, 0x10111120, 0x10111121, 0x10111220, 0x10111221, 0x10121020,
0x10121021, 0x10121022, 0x10121120, 0x10121121, 0x10121122, 0x10121220, 0x10121221, 0x11101021,
0x11101121, 0x11101122, 0x11101220, 0x11101221, 0x11101222, 0x11111020, 0x11111021, 0x11111022,
0x11111120, 0x11111121, 0x11111122, 0x11111220, 0x11111221, 0x11111222, 0x11121021, 0x11121120,
0x11121121, 0x11121221, 0x12101022, 0x12101121, 0x12101122, 0x12101220, 0x12101221, 0x12101222,
0x12111021, 0x12111121, 0x12111222, 0x12121022, 0x12121121, 0x12121122, 0x12121220, 0x12121221,
0x10102100, 0x10102101, 0x10102102, 0x10102201, 0x10112000, 0x10112101, 0x10112200, 0x10122001,
0x10122202, 0x11102101, 0x11102200, 0x11102202, 0x11112001, 0x11112100, 0x11112101, 0x11112102,
0x11112200, 0x11112201, 0x11122000, 0x11122002, 0x11122100, 0x11122101, 0x12102002, 0x12102201,
0x12112000, 0x12112002, 0x12112101, 0x12112200, 0x12122001, 0x12122201, 0x10102011, 0x10102012,
0x10102111, 0x10102212, 0x10112011, 0x10112110, 0x10112111, 0x10112112, 0x10112211, 0x10122111,
0x11102011, 0x11102110, 0x11102111, 0x11102112, 0x11102211, 0x11112010, 0x11112011, 0x11112012,
0x11112110, 0x11112111, 0x11112112, 0x11112210, 0x11112211, 0x11112212, 0x11122011, 0x11122110,
0x11122111, 0x11122112, 0x11122211, 0x12102011, 0x12102111, 0x12102211, 0x12112011, 0x12112110,
0x12112111, 0x12112112, 0x12112210, 0x12112211, 0x12122111, 0x10102120, 0x10102220, 0x10112121,
0x10112222, 0x10122020, 0x10122121, 0x10122122, 0x10122221, 0x11102121, 0x11102220, 0x11102221,
0x11112021, 0x11112121, 0x11112122, 0x11112220, 0x11112221, 0x11122022, 0x11122121, 0x11122220,
0x11122222, 0x12102021, 0x12102222, 0x12112022, 0x12112121, 0x12112122, 0x12112220, 0x12112222,
0x12122021, 0x10200101, 0x10210100, 0x10210102, 0x10210201, 0x10220101, 0x11200100, 0x11210000,
0x11210101, 0x11210102, 0x11210200, 0x11210202, 0x11220001, 0x11220100, 0x11220102, 0x11220201,
0x12200001, 0x12210102, 0x12220101, 0x10200011, 0x10200110, 0x10200112, 0x10200211, 0x10210012,
0x10210111, 0x10220011, 0x10220012, 0x10220112, 0x10220211, 0x11200111, 0x11200211, 0x11210011,
0x11210111, 0x11210112, 0x11210211, 0x11220111, 0x11220112, 0x11220212, 0x12200110, 0x12200212,
0x12210012, 0x12210111, 0x12220011, 0x12220112, 0x12220211, 0x10210021, 0x10210122, 0x10210221,
0x11200020, 0x11200021, 0x11200122, 0x11210121, 0x11210122, 0x11210220, 0x11220020, 0x12200121,
0x12210021, 0x12210122, 0x12220121, 0x10211001, 0x10211002, 0x10211101, 0x10211102, 0x10211202,
0x10221001, 0x10221102, 0x10221201, 0x11201000, 0x11201002, 0x11201101, 0x11201200, 0x11201202,
0x11211001, 0x11211100, 0x11211101, 0x11211102, 0x11211201, 0x11211202, 0x11221000, 0x11221002,
0x11221101, 0x12201100, 0x12201101, 0x12201201, 0x12211000, 0x12211002, 0x12211100, 0x12211101,
0x12211102, 0x12211200, 0x12211202, 0x12221001, 0x12221100, 0x12221201, 0x10201111, 0x10201210,
0x10201212, 0x10211011, 0x10211111, 0x10211112, 0x10211211, 0x11201110, 0x11201111, 0x11201112,
0x11201211, 0x11211010, 0x11211011, 0x11211110, 0x11211111, 0x11211112, 0x11211211, 0x11221011,
0x11221110, 0x11221111, 0x11221112, 0x11221211, 0x12201112, 0x12201211, 0x12201212, 0x12211011,
0x12211111, 0x12211112, 0x12211211, 0x12211212, 0x12221012, 0x12221111, 0x12221112, 0x12221210,
0x10201022, 0x10201221, 0x10211121, 0x10221020, 0x10221122, 0x10221220, 0x10221221, 0x11201020,
0x11201121, 0x11201220, 0x11201222, 0x11211021, 0x11211120, 0x11211121, 0x11211122, 0x11211220,
0x11211222, 0x11221020, 0x11221121, 0x11221220, 0x12201020, 0x12201022, 0x12201121, 0x12201222,
0x12211120, 0x12211122, 0x12211220, 0x12211221, 0x12221020, 0x12221120, 0x12221122, 0x12221222,
0x10212102, 0x10212201, 0x10222101, 0x11202001, 0x11212002, 0x11212101, 0x11212202, 0x11222001,
0x11222201, 0x12202101, 0x12212001, 0x12212200, 0x12222102, 0x10202011, 0x10202110, 0x10212010,
0x10212111, 0x10222011, 0x10222110, 0x10222112, 0x10222211, 0x11202010, 0x11202011, 0x11202111,
0x11202112, 0x11202210, 0x11212011, 0x11212110, 0x11212111, 0x11212112, 0x11212211, 0x11222010,
0x11222111, 0x11222212, 0x12202012, 0x12202110, 0x12202212, 0x12212111, 0x12222011, 0x12222110,
0x12222111, 0x12222211, 0x10212021, 0x10212122, 0x10212220, 0x11202021, 0x11202120, 0x11202221,
0x11212020, 0x11212121, 0x11212220, 0x11212222, 0x11222120, 0x11222121, 0x11222221, 0x12202122,
0x12212120, 0x12212220, 0x12212222, 0x12222122, 0x20000000, 0x20000002, 0x20000200, 0x20000202,
0x20020000, 0x20020002, 0x20020200, 0x20020202, 0x21000101, 0x21010000, 0x21010001, 0x21010100,
0x21010102, 0x21010201, 0x21020101, 0x22000000, 0x22000002, 0x22000200, 0x22000202, 0x22010101,
0x22020000, 0x22020002, 0x22020200, 0x22020202, 0x20000111, 0x20010011, 0x20010110, 0x20010112,
0x20010211, 0x20020111, 0x21000011, 0x21000110, 0x21000211, 0x21010010, 0x21010012, 0x21010111,
0x21010112, 0x21010210, 0x21010211, 0x21020110, 0x21020112, 0x21020211, 0x22000111, 0x22000211,
0x22010110, 0x22010112, 0x22010211, 0x22020111, 0x20000020, 0x20000022, 0x20000220, 0x20000222,
0x20010121, 0x20020020, 0x20020022, 0x20020220, 0x20020222, 0x21010021, 0x21010120, 0x21010221,
0x21020121, 0x22000020, 0x22000022, 0x22000220, 0x22000222, 0x22010121, 0x22020020, 0x22020022,
0x22020220, 0x22020222, 0x20011100, 0x20011201, 0x21001001, 0x21001100, 0x21011001, 0x21011101,
0x21011202, 0x21021001, 0x21021100, 0x21021201, 0x22011100, 0x22011201, 0x20001011, 0x20001211,
0x20011012, 0x20011111, 0x20011212, 0x20021112, 0x20021211, 0x21001010, 0x21001011, 0x21001111,
0x21001210, 0x21011011, 0x21011110, 0x21011111, 0x21011112, 0x21011211, 0x21011212, 0x21021111,
0x21021112, 0x21021210, 0x21021212, 0x22001011, 0x22001110, 0x22001112, 0x22001211, 0x22011010,
0x22011012, 0x22011111, 0x22011210, 0x22021112, 0x20011021, 0x20011122, 0x20011221, 0x20021121,
0x21001021, 0x21001120, 0x21001221, 0x21001222, 0x21011020, 0x21011121, 0x21011221, 0x21011222,
0x21021021, 0x21021122, 0x21021222, 0x22001121, 0x22011021, 0x22011222, 0x22021120, 0x20002000,
0x20002002, 0x20002200, 0x20002202, 0x20012101, 0x20022000, 0x20022002, 0x20022200, 0x20022202,
0x21002001, 0x21002101, 0x21012001, 0x21012100, 0x21012201, 0x21022101, 0x21022201, 0x22002000,
0x22002002, 0x22002200, 0x22002202, 0x22012101, 0x22022000, 0x22022002, 0x22022200, 0x22022202,
0x20002111, 0x20002112, 0x20012011, 0x20012110, 0x20012112, 0x20022111, 0x21002011, 0x21002110,
0x21002112, 0x21002211, 0x21012010, 0x21012012, 0x21012111, 0x21012212, 0x21022011, 0x21022110,
0x22002111, 0x22012112, 0x22012211, 0x22022111, 0x20002020, 0x20002022, 0x20002220, 0x20002222,
0x20012121, 0x20022020, 0x20022022, 0x20022220, 0x20022222, 0x21002121, 0x21012021, 0x21012120,
0x21012122, 0x22002020, 0x22002022, 0x22002220, 0x22002222, 0x22012121, 0x22022020, 0x22022022,
0x22022220, 0x22022222, 0x20100101, 0x20110001, 0x20110102, 0x20110200, 0x20110201, 0x20120101,
0x21100001, 0x21100102, 0x21100201, 0x21110101, 0x21110200, 0x21110202, 0x21120201, 0x21120202,
0x22100101, 0x22110001, 0x22110100, 0x22110102, 0x22110201, 0x22120101, 0x20100011, 0x20100110,
0x20100112, 0x20100211, 0x20110010, 0x20110111, 0x20110210, 0x20110212, 0x20120011, 0x20120110,
0x20120112, 0x20120211, 0x21100010, 0x21100111, 0x21110010, 0x21110011, 0x21110110, 0x21110111,
0x21110112, 0x21110211, 0x21120012, 0x21120111, 0x22100110, 0x22100112, 0x22110012, 0x22110111,
0x22110210, 0x22120011, 0x22120110, 0x22120112, 0x22120211, 0x20100121, 0x20110021, 0x20110120,
0x20110221, 0x20120121, 0x21100120, 0x21100122, 0x21100221, 0x21110020, 0x21110022, 0x21110121,
0x21110220, 0x21120122, 0x21120221, 0x22100121, 0x22110120, 0x22110122, 0x22120221, 0x20101001,
0x20101100, 0x20101102, 0x20111000, 0x20111101, 0x20111200, 0x20121102, 0x21101000, 0x21101202,
0x21111001, 0x21111100, 0x21111101, 0x21111102, 0x21111200, 0x21111201, 0x21121000, 0x21121001,
0x21121002, 0x21121101, 0x22101100, 0x22101102, 0x22111002, 0x22111100, 0x22111101, 0x22111200,
0x22121001, 0x22121201, 0x20101010, 0x20101111, 0x20101210, 0x20101212, 0x20111010, 0x20111011,
0x20111110, 0x20111111, 0x20111112, 0x20111211, 0x20121011, 0x20121111, 0x20121211, 0x20121212,
0x21101011, 0x21101110, 0x21101111, 0x21101112, 0x21101211, 0x21111010, 0x21111011, 0x21111012,
0x21111110, 0x21111111, 0x21111112, 0x21111210, 0x21111211, 0x21111212, 0x21121011, 0x21121110,
0x21121111, 0x21121112, 0x21121211, 0x22101011, 0x22101111, 0x22101210, 0x22111011, 0x22111012,
0x22111110, 0x22111111, 0x22111112, 0x22111211, 0x22111212, 0x22121010, 0x22121012, 0x22121111,
0x22121210, 0x22121212, 0x20101021, 0x20101120, 0x20111020, 0x20111121, 0x20111221, 0x20121020,
0x20121122, 0x20121221, 0x21101121, 0x21101220, 0x21101221, 0x21111021, 0x21111022, 0x21111121,
0x21111122, 0x21111221, 0x21121121, 0x21121220, 0x22101022, 0x22101120, 0x22101221, 0x22101222,
0x22111022, 0x22111120, 0x22111121, 0x22121120, 0x22121122, 0x22121221, 0x20102101, 0x20112102,
0x20112201, 0x20122101, 0x21102001, 0x21102102, 0x21112000, 0x21112002, 0x21112101, 0x21112102,
0x21112202, 0x21122100, 0x21122101, 0x22102101, 0x22112001, 0x22112102, 0x22112201, 0x22122101,
0x20102110, 0x20102112, 0x20102211, 0x20112010, 0x20112012, 0x20112111, 0x20112210, 0x20112212,
0x20122010, 0x20122011, 0x20122110, 0x20122112, 0x21102010, 0x21102012, 0x21102111, 0x21102210,
0x21102212, 0x21112011, 0x21112110, 0x21112111, 0x21112112, 0x21112211, 0x21122012, 0x21122111,
0x21122112, 0x21122212, 0x22102011, 0x22102110, 0x22112010, 0x22112012, 0x22112111, 0x22112212,
0x22122011, 0x22122112, 0x20102121, 0x20112121, 0x20122121, 0x21102120, 0x21102122, 0x21102221,
0x21112020, 0x21112121, 0x21112220, 0x21122021, 0x22102121, 0x22112021, 0x22112120, 0x22112121,
0x22112122, 0x20200000, 0x20200002, 0x20200200, 0x20200202, 0x20210101, 0x20220000, 0x20220002,
0x20220200, 0x20220202, 0x21200101, 0x21210001, 0x21210100, 0x21210102, 0x21210201, 0x22200000,
0x22200002, 0x22200200, 0x22200202, 0x22210101, 0x22220000, 0x22220002, 0x22220200, 0x22220202,
0x20200111, 0x20200211, 0x20210011, 0x20210110, 0x20210112, 0x20210211, 0x20210212, 0x21200112,
0x21200211, 0x21210011, 0x21210111, 0x21210210, 0x21210212, 0x21220011, 0x21220110, 0x22200111,
0x22210010, 0x22210012, 0x22210112, 0x22210211, 0x20200022, 0x20200220, 0x20200222, 0x20210020,
0x20210221, 0x20220022, 0x20220220, 0x20220222, 0x21200121, 0x21210021, 0x21210122, 0x21210221,
0x21220121, 0x22200020, 0x22200022, 0x22200220, 0x22200222, 0x22210121, 0x22220020, 0x22220022,
0x22220220, 0x22220222, 0x20211201, 0x20221101, 0x21201001, 0x21201100, 0x21211000, 0x21211100,
0x21211101, 0x21211200, 0x21211202, 0x21221001, 0x21221101, 0x21221102, 0x21221200, 0x21221201,
0x22201101, 0x20201112, 0x20201211, 0x20211010, 0x20211012, 0x20211111, 0x20211210, 0x20221112,
0x20221211, 0x21201012, 0x21201111, 0x21211011, 0x21211110, 0x21211111, 0x21211112, 0x21211211,
0x21221111, 0x21221212, 0x22201011, 0x22201110, 0x22201111, 0x22201112, 0x22201211, 0x22211012,
0x22211111, 0x22211210, 0x20201121, 0x20211021, 0x20211122, 0x20211222, 0x20221021, 0x20221121,
0x21201120, 0x21201122, 0x21201222, 0x21211022, 0x21211121, 0x21211122, 0x21211220, 0x21221020,
0x21221022, 0x22201122, 0x22211020, 0x22211121, 0x22211122, 0x22211221, 0x22221021, 0x22221120,
0x22221122, 0x20202000, 0x20202002, 0x20202200, 0x20202202, 0x20222000, 0x20222002, 0x20222200,
0x20222202, 0x21212001, 0x21212100, 0x21212102, 0x21212201, 0x22202000, 0x22202002, 0x22202200,
0x22202202, 0x22212101, 0x22222000, 0x22222002, 0x22222200, 0x22222202, 0x20202111, 0x20212110,
0x20212211, 0x20222011, 0x20222111, 0x21202011, 0x21212010, 0x21212111, 0x21212212, 0x21222011,
0x21222112, 0x21222211, 0x22212010, 0x22212112, 0x20202020, 0x20202022, 0x20202220, 0x20202222,
0x20222020, 0x20222022, 0x20222220, 0x20222222, 0x21212021, 0x21212120, 0x21212122, 0x22202020,
0x22202022, 0x22202220, 0x22202222, 0x22212121, 0x22222020, 0x22222022, 0x22222220, 0x22222222,
};
shared uint16_t iq1s_grid[2048]; shared uint16_t iq1s_grid[2048];
shared uint32_t iq1s_grid_gpu[2048];
#define NEEDS_INIT_IQ_SHMEM #define NEEDS_INIT_IQ_SHMEM
void init_iq_shmem(uvec3 wgsize) void init_iq_shmem(uvec3 wgsize)
@ -857,12 +573,6 @@ void init_iq_shmem(uvec3 wgsize)
iq1s_grid[2*idx+1] = g.y; iq1s_grid[2*idx+1] = g.y;
} }
} }
[[unroll]] for (uint i = 0; i < iq1s_grid_gpu_const.length(); i += wgsize.x) {
uint idx = i + gl_LocalInvocationIndex.x;
if (iq1s_grid_gpu_const.length() % wgsize.x == 0 || idx < iq1s_grid_gpu_const.length()) {
iq1s_grid_gpu[idx] = iq1s_grid_gpu_const[idx];
}
}
barrier(); barrier();
} }
#endif #endif

View File

@ -685,7 +685,7 @@ void process_shaders() {
// mul mat vec with integer dot product // mul mat vec with integer dot product
#if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) #if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
if (is_legacy_quant(tname) || tname == "mxfp4" || is_k_quant(tname) || tname == "iq1_s" || tname == "iq1_m") { if (is_legacy_quant(tname) || tname == "mxfp4" || is_k_quant(tname)) {
string_to_spv("mul_mat_vec_" + tname + "_q8_1_f32", "mul_mat_vecq.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"FLOAT_TYPE_VEC2", "vec2"}, {"ACC_TYPE", "float"}})); string_to_spv("mul_mat_vec_" + tname + "_q8_1_f32", "mul_mat_vecq.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"FLOAT_TYPE_VEC2", "vec2"}, {"ACC_TYPE", "float"}}));
string_to_spv("mul_mat_vec_" + tname + "_q8_1_f32_subgroup", "mul_mat_vecq.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"FLOAT_TYPE_VEC2", "vec2"}, {"ACC_TYPE", "float"}, {"USE_SUBGROUP_ADD", "1"}})); string_to_spv("mul_mat_vec_" + tname + "_q8_1_f32_subgroup", "mul_mat_vecq.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"FLOAT_TYPE_VEC2", "vec2"}, {"ACC_TYPE", "float"}, {"USE_SUBGROUP_ADD", "1"}}));
string_to_spv("mul_mat_vec_" + tname + "_q8_1_f32_subgroup_no_shmem", "mul_mat_vecq.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"FLOAT_TYPE_VEC2", "vec2"}, {"ACC_TYPE", "float"}, {"USE_SUBGROUP_ADD_NO_SHMEM", "1"}})); string_to_spv("mul_mat_vec_" + tname + "_q8_1_f32_subgroup_no_shmem", "mul_mat_vecq.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"FLOAT_TYPE_VEC2", "vec2"}, {"ACC_TYPE", "float"}, {"USE_SUBGROUP_ADD_NO_SHMEM", "1"}}));
@ -944,8 +944,6 @@ void process_shaders() {
string_to_spv("sum_rows_f32", "sum_rows.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); string_to_spv("sum_rows_f32", "sum_rows.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
string_to_spv("count_equal_i32", "count_equal.comp", merge_maps(base_dict, {{"A_TYPE", "int"}, {"B_TYPE", "int"}, {"D_TYPE", "int"}})); string_to_spv("count_equal_i32", "count_equal.comp", merge_maps(base_dict, {{"A_TYPE", "int"}, {"B_TYPE", "int"}, {"D_TYPE", "int"}}));
string_to_spv("cumsum_f32", "cumsum.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); string_to_spv("cumsum_f32", "cumsum.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
string_to_spv("cumsum_multipass1_f32", "cumsum_multipass1.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
string_to_spv("cumsum_multipass2_f32", "cumsum_multipass2.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
string_to_spv("count_experts", "count_experts.comp", merge_maps(base_dict, {{"A_TYPE", "uint"}, {"D_TYPE", "uint"}})); string_to_spv("count_experts", "count_experts.comp", merge_maps(base_dict, {{"A_TYPE", "uint"}, {"D_TYPE", "uint"}}));
@ -1125,7 +1123,7 @@ void write_output_files() {
for (const std::string& btype : btypes) { for (const std::string& btype : btypes) {
for (const auto& tname : type_names) { for (const auto& tname : type_names) {
if (btype == "q8_1" && !is_legacy_quant(tname) && tname != "mxfp4" && !is_k_quant(tname) && tname != "iq1_s" && tname != "iq1_m") { if (btype == "q8_1" && !is_legacy_quant(tname) && tname != "mxfp4" && !is_k_quant(tname)) {
continue; continue;
} }
hdr << "extern const void * arr_dmmv_" << tname << "_" << btype << "_f32_data[3];\n"; hdr << "extern const void * arr_dmmv_" << tname << "_" << btype << "_f32_data[3];\n";

View File

@ -294,9 +294,7 @@ class Keys:
USE_GELU = "clip.use_gelu" USE_GELU = "clip.use_gelu"
USE_SILU = "clip.use_silu" USE_SILU = "clip.use_silu"
N_WA_PATTERN = "clip.vision.n_wa_pattern" # used by qwen2.5vl N_WA_PATTERN = "clip.vision.n_wa_pattern" # used by qwen2.5vl
WA_LAYER_INDEXES = "clip.vision.wa_layer_indexes" # used by youtuvl
IS_DEEPSTACK_LAYERS = "clip.vision.is_deepstack_layers" IS_DEEPSTACK_LAYERS = "clip.vision.is_deepstack_layers"
WINDOW_SIZE = "clip.vision.window_size"
class Attention: class Attention:
HEAD_COUNT = "clip.vision.attention.head_count" HEAD_COUNT = "clip.vision.attention.head_count"
@ -454,7 +452,6 @@ class MODEL_ARCH(IntEnum):
MISTRAL3 = auto() MISTRAL3 = auto()
MIMO2 = auto() MIMO2 = auto()
LLAMA_EMBED = auto() LLAMA_EMBED = auto()
MAINCODER = auto()
class VISION_PROJECTOR_TYPE(IntEnum): class VISION_PROJECTOR_TYPE(IntEnum):
@ -853,7 +850,6 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.MISTRAL3: "mistral3", MODEL_ARCH.MISTRAL3: "mistral3",
MODEL_ARCH.MIMO2: "mimo2", MODEL_ARCH.MIMO2: "mimo2",
MODEL_ARCH.LLAMA_EMBED: "llama-embed", MODEL_ARCH.LLAMA_EMBED: "llama-embed",
MODEL_ARCH.MAINCODER: "maincoder",
} }
VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = { VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
@ -3261,22 +3257,6 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_DOWN_EXP,
MODEL_TENSOR.FFN_UP_EXP, MODEL_TENSOR.FFN_UP_EXP,
], ],
MODEL_ARCH.MAINCODER: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_Q_NORM,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_K_NORM,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
# TODO # TODO
} }
@ -3512,9 +3492,7 @@ class VisionProjectorType:
COGVLM = "cogvlm" COGVLM = "cogvlm"
JANUS_PRO = "janus_pro" JANUS_PRO = "janus_pro"
LFM2A = "lfm2a" # audio LFM2A = "lfm2a" # audio
MUSIC_FLAMINGO = "musicflamingo" # audio
GLM4V = "glm4v" GLM4V = "glm4v"
YOUTUVL = "youtuvl"
# Items here are (block size, type size) # Items here are (block size, type size)

View File

@ -1129,40 +1129,11 @@ class GGUFWriter:
self.add_uint32(Keys.ClipVision.Projector.SCALE_FACTOR, value) self.add_uint32(Keys.ClipVision.Projector.SCALE_FACTOR, value)
def add_vision_n_wa_pattern(self, value: int) -> None: def add_vision_n_wa_pattern(self, value: int) -> None:
"""Add window attention pattern interval for vision models.
This defines the pattern interval for window attention vs full attention layers.
For example, if n_wa_pattern=4, then layers 3, 7, 11, ... use full attention,
while other layers use window attention.
Used by models like Qwen2.5-VL where full attention layers follow a regular pattern.
"""
self.add_uint32(Keys.ClipVision.N_WA_PATTERN, value) self.add_uint32(Keys.ClipVision.N_WA_PATTERN, value)
def add_vision_wa_layer_indexes(self, layers: Sequence[int]) -> None:
"""Add explicit layer indexes that use full attention in vision models.
This specifies the exact layer indices (0-based) that should use full attention
instead of window attention. All other layers will use window attention.
Args:
layers: List of layer indices that use full attention (e.g., [3, 7, 11, 15])
Used by models like YoutuVL where full attention layers are explicitly specified
rather than following a regular pattern.
Difference from add_vision_n_wa_pattern:
- n_wa_pattern: Defines a regular interval pattern (every Nth layer uses full attention)
- wa_layer_indexes: Explicitly lists which layers use full attention (irregular pattern)
"""
self.add_array(Keys.ClipVision.WA_LAYER_INDEXES, layers)
def add_vision_is_deepstack_layers(self, layers: Sequence[bool]) -> None: def add_vision_is_deepstack_layers(self, layers: Sequence[bool]) -> None:
self.add_array(Keys.ClipVision.IS_DEEPSTACK_LAYERS, layers) self.add_array(Keys.ClipVision.IS_DEEPSTACK_LAYERS, layers)
def add_vision_window_size(self, value: int) -> None:
self.add_uint32(Keys.ClipVision.WINDOW_SIZE, value)
# audio models # audio models
def add_audio_projection_dim(self, value: int) -> None: def add_audio_projection_dim(self, value: int) -> None:

View File

@ -1221,7 +1221,6 @@ class TensorNameMap:
MODEL_TENSOR.V_MMPROJ: ( MODEL_TENSOR.V_MMPROJ: (
"multi_modal_projector.linear_{bid}", "multi_modal_projector.linear_{bid}",
"visual.merger.mlp.{bid}", # qwen2vl "visual.merger.mlp.{bid}", # qwen2vl
"merger.mlp.{bid}",
), ),
MODEL_TENSOR.V_MMPROJ_FC: ( MODEL_TENSOR.V_MMPROJ_FC: (
@ -1259,7 +1258,6 @@ class TensorNameMap:
"visual.patch_embed.proj", # qwen2vl "visual.patch_embed.proj", # qwen2vl
"vision_tower.patch_embed.proj", # kimi-vl "vision_tower.patch_embed.proj", # kimi-vl
"model.vision.patch_embedding.proj", # cogvlm "model.vision.patch_embedding.proj", # cogvlm
"siglip2.vision_model.embeddings.patch_embedding",
), ),
MODEL_TENSOR.V_ENC_EMBD_NORM: ( MODEL_TENSOR.V_ENC_EMBD_NORM: (
@ -1293,7 +1291,6 @@ class TensorNameMap:
"vision_encoder.transformer.layers.{bid}.attention.wq", # pixtral "vision_encoder.transformer.layers.{bid}.attention.wq", # pixtral
"visual.blocks.{bid}.attn.q", # qwen2vl, generated "visual.blocks.{bid}.attn.q", # qwen2vl, generated
"vision_tower.encoder.blocks.{bid}.wq", # kimi-vl, generated "vision_tower.encoder.blocks.{bid}.wq", # kimi-vl, generated
"siglip2.vision_model.encoder.layers.{bid}.self_attn.q_proj", # youtuvl
), ),
MODEL_TENSOR.V_ENC_ATTN_Q_NORM: ( MODEL_TENSOR.V_ENC_ATTN_Q_NORM: (
@ -1311,7 +1308,6 @@ class TensorNameMap:
"vision_encoder.transformer.layers.{bid}.attention.wk", # pixtral "vision_encoder.transformer.layers.{bid}.attention.wk", # pixtral
"visual.blocks.{bid}.attn.k", # qwen2vl, generated "visual.blocks.{bid}.attn.k", # qwen2vl, generated
"vision_tower.encoder.blocks.{bid}.wk", # kimi-vl, generated "vision_tower.encoder.blocks.{bid}.wk", # kimi-vl, generated
"siglip2.vision_model.encoder.layers.{bid}.self_attn.k_proj",
), ),
MODEL_TENSOR.V_ENC_ATTN_K_NORM: ( MODEL_TENSOR.V_ENC_ATTN_K_NORM: (
@ -1329,7 +1325,6 @@ class TensorNameMap:
"vision_encoder.transformer.layers.{bid}.attention.wv", # pixtral "vision_encoder.transformer.layers.{bid}.attention.wv", # pixtral
"visual.blocks.{bid}.attn.v", # qwen2vl, generated "visual.blocks.{bid}.attn.v", # qwen2vl, generated
"vision_tower.encoder.blocks.{bid}.wv", # kimi-vl, generated "vision_tower.encoder.blocks.{bid}.wv", # kimi-vl, generated
"siglip2.vision_model.encoder.layers.{bid}.self_attn.v_proj",
), ),
MODEL_TENSOR.V_ENC_INPUT_NORM: ( MODEL_TENSOR.V_ENC_INPUT_NORM: (
@ -1344,7 +1339,6 @@ class TensorNameMap:
"visual.blocks.{bid}.norm1", # qwen2vl "visual.blocks.{bid}.norm1", # qwen2vl
"vision_tower.encoder.blocks.{bid}.norm0", # kimi-vl (norm0/norm1) "vision_tower.encoder.blocks.{bid}.norm0", # kimi-vl (norm0/norm1)
"model.vision.transformer.layers.{bid}.input_layernorm", # cogvlm "model.vision.transformer.layers.{bid}.input_layernorm", # cogvlm
"siglip2.vision_model.encoder.layers.{bid}.layer_norm1",
), ),
MODEL_TENSOR.V_ENC_ATTN_O: ( MODEL_TENSOR.V_ENC_ATTN_O: (
@ -1360,7 +1354,6 @@ class TensorNameMap:
"visual.blocks.{bid}.attn.proj", # qwen2vl "visual.blocks.{bid}.attn.proj", # qwen2vl
"vision_tower.encoder.blocks.{bid}.wo", # kimi-vl "vision_tower.encoder.blocks.{bid}.wo", # kimi-vl
"model.vision.transformer.layers.{bid}.attention.dense", # cogvlm "model.vision.transformer.layers.{bid}.attention.dense", # cogvlm
"siglip2.vision_model.encoder.layers.{bid}.self_attn.out_proj", # youtuvl
), ),
MODEL_TENSOR.V_ENC_POST_ATTN_NORM: ( MODEL_TENSOR.V_ENC_POST_ATTN_NORM: (
@ -1375,7 +1368,6 @@ class TensorNameMap:
"visual.blocks.{bid}.norm2", # qwen2vl "visual.blocks.{bid}.norm2", # qwen2vl
"vision_tower.encoder.blocks.{bid}.norm1", # kimi-vl (norm0/norm1) "vision_tower.encoder.blocks.{bid}.norm1", # kimi-vl (norm0/norm1)
"model.vision.transformer.layers.{bid}.post_attention_layernorm", # cogvlm "model.vision.transformer.layers.{bid}.post_attention_layernorm", # cogvlm
"siglip2.vision_model.encoder.layers.{bid}.layer_norm2",
), ),
MODEL_TENSOR.V_ENC_FFN_UP: ( MODEL_TENSOR.V_ENC_FFN_UP: (
@ -1391,7 +1383,6 @@ class TensorNameMap:
"visual.blocks.{bid}.mlp.linear_fc1", # qwen3vl "visual.blocks.{bid}.mlp.linear_fc1", # qwen3vl
"vision_tower.encoder.blocks.{bid}.mlp.fc0", # kimi-vl (fc0/fc1) "vision_tower.encoder.blocks.{bid}.mlp.fc0", # kimi-vl (fc0/fc1)
"model.vision.transformer.layers.{bid}.mlp.fc1", # cogvlm "model.vision.transformer.layers.{bid}.mlp.fc1", # cogvlm
"siglip2.vision_model.encoder.layers.{bid}.mlp.fc1",
), ),
MODEL_TENSOR.V_ENC_FFN_GATE: ( MODEL_TENSOR.V_ENC_FFN_GATE: (
@ -1413,7 +1404,6 @@ class TensorNameMap:
"visual.blocks.{bid}.mlp.linear_fc2", # qwen3vl "visual.blocks.{bid}.mlp.linear_fc2", # qwen3vl
"vision_tower.encoder.blocks.{bid}.mlp.fc1", # kimi-vl (fc0/fc1) "vision_tower.encoder.blocks.{bid}.mlp.fc1", # kimi-vl (fc0/fc1)
"model.vision.transformer.layers.{bid}.mlp.fc2", # cogvlm "model.vision.transformer.layers.{bid}.mlp.fc2", # cogvlm
"siglip2.vision_model.encoder.layers.{bid}.mlp.fc2",
), ),
MODEL_TENSOR.V_LAYER_SCALE_1: ( MODEL_TENSOR.V_LAYER_SCALE_1: (
@ -1440,7 +1430,6 @@ class TensorNameMap:
"visual.merger.ln_q", # qwen2vl "visual.merger.ln_q", # qwen2vl
"vision_tower.encoder.final_layernorm", # kimi-vl "vision_tower.encoder.final_layernorm", # kimi-vl
"visual.post_layernorm", # glm4v "visual.post_layernorm", # glm4v
"siglip2.vision_model.post_layernorm",
), ),
MODEL_TENSOR.V_MM_POST_NORM: ( MODEL_TENSOR.V_MM_POST_NORM: (
@ -1457,7 +1446,6 @@ class TensorNameMap:
"multi_modal_projector.pre_norm", "multi_modal_projector.pre_norm",
"pre_mm_projector_norm", "pre_mm_projector_norm",
"model.vision.linear_proj.norm1", # cogvlm "model.vision.linear_proj.norm1", # cogvlm
"merger.ln_q",
), ),
MODEL_TENSOR.V_MM_SOFT_EMB_NORM: ( MODEL_TENSOR.V_MM_SOFT_EMB_NORM: (

View File

@ -316,11 +316,6 @@ extern "C" {
bool no_alloc; // only load metadata and simulate memory allocations bool no_alloc; // only load metadata and simulate memory allocations
}; };
struct llama_sampler_seq_config {
llama_seq_id seq_id;
struct llama_sampler * sampler;
};
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
// https://github.com/ggml-org/llama.cpp/pull/7544 // https://github.com/ggml-org/llama.cpp/pull/7544
struct llama_context_params { struct llama_context_params {
@ -369,12 +364,6 @@ extern "C" {
bool kv_unified; // use a unified buffer across the input sequences when computing the attention bool kv_unified; // use a unified buffer across the input sequences when computing the attention
// try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
// ref: https://github.com/ggml-org/llama.cpp/pull/14363 // ref: https://github.com/ggml-org/llama.cpp/pull/14363
// [EXPERIMENTAL]
// backend sampler chain configuration (make sure the caller keeps the sampler chains alive)
// note: the samplers must be sampler chains (i.e. use llama_sampler_chain_init)
struct llama_sampler_seq_config * samplers;
size_t n_samplers;
}; };
// model quantization parameters // model quantization parameters
@ -1003,32 +992,6 @@ extern "C" {
// otherwise: float[n_embd] (1-dimensional) // otherwise: float[n_embd] (1-dimensional)
LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id); LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
//
// backend sampling API [EXPERIMENTAL]
// note: use only if the llama_context was created with at least one llama_sampler_seq_config
//
// Get the backend sampled token for the ith token.
// Returns LLAMA_TOKEN_NULL if no token was sampled.
LLAMA_API llama_token llama_get_sampled_token_ith(struct llama_context * ctx, int32_t i);
// Get the backend sampled probabilites for the ith token
// The index matches llama_get_sampled_token_ith().
// Returns NULL if no probabilites were generated.
LLAMA_API float * llama_get_sampled_probs_ith (struct llama_context * ctx, int32_t i);
LLAMA_API uint32_t llama_get_sampled_probs_count_ith(struct llama_context * ctx, int32_t i);
// Get the backend sampled logits for the ith token
// Returns NULL if no logits were sampled.
LLAMA_API float * llama_get_sampled_logits_ith (struct llama_context * ctx, int32_t i);
LLAMA_API uint32_t llama_get_sampled_logits_count_ith(struct llama_context * ctx, int32_t i);
// Get the backend sampled candidates (token ids) for the ith token
// These are needed to map probability/logit indices to vocab token ids.
// Returns NULL if no candidates were sampled.
LLAMA_API llama_token * llama_get_sampled_candidates_ith (struct llama_context * ctx, int32_t i);
LLAMA_API uint32_t llama_get_sampled_candidates_count_ith(struct llama_context * ctx, int32_t i);
// //
// Vocab // Vocab
// //
@ -1200,16 +1163,11 @@ extern "C" {
// //
// llama_sampler_free(smpl); // llama_sampler_free(smpl);
// //
// TODO: In the future, llama_sampler will be utilized to offload the sampling to the backends (e.g. GPU).
//
typedef void * llama_sampler_context_t; typedef void * llama_sampler_context_t;
struct llama_sampler_data {
struct ggml_tensor * logits;
struct ggml_tensor * probs;
struct ggml_tensor * sampled;
struct ggml_tensor * candidates;
};
// user code can implement the interface below in order to create custom llama_sampler // user code can implement the interface below in order to create custom llama_sampler
struct llama_sampler_i { struct llama_sampler_i {
const char * (*name) (const struct llama_sampler * smpl); // can be NULL const char * (*name) (const struct llama_sampler * smpl); // can be NULL
@ -1219,45 +1177,17 @@ extern "C" {
struct llama_sampler * (*clone) (const struct llama_sampler * smpl); // can be NULL if ctx is NULL struct llama_sampler * (*clone) (const struct llama_sampler * smpl); // can be NULL if ctx is NULL
void (*free) ( struct llama_sampler * smpl); // can be NULL if ctx is NULL void (*free) ( struct llama_sampler * smpl); // can be NULL if ctx is NULL
// [EXPERIMENTAL] // TODO: API for internal libllama usage for appending the sampling to an existing ggml_cgraph
// backend sampling interface: //void (*apply_ggml) (struct llama_sampler * smpl, ...);
// return true if the backend supports all ops needed by the sampler
// note: call once per sampler
bool (*backend_init)(struct llama_sampler * smpl, ggml_backend_buffer_type_t buft);
// call after .backend_apply()
void (*backend_accept)(
struct llama_sampler * smpl,
struct ggml_context * ctx,
struct ggml_cgraph * gf,
struct ggml_tensor * selected_token);
// call after .backend_init()
void (*backend_apply)(
struct llama_sampler * smpl,
struct ggml_context * ctx,
struct ggml_cgraph * gf,
struct llama_sampler_data * data);
// called before graph execution to set inputs for the current ubatch
void (*backend_set_input)(struct llama_sampler * smpl);
}; };
struct llama_sampler { struct llama_sampler {
struct llama_sampler_i * iface; const struct llama_sampler_i * iface;
llama_sampler_context_t ctx;
llama_sampler_context_t ctx;
}; };
// [EXPERIMENTAL]
// attach a sampler to the context
// note: prefer initializing the context with llama_context_params.samplers when possible
// note: changing the samplers of a context can cause graph reallocations and degraded performance
LLAMA_API bool llama_set_sampler(struct llama_context * ctx, llama_seq_id seq_id, struct llama_sampler * smpl);
// mirror of llama_sampler_i: // mirror of llama_sampler_i:
LLAMA_API struct llama_sampler * llama_sampler_init ( struct llama_sampler_i * iface, llama_sampler_context_t ctx); LLAMA_API struct llama_sampler * llama_sampler_init (const struct llama_sampler_i * iface, llama_sampler_context_t ctx);
LLAMA_API const char * llama_sampler_name (const struct llama_sampler * smpl); LLAMA_API const char * llama_sampler_name (const struct llama_sampler * smpl);
LLAMA_API void llama_sampler_accept( struct llama_sampler * smpl, llama_token token); LLAMA_API void llama_sampler_accept( struct llama_sampler * smpl, llama_token token);
LLAMA_API void llama_sampler_apply ( struct llama_sampler * smpl, llama_token_data_array * cur_p); LLAMA_API void llama_sampler_apply ( struct llama_sampler * smpl, llama_token_data_array * cur_p);
@ -1273,15 +1203,7 @@ extern "C" {
// important: takes ownership of the sampler object and will free it when llama_sampler_free is called // important: takes ownership of the sampler object and will free it when llama_sampler_free is called
LLAMA_API void llama_sampler_chain_add( struct llama_sampler * chain, struct llama_sampler * smpl); LLAMA_API void llama_sampler_chain_add( struct llama_sampler * chain, struct llama_sampler * smpl);
LLAMA_API struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i);
// return NULL if:
// - the sampler is NULL
// - the sampler is not a llama_sampler_chain
// - the index is out of bounds, unless i == -1
// - if i == -1, returns the chain itself (can be used to check if the sampler is a chain)
LLAMA_API struct llama_sampler * llama_sampler_chain_get( struct llama_sampler * chain, int32_t i);
// the total number of samplers in the chain
LLAMA_API int llama_sampler_chain_n (const struct llama_sampler * chain); LLAMA_API int llama_sampler_chain_n (const struct llama_sampler * chain);
// after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed // after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed

View File

@ -1 +1 @@
ebc3a0f4a56be1c9424a89fbec09962ac34fde85 130bc125a88bb57664b88932c48c38a1cb316fac

View File

@ -87,7 +87,6 @@ add_library(llama
models/llada.cpp models/llada.cpp
models/llama-iswa.cpp models/llama-iswa.cpp
models/llama.cpp models/llama.cpp
models/maincoder.cpp
models/mamba.cpp models/mamba.cpp
models/mimo2-iswa.cpp models/mimo2-iswa.cpp
models/minicpm3.cpp models/minicpm3.cpp

View File

@ -118,7 +118,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_MISTRAL3, "mistral3" }, { LLM_ARCH_MISTRAL3, "mistral3" },
{ LLM_ARCH_MIMO2, "mimo2" }, { LLM_ARCH_MIMO2, "mimo2" },
{ LLM_ARCH_LLAMA_EMBED, "llama-embed" }, { LLM_ARCH_LLAMA_EMBED, "llama-embed" },
{ LLM_ARCH_MAINCODER, "maincoder" },
{ LLM_ARCH_UNKNOWN, "(unknown)" }, { LLM_ARCH_UNKNOWN, "(unknown)" },
}; };
@ -2235,23 +2234,6 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
return { return {
LLM_TENSOR_TOKEN_EMBD, LLM_TENSOR_TOKEN_EMBD,
}; };
case LLM_ARCH_MAINCODER:
return {
LLM_TENSOR_TOKEN_EMBD,
LLM_TENSOR_OUTPUT_NORM,
LLM_TENSOR_OUTPUT,
LLM_TENSOR_ATTN_NORM,
LLM_TENSOR_ATTN_Q,
LLM_TENSOR_ATTN_Q_NORM,
LLM_TENSOR_ATTN_K,
LLM_TENSOR_ATTN_K_NORM,
LLM_TENSOR_ATTN_V,
LLM_TENSOR_ATTN_OUT,
LLM_TENSOR_FFN_NORM,
LLM_TENSOR_FFN_GATE,
LLM_TENSOR_FFN_DOWN,
LLM_TENSOR_FFN_UP,
};
default: default:
GGML_ABORT("unknown architecture for tensor mapping"); GGML_ABORT("unknown architecture for tensor mapping");
} }

View File

@ -122,7 +122,6 @@ enum llm_arch {
LLM_ARCH_MISTRAL3, LLM_ARCH_MISTRAL3,
LLM_ARCH_MIMO2, LLM_ARCH_MIMO2,
LLM_ARCH_LLAMA_EMBED, LLM_ARCH_LLAMA_EMBED,
LLM_ARCH_MAINCODER,
LLM_ARCH_UNKNOWN, LLM_ARCH_UNKNOWN,
}; };

View File

@ -74,7 +74,6 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
{ "seed_oss", LLM_CHAT_TEMPLATE_SEED_OSS }, { "seed_oss", LLM_CHAT_TEMPLATE_SEED_OSS },
{ "grok-2", LLM_CHAT_TEMPLATE_GROK_2 }, { "grok-2", LLM_CHAT_TEMPLATE_GROK_2 },
{ "pangu-embedded", LLM_CHAT_TEMPLATE_PANGU_EMBED }, { "pangu-embedded", LLM_CHAT_TEMPLATE_PANGU_EMBED },
{ "solar-open", LLM_CHAT_TEMPLATE_SOLAR_OPEN },
}; };
llm_chat_template llm_chat_template_from_str(const std::string & name) { llm_chat_template llm_chat_template_from_str(const std::string & name) {
@ -217,8 +216,6 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
return LLM_CHAT_TEMPLATE_GROK_2; return LLM_CHAT_TEMPLATE_GROK_2;
} else if (tmpl_contains(LU8("[unused9]系统:[unused10]"))) { } else if (tmpl_contains(LU8("[unused9]系统:[unused10]"))) {
return LLM_CHAT_TEMPLATE_PANGU_EMBED; return LLM_CHAT_TEMPLATE_PANGU_EMBED;
} else if (tmpl_contains("<|begin|>") && tmpl_contains("<|end|>") && tmpl_contains("<|content|>")) {
return LLM_CHAT_TEMPLATE_SOLAR_OPEN;
} }
return LLM_CHAT_TEMPLATE_UNKNOWN; return LLM_CHAT_TEMPLATE_UNKNOWN;
} }
@ -848,14 +845,6 @@ int32_t llm_chat_apply_template(
if (add_ass) { if (add_ass) {
ss << "[unused9]助手:"; ss << "[unused9]助手:";
} }
} else if (tmpl == LLM_CHAT_TEMPLATE_SOLAR_OPEN) {
for (auto message : chat) {
std::string role(message->role);
ss << "<|begin|>" << role << "<|content|>" << message->content << "<|end|>";
}
if (add_ass) {
ss << "<|begin|>assistant";
}
} else { } else {
// template not supported // template not supported
return -1; return -1;

View File

@ -54,7 +54,6 @@ enum llm_chat_template {
LLM_CHAT_TEMPLATE_SEED_OSS, LLM_CHAT_TEMPLATE_SEED_OSS,
LLM_CHAT_TEMPLATE_GROK_2, LLM_CHAT_TEMPLATE_GROK_2,
LLM_CHAT_TEMPLATE_PANGU_EMBED, LLM_CHAT_TEMPLATE_PANGU_EMBED,
LLM_CHAT_TEMPLATE_SOLAR_OPEN,
LLM_CHAT_TEMPLATE_UNKNOWN, LLM_CHAT_TEMPLATE_UNKNOWN,
}; };

View File

@ -60,25 +60,6 @@ llama_context::llama_context(
cparams.cb_eval = params.cb_eval; cparams.cb_eval = params.cb_eval;
cparams.cb_eval_user_data = params.cb_eval_user_data; cparams.cb_eval_user_data = params.cb_eval_user_data;
// Initialize backend samplers here so they are part of the sampling graph
// before the reserve passes run later in this function. This avoids a later
// re-reserve when graph nodes change.
if (params.samplers != nullptr && params.n_samplers > 0) {
for (size_t i = 0; i < params.n_samplers; ++i) {
const auto & config = params.samplers[i];
if (llama_sampler_chain_get(config.sampler, -1) == nullptr) {
throw std::runtime_error("the backend samplers must be of type llama_sampler_chain");
}
if (set_sampler(config.seq_id, config.sampler)) {
const int n_samplers = llama_sampler_chain_n(config.sampler);
LLAMA_LOG_INFO("%s: setting backend sampler for seq_id %d (n = %d)\n", __func__, config.seq_id, n_samplers);
}
}
}
auto rope_scaling_type = params.rope_scaling_type; auto rope_scaling_type = params.rope_scaling_type;
if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) { if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) {
rope_scaling_type = hparams.rope_scaling_type_train; rope_scaling_type = hparams.rope_scaling_type_train;
@ -250,10 +231,7 @@ llama_context::llama_context(
// graph outputs buffer // graph outputs buffer
{ {
// resized during inference when a batch uses more outputs // resized during inference when a batch uses more outputs
// Create a dummy batch for initialization. if (output_reserve(params.n_seq_max) < params.n_seq_max) {
llama_batch dummy_batch = {};
dummy_batch.n_tokens = 0;
if (output_reserve(params.n_seq_max, dummy_batch) < params.n_seq_max) {
throw std::runtime_error("failed to reserve initial output buffer"); throw std::runtime_error("failed to reserve initial output buffer");
} }
@ -478,16 +456,6 @@ llama_context::llama_context(
LLAMA_LOG_INFO("%s: graph splits = %d (with bs=%d), %d (with bs=1)\n", __func__, n_splits_pp, n_tokens, n_splits_tg); LLAMA_LOG_INFO("%s: graph splits = %d (with bs=%d), %d (with bs=1)\n", __func__, n_splits_pp, n_tokens, n_splits_tg);
} }
} }
// Initialize the full vocabulary token ids for backend samplers.
{
const int n_vocab = model.vocab.n_tokens();
sampling.token_ids_full_vocab.resize(n_vocab);
for (int i = 0; i < n_vocab; ++i) {
sampling.token_ids_full_vocab[i] = i;
}
}
} }
llama_context::~llama_context() { llama_context::~llama_context() {
@ -648,35 +616,6 @@ float * llama_context::get_logits() {
return logits; return logits;
} }
int64_t llama_context::output_resolve_row(int32_t i) const {
int64_t j = -1;
// support negative indices (last output row)
if (i < 0) {
j = n_outputs + i;
if (j < 0) {
throw std::runtime_error(format("negative index out of range [0, %d)", n_outputs));
}
} else if ((size_t) i >= output_ids.size()) {
throw std::runtime_error(format("out of range [0, %zu)", output_ids.size()));
} else {
// use output_ids to translate the batch token index into a row number
// that holds this token's data.
j = output_ids[i];
}
if (j < 0) {
// the batch token was not configured to output anything
throw std::runtime_error(format("batch.logits[%d] != true", i));
}
if (j >= n_outputs) {
throw std::runtime_error(format("corrupt output buffer (j=%" PRId64 ", n_outputs=%d)", j, n_outputs));
}
return j;
}
float * llama_context::get_logits_ith(int32_t i) { float * llama_context::get_logits_ith(int32_t i) {
int64_t j = -1; int64_t j = -1;
@ -687,7 +626,6 @@ float * llama_context::get_logits_ith(int32_t i) {
throw std::runtime_error("no logits"); throw std::runtime_error("no logits");
} }
// TODO: use output_resolve_row()
if (i < 0) { if (i < 0) {
j = n_outputs + i; j = n_outputs + i;
if (j < 0) { if (j < 0) {
@ -724,10 +662,6 @@ float * llama_context::get_embeddings() {
return embd; return embd;
} }
llama_token * llama_context::get_sampled_tokens() const{
return sampling.sampled;
}
float * llama_context::get_embeddings_ith(int32_t i) { float * llama_context::get_embeddings_ith(int32_t i) {
int64_t j = -1; int64_t j = -1;
@ -738,7 +672,6 @@ float * llama_context::get_embeddings_ith(int32_t i) {
throw std::runtime_error("no embeddings"); throw std::runtime_error("no embeddings");
} }
// TODO: use output_resolve_row()
if (i < 0) { if (i < 0) {
j = n_outputs + i; j = n_outputs + i;
if (j < 0) { if (j < 0) {
@ -778,136 +711,6 @@ float * llama_context::get_embeddings_seq(llama_seq_id seq_id) {
return it->second.data(); return it->second.data();
} }
llama_token llama_context::get_sampled_token_ith(int32_t idx) {
output_reorder();
if (sampling.sampled == nullptr) {
return LLAMA_TOKEN_NULL;
}
try {
const int64_t row = output_resolve_row(idx);
GGML_ASSERT(row < (int64_t) sampling.sampled_size);
return sampling.sampled[row];
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: invalid backend sampled token id %d, reason: %s\n", __func__, idx, err.what());
return LLAMA_TOKEN_NULL;
}
}
float * llama_context::get_sampled_probs_ith(int32_t idx) {
output_reorder();
if (sampling.probs == nullptr) {
return nullptr;
}
try {
const int64_t row = output_resolve_row(idx);
if ((size_t) row >= sampling.probs_count.size() || sampling.probs_count[row] == 0) {
return nullptr;
}
return sampling.probs + row*model.vocab.n_tokens();
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: invalid backend sampled probs id %d, reason: %s\n", __func__, idx, err.what());
return nullptr;
}
}
float * llama_context::get_sampled_logits_ith(int32_t idx) {
output_reorder();
if (sampling.logits == nullptr) {
return nullptr;
}
try {
const int64_t row = output_resolve_row(idx);
if ((size_t) row >= sampling.logits_count.size() || sampling.logits_count[row] == 0) {
return nullptr;
}
return sampling.logits + row*model.vocab.n_tokens();
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: invalid backend sampled logits id %d, reason: %s\n", __func__, idx, err.what());
return nullptr;
}
}
const llama_token * llama_context::get_sampled_candidates_ith(int32_t idx) {
output_reorder();
try {
const int64_t row = output_resolve_row(idx);
if (sampling.candidates != nullptr &&
(size_t) row < sampling.candidates_count.size() &&
sampling.candidates_count[row] > 0) {
return sampling.candidates + row*model.vocab.n_tokens();
}
} catch (const std::exception & err) {
// fallback to full vocab list
}
return sampling.token_ids_full_vocab.data();
}
size_t llama_context::get_sampled_candidates_count(int32_t idx) {
output_reorder();
if (sampling.candidates == nullptr) {
return 0;
}
try {
const int64_t row = output_resolve_row(idx);
if ((size_t) row >= sampling.candidates_count.size()) {
return 0;
}
return sampling.candidates_count[row];
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: invalid backend sampled candidates count id %d, reason: %s\n", __func__, idx, err.what());
return 0;
}
}
size_t llama_context::get_sampled_logits_count(int32_t idx) {
output_reorder();
if (sampling.logits == nullptr) {
return model.vocab.n_tokens();
}
try {
const int64_t row = output_resolve_row(idx);
if ((size_t) row >= sampling.logits_count.size()) {
return 0;
}
return sampling.logits_count[row];
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: invalid backend sampled logits count id %d, reason: %s\n", __func__, idx, err.what());
return 0;
}
}
size_t llama_context::get_sampled_probs_count(int32_t idx) {
output_reorder();
if (sampling.probs == nullptr) {
return 0;
}
try {
const int64_t row = output_resolve_row(idx);
if ((size_t) row >= sampling.probs_count.size()) {
return 0;
}
return sampling.probs_count[row];
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: invalid backend sampled probs count id %d, reason: %s\n", __func__, idx, err.what());
return 0;
}
}
void llama_context::attach_threadpool( void llama_context::attach_threadpool(
ggml_threadpool_t threadpool, ggml_threadpool_t threadpool,
ggml_threadpool_t threadpool_batch) { ggml_threadpool_t threadpool_batch) {
@ -964,42 +767,6 @@ void llama_context::set_warmup(bool value) {
cparams.warmup = value; cparams.warmup = value;
} }
bool llama_context::set_sampler(llama_seq_id seq_id, llama_sampler * sampler) {
LLAMA_LOG_DEBUG("%s: seq_id = %d, sampler = %p\n", __func__, (int) seq_id, (void *) sampler);
const bool can_offload =
sampler &&
sampler->iface->backend_init &&
sampler->iface->backend_apply &&
llama_sampler_chain_n(sampler) > 0;
if (sampler && can_offload) {
ggml_backend_buffer_type_t buft = ggml_backend_dev_buffer_type(model.dev_output());
auto * host_buft = ggml_backend_dev_host_buffer_type(model.dev_output());
if (host_buft) {
buft = host_buft;
}
sampler->iface->backend_init(sampler, buft);
sampling.samplers[seq_id] = sampler;
return true;
}
if (sampler && !can_offload) {
LLAMA_LOG_WARN("%s: sampler '%s' for seq_id = %d, cannot be offloaded to the backend\n", __func__, llama_sampler_name(sampler), seq_id);
sampling.samplers.erase(seq_id);
return false;
}
sampling.samplers.erase(seq_id);
return true;
}
void llama_context::set_adapter_lora( void llama_context::set_adapter_lora(
llama_adapter_lora * adapter, llama_adapter_lora * adapter,
float scale) { float scale) {
@ -1140,7 +907,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
n_queued_tokens += n_tokens; n_queued_tokens += n_tokens;
// reserve output buffer // reserve output buffer
if (output_reserve(n_tokens, batch_inp) < n_tokens) { if (output_reserve(n_tokens) < n_tokens) {
LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens); LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens);
return -2; return -2;
}; };
@ -1264,112 +1031,6 @@ int llama_context::encode(const llama_batch & batch_inp) {
return 0; return 0;
} }
static std::map<llama_seq_id, uint32_t> build_seq_to_output_row(const llama_ubatch & ubatch, uint32_t row_offset) {
std::map<llama_seq_id, uint32_t> seq_to_row;
// how many output tokens we have seen so far for this ubatch.
uint32_t local = 0;
for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
// skip tokens that are not output.
if (!ubatch.output[i]) {
continue;
}
const llama_seq_id seq_id = ubatch.seq_id[i][0];
// row_offset is the number of output tokens before this ubatch.
seq_to_row[seq_id] = row_offset + local;
++local;
}
return seq_to_row;
}
static void copy_tensor_async_ints(
const std::map<llama_seq_id, ggml_tensor*> & tensor_map,
llama_token * sampled,
size_t sampled_size,
const std::map<llama_seq_id, uint32_t> & seq_to_row,
ggml_backend_sched_t sched) {
if (sampled == nullptr) {
return;
}
for (const auto & [seq_id, tensor] : tensor_map) {
auto it = seq_to_row.find(seq_id);
if (it == seq_to_row.end()) {
continue;
}
const uint32_t row = it->second;
GGML_ASSERT(row < sampled_size);
GGML_ASSERT(ggml_is_contiguous(tensor) && "sampled tokens tensor must be contiguous for async copy");
ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor);
ggml_backend_tensor_get_async(backend, tensor, sampled + row, 0, sizeof(sampled[row]));
}
}
static void copy_tensor_async_floats(
const std::map<llama_seq_id, ggml_tensor*> & tensor_map,
float * dst,
size_t stride,
std::vector<uint32_t> & counts,
const std::map<llama_seq_id, uint32_t> & seq_to_row,
ggml_backend_sched_t sched) {
if (dst == nullptr) {
return;
}
for (const auto & [seq_id, tensor] : tensor_map) {
auto it = seq_to_row.find(seq_id);
if (it == seq_to_row.end()) {
continue;
}
const uint32_t row = it->second;
GGML_ASSERT(row < counts.size());
GGML_ASSERT(ggml_is_contiguous(tensor) && "logits/probs tensor must be contiguous for async copy");
ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor);
float * row_ptr = dst + (size_t) row * stride;
ggml_backend_tensor_get_async(backend, tensor, row_ptr, 0, ggml_nbytes(tensor));
// Update the actual number of logits/probabilities that were written for this row.
counts[row] = ggml_nelements(tensor);
}
}
static void copy_tensor_async_candidates(
const std::map<llama_seq_id, ggml_tensor*> & tensor_map,
llama_token * dst,
size_t stride,
std::vector<uint32_t> & counts,
const std::map<llama_seq_id, uint32_t> & seq_to_row,
ggml_backend_sched_t sched) {
if (dst == nullptr) {
return;
}
for (const auto & [seq_id, tensor] : tensor_map) {
auto it = seq_to_row.find(seq_id);
if (it == seq_to_row.end()) {
continue;
}
const uint32_t row = it->second;
GGML_ASSERT(row < counts.size());
GGML_ASSERT(ggml_is_contiguous(tensor) && "candidates tensor must be contiguous for async copy");
ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor);
llama_token * row_ptr = dst + (size_t) row * stride;
ggml_backend_tensor_get_async(backend, tensor, row_ptr, 0, ggml_nbytes(tensor));
// Update the actual number of candidates that were written.
counts[row] = ggml_nelements(tensor);
}
}
int llama_context::decode(const llama_batch & batch_inp) { int llama_context::decode(const llama_batch & batch_inp) {
GGML_ASSERT((!batch_inp.token && batch_inp.embd) || (batch_inp.token && !batch_inp.embd)); // NOLINT GGML_ASSERT((!batch_inp.token && batch_inp.embd) || (batch_inp.token && !batch_inp.embd)); // NOLINT
@ -1390,36 +1051,9 @@ int llama_context::decode(const llama_batch & batch_inp) {
const int64_t n_embd = hparams.n_embd_inp(); const int64_t n_embd = hparams.n_embd_inp();
// when computing embeddings, all tokens are output // when computing embeddings, all tokens are output
const bool output_all = cparams.embeddings; const bool output_all = cparams.embeddings;
const bool has_samplers = !sampling.samplers.empty();
const uint32_t n_seq_max = cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max; if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, output_all)) {
// TODO: avoid this workaround in the future
if (has_samplers && batch_inp.logits) {
std::vector<int32_t> seq_output_count(n_seq_max, 0);
for (int32_t i = 0; i < batch_inp.n_tokens; ++i) {
if (batch_inp.logits[i] == 0) {
continue;
}
const int ns = batch_inp.n_seq_id ? batch_inp.n_seq_id[i] : 1;
for (int32_t s = 0; s < ns; ++s) {
const llama_seq_id seq_id = batch_inp.seq_id ? batch_inp.seq_id[i][s] : 0;
seq_output_count[seq_id]++;
if (seq_output_count[seq_id] > 1) {
LLAMA_LOG_ERROR("%s: backend sampling requires at most one output token per sequence (seq_id %d had %d)\n",
__func__, seq_id, seq_output_count[seq_id]);
return -1;
}
}
}
}
if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, n_seq_max, output_all)) {
LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__); LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
return -1; return -1;
} }
@ -1500,7 +1134,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
} }
// reserve output buffer // reserve output buffer
if (output_reserve(n_outputs_all, balloc->get_batch()) < n_outputs_all) { if (output_reserve(n_outputs_all) < n_outputs_all) {
LLAMA_LOG_ERROR("%s: could not reserve space for batch with %d outputs\n", __func__, n_outputs_all); LLAMA_LOG_ERROR("%s: could not reserve space for batch with %d outputs\n", __func__, n_outputs_all);
return -2; return -2;
}; };
@ -1573,10 +1207,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
} }
// extract logits // extract logits
// For multi-sequence batches that mix backend samplers and CPU sampler if (t_logits && n_outputs > 0) {
// this is currently inefficient as we copy all logits even for the
// backend sampled tokens.
if (logits && t_logits && n_outputs > 0) {
ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits); ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits);
GGML_ASSERT(backend_res != nullptr); GGML_ASSERT(backend_res != nullptr);
GGML_ASSERT(logits != nullptr); GGML_ASSERT(logits != nullptr);
@ -1591,7 +1222,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
} }
// extract embeddings // extract embeddings
if (embd && t_embd && n_outputs > 0) { if (t_embd && n_outputs > 0) {
ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd); ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
GGML_ASSERT(backend_embd != nullptr); GGML_ASSERT(backend_embd != nullptr);
@ -1645,22 +1276,6 @@ int llama_context::decode(const llama_batch & batch_inp) {
} }
} }
// This flag indicates whether a backend sampler has actually sampled a specific
// token, or if it has produced probabilites. If true, we can skip the normal copying of logits and embeddings.
const bool has_sampled = !res->t_sampled.empty() || !res->t_sampled_probs.empty() || !res->t_sampled_logits.empty();
if (has_samplers && has_sampled) {
const auto seq_to_output_row = build_seq_to_output_row(ubatch, n_outputs_prev);
const auto stride = n_vocab;
// async copy the sampling data from the backend to the host
copy_tensor_async_ints(res->t_sampled, sampling.sampled, sampling.sampled_size, seq_to_output_row, sched.get());
copy_tensor_async_floats (res->t_sampled_logits, sampling.logits, stride, sampling.logits_count, seq_to_output_row, sched.get());
copy_tensor_async_floats (res->t_sampled_probs, sampling.probs, stride, sampling.probs_count, seq_to_output_row, sched.get());
copy_tensor_async_candidates(res->t_candidates, sampling.candidates, stride, sampling.candidates_count, seq_to_output_row, sched.get());
}
n_outputs_prev += n_outputs; n_outputs_prev += n_outputs;
} while (mctx->next()); } while (mctx->next());
@ -1724,7 +1339,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
// output // output
// //
uint32_t llama_context::output_reserve(int32_t n_outputs, const llama_batch & batch) { uint32_t llama_context::output_reserve(int32_t n_outputs) {
const auto & hparams = model.hparams; const auto & hparams = model.hparams;
const auto & vocab = model.vocab; const auto & vocab = model.vocab;
@ -1743,53 +1358,8 @@ uint32_t llama_context::output_reserve(int32_t n_outputs, const llama_batch & ba
has_embd = true; has_embd = true;
} }
// Check which sampling modes are needed for the current batch. logits_size = has_logits ? n_vocab*n_outputs_max : 0;
// TODO: avoid this branching by working with the worst-case embd_size = has_embd ? n_embd*n_outputs_max : 0;
bool has_sampling = false;
bool cpu_logits = false;
if (batch.logits) {
for (int32_t i = 0; i < batch.n_tokens; i++) {
if (!batch.logits[i]) {
continue;
}
for (int32_t j = 0; j < batch.n_seq_id[i]; j++) {
llama_seq_id seq_id = batch.seq_id[i][j];
if (sampling.samplers.find(seq_id) != sampling.samplers.end()) {
has_sampling = true;
} else {
cpu_logits = true;
}
}
}
} else {
// When batch.logits is nullptr (when loading state with a dummy batch),
// allocate CPU logits.
cpu_logits = true;
}
size_t backend_float_count = 0;
size_t backend_token_count = 0;
// Allocate CPU logits buffer only if needed by sequences in this batch
logits_size = (has_logits && cpu_logits) ? n_vocab*n_outputs_max : 0;
embd_size = has_embd ? n_embd*n_outputs_max : 0;
// TODO: avoid this branching by working with the worst-case
if (!has_sampling) {
sampling.logits_size = 0;
sampling.probs_size = 0;
sampling.sampled_size = 0;
sampling.candidates_size = 0;
} else {
sampling.logits_size = n_vocab*n_outputs_max;
sampling.probs_size = n_vocab*n_outputs_max;
sampling.sampled_size = n_outputs_max;
sampling.candidates_size = n_vocab*n_outputs_max;
backend_float_count = sampling.logits_size + sampling.probs_size;
backend_token_count = sampling.sampled_size + sampling.candidates_size;
}
if (output_ids.empty()) { if (output_ids.empty()) {
// init, never resized afterwards // init, never resized afterwards
@ -1797,9 +1367,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs, const llama_batch & ba
} }
const size_t prev_size = buf_output ? ggml_backend_buffer_get_size(buf_output.get()) : 0; const size_t prev_size = buf_output ? ggml_backend_buffer_get_size(buf_output.get()) : 0;
const size_t new_size = const size_t new_size = (logits_size + embd_size) * sizeof(float);
(logits_size + embd_size + backend_float_count) * sizeof(float) +
( backend_token_count) * sizeof(llama_token);
// alloc only when more than the current capacity is required // alloc only when more than the current capacity is required
// TODO: also consider shrinking the buffer // TODO: also consider shrinking the buffer
@ -1807,11 +1375,9 @@ uint32_t llama_context::output_reserve(int32_t n_outputs, const llama_batch & ba
if (buf_output) { if (buf_output) {
#ifndef NDEBUG #ifndef NDEBUG
// This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark) // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
LLAMA_LOG_DEBUG("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
#endif #endif
synchronize(); synchronize();
// TODO: not needed?
buf_output = nullptr; buf_output = nullptr;
logits = nullptr; logits = nullptr;
embd = nullptr; embd = nullptr;
@ -1833,49 +1399,8 @@ uint32_t llama_context::output_reserve(int32_t n_outputs, const llama_batch & ba
float * output_base = (float *) ggml_backend_buffer_get_base(buf_output.get()); float * output_base = (float *) ggml_backend_buffer_get_base(buf_output.get());
logits = nullptr; logits = has_logits ? output_base : nullptr;
embd = nullptr; embd = has_embd ? output_base + logits_size : nullptr;
size_t offset = 0;
uint8_t * base = (uint8_t *) output_base;
logits = (has_logits && cpu_logits) ? output_base : nullptr;
offset += logits_size * sizeof(float);
embd = has_embd ? (float *) (base + offset) : nullptr;
offset += embd_size * sizeof(float);
sampling.logits = nullptr;
sampling.probs = nullptr;
sampling.sampled = nullptr;
sampling.candidates = nullptr;
if (has_sampling) {
sampling.logits = (float *) (base + offset);
offset += sampling.logits_size * sizeof(float);
sampling.probs = (float *) (base + offset);
offset += sampling.probs_size * sizeof(float);
sampling.sampled = (llama_token *) (base + offset);
offset += sampling.sampled_size * sizeof(llama_token);
sampling.candidates = (llama_token *) (base + offset);
offset += sampling.candidates_size * sizeof(llama_token);
// The count vectors keep track of the actual number of logits/probs/candidates
// copied from the backend for each output row.
sampling.logits_count.resize(n_outputs_max);
sampling.probs_count.resize(n_outputs_max);
sampling.candidates_count.resize(n_outputs_max);
std::fill(sampling.logits_count.begin(), sampling.logits_count.end(), 0);
std::fill(sampling.probs_count.begin(), sampling.probs_count.end(), 0);
std::fill(sampling.candidates_count.begin(), sampling.candidates_count.end(), 0);
std::fill_n(sampling.sampled, sampling.sampled_size, LLAMA_TOKEN_NULL);
}
// set all ids as invalid (negative) // set all ids as invalid (negative)
std::fill(output_ids.begin(), output_ids.end(), -1); std::fill(output_ids.begin(), output_ids.end(), -1);
@ -1904,40 +1429,6 @@ void llama_context::output_reorder() {
std::swap(embd[i0*n_embd + k], embd[i1*n_embd + k]); std::swap(embd[i0*n_embd + k], embd[i1*n_embd + k]);
} }
} }
if (sampling.logits && sampling.logits_size > 0) {
for (uint64_t k = 0; k < n_vocab; ++k) {
std::swap(sampling.logits[i0*n_vocab + k], sampling.logits[i1*n_vocab + k]);
}
}
if (sampling.probs && sampling.probs_size > 0) {
for (uint64_t k = 0; k < n_vocab; ++k) {
std::swap(sampling.probs[i0*n_vocab + k], sampling.probs[i1*n_vocab + k]);
}
}
if (sampling.candidates && sampling.candidates_size > 0) {
for (uint64_t k = 0; k < n_vocab; ++k) {
std::swap(sampling.candidates[i0*n_vocab + k], sampling.candidates[i1*n_vocab + k]);
}
}
if (sampling.sampled && sampling.sampled_size > 0) {
std::swap(sampling.sampled[i0], sampling.sampled[i1]);
}
if (!sampling.logits_count.empty()) {
std::swap(sampling.logits_count[i0], sampling.logits_count[i1]);
}
if (!sampling.probs_count.empty()) {
std::swap(sampling.probs_count[i0], sampling.probs_count[i1]);
}
if (!sampling.candidates_count.empty()) {
std::swap(sampling.candidates_count[i0], sampling.candidates_count[i1]);
}
} }
output_swaps.clear(); output_swaps.clear();
@ -1967,7 +1458,7 @@ ggml_cgraph * llama_context::graph_reserve(
if (n_tokens % n_seqs != 0) { if (n_tokens % n_seqs != 0) {
n_tokens = ((n_tokens + (n_seqs - 1)) / n_seqs) * n_seqs; // round to next multiple of n_seqs n_tokens = ((n_tokens + (n_seqs - 1)) / n_seqs) * n_seqs; // round to next multiple of n_seqs
n_outputs = std::max(n_outputs, n_tokens); n_outputs = std::min(n_outputs, n_tokens);
LLAMA_LOG_DEBUG("%s: making n_tokens a multiple of n_seqs - n_tokens = %u, n_seqs = %u, n_outputs = %u\n", __func__, n_tokens, n_seqs, n_outputs); LLAMA_LOG_DEBUG("%s: making n_tokens a multiple of n_seqs - n_tokens = %u, n_seqs = %u, n_outputs = %u\n", __func__, n_tokens, n_seqs, n_outputs);
} }
@ -1986,15 +1477,6 @@ ggml_cgraph * llama_context::graph_reserve(
llama_batch_allocr balloc(model.hparams.n_pos_per_embd()); llama_batch_allocr balloc(model.hparams.n_pos_per_embd());
llama_ubatch ubatch = balloc.ubatch_reserve(n_tokens/n_seqs, n_seqs); llama_ubatch ubatch = balloc.ubatch_reserve(n_tokens/n_seqs, n_seqs);
// set one output token per sequence in order to activate all backend samplers
std::vector<llama_seq_id> seq_ids(n_seqs);
for (uint32_t i = 0; i < n_seqs; ++i) {
seq_ids[i] = i;
ubatch.n_seq_id[i] = 1;
ubatch.seq_id[i] = &seq_ids[i];
ubatch.output[i] = true;
}
auto * res = gf_res_reserve.get(); auto * res = gf_res_reserve.get();
const auto gparams = graph_params(res, ubatch, mctx, LLM_GRAPH_TYPE_DEFAULT); const auto gparams = graph_params(res, ubatch, mctx, LLM_GRAPH_TYPE_DEFAULT);
@ -2025,7 +1507,7 @@ llm_graph_params llama_context::graph_params(
llm_graph_result * res, llm_graph_result * res,
const llama_ubatch & ubatch, const llama_ubatch & ubatch,
const llama_memory_context_i * mctx, const llama_memory_context_i * mctx,
llm_graph_type gtype) const { llm_graph_type gtype) const {
return { return {
/*.arch =*/ model.arch, /*.arch =*/ model.arch,
/*.hparams =*/ model.hparams, /*.hparams =*/ model.hparams,
@ -2038,7 +1520,6 @@ llm_graph_params llama_context::graph_params(
/*.loras =*/ &loras, /*.loras =*/ &loras,
/*.mctx =*/ mctx, /*.mctx =*/ mctx,
/*.cross =*/ &cross, /*.cross =*/ &cross,
/*.samplers =*/ sampling.samplers,
/*.n_outputs =*/ n_outputs, /*.n_outputs =*/ n_outputs,
/*.cb =*/ graph_get_cb(), /*.cb =*/ graph_get_cb(),
/*.res =*/ res, /*.res =*/ res,
@ -2494,9 +1975,6 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
} }
} }
// TODO: handle sampling buffers and samplers state ?
// https://github.com/ggml-org/llama.cpp/pull/17004
if (memory != nullptr) { if (memory != nullptr) {
LLAMA_LOG_DEBUG("%s: - writing memory module\n", __func__); LLAMA_LOG_DEBUG("%s: - writing memory module\n", __func__);
memory->state_write(io); memory->state_write(io);
@ -2529,10 +2007,7 @@ size_t llama_context::state_read_data(llama_io_read_i & io) {
auto n_outputs = this->n_outputs; auto n_outputs = this->n_outputs;
io.read_to(&n_outputs, sizeof(n_outputs)); io.read_to(&n_outputs, sizeof(n_outputs));
// Create a dummy batch for state loading. if (n_outputs > output_reserve(n_outputs)) {
llama_batch dummy_batch = {};
dummy_batch.n_tokens = 0;
if (n_outputs > output_reserve(n_outputs, dummy_batch)) {
throw std::runtime_error("could not reserve outputs"); throw std::runtime_error("could not reserve outputs");
} }
@ -2586,9 +2061,6 @@ size_t llama_context::state_read_data(llama_io_read_i & io) {
} }
} }
// TODO: handle sampling buffers and samplers state ?
// https://github.com/ggml-org/llama.cpp/pull/17004
if (memory) { if (memory) {
LLAMA_LOG_DEBUG("%s: - reading memory module\n", __func__); LLAMA_LOG_DEBUG("%s: - reading memory module\n", __func__);
@ -2777,7 +2249,7 @@ void llama_context::opt_epoch_iter(
} }
// reserve output buffer // reserve output buffer
if (output_reserve(n_outputs_all, balloc->get_batch()) < n_outputs_all) { if (output_reserve(n_outputs_all) < n_outputs_all) {
LLAMA_LOG_ERROR("%s: could not reserve space for batch with %d outputs\n", __func__, n_outputs_all); LLAMA_LOG_ERROR("%s: could not reserve space for batch with %d outputs\n", __func__, n_outputs_all);
GGML_ABORT("TODO: handle this error"); GGML_ABORT("TODO: handle this error");
}; };
@ -2922,8 +2394,6 @@ llama_context_params llama_context_default_params() {
/*.op_offload =*/ true, /*.op_offload =*/ true,
/*.swa_full =*/ true, /*.swa_full =*/ true,
/*.kv_unified =*/ false, /*.kv_unified =*/ false,
/*.sampler =*/ nullptr,
/*.n_sampler =*/ 0,
}; };
return result; return result;
@ -3083,15 +2553,7 @@ float * llama_get_logits(llama_context * ctx) {
float * llama_get_logits_ith(llama_context * ctx, int32_t i) { float * llama_get_logits_ith(llama_context * ctx, int32_t i) {
ctx->synchronize(); ctx->synchronize();
float * res = nullptr; return ctx->get_logits_ith(i);
res = ctx->get_sampled_logits_ith(i);
if (!res) {
res = ctx->get_logits_ith(i);
}
return res;
} }
float * llama_get_embeddings(llama_context * ctx) { float * llama_get_embeddings(llama_context * ctx) {
@ -3112,52 +2574,6 @@ float * llama_get_embeddings_seq(llama_context * ctx, llama_seq_id seq_id) {
return ctx->get_embeddings_seq(seq_id); return ctx->get_embeddings_seq(seq_id);
} }
bool llama_set_sampler(llama_context * ctx, llama_seq_id seq_id, llama_sampler * smpl) {
return ctx->set_sampler(seq_id, smpl);
}
llama_token llama_get_sampled_token_ith(llama_context * ctx, int32_t i) {
ctx->synchronize();
return ctx->get_sampled_token_ith(i);
}
float * llama_get_sampled_probs_ith(llama_context * ctx, int32_t i) {
ctx->synchronize();
return ctx->get_sampled_probs_ith(i);
}
float * llama_get_sampled_logits_ith(llama_context * ctx, int32_t i) {
ctx->synchronize();
return ctx->get_sampled_logits_ith(i);
}
llama_token * llama_get_sampled_candidates_ith(llama_context * ctx, int32_t i) {
ctx->synchronize();
return const_cast<llama_token *>(ctx->get_sampled_candidates_ith(i));
}
uint32_t llama_get_sampled_candidates_count_ith(llama_context * ctx, int32_t i) {
ctx->synchronize();
return static_cast<uint32_t>(ctx->get_sampled_candidates_count(i));
}
uint32_t llama_get_sampled_logits_count_ith(llama_context * ctx, int32_t i) {
ctx->synchronize();
return static_cast<uint32_t>(ctx->get_sampled_logits_count(i));
}
uint32_t llama_get_sampled_probs_count_ith(llama_context * ctx, int32_t i) {
ctx->synchronize();
return static_cast<uint32_t>(ctx->get_sampled_probs_count(i));
}
// llama adapter API // llama adapter API
int32_t llama_set_adapter_lora( int32_t llama_set_adapter_lora(

View File

@ -70,18 +70,6 @@ struct llama_context {
float * get_embeddings_ith(int32_t i); float * get_embeddings_ith(int32_t i);
float * get_embeddings_seq(llama_seq_id seq_id); float * get_embeddings_seq(llama_seq_id seq_id);
llama_token * get_sampled_tokens() const;
llama_token get_sampled_token_ith(int32_t idx);
float * get_sampled_logits_ith(int32_t idx);
size_t get_sampled_logits_count(int32_t idx);
float * get_sampled_probs_ith(int32_t idx);
size_t get_sampled_probs_count(int32_t idx);
const llama_token * get_sampled_candidates_ith(int32_t idx);
size_t get_sampled_candidates_count(int32_t idx);
void attach_threadpool( void attach_threadpool(
ggml_threadpool_t threadpool, ggml_threadpool_t threadpool,
ggml_threadpool_t threadpool_batch); ggml_threadpool_t threadpool_batch);
@ -204,13 +192,10 @@ private:
// Make sure enough space is available for outputs. // Make sure enough space is available for outputs.
// Returns max number of outputs for which space was reserved. // Returns max number of outputs for which space was reserved.
uint32_t output_reserve(int32_t n_outputs, const llama_batch & batch); uint32_t output_reserve(int32_t n_outputs);
void output_reorder(); void output_reorder();
// map the output row index `i` to batch index
int64_t output_resolve_row(int32_t i) const;
// //
// graph // graph
// //
@ -228,8 +213,6 @@ public:
ggml_cgraph * graph_reserve( ggml_cgraph * graph_reserve(
uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false, size_t * sizes = nullptr); uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false, size_t * sizes = nullptr);
bool set_sampler(llama_seq_id seq_id, llama_sampler * sampler);
private: private:
llm_graph_params graph_params( llm_graph_params graph_params(
llm_graph_result * res, llm_graph_result * res,
@ -269,31 +252,6 @@ private:
size_t embd_size = 0; // capacity (of floats) for embeddings size_t embd_size = 0; // capacity (of floats) for embeddings
float * embd = nullptr; float * embd = nullptr;
// TODO: simplify
struct sampling_info {
std::map<llama_seq_id, llama_sampler *> samplers;
float * logits = nullptr;
size_t logits_size = 0;
llama_token * sampled = nullptr;
size_t sampled_size = 0;
float * probs = nullptr;
size_t probs_size = 0;
llama_token * candidates = nullptr;
size_t candidates_size = 0;
std::vector<uint32_t> logits_count;
std::vector<uint32_t> probs_count;
std::vector<uint32_t> candidates_count;
std::vector<llama_token> token_ids_full_vocab;
};
sampling_info sampling;
// sequence embeddings output (map of [n_embd] vectors) // sequence embeddings output (map of [n_embd] vectors)
// populated only when pooling_type != LLAMA_POOLING_TYPE_NONE // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
std::map<llama_seq_id, std::vector<float>> embd_seq; std::map<llama_seq_id, std::vector<float>> embd_seq;

View File

@ -369,44 +369,6 @@ static void print_rule(
fprintf(file, "\n"); fprintf(file, "\n");
} }
//
// Regex utilities
//
size_t llama_grammar_trigger_pattern::find(const std::string & input) const {
auto find_start_pos = [](const std::smatch & match) {
// get from the first matched capturing group to the end of the string
size_t start = std::string::npos;
for (auto i = 1u; i < match.size(); i++) {
if (match.length(i) > 0) {
start = match.position(i);
break;
}
}
if (start == std::string::npos) {
start = match.position(0);
}
return start;
};
if (!pattern.empty() && pattern.front() == '^' && pattern.back() == '$') {
// match against the entire input
std::smatch match;
if (std::regex_match(input, match, regex)) {
return find_start_pos(match);
}
}
// search anywhere
std::smatch match;
if (std::regex_search(input, match, regex)) {
return find_start_pos(match);
}
return std::string::npos;
}
// //
// implementation // implementation
// //
@ -1350,10 +1312,21 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
grammar.trigger_buffer_positions.push_back(std::make_pair(token, position)); grammar.trigger_buffer_positions.push_back(std::make_pair(token, position));
grammar.trigger_buffer += piece; grammar.trigger_buffer += piece;
std::smatch match;
for (const auto & trigger_pattern : grammar.trigger_patterns) { for (const auto & trigger_pattern : grammar.trigger_patterns) {
auto start = trigger_pattern.find(grammar.trigger_buffer); if (std::regex_match(grammar.trigger_buffer, match, trigger_pattern.regex)) {
if (start != std::string::npos) {
grammar.awaiting_trigger = false; grammar.awaiting_trigger = false;
// get from the first matched capturing group to the end of the string
size_t start = std::string::npos;
for (auto i = 1u; i < match.size(); i++) {
if (match.length(i) > 0) {
start = match.position(i);
break;
}
}
if (start == std::string::npos) {
start = match.position(0);
}
// replay tokens that overlap with [start, end) // replay tokens that overlap with [start, end)
for (const auto & [tok, tok_pos] : grammar.trigger_buffer_positions) { for (const auto & [tok, tok_pos] : grammar.trigger_buffer_positions) {

View File

@ -119,8 +119,6 @@ struct llama_grammar_parser {
struct llama_grammar_trigger_pattern { struct llama_grammar_trigger_pattern {
std::string pattern; std::string pattern;
std::regex regex; std::regex regex;
size_t find(const std::string & input) const;
}; };
struct llama_grammar { struct llama_grammar {

View File

@ -12,7 +12,6 @@
#include <cassert> #include <cassert>
#include <cmath> #include <cmath>
#include <cstring> #include <cstring>
#include <unordered_set>
void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) { void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
if (ubatch->token) { if (ubatch->token) {
@ -33,7 +32,7 @@ bool llm_graph_input_embd::can_reuse(const llm_graph_params & params) {
bool res = true; bool res = true;
res &= (!tokens && !params.ubatch.token) || (tokens && tokens->ne[0] == params.ubatch.n_tokens); res &= (!tokens && !params.ubatch.token) || (tokens && tokens->ne[0] == params.ubatch.n_tokens);
res &= (!embd && !params.ubatch.embd) || (embd && embd->ne[1] == params.ubatch.n_tokens); res &= (!embd && !params.ubatch.embd) || (embd && embd->ne[0] == params.ubatch.n_tokens);
return res; return res;
} }
@ -63,7 +62,7 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
bool llm_graph_input_pos::can_reuse(const llm_graph_params & params) { bool llm_graph_input_pos::can_reuse(const llm_graph_params & params) {
bool res = true; bool res = true;
res &= pos->ne[0] == params.ubatch.n_tokens*n_pos_per_embd; res &= pos->ne[0] == params.ubatch.n_tokens;
return res; return res;
} }
@ -522,43 +521,6 @@ bool llm_graph_input_mem_hybrid::can_reuse(const llm_graph_params & params) {
return res; return res;
} }
void llm_graph_input_sampling::set_input(const llama_ubatch * ubatch) {
// set the inputs only for the active samplers in the current ubatch
std::unordered_set<llama_seq_id> active_samplers;
for (uint32_t i = 0; i < ubatch->n_tokens; i++) {
if (ubatch->output[i]) {
llama_seq_id seq_id = ubatch->seq_id[i][0];
active_samplers.insert(seq_id);
}
}
for (auto seq_id : active_samplers) {
if (samplers.find(seq_id) == samplers.end()) {
continue;
}
auto & sampler = samplers[seq_id];
if (sampler->iface->backend_set_input) {
sampler->iface->backend_set_input(sampler);
}
}
}
bool llm_graph_input_sampling::can_reuse(const llm_graph_params & params) {
if (samplers.size() != params.samplers.size()) {
return false;
}
for (const auto & [seq_id, sampler] : params.samplers) {
if (samplers[seq_id] != sampler) {
return false;
}
}
return true;
}
// //
// llm_graph_result // llm_graph_result
// //
@ -579,10 +541,6 @@ void llm_graph_result::reset() {
t_logits = nullptr; t_logits = nullptr;
t_embd = nullptr; t_embd = nullptr;
t_embd_pooled = nullptr; t_embd_pooled = nullptr;
t_sampled.clear();
t_sampled_probs.clear();
t_sampled_logits.clear();
t_candidates.clear();
params = {}; params = {};
@ -607,38 +565,6 @@ void llm_graph_result::set_inputs(const llama_ubatch * ubatch) {
} }
} }
void llm_graph_result::set_outputs() {
if (t_logits != nullptr) {
ggml_set_output(t_logits);
}
if (t_embd != nullptr) {
ggml_set_output(t_embd);
}
if (t_embd_pooled != nullptr) {
ggml_set_output(t_embd_pooled);
}
for (auto & [seq_id, t] : t_sampled) {
if (t != nullptr) {
ggml_set_output(t);
}
}
for (auto & [seq_id, t] : t_sampled_probs) {
if (t != nullptr) {
ggml_set_output(t);
}
}
for (auto & [seq_id, t] : t_sampled_logits) {
if (t != nullptr) {
ggml_set_output(t);
}
}
for (auto & [seq_id, t] : t_candidates) {
if (t != nullptr) {
ggml_set_output(t);
}
}
}
bool llm_graph_result::can_reuse(const llm_graph_params & params) { bool llm_graph_result::can_reuse(const llm_graph_params & params) {
if (!this->params.allow_reuse(params)) { if (!this->params.allow_reuse(params)) {
if (debug > 1) { if (debug > 1) {
@ -720,7 +646,6 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
loras (params.loras), loras (params.loras),
mctx (params.mctx), mctx (params.mctx),
cross (params.cross), cross (params.cross),
samplers (params.samplers),
cb_func (params.cb), cb_func (params.cb),
res (params.res), res (params.res),
ctx0 (res->get_ctx()), ctx0 (res->get_ctx()),
@ -1909,10 +1834,8 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream); inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
ggml_set_input(inp->self_kq_mask); ggml_set_input(inp->self_kq_mask);
ggml_set_name(inp->self_kq_mask, "self_kq_mask");
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask; inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
ggml_set_name(inp->self_kq_mask_cnv, "self_kq_mask_cnv");
} }
{ {
@ -1925,10 +1848,8 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream); inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
ggml_set_input(inp->self_kq_mask_swa); ggml_set_input(inp->self_kq_mask_swa);
ggml_set_name(inp->self_kq_mask_swa, "self_kq_mask_swa");
inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa; inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
ggml_set_name(inp->self_kq_mask_swa_cnv, "self_kq_mask_swa_cnv");
} }
return (llm_graph_input_attn_kv_iswa *) res->add_input(std::move(inp)); return (llm_graph_input_attn_kv_iswa *) res->add_input(std::move(inp));
@ -2165,87 +2086,6 @@ void llm_graph_context::build_pooling(
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
} }
void llm_graph_context::build_sampling() const {
if (samplers.empty() || !res->t_logits) {
return;
}
auto inp_sampling = std::make_unique<llm_graph_input_sampling>(samplers);
res->add_input(std::move(inp_sampling));
std::map<llama_seq_id, int32_t> seq_to_logit_row;
int32_t logit_row_idx = 0;
for (uint32_t i = 0; i < ubatch.n_tokens; i++) {
if (ubatch.output[i]) {
llama_seq_id seq_id = ubatch.seq_id[i][0];
seq_to_logit_row[seq_id] = logit_row_idx;
logit_row_idx++;
}
}
// res->t_logits will contain logits for all tokens that want the logits calculated (logits=1 or output=1)
GGML_ASSERT(res->t_logits != nullptr && "missing t_logits tensor");
// add a dummy row of logits
// this trick makes the graph static, regardless of which samplers are activated
// this is important in order to minimize graph reallocations
// TODO: use `ggml_build_forward_select()` when available (https://github.com/ggml-org/llama.cpp/pull/18550)
ggml_tensor * logits_t = ggml_pad(ctx0, res->t_logits, 0, 1, 0, 0);
for (const auto & [seq_id, sampler] : samplers) {
const auto it = seq_to_logit_row.find(seq_id);
// inactive samplers always work on the first row
const auto row_idx = seq_to_logit_row.find(seq_id) != seq_to_logit_row.end() ? it->second : 0;
ggml_tensor * logits_seq = ggml_view_1d(ctx0, logits_t, logits_t->ne[0], row_idx * logits_t->nb[1]);
ggml_format_name(logits_seq, "logits_seq_%d", seq_id);
struct llama_sampler_data data = {
/*.logits =*/ logits_seq,
/*.probs =*/ nullptr,
/*.sampled =*/ nullptr,
/*.candidates =*/ nullptr,
};
assert(sampler->iface->backend_apply);
sampler->iface->backend_apply(sampler, ctx0, gf, &data);
if (data.sampled != nullptr) {
res->t_sampled[seq_id] = data.sampled;
ggml_build_forward_expand(gf, data.sampled);
}
if (data.probs != nullptr) {
res->t_sampled_probs[seq_id] = data.probs;
ggml_build_forward_expand(gf, data.probs);
}
if (data.logits != nullptr) {
res->t_sampled_logits[seq_id] = data.logits;
ggml_build_forward_expand(gf, data.logits);
}
if (data.candidates != nullptr) {
res->t_candidates[seq_id] = data.candidates;
ggml_build_forward_expand(gf, data.candidates);
}
}
// TODO: Call llama_sampler_accept_ggml after all samplers have been applied.
/*
for (const auto & [seq_id, sampler] : samplers) {
if (auto it = res->t_sampled.find(seq_id); it != res->t_sampled.end()) {
ggml_tensor * selected_token = it->second;
if (selected_token != nullptr) {
llama_sampler_accept_ggml(sampler, ctx0, gf, selected_token);
}
}
}
*/
}
int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) { int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
// TODO move to hparams if a T5 variant appears that uses a different value // TODO move to hparams if a T5 variant appears that uses a different value
const int64_t max_distance = 128; const int64_t max_distance = 128;

View File

@ -10,7 +10,6 @@
#include <memory> #include <memory>
#include <set> #include <set>
#include <functional> #include <functional>
#include <map>
struct ggml_cgraph; struct ggml_cgraph;
struct ggml_context; struct ggml_context;
@ -397,18 +396,6 @@ public:
const llama_memory_hybrid_context * mctx; const llama_memory_hybrid_context * mctx;
}; };
class llm_graph_input_sampling : public llm_graph_input_i {
public:
llm_graph_input_sampling(std::map<llama_seq_id, llama_sampler *> samplers) :
samplers(std::move(samplers)) { }
virtual ~llm_graph_input_sampling() = default;
void set_input(const llama_ubatch * ubatch) override;
bool can_reuse(const llm_graph_params & params) override;
std::map<llama_seq_id, llama_sampler *> samplers;
};
// //
// llm_graph_result // llm_graph_result
// //
@ -442,23 +429,6 @@ struct llm_graph_params {
const llama_memory_context_i * mctx; const llama_memory_context_i * mctx;
const llama_cross * cross; const llama_cross * cross;
std::map<llama_seq_id, llama_sampler *> samplers;
static bool samplers_equal(
const std::map<llama_seq_id, llama_sampler *> & lhs,
const std::map<llama_seq_id, llama_sampler *> & rhs) {
if (lhs.size() != rhs.size()) {
return false;
}
for (const auto & [seq_id, sampler] : lhs) {
auto it = rhs.find(seq_id);
if (it == rhs.end() || it->second != sampler) {
return false;
}
}
return true;
}
uint32_t n_outputs; uint32_t n_outputs;
llm_graph_cb cb; llm_graph_cb cb;
@ -498,36 +468,15 @@ struct llm_graph_params {
return false; return false;
} }
if (n_outputs != other.n_outputs) {
return false;
}
if (!samplers_equal(samplers, other.samplers)) {
return false;
}
if (samplers.size() > 0) {
if (!ubatch.data || !other.ubatch.data) {
return false;
}
// check that the outputs are the same for all samplers
for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
if (ubatch.output[i] != other.ubatch.output[i] ||
ubatch.seq_id[i][0] != other.ubatch.seq_id[i][0]) {
return false;
}
}
}
return return
cparams.embeddings == other.cparams.embeddings && cparams.embeddings == other.cparams.embeddings &&
cparams.causal_attn == other.cparams.causal_attn && cparams.causal_attn == other.cparams.causal_attn &&
arch == other.arch && arch == other.arch &&
gtype == other.gtype && gtype == other.gtype &&
cvec == other.cvec && cvec == other.cvec &&
loras == other.loras && loras == other.loras &&
cross == other.cross; cross == other.cross &&
n_outputs == other.n_outputs;
} }
}; };
@ -550,7 +499,6 @@ public:
void reset(); void reset();
void set_inputs(const llama_ubatch * ubatch); void set_inputs(const llama_ubatch * ubatch);
void set_outputs();
// try to update the existing graph result using the new graph parameters in order to reuse it // try to update the existing graph result using the new graph parameters in order to reuse it
// this can only be done if we determine that the resulting graph using the new graph parameters // this can only be done if we determine that the resulting graph using the new graph parameters
@ -569,11 +517,6 @@ public:
ggml_tensor * t_embd = nullptr; ggml_tensor * t_embd = nullptr;
ggml_tensor * t_embd_pooled = nullptr; ggml_tensor * t_embd_pooled = nullptr;
std::map<llama_seq_id, ggml_tensor*> t_sampled_logits;
std::map<llama_seq_id, ggml_tensor*> t_candidates;
std::map<llama_seq_id, ggml_tensor*> t_sampled;
std::map<llama_seq_id, ggml_tensor*> t_sampled_probs;
std::vector<llm_graph_input_ptr> inputs; std::vector<llm_graph_input_ptr> inputs;
ggml_context_ptr ctx_compute; ggml_context_ptr ctx_compute;
@ -649,8 +592,6 @@ struct llm_graph_context {
const llama_memory_context_i * mctx; const llama_memory_context_i * mctx;
const llama_cross * cross; const llama_cross * cross;
std::map<llama_seq_id, llama_sampler *> samplers;
const llm_graph_cb & cb_func; const llm_graph_cb & cb_func;
llm_graph_result * res; llm_graph_result * res;
@ -891,12 +832,6 @@ struct llm_graph_context {
ggml_tensor * cls_out, ggml_tensor * cls_out,
ggml_tensor * cls_out_b) const; ggml_tensor * cls_out_b) const;
//
// sampling (backend sampling)
//
void build_sampling() const;
// //
// dense (out) // dense (out)
// //

View File

@ -240,10 +240,9 @@ struct llama_file::impl {
throw std::runtime_error("unexpectedly reached end of file"); throw std::runtime_error("unexpectedly reached end of file");
} }
} else { } else {
size_t bytes_read = 0; bool successful = false;
while (bytes_read < len) { while (!successful) {
const size_t to_read = len - bytes_read; off_t ret = read(fd, ptr, len);
ssize_t ret = ::read(fd, reinterpret_cast<char *>(ptr) + bytes_read, to_read);
if (ret == -1) { if (ret == -1) {
if (errno == EINTR) { if (errno == EINTR) {
@ -252,16 +251,10 @@ struct llama_file::impl {
throw std::runtime_error(format("read error: %s", strerror(errno))); throw std::runtime_error(format("read error: %s", strerror(errno)));
} }
if (ret == 0) { if (ret == 0) {
// EOF: allow if this read was only pulling alignment padding past file end
off_t pos = lseek(fd, 0, SEEK_CUR);
if (pos != -1 && (size_t) pos == size) {
std::memset(reinterpret_cast<char *>(ptr) + bytes_read, 0, len - bytes_read);
return;
}
throw std::runtime_error("unexpectedly reached end of file"); throw std::runtime_error("unexpectedly reached end of file");
} }
bytes_read += (size_t) ret; successful = true;
} }
} }
} }

View File

@ -126,7 +126,6 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_31B_A3_5B: return "31B.A3.5B"; case LLM_TYPE_31B_A3_5B: return "31B.A3.5B";
case LLM_TYPE_80B_A3B: return "80B.A3B"; case LLM_TYPE_80B_A3B: return "80B.A3B";
case LLM_TYPE_100B_A6B: return "100B.A6B"; case LLM_TYPE_100B_A6B: return "100B.A6B";
case LLM_TYPE_102B_A12B: return "102B.A12B";
case LLM_TYPE_106B_A12B: return "106B.A12B"; case LLM_TYPE_106B_A12B: return "106B.A12B";
case LLM_TYPE_230B_A10B: return "230B.A10B"; case LLM_TYPE_230B_A10B: return "230B.A10B";
case LLM_TYPE_235B_A22B: return "235B.A22B"; case LLM_TYPE_235B_A22B: return "235B.A22B";
@ -1110,14 +1109,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN; default: type = LLM_TYPE_UNKNOWN;
} }
} break; } break;
case LLM_ARCH_MAINCODER:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
case 32: type = LLM_TYPE_1B; break;
default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_QWEN3VL: case LLM_ARCH_QWEN3VL:
{ {
ml.get_key(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers, false); ml.get_key(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers, false);
@ -1691,7 +1682,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false); ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false);
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared); ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false); ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false); ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false); ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) { if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
@ -1787,7 +1778,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
switch (hparams.n_layer) { switch (hparams.n_layer) {
case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer) case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
case 48: type = LLM_TYPE_102B_A12B; break; // Solar Open
case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer) case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer)
default: type = LLM_TYPE_UNKNOWN; default: type = LLM_TYPE_UNKNOWN;
} }
@ -3330,14 +3320,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED); layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, layer.ffn_gate ? n_ff : n_ff * 2}, 0);
const auto tn_ffn_up_weight = tn(LLM_TENSOR_FFN_UP, "weight", i);
ggml_tensor * t_ffn_up = ml.get_tensor_meta(tn_ffn_up_weight.str().c_str());
const int64_t n_ffn_up = t_ffn_up ? t_ffn_up->ne[1] : n_ff;
GGML_ASSERT(n_ffn_up == n_ff || n_ffn_up == n_ff * 2);
layer.ffn_up = create_tensor(tn_ffn_up_weight, {n_embd, n_ffn_up}, 0);
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ffn_up}, TENSOR_NOT_REQUIRED);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0); layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
@ -4793,11 +4776,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
// output // output
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
// try to load output.weight, if not found, use token_embd (tied embeddings) output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
if (!output) {
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
}
for (int i = 0; i < n_layer; ++i) { for (int i = 0; i < n_layer; ++i) {
auto & layer = layers[i]; auto & layer = layers[i];
@ -4860,11 +4839,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
// output // output
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
// try to load output.weight, if not found, use token_embd (tied embeddings) output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
if (!output) {
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
}
for (int i = 0; i < n_layer; ++i) { for (int i = 0; i < n_layer; ++i) {
auto & layer = layers[i]; auto & layer = layers[i];
@ -5231,9 +5206,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, flags); layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, flags);
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, flags); layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, flags);
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, flags); layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, flags);
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, TENSOR_NOT_REQUIRED | flags); layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, flags);
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, TENSOR_NOT_REQUIRED | flags); layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, flags);
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, TENSOR_NOT_REQUIRED | flags); layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, flags);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags); layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags);
@ -6786,37 +6761,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED); layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
} }
} break; } break;
case LLM_ARCH_MAINCODER:
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
// output
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
// if output is NULL, init from the input tok embed
if (output == NULL) {
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
}
for (int i = 0; i < n_layer; ++i) {
auto & layer = layers[i];
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
}
} break;
default: default:
throw std::runtime_error("unknown architecture"); throw std::runtime_error("unknown architecture");
} }
@ -7462,10 +7406,6 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
{ {
llm = std::make_unique<llm_build_llama<true>>(*this, params); llm = std::make_unique<llm_build_llama<true>>(*this, params);
} break; } break;
case LLM_ARCH_MAINCODER:
{
llm = std::make_unique<llm_build_maincoder>(*this, params);
} break;
case LLM_ARCH_DECI: case LLM_ARCH_DECI:
{ {
llm = std::make_unique<llm_build_deci>(*this, params); llm = std::make_unique<llm_build_deci>(*this, params);
@ -7500,7 +7440,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
} break; } break;
case LLM_ARCH_MODERN_BERT: case LLM_ARCH_MODERN_BERT:
{ {
llm = std::make_unique<llm_build_modern_bert>(*this, params); llm = std::make_unique<llm_build_modern_bert<true>>(*this, params);
} break; } break;
case LLM_ARCH_NEO_BERT: case LLM_ARCH_NEO_BERT:
{ {
@ -7910,17 +7850,12 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
// add on pooling layer // add on pooling layer
llm->build_pooling(cls, cls_b, cls_out, cls_out_b); llm->build_pooling(cls, cls_b, cls_out, cls_out_b);
// add backend sampling layers (if any)
llm->build_sampling();
// if the gguf model was converted with --sentence-transformers-dense-modules // if the gguf model was converted with --sentence-transformers-dense-modules
// there will be two additional dense projection layers // there will be two additional dense projection layers
// dense linear projections are applied after pooling // dense linear projections are applied after pooling
// TODO: move reranking logic here and generalize // TODO: move reranking logic here and generalize
llm->build_dense_out(dense_2_out_layers, dense_3_out_layers); llm->build_dense_out(dense_2_out_layers, dense_3_out_layers);
llm->res->set_outputs();
return llm->res->get_gf(); return llm->res->get_gf();
} }
@ -8079,7 +8014,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_ERNIE4_5_MOE: case LLM_ARCH_ERNIE4_5_MOE:
case LLM_ARCH_MISTRAL3: case LLM_ARCH_MISTRAL3:
case LLM_ARCH_LLAMA_EMBED: case LLM_ARCH_LLAMA_EMBED:
case LLM_ARCH_MAINCODER:
return LLAMA_ROPE_TYPE_NORM; return LLAMA_ROPE_TYPE_NORM;
// the pairs of head values are offset by n_rot/2 // the pairs of head values are offset by n_rot/2

View File

@ -119,7 +119,6 @@ enum llm_type {
LLM_TYPE_31B_A3_5B, LLM_TYPE_31B_A3_5B,
LLM_TYPE_80B_A3B, // Qwen3 Next LLM_TYPE_80B_A3B, // Qwen3 Next
LLM_TYPE_100B_A6B, LLM_TYPE_100B_A6B,
LLM_TYPE_102B_A12B, // Solar-Open
LLM_TYPE_106B_A12B, // GLM-4.5-Air LLM_TYPE_106B_A12B, // GLM-4.5-Air
LLM_TYPE_230B_A10B, // Minimax M2 LLM_TYPE_230B_A10B, // Minimax M2
LLM_TYPE_235B_A22B, LLM_TYPE_235B_A22B,

File diff suppressed because it is too large Load Diff

View File

@ -14,16 +14,7 @@ struct llama_grammar;
struct llama_sampler_chain { struct llama_sampler_chain {
llama_sampler_chain_params params; llama_sampler_chain_params params;
// has .backend_init() been called? std::vector<struct llama_sampler *> samplers;
bool is_init = false;
struct info {
bool is_backend;
llama_sampler * ptr;
};
std::vector<info> samplers;
// pre-allocated buffer for llama_sampler_sample to avoid repeated allocations // pre-allocated buffer for llama_sampler_sample to avoid repeated allocations
std::vector<llama_token_data> cur; std::vector<llama_token_data> cur;
@ -36,9 +27,9 @@ struct llama_sampler_chain {
}; };
struct llama_sampler * llama_sampler_init_dry_testing( struct llama_sampler * llama_sampler_init_dry_testing(
int32_t context_size, int32_t context_size,
float dry_multiplier, float dry_multiplier,
float dry_base, float dry_base,
int32_t dry_allowed_length, int32_t dry_allowed_length,
int32_t dry_penalty_last_n, int32_t dry_penalty_last_n,
const std::vector<std::vector<llama_token>> & seq_breakers); const std::vector<std::vector<llama_token>>& seq_breakers);

View File

@ -314,12 +314,6 @@ struct llm_tokenizer_bpe : llm_tokenizer {
"[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\r\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\r\n]*|\\s*[\r\n]+|\\s+(?!\\S)|\\s+", "[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\r\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\r\n]*|\\s*[\r\n]+|\\s+(?!\\S)|\\s+",
}; };
break; break;
case LLAMA_VOCAB_PRE_TYPE_YOUTU:
regex_exprs = {
"[가-힣ㄱ-ㆎ]+|[!…“”‘’—:;,、-〿︰-]+|[ㄅ-ㄯ]+|[一-龥぀-ゟ゠-ヿ]+",
"[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
};
break;
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER: case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
regex_exprs = { regex_exprs = {
"[\r\n]", "[\r\n]",
@ -361,7 +355,6 @@ struct llm_tokenizer_bpe : llm_tokenizer {
case LLAMA_VOCAB_PRE_TYPE_STABLELM2: case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
case LLAMA_VOCAB_PRE_TYPE_QWEN2: case LLAMA_VOCAB_PRE_TYPE_QWEN2:
case LLAMA_VOCAB_PRE_TYPE_HUNYUAN: case LLAMA_VOCAB_PRE_TYPE_HUNYUAN:
case LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN:
regex_exprs = { regex_exprs = {
// original regex from tokenizer.json // original regex from tokenizer.json
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
@ -1867,11 +1860,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
tokenizer_pre == "deepseek-v3") { tokenizer_pre == "deepseek-v3") {
pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM; pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM;
clean_spaces = false; clean_spaces = false;
} else if (
tokenizer_pre == "youtu") {
pre_type = LLAMA_VOCAB_PRE_TYPE_YOUTU;
clean_spaces = false;
ignore_merges = true;
} else if ( } else if (
tokenizer_pre == "falcon") { tokenizer_pre == "falcon") {
pre_type = LLAMA_VOCAB_PRE_TYPE_FALCON; pre_type = LLAMA_VOCAB_PRE_TYPE_FALCON;
@ -2027,10 +2015,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
tokenizer_pre == "minimax-m2") { tokenizer_pre == "minimax-m2") {
pre_type = LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2; pre_type = LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2;
clean_spaces = false; clean_spaces = false;
} else if (
tokenizer_pre == "solar-open") {
pre_type = LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN;
clean_spaces = false;
} else { } else {
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
} }
@ -2203,8 +2187,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
// for now, we apply this workaround to find the tokens based on their text // for now, we apply this workaround to find the tokens based on their text
for (const auto & t : token_to_id) { for (const auto & t : token_to_id) {
auto & attr = id_to_token[t.second].attr;
// find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc. // find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
if (special_eot_id == LLAMA_TOKEN_NULL) { if (special_eot_id == LLAMA_TOKEN_NULL) {
if (false if (false
@ -2220,10 +2202,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|| t.first == "<end_of_utterance>" // smoldocling || t.first == "<end_of_utterance>" // smoldocling
) { ) {
special_eot_id = t.second; special_eot_id = t.second;
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str()); __func__, t.second, t.first.c_str());
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL); id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
} }
} }
} }
@ -2234,10 +2216,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|| t.first == "<|eom_id|>" || t.first == "<|eom_id|>"
) { ) {
special_eom_id = t.second; special_eom_id = t.second;
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str()); __func__, t.second, t.first.c_str());
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL); id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
} }
} }
} }
@ -2254,10 +2236,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|| t.first == "<|code_prefix|>" // GLM-4.5 || t.first == "<|code_prefix|>" // GLM-4.5
) { ) {
special_fim_pre_id = t.second; special_fim_pre_id = t.second;
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str()); __func__, t.second, t.first.c_str());
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL); id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
} }
} }
} }
@ -2274,10 +2256,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|| t.first == "<|code_suffix|>" // GLM-4.5 || t.first == "<|code_suffix|>" // GLM-4.5
) { ) {
special_fim_suf_id = t.second; special_fim_suf_id = t.second;
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str()); __func__, t.second, t.first.c_str());
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL); id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
} }
} }
} }
@ -2294,10 +2276,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|| t.first == "<|code_middle|>" // GLM-4.5 || t.first == "<|code_middle|>" // GLM-4.5
) { ) {
special_fim_mid_id = t.second; special_fim_mid_id = t.second;
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str()); __func__, t.second, t.first.c_str());
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL); id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
} }
} }
} }
@ -2311,10 +2293,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|| t.first == "<PAD>" || t.first == "<PAD>"
) { ) {
special_fim_pad_id = t.second; special_fim_pad_id = t.second;
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str()); __func__, t.second, t.first.c_str());
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL); id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
} }
} }
} }
@ -2329,10 +2311,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|| t.first == "<reponame>" // Granite || t.first == "<reponame>" // Granite
) { ) {
special_fim_rep_id = t.second; special_fim_rep_id = t.second;
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str()); __func__, t.second, t.first.c_str());
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL); id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
} }
} }
} }
@ -2343,41 +2325,15 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|| t.first == "<|file_sep|>" // Qwen || t.first == "<|file_sep|>" // Qwen
) { ) {
special_fim_sep_id = t.second; special_fim_sep_id = t.second;
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str()); __func__, t.second, t.first.c_str());
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL); id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
} }
} }
} }
} }
// auto-detect unused tokens: e.g. control tokens with the word "unused"
// ideally, these tokens should be marked as unused during conversion
{
uint32_t n_unused = 0;
for (const auto & t : token_to_id) {
auto & attr = id_to_token[t.second].attr;
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
continue;
}
if ((attr & LLAMA_TOKEN_ATTR_UNUSED) == 0) {
if (strstr(t.first.c_str(), "unused") != NULL) {
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_UNUSED);
}
}
if (attr & LLAMA_TOKEN_ATTR_UNUSED) {
n_unused++;
}
}
LLAMA_LOG_INFO("%s: %u unused tokens\n", __func__, n_unused);
}
// maintain a list of tokens that cause end-of-generation // maintain a list of tokens that cause end-of-generation
// this is currently determined based on the token text, which is obviously not ideal // this is currently determined based on the token text, which is obviously not ideal
// ref: https://github.com/ggerganov/llama.cpp/issues/9606 // ref: https://github.com/ggerganov/llama.cpp/issues/9606
@ -2396,16 +2352,12 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
} }
for (const auto & t : token_to_id) { for (const auto & t : token_to_id) {
auto & attr = id_to_token[t.second].attr;
if (false if (false
|| t.first == "<|eot_id|>" || t.first == "<|eot_id|>"
|| t.first == "<|im_end|>" || t.first == "<|im_end|>"
|| t.first == "<|end|>" || t.first == "<|end|>"
|| t.first == "<|return|>" // o200k_harmony || t.first == "<|return|>" // o200k_harmony
|| t.first == "<|call|>" // o200k_harmony || t.first == "<|call|>" // o200k_harmony
|| t.first == "<|flush|>" // solar-open
|| t.first == "<|calls|>" // solar-open
|| t.first == "<end_of_turn>" || t.first == "<end_of_turn>"
|| t.first == "<|endoftext|>" || t.first == "<|endoftext|>"
|| t.first == "<|eom_id|>" || t.first == "<|eom_id|>"
@ -2415,28 +2367,24 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|| t.first == "<end_of_utterance>" // smoldocling || t.first == "<end_of_utterance>" // smoldocling
) { ) {
special_eog_ids.insert(t.second); special_eog_ids.insert(t.second);
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str()); __func__, t.second, t.first.c_str());
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL); id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
} }
} else { } else {
if (attr & LLAMA_TOKEN_ATTR_CONTROL && !(attr & LLAMA_TOKEN_ATTR_UNUSED)) { // token is control, but not marked as EOG -> print a debug log
// token is control, but not marked as EOG -> print a debug log if (id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && special_eog_ids.count(t.second) == 0) {
if (special_eog_ids.count(t.second) == 0) { LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n", __func__, t.second, t.first.c_str());
__func__, t.second, t.first.c_str());
}
} }
} }
} }
// @ngxson : quick hack for gpt-oss, always render these tokens // @ngxson : quick hack for gpt-oss, always render these tokens
for (const auto & t : token_to_id) { for (const auto & t : token_to_id) {
auto & attr = id_to_token[t.second].attr;
if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>" || t.first == "<|constrain|>") { if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>" || t.first == "<|constrain|>") {
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_USER_DEFINED); id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
} }
} }
@ -2456,42 +2404,34 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__); LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
} }
// TODO: workaround for o200k_harmony and solar-open tokenizer: the "<|end|>" token should not be EOG // TODO: workaround for o200k_harmony tokenizer: the "<|end|>" token should not be EOG
// we don't have a good way to detect this, so for now, if we have "<|return|>" and "<|call|>" tokens ("<|calls|>" and "<|flush|>" for solar-open), // we don't have a good way to detect this, so for now, if we have "<|return|>" and "<|call|>" tokens,
// we remove the "<|end|>" token from the EOG list // we remove the "<|end|>" token from the EOG list
{ {
bool has_return = false; bool has_return = false;
bool has_call = false; bool has_call = false;
bool has_end = false; bool has_end = false;
bool has_flush = false;
llama_token end_id = LLAMA_TOKEN_NULL; llama_token end_id = LLAMA_TOKEN_NULL;
LLAMA_LOG_INFO("%s: printing all EOG tokens:\n", __func__); LLAMA_LOG_INFO("%s: printing all EOG tokens:\n", __func__);
for (auto tid : special_eog_ids) { for (auto tid : special_eog_ids) {
auto & text = id_to_token[tid].text; LLAMA_LOG_INFO("%s: - %d ('%s')\n", __func__, tid, id_to_token[tid].text.c_str());
LLAMA_LOG_INFO("%s: - %d ('%s')\n", __func__, tid, text.c_str()); if (id_to_token[tid].text == "<|return|>") {
if (text == "<|return|>") {
has_return = true; has_return = true;
} else if (text == "<|call|>" || text == "<|calls|>") { } else if (id_to_token[tid].text == "<|call|>") {
has_call = true; has_call = true;
} else if (text == "<|flush|>") { } else if (id_to_token[tid].text == "<|end|>") {
has_flush = true;
} else if (text == "<|end|>") {
has_end = true; has_end = true;
end_id = tid; end_id = tid;
} }
} }
if ((has_return && has_call && has_end) || (has_call && has_flush && has_end)) { if (has_return && has_call && has_end) {
special_eog_ids.erase(end_id); special_eog_ids.erase(end_id);
id_to_token[end_id].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
auto & attr = id_to_token[end_id].attr; LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_USER_DEFINED);
LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>', or '<|calls|>' and '<|flush|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
} }
} }
} }

View File

@ -51,8 +51,6 @@ enum llama_vocab_pre_type {
LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40, LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40,
LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2 = 41, LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2 = 41,
LLAMA_VOCAB_PRE_TYPE_AFMOE = 42, LLAMA_VOCAB_PRE_TYPE_AFMOE = 42,
LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN = 43,
LLAMA_VOCAB_PRE_TYPE_YOUTU = 44,
}; };
struct LLM_KV; struct LLM_KV;

View File

@ -713,7 +713,7 @@ enum llama_params_fit_status llama_params_fit(
struct llama_sampler_chain_params llama_sampler_chain_default_params() { struct llama_sampler_chain_params llama_sampler_chain_default_params() {
struct llama_sampler_chain_params result = { struct llama_sampler_chain_params result = {
/*.no_perf =*/ true, /*.no_perf =*/ true,
}; };
return result; return result;

View File

@ -142,13 +142,11 @@ llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params
LLM_FFN_GELU, LLM_FFN_SEQ, il); LLM_FFN_GELU, LLM_FFN_SEQ, il);
cb(cur, "ffn_out", il); cb(cur, "ffn_out", il);
} else if (model.arch == LLM_ARCH_JINA_BERT_V2) { } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
const bool up_contains_gate = !model.layers[il].ffn_gate && model.layers[il].ffn_up->ne[1] != hparams.n_ff();
auto type_op = up_contains_gate ? LLM_FFN_GEGLU : LLM_FFN_GELU;
cur = build_ffn(cur, cur = build_ffn(cur,
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL,
type_op, LLM_FFN_PAR, il); model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_GEGLU, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il); cb(cur, "ffn_out", il);
} else { } else {
cur = build_ffn(cur, cur = build_ffn(cur,

View File

@ -3,14 +3,12 @@
llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_params & params) : llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) { llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_head = hparams.n_embd_head_v;
const float kq_scale = 1.0f / sqrtf(float(n_embd_head)); float kq_scale = 1.0f / sqrtf(float(n_embd_head));
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot); GGML_ASSERT(n_embd_head == hparams.n_rot);
ggml_tensor * inpL; ggml_tensor *inpL, *cur;
ggml_tensor * cur;
inpL = build_inp_embd(model.tok_embd); inpL = build_inp_embd(model.tok_embd);
ggml_tensor * inp_pos = build_inp_pos(); ggml_tensor * inp_pos = build_inp_pos();
@ -46,7 +44,7 @@ llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_pa
} }
ggml_tensor * inpSA = inpL; ggml_tensor * inpSA = inpL;
cur = build_norm(inpSA, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); cur = build_norm(inpSA, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
// build self attention // build self attention
{ {

View File

@ -215,7 +215,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
model.layers[il].ffn_exp_probs_b, model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used, n_expert, n_expert_used,
LLM_FFN_SILU, hparams.expert_weights_norm, LLM_FFN_SILU, hparams.expert_weights_norm,
hparams.expert_weights_scale, hparams.expert_weights_scale, true, hparams.expert_weights_scale,
(llama_expert_gating_func_type) hparams.expert_gating_func, (llama_expert_gating_func_type) hparams.expert_gating_func,
il); il);
cb(moe_out, "ffn_moe_out", il); cb(moe_out, "ffn_moe_out", il);

View File

@ -1,5 +1,7 @@
#include "models.h" #include "models.h"
llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params) : llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) { llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_k; const int64_t n_embd_head = hparams.n_embd_head_k;
@ -10,8 +12,10 @@ llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model,
inpL = build_inp_embd(model.tok_embd); inpL = build_inp_embd(model.tok_embd);
// important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings) // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f); if (ubatch.token) {
cb(inpL, "inp_scaled", -1); inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
cb(inpL, "inp_scaled", -1);
}
// inp_pos - contains the positions // inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos(); ggml_tensor * inp_pos = build_inp_pos();

View File

@ -10,9 +10,10 @@ llm_build_gemma3<iswa>::llm_build_gemma3(const llama_model & model, const llm_gr
inpL = build_inp_embd(model.tok_embd); inpL = build_inp_embd(model.tok_embd);
// important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings) // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f); if (ubatch.token) {
cb(inpL, "inp_scaled", -1); inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
cb(inpL, "inp_scaled", -1);
}
// inp_pos - contains the positions // inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos(); ggml_tensor * inp_pos = build_inp_pos();

View File

@ -1,5 +1,7 @@
#include "models.h" #include "models.h"
llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params) : llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params), llm_graph_context(params),
model(model), model(model),
@ -13,9 +15,10 @@ llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const
inpL = build_inp_embd(model.tok_embd); inpL = build_inp_embd(model.tok_embd);
// important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings) // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f); if (ubatch.token) {
cb(inpL, "inp_scaled", -1); inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
cb(inpL, "inp_scaled", -1);
}
// inp_pos - contains the positions // inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos(); ggml_tensor * inp_pos = build_inp_pos();
@ -245,7 +248,7 @@ ggml_tensor * llm_build_gemma3n_iswa::view_2d_slice(ggml_tensor * x, int idx) {
// equivalent to get_per_layer_inputs() in python code // equivalent to get_per_layer_inputs() in python code
// output shape: [n_embd_altup, n_layer, n_tokens] // output shape: [n_embd_altup, n_layer, n_tokens]
ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() { ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() {
auto inp = std::make_unique<llm_graph_input_embd>(); auto inp = std::make_unique<llm_graph_input_embd>();
ggml_tensor * inp_per_layer; ggml_tensor * inp_per_layer;
if (ubatch.token) { if (ubatch.token) {
inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens); inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);

View File

@ -1,117 +0,0 @@
#include "models.h"
llm_build_maincoder::llm_build_maincoder(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
inpL = build_inp_embd(model.tok_embd);
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * inp_out_ids = build_inp_out_ids();
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
// norm
cur = build_norm(inpL,
model.layers[il].attn_norm, NULL,
LLM_NORM_RMS, il);
cb(cur, "attn_norm", il);
// self-attention
{
// compute Q and K and RoPE them
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
cb(Qcur, "Qcur", il);
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
cb(Kcur, "Kcur", il);
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
cb(Vcur, "Vcur", il);
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
Kcur = ggml_rope_ext(
ctx0, Kcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
cb(Qcur, "Qcur_normed", il);
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
cb(Kcur, "Kcur_normed", il);
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
// feed-forward network
cur = build_norm(ffn_inp,
model.layers[il].ffn_norm, NULL,
LLM_NORM_RMS, il);
cb(cur, "ffn_norm", il);
cur = build_ffn(cur,
model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
cur = ggml_add(ctx0, cur, ffn_inp);
cur = build_cvec(cur, il);
cb(cur, "l_out", il);
// input for next layer
inpL = cur;
}
cur = inpL;
cur = build_norm(cur,
model.output_norm, NULL,
LLM_NORM_RMS, -1);
cb(cur, "result_norm", -1);
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cb(cur, "result_output", -1);
res->t_logits = cur;
ggml_build_forward_expand(gf, cur);
}

View File

@ -312,10 +312,6 @@ struct llm_build_llama_iswa : public llm_graph_context {
llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params); llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params);
}; };
struct llm_build_maincoder : public llm_graph_context {
llm_build_maincoder(const llama_model & model, const llm_graph_params & params);
};
struct llm_build_mamba : public llm_graph_context_mamba { struct llm_build_mamba : public llm_graph_context_mamba {
llm_build_mamba(const llama_model & model, const llm_graph_params & params); llm_build_mamba(const llama_model & model, const llm_graph_params & params);
}; };
@ -336,6 +332,7 @@ struct llm_build_mistral3 : public llm_graph_context {
llm_build_mistral3(const llama_model & model, const llm_graph_params & params); llm_build_mistral3(const llama_model & model, const llm_graph_params & params);
}; };
template <bool iswa>
struct llm_build_modern_bert : public llm_graph_context { struct llm_build_modern_bert : public llm_graph_context {
llm_build_modern_bert(const llama_model & model, const llm_graph_params & params); llm_build_modern_bert(const llama_model & model, const llm_graph_params & params);
}; };

View File

@ -1,6 +1,7 @@
#include "models.h" #include "models.h"
llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { template <bool iswa>
llm_build_modern_bert<iswa>::llm_build_modern_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@ -23,7 +24,13 @@ llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, const ll
auto * inp_attn = build_attn_inp_no_cache(); auto * inp_attn = build_attn_inp_no_cache();
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
float freq_base_l = model.get_rope_freq_base(cparams, il); float freq_base_l = 0.0f;
if constexpr (iswa) {
freq_base_l = model.get_rope_freq_base(cparams, il);
} else {
freq_base_l = freq_base;
}
cur = inpL; cur = inpL;
@ -113,3 +120,7 @@ llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, const ll
res->t_embd = cur; res->t_embd = cur;
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
} }
// Explicit template instantiations
template struct llm_build_modern_bert<false>;
template struct llm_build_modern_bert<true>;

View File

@ -964,11 +964,6 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
{ "\\p{P}", unicode_cpt_flags::PUNCTUATION }, { "\\p{P}", unicode_cpt_flags::PUNCTUATION },
{ "\\p{M}", unicode_cpt_flags::ACCENT_MARK }, { "\\p{M}", unicode_cpt_flags::ACCENT_MARK },
{ "\\p{S}", unicode_cpt_flags::SYMBOL }, { "\\p{S}", unicode_cpt_flags::SYMBOL },
{ "\\p{Lu}", unicode_cpt_flags::LETTER }, // Uppercase letter
{ "\\p{Ll}", unicode_cpt_flags::LETTER }, // Lowercase letter
{ "\\p{Lt}", unicode_cpt_flags::LETTER }, // Titlecase letter
{ "\\p{Lm}", unicode_cpt_flags::LETTER }, // Modifier letter
{ "\\p{Lo}", unicode_cpt_flags::LETTER }, // Other letter
}; };
static const std::map<int, int> k_ucat_cpt = { static const std::map<int, int> k_ucat_cpt = {
@ -1079,26 +1074,22 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
continue; continue;
} }
// Match \p{...} Unicode properties of varying lengths if (regex_expr[i + 0] == '\\' && i + 4 < regex_expr.size() &&
if (regex_expr[i + 0] == '\\' && i + 3 < regex_expr.size() &&
regex_expr[i + 1] == 'p' && regex_expr[i + 1] == 'p' &&
regex_expr[i + 2] == '{') { regex_expr[i + 2] == '{' &&
// Find the closing brace regex_expr[i + 4] == '}') {
size_t closing_brace = regex_expr.find('}', i + 3); const std::string pat = regex_expr.substr(i, 5);
if (closing_brace != std::string::npos && closing_brace <= i + 10) { // reasonable limit if (k_ucat_enum.find(pat) != k_ucat_enum.end()) {
const std::string pat = regex_expr.substr(i, closing_brace - i + 1); if (!inside) {
if (k_ucat_enum.find(pat) != k_ucat_enum.end()) { regex_expr_collapsed += '[';
if (!inside) {
regex_expr_collapsed += '[';
}
regex_expr_collapsed += k_ucat_cpt.at(k_ucat_enum.at(pat));
regex_expr_collapsed += k_ucat_map.at(k_ucat_enum.at(pat));
if (!inside) {
regex_expr_collapsed += ']';
}
i = closing_brace;
continue;
} }
regex_expr_collapsed += k_ucat_cpt.at(k_ucat_enum.at(pat));
regex_expr_collapsed += k_ucat_map.at(k_ucat_enum.at(pat));
if (!inside) {
regex_expr_collapsed += ']';
}
i += 4;
continue;
} }
} }

View File

@ -219,18 +219,8 @@ endif()
llama_build_and_test(test-gguf.cpp) llama_build_and_test(test-gguf.cpp)
llama_build_and_test(test-backend-ops.cpp) llama_build_and_test(test-backend-ops.cpp)
llama_build_and_test(test-model-load-cancel.cpp LABEL "model") llama_build_and_test(test-model-load-cancel.cpp LABEL "model")
llama_build_and_test(test-autorelease.cpp LABEL "model") llama_build_and_test(test-autorelease.cpp LABEL "model")
llama_build_and_test(test-backend-sampler.cpp LABEL "model")
llama_test(test-backend-sampler NAME test-backend-sampler-greedy ARGS --test greedy)
llama_test(test-backend-sampler NAME test-backend-sampler-temp ARGS --test temp)
llama_test(test-backend-sampler NAME test-backend-sampler-top_k ARGS --test top_k)
llama_test(test-backend-sampler NAME test-backend-sampler-dist ARGS --test dist)
llama_test(test-backend-sampler NAME test-backend-sampler-dist-and-cpu ARGS --test dist_and_cpu)
llama_test(test-backend-sampler NAME test-backend-sampler-logit-bias ARGS --test logit_bias)
llama_test(test-backend-sampler NAME test-backend-sampler-mul_seq ARGS --test multi_sequence)
llama_test(test-backend-sampler NAME test-backend-sampler-set-sampler ARGS --test set_sampler)
# Test for state restore with fragmented KV cache # Test for state restore with fragmented KV cache
# Requires a model, uses same args pattern as test-thread-safety # Requires a model, uses same args pattern as test-thread-safety

View File

@ -1158,7 +1158,6 @@ struct test_case {
} }
virtual bool run_whole_graph() { return false; } virtual bool run_whole_graph() { return false; }
virtual std::vector<ggml_tensor *> fusion_test_nodes() { return {}; }
ggml_cgraph * gf = nullptr; ggml_cgraph * gf = nullptr;
ggml_cgraph * gb = nullptr; ggml_cgraph * gb = nullptr;
@ -1392,13 +1391,7 @@ struct test_case {
GGML_UNUSED(index); GGML_UNUSED(index);
}; };
std::vector<ggml_tensor *> fused_nodes_to_verify = fusion_test_nodes(); const bool cmp_ok = ggml_backend_compare_graph_backend(backend1, backend2, gf, callback, &ud, run_whole_graph() ? out : nullptr);
if (fused_nodes_to_verify.size() == 0 && run_whole_graph()) {
fused_nodes_to_verify.push_back(out);
}
const bool cmp_ok = ggml_backend_compare_graph_backend(backend1, backend2, gf, callback, &ud,
run_whole_graph() ? fused_nodes_to_verify.data() : nullptr,
fused_nodes_to_verify.size());
ggml_backend_buffer_free(buf); ggml_backend_buffer_free(buf);
@ -5187,8 +5180,6 @@ struct test_topk_moe : public test_case {
const bool bias_probs; const bool bias_probs;
const MoeGatingFunc gating_func; const MoeGatingFunc gating_func;
const float scale_w; const float scale_w;
ggml_tensor * weights {};
ggml_tensor * selected_experts {};
test_topk_moe(std::array<int64_t, 4> ne = { 10, 5, 1, 1 }, test_topk_moe(std::array<int64_t, 4> ne = { 10, 5, 1, 1 },
int n_expert_used = 1, int n_expert_used = 1,
@ -5226,16 +5217,16 @@ struct test_topk_moe : public test_case {
ggml_tensor * selection_probs = probs; ggml_tensor * selection_probs = probs;
if (bias_probs) { if (bias_probs) {
ggml_tensor * exp_probs_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ne[0]); ggml_tensor * exp_probs_b = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne.data());
ggml_set_name(exp_probs_b, "exp_probs_b"); ggml_set_name(exp_probs_b, "exp_probs_b");
selection_probs = ggml_add(ctx, probs, exp_probs_b); selection_probs = ggml_add(ctx, probs, exp_probs_b);
ggml_set_name(selection_probs, "selection_probs"); ggml_set_name(selection_probs, "selection_probs");
} }
selected_experts = ggml_argsort_top_k(ctx, selection_probs, n_expert_used); // [n_expert_used, n_tokens] ggml_tensor * selected_experts = ggml_argsort_top_k(ctx, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
ggml_set_name(selected_experts, "selected_experts"); ggml_set_name(selected_experts, "selected_experts");
weights = ggml_get_rows(ctx, ggml_reshape_3d(ctx, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens] ggml_tensor * weights = ggml_get_rows(ctx, ggml_reshape_3d(ctx, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
ggml_set_name(weights, "weights"); ggml_set_name(weights, "weights");
if (gating_func == GATING_FUNC_SOFTMAX_WEIGHT) { if (gating_func == GATING_FUNC_SOFTMAX_WEIGHT) {
@ -5261,21 +5252,6 @@ struct test_topk_moe : public test_case {
ggml_set_name(weights, "weights"); ggml_set_name(weights, "weights");
return weights; return weights;
} }
// Verify two outputs
std::vector<ggml_tensor *> fusion_test_nodes() override { return { selected_experts, weights }; }
// allow output in arbitrary order
double err(const float * a, const float * b, size_t n) override {
std::vector<float> a2(n);
std::vector<float> b2(n);
for (size_t i = 0; i < n; ++i) {
a2[i] = a[i];
b2[i] = b[i];
}
std::sort(a2.begin(), a2.end());
std::sort(b2.begin(), b2.end());
return nmse(a2.data(), b2.data(), n);
}
}; };
struct test_mul_mat_vec_fusion : public test_case { struct test_mul_mat_vec_fusion : public test_case {
@ -7775,11 +7751,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, true, GGML_TYPE_F32, {1, 1}, 0.1f, 8.0f)); test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, true, GGML_TYPE_F32, {1, 1}, 0.1f, 8.0f));
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, true, GGML_TYPE_F16, {1, 1}, 0.1f, 8.0f)); test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, true, GGML_TYPE_F16, {1, 1}, 0.1f, 8.0f));
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {200001, 2, 3, 1}, true, true, GGML_TYPE_F32, {1, 1}, 0.1f, 8.0f)); test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {200001, 2, 3, 1}, true, true, GGML_TYPE_F32, {1, 1}, 0.1f, 8.0f));
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {200001, 2, 3, 1}, true, true, GGML_TYPE_F16, {1, 1}, 0.1f, 8.0f)); test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {200001, 2, 3, 1}, true, true, GGML_TYPE_F16, {1, 1}, 0.1f, 8.0f));
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {200000, 1, 1, 1}, false, false, GGML_TYPE_F32, {1, 1}, 1.0f, 0.0f));
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {200000, 4, 1, 1}, false, false, GGML_TYPE_F32, {1, 1}, 1.0f, 0.0f));
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {643251, 3, 1, 1}, false, false, GGML_TYPE_F32, {1, 1}, 1.0f, 0.0f));
for (float max_bias : {0.0f, 8.0f}) { for (float max_bias : {0.0f, 8.0f}) {
for (float scale : {1.0f, 0.1f}) { for (float scale : {1.0f, 0.1f}) {
@ -7883,11 +7856,6 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {2, 8, 8192, 1}, order)); // bailingmoe2 (group selection) test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {2, 8, 8192, 1}, order)); // bailingmoe2 (group selection)
} }
for (int n = 1; n < 5; ++n) {
for (int k = 1; k <= n; ++k) {
test_cases.emplace_back(new test_top_k(GGML_TYPE_F32, {n, 2, 1, 3}, k, true));
}
}
for (int i = 0; i < 20; ++i) { for (int i = 0; i < 20; ++i) {
for (int k : {1, 2, 3, 7, 15, 100, 500, 1023, 9999}) { for (int k : {1, 2, 3, 7, 15, 100, 500, 1023, 9999}) {
if (k <= 1<<i) { if (k <= 1<<i) {
@ -7975,7 +7943,6 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
test_cases.emplace_back(new test_cumsum(GGML_TYPE_F32, { 2048, 5, 4, 3 })); test_cases.emplace_back(new test_cumsum(GGML_TYPE_F32, { 2048, 5, 4, 3 }));
test_cases.emplace_back(new test_cumsum(GGML_TYPE_F32, { 201*1204, 1, 1, 1 })); test_cases.emplace_back(new test_cumsum(GGML_TYPE_F32, { 201*1204, 1, 1, 1 }));
test_cases.emplace_back(new test_cumsum(GGML_TYPE_F32, { 312*1205, 1, 1, 1 })); test_cases.emplace_back(new test_cumsum(GGML_TYPE_F32, { 312*1205, 1, 1, 1 }));
test_cases.emplace_back(new test_cumsum(GGML_TYPE_F32, { 20481, 4, 1, 1 }));
test_cases.emplace_back(new test_xielu()); test_cases.emplace_back(new test_xielu());
@ -8303,12 +8270,6 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
} }
} }
for (int col : {8192, 16384, 32768, 65536, 131072, 262144, 524288}) {
for (int rows : {1, 4, 16}){
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {col, rows, 1, 1}, false, false, GGML_TYPE_F32, {1, 1}, 1.0f, 0.0f));
}
}
test_cases.emplace_back(new test_conv_2d_dw({512, 512, 256, 1}, {3, 3, 1, 256}, 1, 1, 1, false)); test_cases.emplace_back(new test_conv_2d_dw({512, 512, 256, 1}, {3, 3, 1, 256}, 1, 1, 1, false));
test_cases.emplace_back(new test_conv_2d_dw({512, 512, 256, 1}, {3, 3, 1, 256}, 1, 1, 1, true)); test_cases.emplace_back(new test_conv_2d_dw({512, 512, 256, 1}, {3, 3, 1, 256}, 1, 1, 1, true));
@ -8352,9 +8313,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
test_cases.emplace_back(new test_sum(GGML_TYPE_F32, it)); test_cases.emplace_back(new test_sum(GGML_TYPE_F32, it));
} }
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {65000, 16, 1, 1})); test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {65000, 16, 1, 1}));
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {200000, 1, 1, 1}));
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {200000, 16, 1, 1}));
test_cases.emplace_back(new test_top_k(GGML_TYPE_F32, {2, 1, 1, 1}, 1)); test_cases.emplace_back(new test_top_k(GGML_TYPE_F32, {2, 1, 1, 1}, 1));
for (auto k : {1, 10, 40, 400}) { for (auto k : {1, 10, 40, 400}) {
@ -8365,18 +8324,13 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
} }
} }
for (auto nrows : {1, 4, 8, 16}) {
for (auto cols : {128, 1024, 4096, 8192, 16384, 32768, 65536, 131072, 200000, 2000000}) {
test_cases.emplace_back(new test_cumsum(GGML_TYPE_F32, {cols, nrows, 1, 1}));
}
}
// Examples from granite-4.0-h-1b/ggml-model-Q8_0.gguf // Examples from granite-4.0-h-1b/ggml-model-Q8_0.gguf
test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {515, 3328, 1, 1}, {4, 3328, 1, 1})); // prefill test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {515, 3328, 1, 1}, {4, 3328, 1, 1})); // prefill
test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {4, 3328, 1, 1}, {4, 3328, 1, 1})); // generate test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {4, 3328, 1, 1}, {4, 3328, 1, 1})); // generate
test_cases.emplace_back(new test_ssm_scan(GGML_TYPE_F32, 128, 64, 48, 1, 512, 1)); // prefill test_cases.emplace_back(new test_ssm_scan(GGML_TYPE_F32, 128, 64, 48, 1, 512, 1)); // prefill
test_cases.emplace_back(new test_ssm_scan(GGML_TYPE_F32, 128, 64, 48, 1, 1, 1)); // generate test_cases.emplace_back(new test_ssm_scan(GGML_TYPE_F32, 128, 64, 48, 1, 1, 1)); // generate
return test_cases; return test_cases;
} }

File diff suppressed because it is too large Load Diff

View File

@ -724,30 +724,6 @@ static void test_tools_oaicompat_json_conversion() {
"]" "]"
), ),
common_chat_tools_to_json_oaicompat<json>({special_function_tool}).dump(2)); common_chat_tools_to_json_oaicompat<json>({special_function_tool}).dump(2));
{
auto tools_no_params = common_chat_tools_parse_oaicompat(json::parse(
R"([{"type": "function", "function": {"name": "test_func", "description": "A test"}}])"));
assert_equals((size_t) 1, tools_no_params.size());
assert_equals(std::string("test_func"), tools_no_params[0].name);
assert_equals(std::string("A test"), tools_no_params[0].description);
assert_equals(std::string("{}"), tools_no_params[0].parameters);
}
{
auto tools_no_desc = common_chat_tools_parse_oaicompat(json::parse(
R"([{"type": "function", "function": {"name": "test_func", "parameters": {"type": "object"}}}])"));
assert_equals((size_t) 1, tools_no_desc.size());
assert_equals(std::string("test_func"), tools_no_desc[0].name);
assert_equals(std::string(""), tools_no_desc[0].description);
}
{
auto tools_minimal = common_chat_tools_parse_oaicompat(json::parse(
R"([{"type": "function", "function": {"name": "test_func"}}])"));
assert_equals((size_t) 1, tools_minimal.size());
assert_equals(std::string("test_func"), tools_minimal[0].name);
assert_equals(std::string(""), tools_minimal[0].description);
assert_equals(std::string("{}"), tools_minimal[0].parameters);
}
} }
static void test_template_output_parsers() { static void test_template_output_parsers() {

View File

@ -232,52 +232,52 @@ static void test_regex_to_reversed_partial_regex() {
printf("[%s]\n", __func__); printf("[%s]\n", __func__);
assert_equals<std::string>( assert_equals<std::string>(
"^((?:(?:c)?b)?a)", "((?:(?:c)?b)?a)[\\s\\S]*",
regex_to_reversed_partial_regex("abc")); regex_to_reversed_partial_regex("abc"));
assert_equals<std::string>( assert_equals<std::string>(
"^(a+)", "(a+)[\\s\\S]*",
regex_to_reversed_partial_regex("a+")); regex_to_reversed_partial_regex("a+"));
assert_equals<std::string>( assert_equals<std::string>(
"^(a*)", "(a*)[\\s\\S]*",
regex_to_reversed_partial_regex("a*")); regex_to_reversed_partial_regex("a*"));
assert_equals<std::string>( assert_equals<std::string>(
"^(a?)", "(a?)[\\s\\S]*",
regex_to_reversed_partial_regex("a?")); regex_to_reversed_partial_regex("a?"));
assert_equals<std::string>( assert_equals<std::string>(
"^([a-z])", "([a-z])[\\s\\S]*",
regex_to_reversed_partial_regex("[a-z]")); regex_to_reversed_partial_regex("[a-z]"));
assert_equals<std::string>( assert_equals<std::string>(
"^((?:\\w+)?[a-z])", "((?:\\w+)?[a-z])[\\s\\S]*",
regex_to_reversed_partial_regex("[a-z]\\w+")); regex_to_reversed_partial_regex("[a-z]\\w+"));
assert_equals<std::string>( assert_equals<std::string>(
"^((?:a|b))", "((?:a|b))[\\s\\S]*",
regex_to_reversed_partial_regex("(?:a|b)")); regex_to_reversed_partial_regex("(?:a|b)"));
assert_equals<std::string>( assert_equals<std::string>(
"^((?:(?:(?:d)?c)?b)?a)", "((?:(?:(?:d)?c)?b)?a)[\\s\\S]*",
regex_to_reversed_partial_regex("abcd")); regex_to_reversed_partial_regex("abcd"));
assert_equals<std::string>( assert_equals<std::string>(
"^((?:b)?a*)", // TODO: ((?:b)?a*+).* ?? "((?:b)?a*)[\\s\\S]*", // TODO: ((?:b)?a*+).* ??
regex_to_reversed_partial_regex("a*b")); regex_to_reversed_partial_regex("a*b"));
assert_equals<std::string>( assert_equals<std::string>(
"^((?:(?:b)?a)?.*)", "((?:(?:b)?a)?.*)[\\s\\S]*",
regex_to_reversed_partial_regex(".*?ab")); regex_to_reversed_partial_regex(".*?ab"));
assert_equals<std::string>( assert_equals<std::string>(
"^((?:(?:b)?.*)?a)", "((?:(?:b)?.*)?a)[\\s\\S]*",
regex_to_reversed_partial_regex("a.*?b")); regex_to_reversed_partial_regex("a.*?b"));
assert_equals<std::string>( assert_equals<std::string>(
"^((?:(?:d)?(?:(?:c)?b))?a)", "((?:(?:d)?(?:(?:c)?b))?a)[\\s\\S]*",
regex_to_reversed_partial_regex("a(bc)d")); regex_to_reversed_partial_regex("a(bc)d"));
assert_equals<std::string>( assert_equals<std::string>(
"^((?:(?:(?:c)?b|(?:e)?d))?a)", "((?:(?:(?:c)?b|(?:e)?d))?a)[\\s\\S]*",
regex_to_reversed_partial_regex("a(bc|de)")); regex_to_reversed_partial_regex("a(bc|de)"));
assert_equals<std::string>( assert_equals<std::string>(
"^((?:(?:(?:(?:(?:c)?b?)?b?)?b)?b)?a)", "((?:(?:(?:(?:(?:c)?b?)?b?)?b)?b)?a)[\\s\\S]*",
regex_to_reversed_partial_regex("ab{2,4}c")); regex_to_reversed_partial_regex("ab{2,4}c"));
} }

View File

@ -27,7 +27,6 @@ add_library(mtmd
models/qwen3vl.cpp models/qwen3vl.cpp
models/siglip.cpp models/siglip.cpp
models/whisper-enc.cpp models/whisper-enc.cpp
models/youtuvl.cpp
) )
set_target_properties(mtmd PROPERTIES set_target_properties(mtmd PROPERTIES

View File

@ -45,14 +45,13 @@
#define KEY_SPATIAL_MERGE_SIZE "clip.vision.spatial_merge_size" #define KEY_SPATIAL_MERGE_SIZE "clip.vision.spatial_merge_size"
#define KEY_IS_DEEPSTACK_LAYERS "clip.vision.is_deepstack_layers" #define KEY_IS_DEEPSTACK_LAYERS "clip.vision.is_deepstack_layers"
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type" #define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints" #define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution" #define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
#define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern" #define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern"
#define KEY_WIN_ATTN_LAYER_INDEXES "clip.vision.wa_layer_indexes" #define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size"
#define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size" #define KEY_MINICPMV_VERSION "clip.minicpmv_version"
#define KEY_MINICPMV_VERSION "clip.minicpmv_version" #define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num"
#define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num"
// audio-specific // audio-specific
#define KEY_AUDIO_PROJ_TYPE "clip.audio.projector_type" // for models with mixed modalities #define KEY_AUDIO_PROJ_TYPE "clip.audio.projector_type" // for models with mixed modalities
@ -181,7 +180,6 @@ enum projector_type {
PROJECTOR_TYPE_GLMA, PROJECTOR_TYPE_GLMA,
PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
PROJECTOR_TYPE_VOXTRAL, PROJECTOR_TYPE_VOXTRAL,
PROJECTOR_TYPE_MUSIC_FLAMINGO,
PROJECTOR_TYPE_LFM2, PROJECTOR_TYPE_LFM2,
PROJECTOR_TYPE_KIMIVL, PROJECTOR_TYPE_KIMIVL,
PROJECTOR_TYPE_LIGHTONOCR, PROJECTOR_TYPE_LIGHTONOCR,
@ -189,7 +187,6 @@ enum projector_type {
PROJECTOR_TYPE_JANUS_PRO, PROJECTOR_TYPE_JANUS_PRO,
PROJECTOR_TYPE_LFM2A, PROJECTOR_TYPE_LFM2A,
PROJECTOR_TYPE_GLM4V, PROJECTOR_TYPE_GLM4V,
PROJECTOR_TYPE_YOUTUVL,
PROJECTOR_TYPE_UNKNOWN, PROJECTOR_TYPE_UNKNOWN,
}; };
@ -212,7 +209,6 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_GLMA, "glma"}, { PROJECTOR_TYPE_GLMA, "glma"},
{ PROJECTOR_TYPE_QWEN25O, "qwen2.5o"}, { PROJECTOR_TYPE_QWEN25O, "qwen2.5o"},
{ PROJECTOR_TYPE_VOXTRAL, "voxtral"}, { PROJECTOR_TYPE_VOXTRAL, "voxtral"},
{ PROJECTOR_TYPE_MUSIC_FLAMINGO, "musicflamingo"},
{ PROJECTOR_TYPE_LFM2, "lfm2"}, { PROJECTOR_TYPE_LFM2, "lfm2"},
{ PROJECTOR_TYPE_KIMIVL, "kimivl"}, { PROJECTOR_TYPE_KIMIVL, "kimivl"},
{ PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"}, { PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
@ -220,7 +216,6 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_JANUS_PRO, "janus_pro"}, { PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
{ PROJECTOR_TYPE_LFM2A, "lfm2a"}, { PROJECTOR_TYPE_LFM2A, "lfm2a"},
{ PROJECTOR_TYPE_GLM4V, "glm4v"}, { PROJECTOR_TYPE_GLM4V, "glm4v"},
{ PROJECTOR_TYPE_YOUTUVL, "youtuvl"},
}; };
static projector_type clip_projector_type_from_string(const std::string & str) { static projector_type clip_projector_type_from_string(const std::string & str) {

View File

@ -61,7 +61,6 @@ struct clip_hparams {
std::unordered_set<int32_t> vision_feature_layer; std::unordered_set<int32_t> vision_feature_layer;
int32_t attn_window_size = 0; int32_t attn_window_size = 0;
int32_t n_wa_pattern = 0; int32_t n_wa_pattern = 0;
std::unordered_set<int32_t> wa_layer_indexes; // explicit layer indexes that use full attention (for irregular patterns like YoutuVL)
// audio // audio
int32_t n_mel_bins = 0; // whisper preprocessor int32_t n_mel_bins = 0; // whisper preprocessor
@ -320,8 +319,7 @@ struct clip_model {
bool audio_has_avgpool() const { bool audio_has_avgpool() const {
return proj_type == PROJECTOR_TYPE_QWEN2A return proj_type == PROJECTOR_TYPE_QWEN2A
|| proj_type == PROJECTOR_TYPE_VOXTRAL || proj_type == PROJECTOR_TYPE_VOXTRAL;
|| proj_type == PROJECTOR_TYPE_MUSIC_FLAMINGO;
} }
bool audio_has_stack_frames() const { bool audio_has_stack_frames() const {

View File

@ -818,7 +818,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
case PROJECTOR_TYPE_VOXTRAL: case PROJECTOR_TYPE_VOXTRAL:
case PROJECTOR_TYPE_QWEN2A: case PROJECTOR_TYPE_QWEN2A:
case PROJECTOR_TYPE_GLMA: case PROJECTOR_TYPE_GLMA:
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
{ {
builder = std::make_unique<clip_graph_whisper_enc>(ctx, img); builder = std::make_unique<clip_graph_whisper_enc>(ctx, img);
} break; } break;
@ -846,10 +845,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
{ {
builder = std::make_unique<clip_graph_glm4v>(ctx, img); builder = std::make_unique<clip_graph_glm4v>(ctx, img);
} break; } break;
case PROJECTOR_TYPE_YOUTUVL:
{
builder = std::make_unique<clip_graph_youtuvl>(ctx, img);
} break;
default: default:
GGML_ABORT("missing cgraph builder"); GGML_ABORT("missing cgraph builder");
} }
@ -1163,20 +1158,6 @@ struct clip_model_loader {
LOG_WRN("%s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n", __func__); LOG_WRN("%s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n", __func__);
} }
} break; } break;
case PROJECTOR_TYPE_YOUTUVL:
{
hparams.n_merge = 2;
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true);
std::vector<int> wa_layer_indexes_vec;
get_arr_int(KEY_WIN_ATTN_LAYER_INDEXES, wa_layer_indexes_vec, true);
for (auto & layer : wa_layer_indexes_vec) {
hparams.wa_layer_indexes.insert(layer);
}
// support max_height * max_width = 8000 * 8000. 8000/16/2 = 250 image tokens
hparams.set_limit_image_tokens(1, 62500);
hparams.set_warmup_n_tokens(16*16); // avoid OOM on warmup
} break;
case PROJECTOR_TYPE_GLM4V: case PROJECTOR_TYPE_GLM4V:
{ {
hparams.rope_theta = 10000.0f; hparams.rope_theta = 10000.0f;
@ -1195,7 +1176,6 @@ struct clip_model_loader {
case PROJECTOR_TYPE_QWEN2A: case PROJECTOR_TYPE_QWEN2A:
case PROJECTOR_TYPE_GLMA: case PROJECTOR_TYPE_GLMA:
case PROJECTOR_TYPE_VOXTRAL: case PROJECTOR_TYPE_VOXTRAL:
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
{ {
bool require_stack = model.proj_type == PROJECTOR_TYPE_ULTRAVOX || bool require_stack = model.proj_type == PROJECTOR_TYPE_ULTRAVOX ||
model.proj_type == PROJECTOR_TYPE_VOXTRAL || model.proj_type == PROJECTOR_TYPE_VOXTRAL ||
@ -1245,14 +1225,7 @@ struct clip_model_loader {
LOG_INF("%s: has_llava_proj: %d\n", __func__, hparams.has_llava_projector); LOG_INF("%s: has_llava_proj: %d\n", __func__, hparams.has_llava_projector);
LOG_INF("%s: minicpmv_version: %d\n", __func__, hparams.minicpmv_version); LOG_INF("%s: minicpmv_version: %d\n", __func__, hparams.minicpmv_version);
LOG_INF("%s: n_merge: %d\n", __func__, hparams.n_merge); LOG_INF("%s: n_merge: %d\n", __func__, hparams.n_merge);
LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern); LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern);
if (!hparams.wa_layer_indexes.empty()) {
LOG_INF("%s: wa_layer_indexes: ", __func__);
for (auto & layer : hparams.wa_layer_indexes) {
LOG_INF("%d ", layer);
}
LOG_INF("\n");
}
if (hparams.image_min_pixels > 0) { if (hparams.image_min_pixels > 0) {
LOG_INF("%s: image_min_pixels: %d%s\n", __func__, hparams.image_min_pixels, hparams.custom_image_min_tokens > 0 ? " (custom value)" : ""); LOG_INF("%s: image_min_pixels: %d%s\n", __func__, hparams.image_min_pixels, hparams.custom_image_min_tokens > 0 ? " (custom value)" : "");
} }
@ -1520,14 +1493,6 @@ struct clip_model_loader {
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias")); model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
} break; } break;
case PROJECTOR_TYPE_YOUTUVL:
{
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM); // merger.ln_q (RMS norm)
model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight")); // merger.mlp.0
model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); // merger.mlp.2
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
} break;
case PROJECTOR_TYPE_GLM4V: case PROJECTOR_TYPE_GLM4V:
{ {
model.projection = get_tensor(TN_MM_PROJECTOR); model.projection = get_tensor(TN_MM_PROJECTOR);
@ -1552,14 +1517,6 @@ struct clip_model_loader {
model.projection = get_tensor(TN_MM_PROJECTOR); model.projection = get_tensor(TN_MM_PROJECTOR);
} break; } break;
case PROJECTOR_TYPE_LFM2: case PROJECTOR_TYPE_LFM2:
{
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
model.mm_input_norm_b = get_tensor(TN_MM_INP_NORM_B, false);
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
} break;
case PROJECTOR_TYPE_KIMIVL: case PROJECTOR_TYPE_KIMIVL:
{ {
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM); model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM);
@ -1619,17 +1576,6 @@ struct clip_model_loader {
model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight")); model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight")); model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
} break; } break;
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
{
model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias"));
model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias"));
} break;
case PROJECTOR_TYPE_INTERNVL: case PROJECTOR_TYPE_INTERNVL:
{ {
model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight")); model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
@ -2738,57 +2684,6 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
// res_imgs->data[0] = *res; // res_imgs->data[0] = *res;
res_imgs->entries.push_back(std::move(img_f32)); res_imgs->entries.push_back(std::move(img_f32));
} break; } break;
case PROJECTOR_TYPE_YOUTUVL:
{
const int patch_size = params.patch_size; // typically 16
const int merge_size = params.n_merge; // typically 2
const int align_size = patch_size * merge_size; // 32
const int max_num_patches = params.image_max_pixels > 0 ?
params.image_max_pixels / (patch_size * patch_size) : 256;
// Linear search for optimal scale to fit within max_num_patches
float scale = 1.0f;
int target_height = original_size.height;
int target_width = original_size.width;
auto get_scaled_image_size = [align_size](float scale, int size) -> int {
float scaled_size = size * scale;
// Round up to nearest multiple of align_size
int aligned = static_cast<int>(std::ceil(scaled_size / align_size)) * align_size;
// Ensure at least one patch
return std::max(align_size, aligned);
};
// Linear search with 0.02 step size
while (scale > 0.0f) {
target_height = get_scaled_image_size(scale, original_size.height);
target_width = get_scaled_image_size(scale, original_size.width);
int num_patches_h = target_height / patch_size;
int num_patches_w = target_width / patch_size;
int num_patches = num_patches_h * num_patches_w;
if (num_patches > max_num_patches) {
scale -= 0.02f;
} else {
break;
}
}
clip_image_size new_size = {target_width, target_height};
// Resize the image
clip_image_u8 resized;
img_tool::resize(*img, resized, new_size, img_tool::RESIZE_ALGO_BILINEAR, false);
// Normalize to float32
clip_image_f32_ptr img_f32(clip_image_f32_init());
normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std);
// Add to results
res_imgs->entries.push_back(std::move(img_f32));
} break;
case PROJECTOR_TYPE_IDEFICS3: case PROJECTOR_TYPE_IDEFICS3:
{ {
@ -3021,7 +2916,6 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
case PROJECTOR_TYPE_QWEN25VL: case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_QWEN3VL: case PROJECTOR_TYPE_QWEN3VL:
case PROJECTOR_TYPE_GLM4V: case PROJECTOR_TYPE_GLM4V:
case PROJECTOR_TYPE_YOUTUVL:
return (img->nx / params.patch_size) / 2; return (img->nx / params.patch_size) / 2;
default: default:
break; break;
@ -3037,7 +2931,6 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
case PROJECTOR_TYPE_QWEN25VL: case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_QWEN3VL: case PROJECTOR_TYPE_QWEN3VL:
case PROJECTOR_TYPE_GLM4V: case PROJECTOR_TYPE_GLM4V:
case PROJECTOR_TYPE_YOUTUVL:
return (img->ny / params.patch_size) / 2; return (img->ny / params.patch_size) / 2;
default: default:
break; break;
@ -3098,7 +2991,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
case PROJECTOR_TYPE_QWEN25VL: case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_QWEN3VL: case PROJECTOR_TYPE_QWEN3VL:
case PROJECTOR_TYPE_GLM4V: case PROJECTOR_TYPE_GLM4V:
case PROJECTOR_TYPE_YOUTUVL:
{ {
// dynamic size (2 conv, so double patch size) // dynamic size (2 conv, so double patch size)
int x_patch = img->nx / (params.patch_size * 2); int x_patch = img->nx / (params.patch_size * 2);
@ -3139,7 +3031,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
case PROJECTOR_TYPE_VOXTRAL: case PROJECTOR_TYPE_VOXTRAL:
case PROJECTOR_TYPE_ULTRAVOX: case PROJECTOR_TYPE_ULTRAVOX:
case PROJECTOR_TYPE_QWEN2A: case PROJECTOR_TYPE_QWEN2A:
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
{ {
n_patches = img->nx; n_patches = img->nx;
@ -3226,6 +3117,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
const int pos_w = image_size_width / patch_size; const int pos_w = image_size_width / patch_size;
const int pos_h = image_size_height / patch_size; const int pos_h = image_size_height / patch_size;
const bool use_window_attn = hparams.n_wa_pattern > 0; // for qwen2.5vl
auto get_inp_tensor = [&gf](const char * name) { auto get_inp_tensor = [&gf](const char * name) {
ggml_tensor * inp = ggml_graph_get_tensor(gf, name); ggml_tensor * inp = ggml_graph_get_tensor(gf, name);
@ -3374,11 +3266,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
set_input_i32("positions", positions); set_input_i32("positions", positions);
} break; } break;
case PROJECTOR_TYPE_QWEN25VL: case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_YOUTUVL:
{ {
// pw * ph = number of tokens output by ViT after apply patch merger // pw * ph = number of tokens output by ViT after apply patch merger
// ipw * ipw = number of vision token been processed inside ViT // ipw * ipw = number of vision token been processed inside ViT
const bool use_window_attn = ctx->model.proj_type == PROJECTOR_TYPE_QWEN25VL ? hparams.n_wa_pattern > 0 : !hparams.wa_layer_indexes.empty();
const int merge_ratio = 2; const int merge_ratio = 2;
const int pw = image_size_width / patch_size / merge_ratio; const int pw = image_size_width / patch_size / merge_ratio;
const int ph = image_size_height / patch_size / merge_ratio; const int ph = image_size_height / patch_size / merge_ratio;
@ -3389,7 +3279,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
std::vector<int> inv_idx(ph * pw); std::vector<int> inv_idx(ph * pw);
if (use_window_attn) { if (use_window_attn) {
const int attn_window_size = hparams.attn_window_size > 0 ? hparams.attn_window_size : 112; const int attn_window_size = 112;
const int grid_window = attn_window_size / patch_size / merge_ratio; const int grid_window = attn_window_size / patch_size / merge_ratio;
int dst = 0; int dst = 0;
// [num_vision_tokens, num_vision_tokens] attention mask tensor // [num_vision_tokens, num_vision_tokens] attention mask tensor
@ -3513,7 +3403,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
case PROJECTOR_TYPE_ULTRAVOX: case PROJECTOR_TYPE_ULTRAVOX:
case PROJECTOR_TYPE_LFM2: case PROJECTOR_TYPE_LFM2:
case PROJECTOR_TYPE_VOXTRAL: case PROJECTOR_TYPE_VOXTRAL:
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
case PROJECTOR_TYPE_JANUS_PRO: case PROJECTOR_TYPE_JANUS_PRO:
case PROJECTOR_TYPE_COGVLM: case PROJECTOR_TYPE_COGVLM:
{ {
@ -3627,7 +3516,6 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
case PROJECTOR_TYPE_QWEN2VL: case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN25VL: case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_JANUS_PRO: case PROJECTOR_TYPE_JANUS_PRO:
case PROJECTOR_TYPE_YOUTUVL:
return ctx->model.mm_1_b->ne[0]; return ctx->model.mm_1_b->ne[0];
case PROJECTOR_TYPE_QWEN3VL: case PROJECTOR_TYPE_QWEN3VL:
// main path + deepstack paths // main path + deepstack paths
@ -3638,7 +3526,6 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
return ctx->model.projection->ne[1]; return ctx->model.projection->ne[1];
case PROJECTOR_TYPE_ULTRAVOX: case PROJECTOR_TYPE_ULTRAVOX:
case PROJECTOR_TYPE_VOXTRAL: case PROJECTOR_TYPE_VOXTRAL:
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
return ctx->model.mm_2_w->ne[1]; return ctx->model.mm_2_w->ne[1];
case PROJECTOR_TYPE_INTERNVL: case PROJECTOR_TYPE_INTERNVL:
return ctx->model.mm_3_w->ne[1]; return ctx->model.mm_3_w->ne[1];
@ -3700,8 +3587,7 @@ bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
return ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX return ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN2A || ctx->proj_type() == PROJECTOR_TYPE_QWEN2A
|| ctx->proj_type() == PROJECTOR_TYPE_GLMA || ctx->proj_type() == PROJECTOR_TYPE_GLMA
|| ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL || ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL;
|| ctx->proj_type() == PROJECTOR_TYPE_MUSIC_FLAMINGO;
} }
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) { bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {

View File

@ -27,11 +27,6 @@ struct clip_graph_qwen3vl : clip_graph {
ggml_cgraph * build() override; ggml_cgraph * build() override;
}; };
struct clip_graph_youtuvl : clip_graph {
clip_graph_youtuvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};
struct clip_graph_minicpmv : clip_graph { struct clip_graph_minicpmv : clip_graph {
clip_graph_minicpmv(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} clip_graph_minicpmv(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override; ggml_cgraph * build() override;

View File

@ -50,15 +50,10 @@ ggml_cgraph * clip_graph_siglip::build() {
const int scale_factor = model.hparams.n_merge; const int scale_factor = model.hparams.n_merge;
cur = build_patch_merge_permute(cur, scale_factor); cur = build_patch_merge_permute(cur, scale_factor);
// projection, in LFM2-VL input norm is optional // projection
if (model.mm_input_norm_w) { cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
cur = ggml_mul(ctx0, cur, model.mm_input_norm_w); cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
}
if (model.mm_input_norm_b) {
cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
}
cur = build_ffn(cur, cur = build_ffn(cur,
model.mm_1_w, model.mm_1_b, model.mm_1_w, model.mm_1_b,

View File

@ -86,15 +86,6 @@ ggml_cgraph * clip_graph_whisper_enc::build() {
FFN_GELU_ERF, FFN_GELU_ERF,
-1); -1);
} else if (proj_type == PROJECTOR_TYPE_MUSIC_FLAMINGO) {
// projector
cur = build_ffn(cur,
model.mm_1_w, model.mm_1_b,
nullptr, nullptr,
model.mm_2_w, model.mm_2_b,
FFN_GELU_ERF,
-1);
} else if (proj_type == PROJECTOR_TYPE_GLMA) { } else if (proj_type == PROJECTOR_TYPE_GLMA) {
cur = ggml_norm(ctx0, cur, hparams.eps); cur = ggml_norm(ctx0, cur, hparams.eps);
cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w); cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);

View File

@ -1,179 +0,0 @@
#include "models.h"
ggml_cgraph * clip_graph_youtuvl::build() {
GGML_ASSERT(model.class_embedding == nullptr);
const int batch_size = 1;
const bool use_window_attn = !hparams.wa_layer_indexes.empty();
const int n_pos = n_patches;
const int num_position_ids = n_pos * 4;
const int m = 2;
const int Wp = n_patches_x;
const int Hp = n_patches_y;
const int Hm = Hp / m;
const int Wm = Wp / m;
norm_type norm_t = NORM_TYPE_NORMAL;
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
ggml_tensor * inp = build_inp_raw();
// change conv3d to linear
// reshape and permute to get patches, permute from (patch_size, m, Wm, patch_size, m, Hm, C) to (C, patch_size, patch_size, m, m, Wm, Hm)
{
inp = ggml_reshape_4d(
ctx0, inp,
Wm * m * patch_size, m * patch_size, Hm, 3);
inp = ggml_permute(ctx0, inp, 1, 2, 3, 0);
inp = ggml_cont_4d(
ctx0, inp,
m * patch_size * 3, Wm, m * patch_size, Hm);
inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
inp = ggml_cont_4d(
ctx0, inp,
m * patch_size * 3, patch_size, m, Hm * Wm);
inp = ggml_permute(ctx0, inp, 1, 0, 2, 3);
inp = ggml_cont_4d(
ctx0, inp,
patch_size, 3, patch_size, Hm * Wm * m * m);
inp = ggml_permute(ctx0, inp, 2, 0, 1, 3);
inp = ggml_cont_3d(
ctx0, inp,
3*patch_size* patch_size, Hm * Wm * m * m, 1);
}
inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp);
if (model.patch_bias) {
inp = ggml_add(ctx0, inp, model.patch_bias);
}
inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches);
ggml_tensor * inpL = inp;
ggml_tensor * window_mask = nullptr;
ggml_tensor * window_idx = nullptr;
ggml_tensor * inv_window_idx = nullptr;
ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
ggml_set_name(positions, "positions");
ggml_set_input(positions);
// pre-layernorm
if (model.pre_ln_w) {
inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
}
if (use_window_attn) {
inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
ggml_set_name(inv_window_idx, "inv_window_idx");
ggml_set_input(inv_window_idx);
// mask for window attention
window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_pos, n_pos);
ggml_set_name(window_mask, "window_mask");
ggml_set_input(window_mask);
// if flash attn is used, we need to pad the mask and cast to f16
if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
window_mask = ggml_cast(ctx0, window_mask, GGML_TYPE_F16);
}
// inpL shape: [n_embd, n_patches_x * n_patches_y, batch_size]
GGML_ASSERT(batch_size == 1);
inpL = ggml_reshape_2d(ctx0, inpL, n_embd * 4, n_patches_x * n_patches_y * batch_size / 4);
inpL = ggml_get_rows(ctx0, inpL, inv_window_idx);
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_patches_x * n_patches_y, batch_size);
}
// loop over layers
for (int il = 0; il < n_layer; il++) {
const auto & layer = model.layers[il];
const bool full_attn = use_window_attn ? hparams.wa_layer_indexes.count(il) > 0 : true;
ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
// layernorm1
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
// self-attention
{
ggml_tensor * Qcur = ggml_add(ctx0,
ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b);
ggml_tensor * Kcur = ggml_add(ctx0,
ggml_mul_mat(ctx0, layer.k_w, cur), layer.k_b);
ggml_tensor * Vcur = ggml_add(ctx0,
ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b);
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches);
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches);
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_patches);
Qcur = ggml_rope_multi(
ctx0, Qcur, positions, nullptr,
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
Kcur = ggml_rope_multi(
ctx0, Kcur, positions, nullptr,
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
ggml_tensor * attn_mask = full_attn ? nullptr : window_mask;
cur = build_attn(layer.o_w, layer.o_b,
Qcur, Kcur, Vcur, attn_mask, kq_scale, il);
}
// re-add the layer input, e.g., residual
cur = ggml_add(ctx0, cur, inpL);
inpL = cur; // inpL = residual, cur = hidden_states
// layernorm2
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
// ffn
cur = build_ffn(cur,
layer.ff_up_w, layer.ff_up_b,
nullptr, nullptr,
layer.ff_down_w, layer.ff_down_b,
hparams.ffn_op, il);
// residual 2
cur = ggml_add(ctx0, inpL, cur);
inpL = cur;
}
ggml_tensor * embeddings = inpL;
if (use_window_attn) {
const int spatial_merge_unit = 4;
window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / spatial_merge_unit);
ggml_set_name(window_idx, "window_idx");
ggml_set_input(window_idx);
GGML_ASSERT(batch_size == 1);
embeddings = ggml_reshape_2d(ctx0, embeddings, n_embd * spatial_merge_unit, n_patches / spatial_merge_unit);
embeddings = ggml_get_rows(ctx0, embeddings, window_idx);
embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd, n_patches, batch_size);
cb(embeddings, "window_order_restored", -1);
}
// post-layernorm (part of Siglip2VisionTransformer, applied after encoder)
if (model.post_ln_w) {
embeddings = build_norm(embeddings, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer);
}
// Now apply merger (VLPatchMerger):
// 1. Apply RMS norm (ln_q in VLPatchMerger)
embeddings = build_norm(embeddings, model.mm_input_norm_w, nullptr, NORM_TYPE_RMS, 1e-6, -1);
cb(embeddings, "merger_normed", -1);
// 2. First reshape for spatial merge (merge 2x2 patches)
embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
cb(embeddings, "merger_reshaped", -1);
embeddings = build_ffn(embeddings,
model.mm_0_w, model.mm_0_b,
nullptr, nullptr,
model.mm_1_w, model.mm_1_b,
FFN_GELU,
-1);
ggml_build_forward_expand(gf, embeddings);
return gf;
}

View File

@ -283,7 +283,7 @@ struct mtmd_context {
// https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md // https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
img_end = "[IMG_END]"; img_end = "[IMG_END]";
} else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL || proj == PROJECTOR_TYPE_QWEN3VL || proj == PROJECTOR_TYPE_YOUTUVL) { } else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL || proj == PROJECTOR_TYPE_QWEN3VL) {
// <|vision_start|> ... (image embeddings) ... <|vision_end|> // <|vision_start|> ... (image embeddings) ... <|vision_end|>
img_beg = "<|vision_start|>"; img_beg = "<|vision_start|>";
img_end = "<|vision_end|>"; img_end = "<|vision_end|>";
@ -330,7 +330,6 @@ struct mtmd_context {
case PROJECTOR_TYPE_ULTRAVOX: case PROJECTOR_TYPE_ULTRAVOX:
case PROJECTOR_TYPE_VOXTRAL: case PROJECTOR_TYPE_VOXTRAL:
case PROJECTOR_TYPE_GLMA: case PROJECTOR_TYPE_GLMA:
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a); audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
break; break;
case PROJECTOR_TYPE_LFM2A: case PROJECTOR_TYPE_LFM2A:
@ -353,9 +352,6 @@ struct mtmd_context {
// [BEGIN_AUDIO] ... (embeddings) ... // [BEGIN_AUDIO] ... (embeddings) ...
aud_beg = "[BEGIN_AUDIO]"; aud_beg = "[BEGIN_AUDIO]";
} else if (proj == PROJECTOR_TYPE_MUSIC_FLAMINGO) {
// <sound> ... (embeddings) ...
aud_beg = "<sound>";
} }
} }

View File

@ -12,7 +12,6 @@
#include <cmath> #include <cmath>
#include <cctype> #include <cctype>
#include <algorithm> #include <algorithm>
#include <filesystem>
struct quant_option { struct quant_option {
std::string name; std::string name;
@ -644,11 +643,6 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
if (std::error_code ec; std::filesystem::equivalent(fname_inp, fname_out, ec)) {
fprintf(stderr, "%s: error: input and output files are the same: '%s'\n", __func__, fname_inp.c_str());
return 1;
}
print_build_info(); print_build_info();
fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str()); fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str());

Binary file not shown.

View File

@ -1385,21 +1385,16 @@ json format_response_rerank(
std::vector<llama_token_data> get_token_probabilities(llama_context * ctx, int idx) { std::vector<llama_token_data> get_token_probabilities(llama_context * ctx, int idx) {
std::vector<llama_token_data> cur; std::vector<llama_token_data> cur;
const auto * logits = llama_get_logits_ith(ctx, idx); const auto * logits = llama_get_logits_ith(ctx, idx);
const llama_token * sampled_ids = llama_get_sampled_candidates_ith(ctx, idx);
const int n_logits = llama_get_sampled_logits_count_ith(ctx, idx); const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
cur.resize(n_logits); const int n_vocab = llama_vocab_n_tokens(vocab);
if (sampled_ids) {
for (int i = 0; i < n_logits; i++) { cur.resize(n_vocab);
cur[i] = llama_token_data{sampled_ids[i], logits[i], 0.0f}; for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
} cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
} else {
for (llama_token token_id = 0; token_id < n_logits; token_id++) {
cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
}
} }
// sort tokens by logits // sort tokens by logits

View File

@ -1148,25 +1148,6 @@ private:
return false; return false;
} }
const bool need_logits = task.params.sampling.n_probs > 0;
bool backend_sampling = true;
backend_sampling &= task.params.sampling.backend_sampling;
// TODO: speculative decoding requires multiple samples per batch - not supported yet
backend_sampling &= !(slot.ctx_dft && task.params.speculative.n_max > 0);
// TODO: getting post/pre sampling logits is not yet supported with backend sampling
backend_sampling &= !need_logits;
// TODO: tmp until backend sampling is fully implemented
if (backend_sampling) {
llama_set_sampler(ctx, slot.id, common_sampler_get(slot.smpl.get()));
} else {
llama_set_sampler(ctx, slot.id, nullptr);
}
SLT_INF(slot, "sampler chain: %s\n", common_sampler_print(slot.smpl.get()).c_str()); SLT_INF(slot, "sampler chain: %s\n", common_sampler_print(slot.smpl.get()).c_str());
} }

View File

@ -78,7 +78,6 @@ json task_params::to_json(bool only_metrics) const {
{"speculative.p_min", speculative.p_min}, {"speculative.p_min", speculative.p_min},
{"timings_per_token", timings_per_token}, {"timings_per_token", timings_per_token},
{"post_sampling_probs", post_sampling_probs}, {"post_sampling_probs", post_sampling_probs},
{"backend_sampling", sampling.backend_sampling},
{"lora", lora}, {"lora", lora},
}; };
} }
@ -137,7 +136,6 @@ json task_params::to_json(bool only_metrics) const {
{"speculative.p_min", speculative.p_min}, {"speculative.p_min", speculative.p_min},
{"timings_per_token", timings_per_token}, {"timings_per_token", timings_per_token},
{"post_sampling_probs", post_sampling_probs}, {"post_sampling_probs", post_sampling_probs},
{"backend_sampling", sampling.backend_sampling},
{"lora", lora}, {"lora", lora},
}; };
} }
@ -206,7 +204,6 @@ task_params server_task::params_from_json_cmpl(
params.sampling.seed = json_value(data, "seed", defaults.sampling.seed); params.sampling.seed = json_value(data, "seed", defaults.sampling.seed);
params.sampling.n_probs = json_value(data, "n_probs", defaults.sampling.n_probs); params.sampling.n_probs = json_value(data, "n_probs", defaults.sampling.n_probs);
params.sampling.min_keep = json_value(data, "min_keep", defaults.sampling.min_keep); params.sampling.min_keep = json_value(data, "min_keep", defaults.sampling.min_keep);
params.sampling.backend_sampling = json_value(data, "backend_sampling", defaults.sampling.backend_sampling);
params.post_sampling_probs = json_value(data, "post_sampling_probs", defaults.post_sampling_probs); params.post_sampling_probs = json_value(data, "post_sampling_probs", defaults.post_sampling_probs);
params.speculative.n_min = json_value(data, "speculative.n_min", defaults.speculative.n_min); params.speculative.n_min = json_value(data, "speculative.n_min", defaults.speculative.n_min);

View File

@ -185,11 +185,6 @@
key: 'samplers', key: 'samplers',
label: 'Samplers', label: 'Samplers',
type: 'input' type: 'input'
},
{
key: 'backend_sampling',
label: 'Backend sampling',
type: 'checkbox'
} }
] ]
}, },

View File

@ -21,7 +21,6 @@ export const SETTING_CONFIG_DEFAULT: Record<string, string | number | boolean> =
autoMicOnEmpty: false, autoMicOnEmpty: false,
// make sure these default values are in sync with `common.h` // make sure these default values are in sync with `common.h`
samplers: 'top_k;typ_p;top_p;min_p;temperature', samplers: 'top_k;typ_p;top_p;min_p;temperature',
backend_sampling: false,
temperature: 0.8, temperature: 0.8,
dynatemp_range: 0.0, dynatemp_range: 0.0,
dynatemp_exponent: 1.0, dynatemp_exponent: 1.0,
@ -58,8 +57,6 @@ export const SETTING_CONFIG_INFO: Record<string, string> = {
'When copying a message with text attachments, combine them into a single plain text string instead of a special format that can be pasted back as attachments.', 'When copying a message with text attachments, combine them into a single plain text string instead of a special format that can be pasted back as attachments.',
samplers: samplers:
'The order at which samplers are applied, in simplified way. Default is "top_k;typ_p;top_p;min_p;temperature": top_k->typ_p->top_p->min_p->temperature', 'The order at which samplers are applied, in simplified way. Default is "top_k;typ_p;top_p;min_p;temperature": top_k->typ_p->top_p->min_p->temperature',
backend_sampling:
'Enable backend-based samplers. When enabled, supported samplers run on the accelerator backend for faster sampling.',
temperature: temperature:
'Controls the randomness of the generated text by affecting the probability distribution of the output tokens. Higher = more random, lower = more focused.', 'Controls the randomness of the generated text by affecting the probability distribution of the output tokens. Higher = more random, lower = more focused.',
dynatemp_range: dynatemp_range:

Some files were not shown because too many files have changed in this diff Show More